From 97c79a38cd454602645f0470ffb444b3b75ce574 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 28 Apr 2016 13:16:33 -0300 Subject: perf core: Per event callchain limit Additionally to being able to control the system wide maximum depth via /proc/sys/kernel/perf_event_max_stack, now we are able to ask for different depths per event, using perf_event_attr.sample_max_stack for that. This uses an u16 hole at the end of perf_event_attr, that, when perf_event_attr.sample_type has the PERF_SAMPLE_CALLCHAIN, if sample_max_stack is zero, means use perf_event_max_stack, otherwise it'll be bounds checked under callchain_mutex. Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Brendan Gregg Cc: David Ahern Cc: Frederic Weisbecker Cc: He Kuang Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Wang Nan Cc: Zefan Li Link: http://lkml.kernel.org/n/tip-kolmn1yo40p7jhswxwrc7rrd@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/bpf/stackmap.c | 2 +- kernel/events/callchain.c | 14 ++++++++++++-- kernel/events/core.c | 5 ++++- 3 files changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index a82d7605db3f..f1de5c1a2af6 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -99,7 +99,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (err) goto free_smap; - err = get_callchain_buffers(); + err = get_callchain_buffers(sysctl_perf_event_max_stack); if (err) goto free_smap; diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 179ef4640964..e9fdb5203de5 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -104,7 +104,7 @@ fail: return -ENOMEM; } -int get_callchain_buffers(void) +int get_callchain_buffers(int event_max_stack) { int err = 0; int count; @@ -121,6 +121,15 @@ int get_callchain_buffers(void) /* If the allocation failed, give up */ if (!callchain_cpus_entries) err = -ENOMEM; + /* + * If requesting per event more than the global cap, + * return a different error to help userspace figure + * this out. + * + * And also do it here so that we have &callchain_mutex held. + */ + if (event_max_stack > sysctl_perf_event_max_stack) + err = -EOVERFLOW; goto exit; } @@ -174,11 +183,12 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) bool user = !event->attr.exclude_callchain_user; /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; + const u32 max_stack = event->attr.sample_max_stack; if (!kernel && !user) return NULL; - return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true); + return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true); } struct perf_callchain_entry * diff --git a/kernel/events/core.c b/kernel/events/core.c index 050a290c72c7..79363f298445 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8843,7 +8843,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { - err = get_callchain_buffers(); + err = get_callchain_buffers(attr->sample_max_stack); if (err) goto err_addr_filters; } @@ -9165,6 +9165,9 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + if (!attr.sample_max_stack) + attr.sample_max_stack = sysctl_perf_event_max_stack; + /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument -- cgit v1.3-8-gc7d7 From 2b4c7afe79a8a0a0e05edeaded5653c190153f9b Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Sun, 15 May 2016 22:47:39 -0400 Subject: audit: fixup: log on errors from filter user rules In commit 724e4fcc the intention was to pass any errors back from audit_filter_user_rules() to audit_filter_user(). Add that code. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditfilter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 94ca7b1e5e7e..8a8aa3fbc8d8 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1339,8 +1339,8 @@ static int audit_filter_user_rules(struct audit_krule *rule, int type, break; } - if (!result) - return 0; + if (result <= 0) + return result; } switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_DISABLED; break; -- cgit v1.3-8-gc7d7 From e788892ba3cc71d385b75895f7a375fbc659ce86 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 2 Jun 2016 23:24:15 +0200 Subject: cpufreq: governor: Get rid of governor events The design of the cpufreq governor API is not very straightforward, as struct cpufreq_governor provides only one callback to be invoked from different code paths for different purposes. The purpose it is invoked for is determined by its second "event" argument, causing it to act as a "callback multiplexer" of sorts. Unfortunately, that leads to extra complexity in governors, some of which implement the ->governor() callback as a switch statement that simply checks the event argument and invokes a separate function to handle that specific event. That extra complexity can be eliminated by replacing the all-purpose ->governor() callback with a family of callbacks to carry out specific governor operations: initialization and exit, start and stop and policy limits updates. That also turns out to reduce the code size too, so do it. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- arch/powerpc/platforms/cell/cpufreq_spudemand.c | 72 ++++++++-------- drivers/cpufreq/cpufreq.c | 31 ++++--- drivers/cpufreq/cpufreq_conservative.c | 7 +- drivers/cpufreq/cpufreq_governor.c | 39 +++------ drivers/cpufreq/cpufreq_governor.h | 20 ++++- drivers/cpufreq/cpufreq_ondemand.c | 7 +- drivers/cpufreq/cpufreq_performance.c | 17 +--- drivers/cpufreq/cpufreq_powersave.c | 17 +--- drivers/cpufreq/cpufreq_userspace.c | 104 ++++++++++++------------ include/linux/cpufreq.h | 14 ++-- kernel/sched/cpufreq_schedutil.c | 34 ++------ 11 files changed, 158 insertions(+), 204 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c index 82607d621aca..88301e53f085 100644 --- a/arch/powerpc/platforms/cell/cpufreq_spudemand.c +++ b/arch/powerpc/platforms/cell/cpufreq_spudemand.c @@ -85,61 +85,57 @@ static void spu_gov_cancel_work(struct spu_gov_info_struct *info) cancel_delayed_work_sync(&info->work); } -static int spu_gov_govern(struct cpufreq_policy *policy, unsigned int event) +static int spu_gov_start(struct cpufreq_policy *policy) { unsigned int cpu = policy->cpu; - struct spu_gov_info_struct *info, *affected_info; + struct spu_gov_info_struct *info = &per_cpu(spu_gov_info, cpu); + struct spu_gov_info_struct *affected_info; int i; - int ret = 0; - info = &per_cpu(spu_gov_info, cpu); - - switch (event) { - case CPUFREQ_GOV_START: - if (!cpu_online(cpu)) { - printk(KERN_ERR "cpu %d is not online\n", cpu); - ret = -EINVAL; - break; - } + if (!cpu_online(cpu)) { + printk(KERN_ERR "cpu %d is not online\n", cpu); + return -EINVAL; + } - if (!policy->cur) { - printk(KERN_ERR "no cpu specified in policy\n"); - ret = -EINVAL; - break; - } + if (!policy->cur) { + printk(KERN_ERR "no cpu specified in policy\n"); + return -EINVAL; + } - /* initialize spu_gov_info for all affected cpus */ - for_each_cpu(i, policy->cpus) { - affected_info = &per_cpu(spu_gov_info, i); - affected_info->policy = policy; - } + /* initialize spu_gov_info for all affected cpus */ + for_each_cpu(i, policy->cpus) { + affected_info = &per_cpu(spu_gov_info, i); + affected_info->policy = policy; + } - info->poll_int = POLL_TIME; + info->poll_int = POLL_TIME; - /* setup timer */ - spu_gov_init_work(info); + /* setup timer */ + spu_gov_init_work(info); - break; + return 0; +} - case CPUFREQ_GOV_STOP: - /* cancel timer */ - spu_gov_cancel_work(info); +static void spu_gov_stop(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + struct spu_gov_info_struct *info = &per_cpu(spu_gov_info, cpu); + int i; - /* clean spu_gov_info for all affected cpus */ - for_each_cpu (i, policy->cpus) { - info = &per_cpu(spu_gov_info, i); - info->policy = NULL; - } + /* cancel timer */ + spu_gov_cancel_work(info); - break; + /* clean spu_gov_info for all affected cpus */ + for_each_cpu (i, policy->cpus) { + info = &per_cpu(spu_gov_info, i); + info->policy = NULL; } - - return ret; } static struct cpufreq_governor spu_governor = { .name = "spudemand", - .governor = spu_gov_govern, + .start = spu_gov_start, + .stop = spu_gov_stop, .owner = THIS_MODULE, }; diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 268566e40684..d98ff688b2f1 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2023,10 +2023,12 @@ static int cpufreq_init_governor(struct cpufreq_policy *policy) pr_debug("%s: for CPU %u\n", __func__, policy->cpu); - ret = policy->governor->governor(policy, CPUFREQ_GOV_POLICY_INIT); - if (ret) { - module_put(policy->governor->owner); - return ret; + if (policy->governor->init) { + ret = policy->governor->init(policy); + if (ret) { + module_put(policy->governor->owner); + return ret; + } } policy->governor->initialized++; @@ -2040,7 +2042,8 @@ static void cpufreq_exit_governor(struct cpufreq_policy *policy) pr_debug("%s: for CPU %u\n", __func__, policy->cpu); - policy->governor->governor(policy, CPUFREQ_GOV_POLICY_EXIT); + if (policy->governor->exit) + policy->governor->exit(policy); policy->governor->initialized--; module_put(policy->governor->owner); @@ -2061,11 +2064,15 @@ static int cpufreq_start_governor(struct cpufreq_policy *policy) if (cpufreq_driver->get && !cpufreq_driver->setpolicy) cpufreq_update_current_freq(policy); - ret = policy->governor->governor(policy, CPUFREQ_GOV_START); - if (ret) - return ret; + if (policy->governor->start) { + ret = policy->governor->start(policy); + if (ret) + return ret; + } + + if (policy->governor->limits) + policy->governor->limits(policy); - policy->governor->governor(policy, CPUFREQ_GOV_LIMITS); return 0; } @@ -2076,7 +2083,8 @@ static void cpufreq_stop_governor(struct cpufreq_policy *policy) pr_debug("%s: for CPU %u\n", __func__, policy->cpu); - policy->governor->governor(policy, CPUFREQ_GOV_STOP); + if (policy->governor->stop) + policy->governor->stop(policy); } static void cpufreq_governor_limits(struct cpufreq_policy *policy) @@ -2086,7 +2094,8 @@ static void cpufreq_governor_limits(struct cpufreq_policy *policy) pr_debug("%s: for CPU %u\n", __func__, policy->cpu); - policy->governor->governor(policy, CPUFREQ_GOV_LIMITS); + if (policy->governor->limits) + policy->governor->limits(policy); } int cpufreq_register_governor(struct cpufreq_governor *governor) diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 316df247e00d..2568508fca38 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -313,12 +313,7 @@ static void cs_start(struct cpufreq_policy *policy) } static struct dbs_governor cs_dbs_gov = { - .gov = { - .name = "conservative", - .governor = cpufreq_governor_dbs, - .max_transition_latency = TRANSITION_LATENCY_LIMIT, - .owner = THIS_MODULE, - }, + .gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("conservative"), .kobj_type = { .default_attrs = cs_attributes }, .gov_dbs_timer = cs_dbs_timer, .alloc = cs_alloc, diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index be498d56dd69..ca9927c15a71 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -389,7 +389,7 @@ static void free_policy_dbs_info(struct policy_dbs_info *policy_dbs, gov->free(policy_dbs); } -static int cpufreq_governor_init(struct cpufreq_policy *policy) +int cpufreq_dbs_governor_init(struct cpufreq_policy *policy) { struct dbs_governor *gov = dbs_governor_of(policy); struct dbs_data *dbs_data; @@ -474,8 +474,9 @@ out: mutex_unlock(&gov_dbs_data_mutex); return ret; } +EXPORT_SYMBOL_GPL(cpufreq_dbs_governor_init); -static int cpufreq_governor_exit(struct cpufreq_policy *policy) +void cpufreq_dbs_governor_exit(struct cpufreq_policy *policy) { struct dbs_governor *gov = dbs_governor_of(policy); struct policy_dbs_info *policy_dbs = policy->governor_data; @@ -500,10 +501,10 @@ static int cpufreq_governor_exit(struct cpufreq_policy *policy) free_policy_dbs_info(policy_dbs, gov); mutex_unlock(&gov_dbs_data_mutex); - return 0; } +EXPORT_SYMBOL_GPL(cpufreq_dbs_governor_exit); -static int cpufreq_governor_start(struct cpufreq_policy *policy) +int cpufreq_dbs_governor_start(struct cpufreq_policy *policy) { struct dbs_governor *gov = dbs_governor_of(policy); struct policy_dbs_info *policy_dbs = policy->governor_data; @@ -539,14 +540,15 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy) gov_set_update_util(policy_dbs, sampling_rate); return 0; } +EXPORT_SYMBOL_GPL(cpufreq_dbs_governor_start); -static int cpufreq_governor_stop(struct cpufreq_policy *policy) +void cpufreq_dbs_governor_stop(struct cpufreq_policy *policy) { gov_cancel_work(policy); - return 0; } +EXPORT_SYMBOL_GPL(cpufreq_dbs_governor_stop); -static int cpufreq_governor_limits(struct cpufreq_policy *policy) +void cpufreq_dbs_governor_limits(struct cpufreq_policy *policy) { struct policy_dbs_info *policy_dbs = policy->governor_data; @@ -560,26 +562,5 @@ static int cpufreq_governor_limits(struct cpufreq_policy *policy) gov_update_sample_delay(policy_dbs, 0); mutex_unlock(&policy_dbs->timer_mutex); - - return 0; -} - -int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event) -{ - if (event == CPUFREQ_GOV_POLICY_INIT) { - return cpufreq_governor_init(policy); - } else if (policy->governor_data) { - switch (event) { - case CPUFREQ_GOV_POLICY_EXIT: - return cpufreq_governor_exit(policy); - case CPUFREQ_GOV_START: - return cpufreq_governor_start(policy); - case CPUFREQ_GOV_STOP: - return cpufreq_governor_stop(policy); - case CPUFREQ_GOV_LIMITS: - return cpufreq_governor_limits(policy); - } - } - return -EINVAL; } -EXPORT_SYMBOL_GPL(cpufreq_governor_dbs); +EXPORT_SYMBOL_GPL(cpufreq_dbs_governor_limits); diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h index 34eb214b6d57..36f0d19dd869 100644 --- a/drivers/cpufreq/cpufreq_governor.h +++ b/drivers/cpufreq/cpufreq_governor.h @@ -148,6 +148,25 @@ static inline struct dbs_governor *dbs_governor_of(struct cpufreq_policy *policy return container_of(policy->governor, struct dbs_governor, gov); } +/* Governor callback routines */ +int cpufreq_dbs_governor_init(struct cpufreq_policy *policy); +void cpufreq_dbs_governor_exit(struct cpufreq_policy *policy); +int cpufreq_dbs_governor_start(struct cpufreq_policy *policy); +void cpufreq_dbs_governor_stop(struct cpufreq_policy *policy); +void cpufreq_dbs_governor_limits(struct cpufreq_policy *policy); + +#define CPUFREQ_DBS_GOVERNOR_INITIALIZER(_name_) \ + { \ + .name = _name_, \ + .max_transition_latency = TRANSITION_LATENCY_LIMIT, \ + .owner = THIS_MODULE, \ + .init = cpufreq_dbs_governor_init, \ + .exit = cpufreq_dbs_governor_exit, \ + .start = cpufreq_dbs_governor_start, \ + .stop = cpufreq_dbs_governor_stop, \ + .limits = cpufreq_dbs_governor_limits, \ + } + /* Governor specific operations */ struct od_ops { unsigned int (*powersave_bias_target)(struct cpufreq_policy *policy, @@ -155,7 +174,6 @@ struct od_ops { }; unsigned int dbs_update(struct cpufreq_policy *policy); -int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event); void od_register_powersave_bias_handler(unsigned int (*f) (struct cpufreq_policy *, unsigned int, unsigned int), unsigned int powersave_bias); diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 300163430516..09d6c0c405e4 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -420,12 +420,7 @@ static struct od_ops od_ops = { }; static struct dbs_governor od_dbs_gov = { - .gov = { - .name = "ondemand", - .governor = cpufreq_governor_dbs, - .max_transition_latency = TRANSITION_LATENCY_LIMIT, - .owner = THIS_MODULE, - }, + .gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("ondemand"), .kobj_type = { .default_attrs = od_attributes }, .gov_dbs_timer = od_dbs_timer, .alloc = od_alloc, diff --git a/drivers/cpufreq/cpufreq_performance.c b/drivers/cpufreq/cpufreq_performance.c index dd4dca3ab844..dafb679adc58 100644 --- a/drivers/cpufreq/cpufreq_performance.c +++ b/drivers/cpufreq/cpufreq_performance.c @@ -16,25 +16,16 @@ #include #include -static int cpufreq_governor_performance(struct cpufreq_policy *policy, - unsigned int event) +static void cpufreq_gov_performance_limits(struct cpufreq_policy *policy) { - switch (event) { - case CPUFREQ_GOV_LIMITS: - pr_debug("setting to %u kHz\n", policy->max); - __cpufreq_driver_target(policy, policy->max, - CPUFREQ_RELATION_H); - break; - default: - break; - } - return 0; + pr_debug("setting to %u kHz\n", policy->max); + __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); } static struct cpufreq_governor cpufreq_gov_performance = { .name = "performance", - .governor = cpufreq_governor_performance, .owner = THIS_MODULE, + .limits = cpufreq_gov_performance_limits, }; static int __init cpufreq_gov_performance_init(void) diff --git a/drivers/cpufreq/cpufreq_powersave.c b/drivers/cpufreq/cpufreq_powersave.c index 691e573e46ee..78a651038faf 100644 --- a/drivers/cpufreq/cpufreq_powersave.c +++ b/drivers/cpufreq/cpufreq_powersave.c @@ -16,24 +16,15 @@ #include #include -static int cpufreq_governor_powersave(struct cpufreq_policy *policy, - unsigned int event) +static void cpufreq_gov_powersave_limits(struct cpufreq_policy *policy) { - switch (event) { - case CPUFREQ_GOV_LIMITS: - pr_debug("setting to %u kHz\n", policy->min); - __cpufreq_driver_target(policy, policy->min, - CPUFREQ_RELATION_L); - break; - default: - break; - } - return 0; + pr_debug("setting to %u kHz\n", policy->min); + __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); } static struct cpufreq_governor cpufreq_gov_powersave = { .name = "powersave", - .governor = cpufreq_governor_powersave, + .limits = cpufreq_gov_powersave_limits, .owner = THIS_MODULE, }; diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c index 9f3dec9a3f36..bd897e3e134d 100644 --- a/drivers/cpufreq/cpufreq_userspace.c +++ b/drivers/cpufreq/cpufreq_userspace.c @@ -65,66 +65,66 @@ static int cpufreq_userspace_policy_init(struct cpufreq_policy *policy) return 0; } -static int cpufreq_governor_userspace(struct cpufreq_policy *policy, - unsigned int event) +static void cpufreq_userspace_policy_exit(struct cpufreq_policy *policy) +{ + mutex_lock(&userspace_mutex); + kfree(policy->governor_data); + policy->governor_data = NULL; + mutex_unlock(&userspace_mutex); +} + +static int cpufreq_userspace_policy_start(struct cpufreq_policy *policy) { unsigned int *setspeed = policy->governor_data; - unsigned int cpu = policy->cpu; - int rc = 0; - if (event == CPUFREQ_GOV_POLICY_INIT) - return cpufreq_userspace_policy_init(policy); + BUG_ON(!policy->cur); + pr_debug("started managing cpu %u\n", policy->cpu); - if (!setspeed) - return -EINVAL; - - switch (event) { - case CPUFREQ_GOV_POLICY_EXIT: - mutex_lock(&userspace_mutex); - policy->governor_data = NULL; - kfree(setspeed); - mutex_unlock(&userspace_mutex); - break; - case CPUFREQ_GOV_START: - BUG_ON(!policy->cur); - pr_debug("started managing cpu %u\n", cpu); - - mutex_lock(&userspace_mutex); - per_cpu(cpu_is_managed, cpu) = 1; - *setspeed = policy->cur; - mutex_unlock(&userspace_mutex); - break; - case CPUFREQ_GOV_STOP: - pr_debug("managing cpu %u stopped\n", cpu); - - mutex_lock(&userspace_mutex); - per_cpu(cpu_is_managed, cpu) = 0; - *setspeed = 0; - mutex_unlock(&userspace_mutex); - break; - case CPUFREQ_GOV_LIMITS: - mutex_lock(&userspace_mutex); - pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz, last set to %u kHz\n", - cpu, policy->min, policy->max, policy->cur, *setspeed); - - if (policy->max < *setspeed) - __cpufreq_driver_target(policy, policy->max, - CPUFREQ_RELATION_H); - else if (policy->min > *setspeed) - __cpufreq_driver_target(policy, policy->min, - CPUFREQ_RELATION_L); - else - __cpufreq_driver_target(policy, *setspeed, - CPUFREQ_RELATION_L); - mutex_unlock(&userspace_mutex); - break; - } - return rc; + mutex_lock(&userspace_mutex); + per_cpu(cpu_is_managed, policy->cpu) = 1; + *setspeed = policy->cur; + mutex_unlock(&userspace_mutex); + return 0; +} + +static void cpufreq_userspace_policy_stop(struct cpufreq_policy *policy) +{ + unsigned int *setspeed = policy->governor_data; + + pr_debug("managing cpu %u stopped\n", policy->cpu); + + mutex_lock(&userspace_mutex); + per_cpu(cpu_is_managed, policy->cpu) = 0; + *setspeed = 0; + mutex_unlock(&userspace_mutex); +} + +static void cpufreq_userspace_policy_limits(struct cpufreq_policy *policy) +{ + unsigned int *setspeed = policy->governor_data; + + mutex_lock(&userspace_mutex); + + pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz, last set to %u kHz\n", + policy->cpu, policy->min, policy->max, policy->cur, *setspeed); + + if (policy->max < *setspeed) + __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); + else if (policy->min > *setspeed) + __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); + else + __cpufreq_driver_target(policy, *setspeed, CPUFREQ_RELATION_L); + + mutex_unlock(&userspace_mutex); } static struct cpufreq_governor cpufreq_gov_userspace = { .name = "userspace", - .governor = cpufreq_governor_userspace, + .init = cpufreq_userspace_policy_init, + .exit = cpufreq_userspace_policy_exit, + .start = cpufreq_userspace_policy_start, + .stop = cpufreq_userspace_policy_stop, + .limits = cpufreq_userspace_policy_limits, .store_setspeed = cpufreq_set, .show_setspeed = show_speed, .owner = THIS_MODULE, diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 4e81e08db752..5bcda3e6aa9e 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -455,18 +455,14 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div, #define MIN_LATENCY_MULTIPLIER (20) #define TRANSITION_LATENCY_LIMIT (10 * 1000 * 1000) -/* Governor Events */ -#define CPUFREQ_GOV_START 1 -#define CPUFREQ_GOV_STOP 2 -#define CPUFREQ_GOV_LIMITS 3 -#define CPUFREQ_GOV_POLICY_INIT 4 -#define CPUFREQ_GOV_POLICY_EXIT 5 - struct cpufreq_governor { char name[CPUFREQ_NAME_LEN]; int initialized; - int (*governor) (struct cpufreq_policy *policy, - unsigned int event); + int (*init)(struct cpufreq_policy *policy); + void (*exit)(struct cpufreq_policy *policy); + int (*start)(struct cpufreq_policy *policy); + void (*stop)(struct cpufreq_policy *policy); + void (*limits)(struct cpufreq_policy *policy); ssize_t (*show_setspeed) (struct cpufreq_policy *policy, char *buf); int (*store_setspeed) (struct cpufreq_policy *policy, diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 14c4aa25cc45..fdcee3cf38fc 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -394,7 +394,7 @@ static int sugov_init(struct cpufreq_policy *policy) return ret; } -static int sugov_exit(struct cpufreq_policy *policy) +static void sugov_exit(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy = policy->governor_data; struct sugov_tunables *tunables = sg_policy->tunables; @@ -412,7 +412,6 @@ static int sugov_exit(struct cpufreq_policy *policy) mutex_unlock(&global_tunables_lock); sugov_policy_free(sg_policy); - return 0; } static int sugov_start(struct cpufreq_policy *policy) @@ -444,7 +443,7 @@ static int sugov_start(struct cpufreq_policy *policy) return 0; } -static int sugov_stop(struct cpufreq_policy *policy) +static void sugov_stop(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy = policy->governor_data; unsigned int cpu; @@ -456,10 +455,9 @@ static int sugov_stop(struct cpufreq_policy *policy) irq_work_sync(&sg_policy->irq_work); cancel_work_sync(&sg_policy->work); - return 0; } -static int sugov_limits(struct cpufreq_policy *policy) +static void sugov_limits(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy = policy->governor_data; @@ -477,32 +475,16 @@ static int sugov_limits(struct cpufreq_policy *policy) } sg_policy->need_freq_update = true; - return 0; -} - -int sugov_governor(struct cpufreq_policy *policy, unsigned int event) -{ - if (event == CPUFREQ_GOV_POLICY_INIT) { - return sugov_init(policy); - } else if (policy->governor_data) { - switch (event) { - case CPUFREQ_GOV_POLICY_EXIT: - return sugov_exit(policy); - case CPUFREQ_GOV_START: - return sugov_start(policy); - case CPUFREQ_GOV_STOP: - return sugov_stop(policy); - case CPUFREQ_GOV_LIMITS: - return sugov_limits(policy); - } - } - return -EINVAL; } static struct cpufreq_governor schedutil_gov = { .name = "schedutil", - .governor = sugov_governor, .owner = THIS_MODULE, + .init = sugov_init, + .exit = sugov_exit, + .start = sugov_start, + .stop = sugov_stop, + .limits = sugov_limits, }; static int __init sugov_module_init(void) -- cgit v1.3-8-gc7d7 From bf2be2de8493dd5f86d6e0f0d4eecb5810ad035b Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 18 May 2016 17:55:31 +0530 Subject: cpufreq: governor: Create cpufreq_policy_apply_limits() Create a new helper to avoid code duplication across governors. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_governor.c | 7 +------ include/linux/cpufreq.h | 8 ++++++++ kernel/sched/cpufreq_schedutil.c | 9 +-------- 3 files changed, 10 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 18eab7f4ba4c..4b88b5bec4ef 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -553,12 +553,7 @@ void cpufreq_dbs_governor_limits(struct cpufreq_policy *policy) struct policy_dbs_info *policy_dbs = policy->governor_data; mutex_lock(&policy_dbs->timer_mutex); - - if (policy->max < policy->cur) - __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); - else if (policy->min > policy->cur) - __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); - + cpufreq_policy_apply_limits(policy); gov_update_sample_delay(policy_dbs, 0); mutex_unlock(&policy_dbs->timer_mutex); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 5bcda3e6aa9e..3be54a0b4373 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -489,6 +489,14 @@ void cpufreq_unregister_governor(struct cpufreq_governor *governor); struct cpufreq_governor *cpufreq_default_governor(void); struct cpufreq_governor *cpufreq_fallback_governor(void); +static inline void cpufreq_policy_apply_limits(struct cpufreq_policy *policy) +{ + if (policy->max < policy->cur) + __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); + else if (policy->min > policy->cur) + __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); +} + /* Governor attribute set */ struct gov_attr_set { struct kobject kobj; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index fdcee3cf38fc..758efd7f3abe 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -463,14 +463,7 @@ static void sugov_limits(struct cpufreq_policy *policy) if (!policy->fast_switch_enabled) { mutex_lock(&sg_policy->work_lock); - - if (policy->max < policy->cur) - __cpufreq_driver_target(policy, policy->max, - CPUFREQ_RELATION_H); - else if (policy->min > policy->cur) - __cpufreq_driver_target(policy, policy->min, - CPUFREQ_RELATION_L); - + cpufreq_policy_apply_limits(policy); mutex_unlock(&sg_policy->work_lock); } -- cgit v1.3-8-gc7d7 From 1a99ae3f00d3c7c7885ee529ac9a874b19caa0cf Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Tue, 10 May 2016 21:03:18 +0800 Subject: sched/fair: Fix the wrong throttled clock time for cfs_rq_clock_task() Two minor fixes for cfs_rq_clock_task(): 1) If cfs_rq is currently being throttled, we need to subtract the cfs throttled clock time. 2) Make "throttled_clock_task_time" update SMP unrelated. Now UP cases need it as well. Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1462885398-14724-1-git-send-email-xlpang@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 218f8e83db73..1e87bb633d43 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3688,7 +3688,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) { if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task; + return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; } @@ -3826,13 +3826,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; cfs_rq->throttle_count--; -#ifdef CONFIG_SMP if (!cfs_rq->throttle_count) { /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; } -#endif return 0; } -- cgit v1.3-8-gc7d7 From df55f462b905f3b2d40ec3fb865891382a6ebfb1 Mon Sep 17 00:00:00 2001 From: "Gaurav Jindal (Gaurav Jindal)" Date: Thu, 12 May 2016 10:13:33 +0000 Subject: sched/idle: Optimize the generic idle loop Currently, smp_processor_id() is used to fetch the current CPU in cpu_idle_loop(). Every time the idle thread runs, it fetches the current CPU using smp_processor_id(). Since the idle thread is per CPU, the current CPU is constant, so we can lift the load out of the loop, saving execution cycles/time in the loop. x86-64: Before patch (execution in loop): 148: 0f ae e8 lfence 14b: 65 8b 04 25 00 00 00 00 mov %gs:0x0,%eax 152: 00 153: 89 c0 mov %eax,%eax 155: 49 0f a3 04 24 bt %rax,(%r12) After patch (execution in loop): 150: 0f ae e8 lfence 153: 4d 0f a3 34 24 bt %r14,(%r12) ARM64: Before patch (execution in loop): 168: d5033d9f dsb ld 16c: b9405661 ldr w1,[x19,#84] 170: 1100fc20 add w0,w1,#0x3f 174: 6b1f003f cmp w1,wzr 178: 1a81b000 csel w0,w0,w1,lt 17c: 130c7000 asr w0,w0,#6 180: 937d7c00 sbfiz x0,x0,#3,#32 184: f8606aa0 ldr x0,[x21,x0] 188: 9ac12401 lsr x1,x0,x1 18c: 36000e61 tbz w1,#0,358 After patch (execution in loop): 1a8: d50339df dsb ld 1ac: f8776ac0 ldr x0,[x22,x23] ab0: ea18001f tst x0,x24 1b4: 54000ea0 b.eq 388 Further observance on ARM64 for 4 seconds shows that cpu_idle_loop is called 8672 times. Shifting the code will save instructions executed in loop and eventually time as well. Signed-off-by: Gaurav Jindal Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sanjeev Yadav Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160512101330.GA488@gauravjindalubtnb.del.spreadtrum.com Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index bd12c6c714ec..db4ff7c100b9 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -201,6 +201,8 @@ exit_idle: */ static void cpu_idle_loop(void) { + int cpu = smp_processor_id(); + while (1) { /* * If the arch has a polling bit, we maintain an invariant: @@ -219,7 +221,7 @@ static void cpu_idle_loop(void) check_pgt_cache(); rmb(); - if (cpu_is_offline(smp_processor_id())) { + if (cpu_is_offline(cpu)) { cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } -- cgit v1.3-8-gc7d7 From 150593bf869393d10a79f6bd3df2585ecc20a9bb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 18 May 2016 19:02:18 +0200 Subject: sched/api: Introduce task_rcu_dereference() and try_get_task_struct() Generally task_struct is only protected by RCU if it was found on a RCU protected list (say, for_each_process() or find_task_by_vpid()). As Kirill pointed out rq->curr isn't protected by RCU, the scheduler drops the (potentially) last reference without RCU gp, this means that we need to fix the code which uses foreign_rq->curr under rcu_read_lock(). Add a new helper which can be used to dereference rq->curr or any other pointer to task_struct assuming that it should be cleared or updated before the final put_task_struct(). It returns non-NULL only if this task can't go away before rcu_read_unlock(). ( Also add try_get_task_struct() to make it easier to use this API correctly. ) Suggested-by: Kirill Tkhai Signed-off-by: Oleg Nesterov [ Updated comments; added try_get_task_struct()] Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vladimir Davydov Link: http://lkml.kernel.org/r/20160518170218.GY3192@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 ++ kernel/exit.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6e42ada26345..dee41bf59e6b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2139,6 +2139,9 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } +struct task_struct *task_rcu_dereference(struct task_struct **ptask); +struct task_struct *try_get_task_struct(struct task_struct **ptask); + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime); diff --git a/kernel/exit.c b/kernel/exit.c index 9e6e1356e6bb..2fb4d44c51b1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -210,6 +210,82 @@ repeat: goto repeat; } +/* + * Note that if this function returns a valid task_struct pointer (!NULL) + * task->usage must remain >0 for the duration of the RCU critical section. + */ +struct task_struct *task_rcu_dereference(struct task_struct **ptask) +{ + struct sighand_struct *sighand; + struct task_struct *task; + + /* + * We need to verify that release_task() was not called and thus + * delayed_put_task_struct() can't run and drop the last reference + * before rcu_read_unlock(). We check task->sighand != NULL, + * but we can read the already freed and reused memory. + */ +retry: + task = rcu_dereference(*ptask); + if (!task) + return NULL; + + probe_kernel_address(&task->sighand, sighand); + + /* + * Pairs with atomic_dec_and_test() in put_task_struct(). If this task + * was already freed we can not miss the preceding update of this + * pointer. + */ + smp_rmb(); + if (unlikely(task != READ_ONCE(*ptask))) + goto retry; + + /* + * We've re-checked that "task == *ptask", now we have two different + * cases: + * + * 1. This is actually the same task/task_struct. In this case + * sighand != NULL tells us it is still alive. + * + * 2. This is another task which got the same memory for task_struct. + * We can't know this of course, and we can not trust + * sighand != NULL. + * + * In this case we actually return a random value, but this is + * correct. + * + * If we return NULL - we can pretend that we actually noticed that + * *ptask was updated when the previous task has exited. Or pretend + * that probe_slab_address(&sighand) reads NULL. + * + * If we return the new task (because sighand is not NULL for any + * reason) - this is fine too. This (new) task can't go away before + * another gp pass. + * + * And note: We could even eliminate the false positive if re-read + * task->sighand once again to avoid the falsely NULL. But this case + * is very unlikely so we don't care. + */ + if (!sighand) + return NULL; + + return task; +} + +struct task_struct *try_get_task_struct(struct task_struct **ptask) +{ + struct task_struct *task; + + rcu_read_lock(); + task = task_rcu_dereference(ptask); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + return task; +} + /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected -- cgit v1.3-8-gc7d7 From bac7857319bcf7fed329a10bb760053e761115c0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 18 May 2016 21:57:33 +0200 Subject: sched/fair: Use task_rcu_dereference() Simplify task_numa_compare()'s task reference magic by using task_rcu_dereference(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Kirill Tkhai Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vladimir Davydov Link: http://lkml.kernel.org/r/20160518195733.GA15914@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 34 ++++------------------------------ 1 file changed, 4 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1e87bb633d43..c6dd8bab010c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1305,6 +1305,8 @@ static void task_numa_assign(struct task_numa_env *env, { if (env->best_task) put_task_struct(env->best_task); + if (p) + get_task_struct(p); env->best_task = p; env->best_imp = imp; @@ -1372,31 +1374,11 @@ static void task_numa_compare(struct task_numa_env *env, long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; int dist = env->dist; - bool assigned = false; rcu_read_lock(); - - raw_spin_lock_irq(&dst_rq->lock); - cur = dst_rq->curr; - /* - * No need to move the exiting task or idle task. - */ - if ((cur->flags & PF_EXITING) || is_idle_task(cur)) + cur = task_rcu_dereference(&dst_rq->curr); + if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) cur = NULL; - else { - /* - * The task_struct must be protected here to protect the - * p->numa_faults access in the task_weight since the - * numa_faults could already be freed in the following path: - * finish_task_switch() - * --> put_task_struct() - * --> __put_task_struct() - * --> task_numa_free() - */ - get_task_struct(cur); - } - - raw_spin_unlock_irq(&dst_rq->lock); /* * Because we have preemption enabled we can get migrated around and @@ -1479,7 +1461,6 @@ balance: */ if (!load_too_imbalanced(src_load, dst_load, env)) { imp = moveimp - 1; - put_task_struct(cur); cur = NULL; goto assign; } @@ -1505,16 +1486,9 @@ balance: env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); assign: - assigned = true; task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); - /* - * The dst_rq->curr isn't assigned. The protection for task_struct is - * finished. - */ - if (cur && !assigned) - put_task_struct(cur); } static void task_numa_find_cpu(struct task_numa_env *env, -- cgit v1.3-8-gc7d7 From f2fb6bef92514432398a653df1c2f1041d79ac46 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 23 Mar 2016 11:24:37 -0700 Subject: perf/core: Optimize side-band event delivery The perf_event_aux() function iterates all PMUs and all events in their respective per-CPU contexts to find the events to deliver side-band records to. For example, the brk test case in lkp triggers many mmap() operations, which, if we're also running perf, results in many perf_event_aux() invocations. If we enable uncore PMU support (even when uncore events are not used), dozens of uncore PMUs will be iterated, which can significantly decrease brk_test's throughput. For example, the brk throughput: without uncore PMUs: 2647573 ops_per_sec with uncore PMUs: 1768444 ops_per_sec ... a 33% reduction. To get at the per-CPU events that need side-band records, this patch puts these events on a per-CPU list, this avoids iterating the PMUs and any events that do not need side-band records. Per task events are unchanged to avoid extra overhead on the context switch paths. Suggested-by: Peter Zijlstra (Intel) Reported-by: Huang, Ying Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1458757477-3781-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 6 ++++ kernel/events/core.c | 85 +++++++++++++++++++++++++++++++++++++++------- 2 files changed, 79 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0e43355c7aad..92e9ce737432 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -517,6 +517,11 @@ struct swevent_hlist { struct perf_cgroup; struct ring_buffer; +struct pmu_event_list { + raw_spinlock_t lock; + struct list_head list; +}; + /** * struct perf_event - performance event kernel representation: */ @@ -675,6 +680,7 @@ struct perf_event { int cgrp_defer_enabled; #endif + struct list_head sb_list; #endif /* CONFIG_PERF_EVENTS */ }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 79363f298445..6615c8922ee3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -335,6 +335,7 @@ static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); +static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -3665,6 +3666,26 @@ static void free_event_rcu(struct rcu_head *head) static void ring_buffer_attach(struct perf_event *event, struct ring_buffer *rb); +static void detach_sb_event(struct perf_event *event) +{ + struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); + + raw_spin_lock(&pel->lock); + list_del_rcu(&event->sb_list); + raw_spin_unlock(&pel->lock); +} + +static void unaccount_pmu_sb_event(struct perf_event *event) +{ + if (event->parent) + return; + + if (event->attach_state & PERF_ATTACH_TASK) + return; + + detach_sb_event(event); +} + static void unaccount_event_cpu(struct perf_event *event, int cpu) { if (event->parent) @@ -3728,6 +3749,8 @@ static void unaccount_event(struct perf_event *event) } unaccount_event_cpu(event, event->cpu); + + unaccount_pmu_sb_event(event); } static void perf_sched_delayed(struct work_struct *work) @@ -5888,13 +5911,25 @@ perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, rcu_read_unlock(); } +static void perf_event_sb_iterate(perf_event_aux_output_cb output, void *data) +{ + struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); + struct perf_event *event; + + list_for_each_entry_rcu(event, &pel->list, sb_list) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + continue; + if (!event_filter_match(event)) + continue; + output(event, data); + } +} + static void perf_event_aux(perf_event_aux_output_cb output, void *data, struct perf_event_context *task_ctx) { - struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - struct pmu *pmu; int ctxn; /* @@ -5909,20 +5944,15 @@ perf_event_aux(perf_event_aux_output_cb output, void *data, } rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_aux_ctx(&cpuctx->ctx, output, data, false); - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; + preempt_disable(); + perf_event_sb_iterate(output, data); + + for_each_task_context_nr(ctxn) { ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); if (ctx) perf_event_aux_ctx(ctx, output, data, false); -next: - put_cpu_ptr(pmu->pmu_cpu_context); } + preempt_enable(); rcu_read_unlock(); } @@ -8615,6 +8645,32 @@ unlock: return pmu; } +static void attach_sb_event(struct perf_event *event) +{ + struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); + + raw_spin_lock(&pel->lock); + list_add_rcu(&event->sb_list, &pel->list); + raw_spin_unlock(&pel->lock); +} + +static void account_pmu_sb_event(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + + if (event->parent) + return; + + if (event->attach_state & PERF_ATTACH_TASK) + return; + + if (attr->mmap || attr->mmap_data || attr->mmap2 || + attr->comm || attr->comm_exec || + attr->task || + attr->context_switch) + attach_sb_event(event); +} + static void account_event_cpu(struct perf_event *event, int cpu) { if (event->parent) @@ -8695,6 +8751,8 @@ static void account_event(struct perf_event *event) enabled: account_event_cpu(event, event->cpu); + + account_pmu_sb_event(event); } /* @@ -10203,6 +10261,9 @@ static void __init perf_event_init_all_cpus(void) swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); + + INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); + raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); } } -- cgit v1.3-8-gc7d7 From aab5b71ef2b5c62323b9abe397e2db57b18e1f78 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 May 2016 17:26:46 +0200 Subject: perf/core: Rename the perf_event_aux*() APIs to perf_event_sb*(), to separate them from AUX ring-buffer records There are now two different things called AUX in perf, the infrastructure to deliver the mmap/comm/task records and the AUX part in the mmap buffer (with associated AUX_RECORD). Since the former is internal, rename it to side-band to reduce the confusion factor. No change in functionality. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 69 +++++++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6615c8922ee3..f54454ea5f31 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5879,11 +5879,11 @@ perf_event_read_event(struct perf_event *event, perf_output_end(&handle); } -typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); +typedef void (perf_iterate_f)(struct perf_event *event, void *data); static void -perf_event_aux_ctx(struct perf_event_context *ctx, - perf_event_aux_output_cb output, +perf_iterate_ctx(struct perf_event_context *ctx, + perf_iterate_f output, void *data, bool all) { struct perf_event *event; @@ -5900,18 +5900,7 @@ perf_event_aux_ctx(struct perf_event_context *ctx, } } -static void -perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, - struct perf_event_context *task_ctx) -{ - rcu_read_lock(); - preempt_disable(); - perf_event_aux_ctx(task_ctx, output, data, false); - preempt_enable(); - rcu_read_unlock(); -} - -static void perf_event_sb_iterate(perf_event_aux_output_cb output, void *data) +static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) { struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); struct perf_event *event; @@ -5925,33 +5914,40 @@ static void perf_event_sb_iterate(perf_event_aux_output_cb output, void *data) } } +/* + * Iterate all events that need to receive side-band events. + * + * For new callers; ensure that account_pmu_sb_event() includes + * your event, otherwise it might not get delivered. + */ static void -perf_event_aux(perf_event_aux_output_cb output, void *data, +perf_iterate_sb(perf_iterate_f output, void *data, struct perf_event_context *task_ctx) { struct perf_event_context *ctx; int ctxn; + rcu_read_lock(); + preempt_disable(); + /* - * If we have task_ctx != NULL we only notify - * the task context itself. The task_ctx is set - * only for EXIT events before releasing task + * If we have task_ctx != NULL we only notify the task context itself. + * The task_ctx is set only for EXIT events before releasing task * context. */ if (task_ctx) { - perf_event_aux_task_ctx(output, data, task_ctx); - return; + perf_iterate_ctx(task_ctx, output, data, false); + goto done; } - rcu_read_lock(); - preempt_disable(); - perf_event_sb_iterate(output, data); + perf_iterate_sb_cpu(output, data); for_each_task_context_nr(ctxn) { ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); if (ctx) - perf_event_aux_ctx(ctx, output, data, false); + perf_iterate_ctx(ctx, output, data, false); } +done: preempt_enable(); rcu_read_unlock(); } @@ -6001,7 +5997,7 @@ void perf_event_exec(void) perf_event_enable_on_exec(ctxn); - perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL, + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); } rcu_read_unlock(); @@ -6045,9 +6041,9 @@ static int __perf_pmu_output_stop(void *info) }; rcu_read_lock(); - perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); + perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); if (cpuctx->task_ctx) - perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop, + perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop, &ro, false); rcu_read_unlock(); @@ -6176,7 +6172,7 @@ static void perf_event_task(struct task_struct *task, }, }; - perf_event_aux(perf_event_task_output, + perf_iterate_sb(perf_event_task_output, &task_event, task_ctx); } @@ -6255,7 +6251,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - perf_event_aux(perf_event_comm_output, + perf_iterate_sb(perf_event_comm_output, comm_event, NULL); } @@ -6486,7 +6482,7 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; - perf_event_aux(perf_event_mmap_output, + perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); @@ -6569,7 +6565,7 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma) if (!ctx) continue; - perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true); + perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); } rcu_read_unlock(); } @@ -6756,7 +6752,7 @@ static void perf_event_switch(struct task_struct *task, }, }; - perf_event_aux(perf_event_switch_output, + perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); } @@ -8654,6 +8650,13 @@ static void attach_sb_event(struct perf_event *event) raw_spin_unlock(&pel->lock); } +/* + * We keep a list of all !task (and therefore per-cpu) events + * that need to receive side-band records. + * + * This avoids having to scan all the various PMU per-cpu contexts + * looking for them. + */ static void account_pmu_sb_event(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; -- cgit v1.3-8-gc7d7 From ab7fdefba68f66c8523571c3b3a940635d781824 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 3 May 2016 00:26:06 -0700 Subject: perf/core: Fix implicitly enable dynamic interrupt throttle This patch fixes an issue which was introduced by commit: 91a612eea9a3 ("perf/core: Fix dynamic interrupt throttle") ... which commit unconditionally sets the perf_sample_allowed_ns value to !0. But that could trigger a bug in the following corner case: The user can disable the dynamic interrupt throttle mechanism by setting perf_cpu_time_max_percent to 0. Then they change perf_event_max_sample_rate. For this case, the mechanism will be enabled implicitly, because perf_sample_allowed_ns becomes !0 - which is not what we want. This patch only updates perf_sample_allowed_ns when the dynamic interrupt throttle mechanism is enabled. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Link: http://lkml.kernel.org/r/1462260366-3160-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f54454ea5f31..f94f164b5054 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -397,6 +397,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write, if (ret || !write) return ret; + /* + * If throttling is disabled don't allow the write: + */ + if (sysctl_perf_cpu_time_max_percent == 100 || + sysctl_perf_cpu_time_max_percent == 0) + return -EINVAL; + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; update_perf_cpu_limits(); -- cgit v1.3-8-gc7d7 From a1396555abff9ff9b74c2e4da13e27e81fd094b2 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Mon, 9 May 2016 15:07:40 +0530 Subject: perf/abi: Change the errno for sampling event not supported in hardware Change the return code for sampling event not supported from -ENOTSUPP to -EOPNOTSUPP. This allows userspace to identify this case specifically, instead of printing the catch-all error message it did previously. Technically this is an ABI change, but we think we can get away with it. Old behavior: ------- | # perf record ls | Error: | The sys_perf_event_open() syscall returned with 524 (Unknown error 524) | for event (cycles:ppp). | /bin/dmesg may provide additional information. | No CONFIG_PERF_EVENTS=y kernel support configured? New behavior: ------- | # perf record ls | Error: | PMU Hardware doesn't support sampling/overflow-interrupts. Signed-off-by: Vineet Gupta Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Vineet Gupta Link: http://lkml.kernel.org/r/1462786660-2900-3-git-send-email-vgupta@synopsys.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f94f164b5054..5d48306879d5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9309,7 +9309,7 @@ SYSCALL_DEFINE5(perf_event_open, if (is_sampling_event(event)) { if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { - err = -ENOTSUPP; + err = -EOPNOTSUPP; goto err_alloc; } } -- cgit v1.3-8-gc7d7 From 133e89ef5ef338e1358b16246521ba17d935c396 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 13 May 2016 11:56:26 -0700 Subject: locking/rwsem: Enable lockless waiter wakeup(s) As wake_qs gain users, we can teach rwsems about them such that waiters can be awoken without the wait_lock. This is for both readers and writer, the former being the most ideal candidate as we can batch the wakeups shortening the critical region that much more -- ie writer task blocking a bunch of tasks waiting to service page-faults (mmap_sem readers). In general applying wake_qs to rwsem (xadd) is not difficult as the wait_lock is intended to be released soon _anyways_, with the exception of when a writer slowpath will proactively wakeup any queued readers if it sees that the lock is owned by a reader, in which we simply do the wakeups with the lock held (see comment in __rwsem_down_write_failed_common()). Similar to other locking primitives, delaying the waiter being awoken does allow, at least in theory, the lock to be stolen in the case of writers, however no harm was seen in this (in fact lock stealing tends to be a _good_ thing in most workloads), and this is a tiny window anyways. Some page-fault (pft) and mmap_sem intensive benchmarks show some pretty constant reduction in systime (by up to ~8 and ~10%) on a 2-socket, 12 core AMD box. In addition, on an 8-core Westmere doing page allocations (page_test) aim9: 4.6-rc6 4.6-rc6 rwsemv2 Min page_test 378167.89 ( 0.00%) 382613.33 ( 1.18%) Min exec_test 499.00 ( 0.00%) 502.67 ( 0.74%) Min fork_test 3395.47 ( 0.00%) 3537.64 ( 4.19%) Hmean page_test 395433.06 ( 0.00%) 414693.68 ( 4.87%) Hmean exec_test 499.67 ( 0.00%) 505.30 ( 1.13%) Hmean fork_test 3504.22 ( 0.00%) 3594.95 ( 2.59%) Stddev page_test 17426.57 ( 0.00%) 26649.92 (-52.93%) Stddev exec_test 0.47 ( 0.00%) 1.41 (-199.05%) Stddev fork_test 63.74 ( 0.00%) 32.59 ( 48.86%) Max page_test 429873.33 ( 0.00%) 456960.00 ( 6.30%) Max exec_test 500.33 ( 0.00%) 507.66 ( 1.47%) Max fork_test 3653.33 ( 0.00%) 3650.90 ( -0.07%) 4.6-rc6 4.6-rc6 rwsemv2 User 1.12 0.04 System 0.23 0.04 Elapsed 727.27 721.98 Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hpe.com Cc: dave@stgolabs.net Cc: jason.low2@hp.com Cc: peter@hurleysoftware.com Link: http://lkml.kernel.org/r/1463165787-25937-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 58 ++++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 09e30c6225e5..80b05ac0f015 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -114,12 +114,16 @@ enum rwsem_wake_type { * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) * - there must be someone on the queue - * - the spinlock must be held by the caller + * - the wait_lock must be held by the caller + * - tasks are marked for wakeup, the caller must later invoke wake_up_q() + * to actually wakeup the blocked task(s) and drop the reference count, + * preferably when the wait_lock is released * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if downgrading is false + * - writers are only marked woken if downgrading is false */ static struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) +__rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -128,13 +132,16 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wake_type == RWSEM_WAKE_ANY) - /* Wake writer at the front of the queue, but do not - * grant it the lock yet as we want other writers - * to be able to steal it. Readers, on the other hand, - * will block as they will notice the queued writer. + if (wake_type == RWSEM_WAKE_ANY) { + /* + * Mark writer at the front of the queue for wakeup. + * Until the task is actually later awoken later by + * the caller, other writers are able to steal it. + * Readers, on the other hand, will block as they + * will notice the queued writer. */ - wake_up_process(waiter->task); + wake_q_add(wake_q, waiter->task); + } goto out; } @@ -196,7 +203,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) */ smp_mb(); waiter->task = NULL; - wake_up_process(tsk); + wake_q_add(wake_q, tsk); put_task_struct(tsk); } while (--loop); @@ -216,6 +223,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; + WAKE_Q(wake_q); /* set up my own style of waitqueue */ waiter.task = tsk; @@ -238,9 +246,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) if (count == RWSEM_WAITING_BIAS || (count > RWSEM_WAITING_BIAS && adjustment != -RWSEM_ACTIVE_READ_BIAS)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); /* wait to be given the lock */ while (true) { @@ -440,6 +449,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; struct rw_semaphore *ret = sem; + WAKE_Q(wake_q); /* undo write bias from down_write operation, stop active locking */ count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); @@ -472,8 +482,19 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) * no active writers, the lock must be read owned; so we try to * wake any read locks that were queued ahead of us. */ - if (count > RWSEM_WAITING_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + if (count > RWSEM_WAITING_BIAS) { + WAKE_Q(wake_q); + + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); + /* + * The wakeup is normally called _after_ the wait_lock + * is released, but given that we are proactively waking + * readers we can deal with the wake_q overhead as it is + * similar to releasing and taking the wait_lock again + * for attempting rwsem_try_write_lock(). + */ + wake_up_q(&wake_q); + } } else count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); @@ -509,8 +530,9 @@ out_nolock: if (list_empty(&sem->wait_list)) rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem); else - __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); return ERR_PTR(-EINTR); } @@ -537,6 +559,7 @@ __visible struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; + WAKE_Q(wake_q); /* * If a spinner is present, it is not necessary to do the wakeup. @@ -573,9 +596,10 @@ locked: /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); return sem; } @@ -590,14 +614,16 @@ __visible struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { unsigned long flags; + WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->wait_lock, flags); /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); return sem; } -- cgit v1.3-8-gc7d7 From e38513905eeaae59056eac2c9ac55a43b1fc41b2 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 13 May 2016 11:56:27 -0700 Subject: locking/rwsem: Rework zeroing reader waiter->task Readers that are awoken will expect a nil ->task indicating that a wakeup has occurred. Because of the way readers are implemented, there's a small chance that the waiter will never block in the slowpath (rwsem_down_read_failed), and therefore requires some form of reference counting to avoid the following scenario: rwsem_down_read_failed() rwsem_wake() get_task_struct(); spin_lock_irq(&wait_lock); list_add_tail(&waiter.list) spin_unlock_irq(&wait_lock); raw_spin_lock_irqsave(&wait_lock) __rwsem_do_wake() while (1) { set_task_state(TASK_UNINTERRUPTIBLE); waiter->task = NULL if (!waiter.task) // true break; schedule() // never reached __set_task_state(TASK_RUNNING); do_exit(); wake_up_process(tsk); // boom ... and therefore race with do_exit() when the caller returns. There is also a mismatch between the smp_mb() and its documentation, in that the serialization is done between reading the task and the nil store. Furthermore, in addition to having the overlapping of loads and stores to waiter->task guaranteed to be ordered within that CPU, both wake_up_process() originally and now wake_q_add() already imply barriers upon successful calls, which serves the comment. Now, as an alternative to perhaps inverting the checks in the blocker side (which has its own penalty in that schedule is unavoidable), with lockless wakeups this situation is naturally addressed and we can just use the refcount held by wake_q_add(), instead doing so explicitly. Of course, we must guarantee that the nil store is done as the _last_ operation in that the task must already be marked for deletion to not fall into the race above. Spurious wakeups are also handled transparently in that the task's reference is only removed when wake_up_q() is actually called _after_ the nil store. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hpe.com Cc: dave@stgolabs.net Cc: jason.low2@hp.com Cc: peter@hurleysoftware.com Link: http://lkml.kernel.org/r/1463165787-25937-3-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 80b05ac0f015..fcbf75ac3dcb 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -194,17 +194,15 @@ __rwsem_mark_wake(struct rw_semaphore *sem, waiter = list_entry(next, struct rwsem_waiter, list); next = waiter->list.next; tsk = waiter->task; + + wake_q_add(wake_q, tsk); /* - * Make sure we do not wakeup the next reader before - * setting the nil condition to grant the next reader; - * otherwise we could miss the wakeup on the other - * side and end up sleeping again. See the pairing - * in rwsem_down_read_failed(). + * Ensure that the last operation is setting the reader + * waiter to nil such that rwsem_down_read_failed() cannot + * race with do_exit() by always holding a reference count + * to the task to wakeup. */ - smp_mb(); - waiter->task = NULL; - wake_q_add(wake_q, tsk); - put_task_struct(tsk); + smp_store_release(&waiter->task, NULL); } while (--loop); sem->wait_list.next = next; @@ -228,7 +226,6 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* set up my own style of waitqueue */ waiter.task = tsk; waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(tsk); raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) -- cgit v1.3-8-gc7d7 From c0fcb6c2d332041256dc55d8a1ec3c0a2d0befb8 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Mon, 16 May 2016 17:38:00 -0700 Subject: locking/rwsem: Optimize write lock by reducing operations in slowpath When acquiring the rwsem write lock in the slowpath, we first try to set count to RWSEM_WAITING_BIAS. When that is successful, we then atomically add the RWSEM_WAITING_BIAS in cases where there are other tasks on the wait list. This causes write lock operations to often issue multiple atomic operations. We can instead make the list_is_singular() check first, and then set the count accordingly, so that we issue at most 1 atomic operation when acquiring the write lock and reduce unnecessary cacheline contention. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Fenghua Yu Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Jason Low Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Matt Turner Cc: Paul E. McKenney Cc: Peter Hurley Cc: Peter Zijlstra Cc: Richard Henderson Cc: Terry Rudd Cc: Thomas Gleixner Cc: Tim Chen Cc: Tony Luck Link: http://lkml.kernel.org/r/1463445486-16078-2-git-send-email-jason.low2@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index fcbf75ac3dcb..b957da7fcb19 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -261,17 +261,28 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) } EXPORT_SYMBOL(rwsem_down_read_failed); +/* + * This function must be called with the sem->wait_lock held to prevent + * race conditions between checking the rwsem wait list and setting the + * sem->count accordingly. + */ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) { /* - * Try acquiring the write lock. Check count first in order - * to reduce unnecessary expensive cmpxchg() operations. + * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS. */ - if (count == RWSEM_WAITING_BIAS && - cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, - RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { - if (!list_is_singular(&sem->wait_list)) - rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + if (count != RWSEM_WAITING_BIAS) + return false; + + /* + * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there + * are other tasks on the wait list, we need to add on WAITING_BIAS. + */ + count = list_is_singular(&sem->wait_list) ? + RWSEM_ACTIVE_WRITE_BIAS : + RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS; + + if (cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) == RWSEM_WAITING_BIAS) { rwsem_set_owner(sem); return true; } -- cgit v1.3-8-gc7d7 From 6e2814745c67ab422b86262b05e6f23a56f28aa3 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 20 May 2016 15:19:36 -0700 Subject: locking/mutex: Set and clear owner using WRITE_ONCE() The mutex owner can get read and written to locklessly. Use WRITE_ONCE when setting and clearing the owner field in order to avoid optimizations such as store tearing. This avoids situations where the owner field gets written to with multiple stores and another thread could concurrently read and use a partially written owner value. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Davidlohr Bueso Acked-by: Waiman Long Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Terry Rudd Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1463782776.2479.9.camel@j-VirtualBox Signed-off-by: Ingo Molnar --- kernel/locking/mutex-debug.h | 4 ++-- kernel/locking/mutex.h | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 0799fd3e4cfa..372e6530180d 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -29,12 +29,12 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current; + WRITE_ONCE(lock->owner, current); } static inline void mutex_clear_owner(struct mutex *lock) { - lock->owner = NULL; + WRITE_ONCE(lock->owner, NULL); } #define spin_lock_mutex(lock, flags) \ diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 5cda397607f2..12f96199441c 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -17,14 +17,20 @@ __list_del((waiter)->list.prev, (waiter)->list.next) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * The mutex owner can get read and written to locklessly. + * We should use WRITE_ONCE when writing the owner value to + * avoid store tearing, otherwise, a thread could potentially + * read a partially written and incomplete owner value. + */ static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current; + WRITE_ONCE(lock->owner, current); } static inline void mutex_clear_owner(struct mutex *lock) { - lock->owner = NULL; + WRITE_ONCE(lock->owner, NULL); } #else static inline void mutex_set_owner(struct mutex *lock) -- cgit v1.3-8-gc7d7 From 98f368e9e2630a3ce3e80fb10fb2e02038cf9578 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 2 Jun 2016 23:43:21 -0500 Subject: kernel: Add noaudit variant of ns_capable() When checking the current cred for a capability in a specific user namespace, it isn't always desirable to have the LSMs audit the check. This patch adds a noaudit variant of ns_capable() for when those situations arise. The common logic between ns_capable() and the new ns_capable_noaudit() is moved into a single, shared function to keep duplicated code to a minimum and ease maintainability. Signed-off-by: Tyler Hicks Acked-by: Serge E. Hallyn Signed-off-by: James Morris --- include/linux/capability.h | 5 +++++ kernel/capability.c | 46 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 41 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/capability.h b/include/linux/capability.h index 00690ff92edf..5f3c63dde2d5 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -206,6 +206,7 @@ extern bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); +extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); #else static inline bool has_capability(struct task_struct *t, int cap) { @@ -233,6 +234,10 @@ static inline bool ns_capable(struct user_namespace *ns, int cap) { return true; } +static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap) +{ + return true; +} #endif /* CONFIG_MULTIUSER */ extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); diff --git a/kernel/capability.c b/kernel/capability.c index 45432b54d5c6..00411c82dac5 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -361,6 +361,24 @@ bool has_capability_noaudit(struct task_struct *t, int cap) return has_ns_capability_noaudit(t, &init_user_ns, cap); } +static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) +{ + int capable; + + if (unlikely(!cap_valid(cap))) { + pr_crit("capable() called with invalid cap=%u\n", cap); + BUG(); + } + + capable = audit ? security_capable(current_cred(), ns, cap) : + security_capable_noaudit(current_cred(), ns, cap); + if (capable == 0) { + current->flags |= PF_SUPERPRIV; + return true; + } + return false; +} + /** * ns_capable - Determine if the current task has a superior capability in effect * @ns: The usernamespace we want the capability in @@ -374,19 +392,27 @@ bool has_capability_noaudit(struct task_struct *t, int cap) */ bool ns_capable(struct user_namespace *ns, int cap) { - if (unlikely(!cap_valid(cap))) { - pr_crit("capable() called with invalid cap=%u\n", cap); - BUG(); - } - - if (security_capable(current_cred(), ns, cap) == 0) { - current->flags |= PF_SUPERPRIV; - return true; - } - return false; + return ns_capable_common(ns, cap, true); } EXPORT_SYMBOL(ns_capable); +/** + * ns_capable_noaudit - Determine if the current task has a superior capability + * (unaudited) in effect + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool ns_capable_noaudit(struct user_namespace *ns, int cap) +{ + return ns_capable_common(ns, cap, false); +} +EXPORT_SYMBOL(ns_capable_noaudit); /** * capable - Determine if the current task has a superior capability in effect -- cgit v1.3-8-gc7d7 From 4e49ea4a3d276365bf7396c9b77b4d1d5923835a Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Sun, 5 Jun 2016 14:31:41 -0500 Subject: block/fs/drivers: remove rw argument from submit_bio This has callers of submit_bio/submit_bio_wait set the bio->bi_rw instead of passing it in. This makes that use the same as generic_make_request and how we set the other bio fields. Signed-off-by: Mike Christie Fixed up fs/ext4/crypto.c Signed-off-by: Jens Axboe --- block/bio.c | 7 +++---- block/blk-core.c | 11 ++++------- block/blk-flush.c | 3 ++- block/blk-lib.c | 20 +++++++++++--------- drivers/block/drbd/drbd_actlog.c | 2 +- drivers/block/drbd/drbd_bitmap.c | 4 ++-- drivers/block/floppy.c | 3 ++- drivers/block/xen-blkback/blkback.c | 4 +++- drivers/block/xen-blkfront.c | 4 ++-- drivers/md/bcache/debug.c | 6 ++++-- drivers/md/bcache/journal.c | 2 +- drivers/md/bcache/super.c | 4 ++-- drivers/md/dm-bufio.c | 3 ++- drivers/md/dm-io.c | 3 ++- drivers/md/dm-log-writes.c | 9 ++++++--- drivers/md/dm-thin.c | 3 ++- drivers/md/md.c | 10 +++++++--- drivers/md/raid1.c | 3 ++- drivers/md/raid10.c | 4 +++- drivers/md/raid5-cache.c | 7 ++++--- drivers/target/target_core_iblock.c | 24 +++++++++++++----------- fs/btrfs/check-integrity.c | 18 ++++++++++-------- fs/btrfs/check-integrity.h | 4 ++-- fs/btrfs/disk-io.c | 3 ++- fs/btrfs/extent_io.c | 7 ++++--- fs/btrfs/raid56.c | 17 ++++++++++++----- fs/btrfs/scrub.c | 15 ++++++++++----- fs/btrfs/volumes.c | 14 +++++++------- fs/buffer.c | 3 ++- fs/crypto/crypto.c | 3 ++- fs/direct-io.c | 3 ++- fs/ext4/crypto.c | 3 ++- fs/ext4/page-io.c | 3 ++- fs/ext4/readpage.c | 9 +++++---- fs/f2fs/data.c | 4 +++- fs/f2fs/segment.c | 6 ++++-- fs/gfs2/lops.c | 3 ++- fs/gfs2/meta_io.c | 3 ++- fs/gfs2/ops_fstype.c | 3 ++- fs/hfsplus/wrapper.c | 3 ++- fs/jfs/jfs_logmgr.c | 6 ++++-- fs/jfs/jfs_metapage.c | 10 ++++++---- fs/logfs/dev_bdev.c | 15 ++++++++++----- fs/mpage.c | 3 ++- fs/nfs/blocklayout/blocklayout.c | 22 ++++++++++++---------- fs/nilfs2/segbuf.c | 3 ++- fs/ocfs2/cluster/heartbeat.c | 12 +++++++----- fs/xfs/xfs_aops.c | 15 ++++++++++----- fs/xfs/xfs_buf.c | 4 ++-- include/linux/bio.h | 2 +- include/linux/fs.h | 2 +- kernel/power/swap.c | 5 +++-- mm/page_io.c | 10 ++++++---- 53 files changed, 221 insertions(+), 148 deletions(-) (limited to 'kernel') diff --git a/block/bio.c b/block/bio.c index 0e4aa42bc30d..fc779eba0b95 100644 --- a/block/bio.c +++ b/block/bio.c @@ -854,21 +854,20 @@ static void submit_bio_wait_endio(struct bio *bio) /** * submit_bio_wait - submit a bio, and wait until it completes - * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) * @bio: The &struct bio which describes the I/O * * Simple wrapper around submit_bio(). Returns 0 on success, or the error from * bio_endio() on failure. */ -int submit_bio_wait(int rw, struct bio *bio) +int submit_bio_wait(struct bio *bio) { struct submit_bio_ret ret; - rw |= REQ_SYNC; init_completion(&ret.event); bio->bi_private = &ret; bio->bi_end_io = submit_bio_wait_endio; - submit_bio(rw, bio); + bio->bi_rw |= REQ_SYNC; + submit_bio(bio); wait_for_completion_io(&ret.event); return ret.error; diff --git a/block/blk-core.c b/block/blk-core.c index 2475b1c72773..e95340789592 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2094,7 +2094,6 @@ EXPORT_SYMBOL(generic_make_request); /** * submit_bio - submit a bio to the block device layer for I/O - * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) * @bio: The &struct bio which describes the I/O * * submit_bio() is very similar in purpose to generic_make_request(), and @@ -2102,10 +2101,8 @@ EXPORT_SYMBOL(generic_make_request); * interfaces; @bio must be presetup and ready for I/O. * */ -blk_qc_t submit_bio(int rw, struct bio *bio) +blk_qc_t submit_bio(struct bio *bio) { - bio->bi_rw |= rw; - /* * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. @@ -2113,12 +2110,12 @@ blk_qc_t submit_bio(int rw, struct bio *bio) if (bio_has_data(bio)) { unsigned int count; - if (unlikely(rw & REQ_WRITE_SAME)) + if (unlikely(bio->bi_rw & REQ_WRITE_SAME)) count = bdev_logical_block_size(bio->bi_bdev) >> 9; else count = bio_sectors(bio); - if (rw & WRITE) { + if (bio->bi_rw & WRITE) { count_vm_events(PGPGOUT, count); } else { task_io_account_read(bio->bi_iter.bi_size); @@ -2129,7 +2126,7 @@ blk_qc_t submit_bio(int rw, struct bio *bio) char b[BDEVNAME_SIZE]; printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", current->comm, task_pid_nr(current), - (rw & WRITE) ? "WRITE" : "READ", + (bio->bi_rw & WRITE) ? "WRITE" : "READ", (unsigned long long)bio->bi_iter.bi_sector, bdevname(bio->bi_bdev, b), count); diff --git a/block/blk-flush.c b/block/blk-flush.c index b1c91d229e5e..3af4a5ad46f5 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -485,8 +485,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, bio = bio_alloc(gfp_mask, 0); bio->bi_bdev = bdev; + bio->bi_rw = WRITE_FLUSH; - ret = submit_bio_wait(WRITE_FLUSH, bio); + ret = submit_bio_wait(bio); /* * The driver must store the error location in ->bi_sector, if diff --git a/block/blk-lib.c b/block/blk-lib.c index 23d7f301a196..1f6dec5e0975 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -9,14 +9,14 @@ #include "blk.h" -static struct bio *next_bio(struct bio *bio, int rw, unsigned int nr_pages, +static struct bio *next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp) { struct bio *new = bio_alloc(gfp, nr_pages); if (bio) { bio_chain(bio, new); - submit_bio(rw, bio); + submit_bio(bio); } return new; @@ -62,9 +62,10 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, req_sects = end_sect - sector; } - bio = next_bio(bio, type, 1, gfp_mask); + bio = next_bio(bio, 1, gfp_mask); bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; + bio->bi_rw = type; bio->bi_iter.bi_size = req_sects << 9; nr_sects -= req_sects; @@ -110,7 +111,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, type, &bio); if (!ret && bio) { - ret = submit_bio_wait(type, bio); + ret = submit_bio_wait(bio); if (ret == -EOPNOTSUPP) ret = 0; } @@ -147,13 +148,14 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, max_write_same_sectors = UINT_MAX >> 9; while (nr_sects) { - bio = next_bio(bio, REQ_WRITE | REQ_WRITE_SAME, 1, gfp_mask); + bio = next_bio(bio, 1, gfp_mask); bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; bio->bi_vcnt = 1; bio->bi_io_vec->bv_page = page; bio->bi_io_vec->bv_offset = 0; bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev); + bio->bi_rw = REQ_WRITE | REQ_WRITE_SAME; if (nr_sects > max_write_same_sectors) { bio->bi_iter.bi_size = max_write_same_sectors << 9; @@ -166,7 +168,7 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, } if (bio) - ret = submit_bio_wait(REQ_WRITE | REQ_WRITE_SAME, bio); + ret = submit_bio_wait(bio); return ret != -EOPNOTSUPP ? ret : 0; } EXPORT_SYMBOL(blkdev_issue_write_same); @@ -190,11 +192,11 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, unsigned int sz; while (nr_sects != 0) { - bio = next_bio(bio, WRITE, - min(nr_sects, (sector_t)BIO_MAX_PAGES), + bio = next_bio(bio, min(nr_sects, (sector_t)BIO_MAX_PAGES), gfp_mask); bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; + bio->bi_rw = REQ_WRITE; while (nr_sects != 0) { sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); @@ -207,7 +209,7 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, } if (bio) - return submit_bio_wait(WRITE, bio); + return submit_bio_wait(bio); return 0; } diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 10459a145062..6069e152c32f 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -177,7 +177,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) bio_io_error(bio); else - submit_bio(rw, bio); + submit_bio(bio); wait_until_done_or_force_detached(device, bdev, &device->md_io.done); if (!bio->bi_error) err = device->md_io.error; diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 92d6fc020a65..e8959fee1fa3 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1011,12 +1011,12 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho bio_add_page(bio, page, len, 0); bio->bi_private = ctx; bio->bi_end_io = drbd_bm_endio; + bio->bi_rw = rw; if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { - bio->bi_rw |= rw; bio_io_error(bio); } else { - submit_bio(rw, bio); + submit_bio(bio); /* this should not count as user activity and cause the * resync to throttle -- see drbd_rs_should_slow_down(). */ atomic_add(len >> 9, &device->rs_sect_ev); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 84708a5f8c52..73ded25b82c4 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3822,8 +3822,9 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive) bio.bi_flags |= (1 << BIO_QUIET); bio.bi_private = &cbdata; bio.bi_end_io = floppy_rb0_cb; + bio.bi_rw = READ; - submit_bio(READ, &bio); + submit_bio(&bio); process_fd_request(); init_completion(&cbdata.complete); diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 4809c1501d7e..79fe4934f18b 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -1369,6 +1369,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, bio->bi_private = pending_req; bio->bi_end_io = end_block_io_op; bio->bi_iter.bi_sector = preq.sector_number; + bio->bi_rw = operation; } preq.sector_number += seg[i].nsec; @@ -1386,13 +1387,14 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, bio->bi_bdev = preq.bdev; bio->bi_private = pending_req; bio->bi_end_io = end_block_io_op; + bio->bi_rw = operation; } atomic_set(&pending_req->pendcnt, nbio); blk_start_plug(&plug); for (i = 0; i < nbio; i++) - submit_bio(operation, biolist[i]); + submit_bio(biolist[i]); /* Let the I/Os go.. */ blk_finish_plug(&plug); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index ca13df854639..52963a26660a 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2114,7 +2114,7 @@ static int blkif_recover(struct blkfront_info *info) bio_trim(cloned_bio, offset, size); cloned_bio->bi_private = split_bio; cloned_bio->bi_end_io = split_bio_end; - submit_bio(cloned_bio->bi_rw, cloned_bio); + submit_bio(cloned_bio); } /* * Now we have to wait for all those smaller bios to @@ -2123,7 +2123,7 @@ static int blkif_recover(struct blkfront_info *info) continue; } /* We don't need to split this bio */ - submit_bio(bio->bi_rw, bio); + submit_bio(bio); } return 0; diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 8b1f1d5c1819..52b6bcf6b6c8 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -52,9 +52,10 @@ void bch_btree_verify(struct btree *b) bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev; bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; + bio->bi_rw = REQ_META|READ_SYNC; bch_bio_map(bio, sorted); - submit_bio_wait(REQ_META|READ_SYNC, bio); + submit_bio_wait(bio); bch_bbio_free(bio, b->c); memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9); @@ -113,11 +114,12 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) check = bio_clone(bio, GFP_NOIO); if (!check) return; + check->bi_rw |= READ_SYNC; if (bio_alloc_pages(check, GFP_NOIO)) goto out_put; - submit_bio_wait(READ_SYNC, check); + submit_bio_wait(check); bio_for_each_segment(bv, bio, iter) { void *p1 = kmap_atomic(bv.bv_page); diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 29eba7219b01..af3f9f7f20e2 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -418,7 +418,7 @@ static void journal_discard_work(struct work_struct *work) struct journal_device *ja = container_of(work, struct journal_device, discard_work); - submit_bio(0, &ja->discard_bio); + submit_bio(&ja->discard_bio); } static void do_journal_discard(struct cache *ca) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f5dbb4e884d8..1eb526a7b8b3 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -212,7 +212,7 @@ static void __write_super(struct cache_sb *sb, struct bio *bio) unsigned i; bio->bi_iter.bi_sector = SB_SECTOR; - bio->bi_rw = REQ_SYNC|REQ_META; + bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META; bio->bi_iter.bi_size = SB_SIZE; bch_bio_map(bio, NULL); @@ -238,7 +238,7 @@ static void __write_super(struct cache_sb *sb, struct bio *bio) pr_debug("ver %llu, flags %llu, seq %llu", sb->version, sb->flags, sb->seq); - submit_bio(REQ_WRITE, bio); + submit_bio(bio); } static void bch_write_bdev_super_unlock(struct closure *cl) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index cd77216beff1..9d3ee7f6c6c7 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -634,6 +634,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, * the dm_buffer's inline bio is local to bufio. */ b->bio.bi_private = end_io; + b->bio.bi_rw = rw; /* * We assume that if len >= PAGE_SIZE ptr is page-aligned. @@ -660,7 +661,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, ptr += PAGE_SIZE; } while (len > 0); - submit_bio(rw, &b->bio); + submit_bio(&b->bio); } static void submit_io(struct dm_buffer *b, int rw, sector_t block, diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 06d426eb5a30..50f17e32951a 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -322,6 +322,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, bio->bi_iter.bi_sector = where->sector + (where->count - remaining); bio->bi_bdev = where->bdev; bio->bi_end_io = endio; + bio->bi_rw = rw; store_io_and_region_in_bio(bio, io, region); if (rw & REQ_DISCARD) { @@ -355,7 +356,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, } atomic_inc(&io->count); - submit_bio(rw, bio); + submit_bio(bio); } while (remaining); } diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 608302e222af..addcc4be00b6 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -205,6 +205,7 @@ static int write_metadata(struct log_writes_c *lc, void *entry, bio->bi_bdev = lc->logdev->bdev; bio->bi_end_io = log_end_io; bio->bi_private = lc; + bio->bi_rw = WRITE; page = alloc_page(GFP_KERNEL); if (!page) { @@ -226,7 +227,7 @@ static int write_metadata(struct log_writes_c *lc, void *entry, DMERR("Couldn't add page to the log block"); goto error_bio; } - submit_bio(WRITE, bio); + submit_bio(bio); return 0; error_bio: bio_put(bio); @@ -269,6 +270,7 @@ static int log_one_block(struct log_writes_c *lc, bio->bi_bdev = lc->logdev->bdev; bio->bi_end_io = log_end_io; bio->bi_private = lc; + bio->bi_rw = WRITE; for (i = 0; i < block->vec_cnt; i++) { /* @@ -279,7 +281,7 @@ static int log_one_block(struct log_writes_c *lc, block->vecs[i].bv_len, 0); if (ret != block->vecs[i].bv_len) { atomic_inc(&lc->io_blocks); - submit_bio(WRITE, bio); + submit_bio(bio); bio = bio_alloc(GFP_KERNEL, block->vec_cnt - i); if (!bio) { DMERR("Couldn't alloc log bio"); @@ -290,6 +292,7 @@ static int log_one_block(struct log_writes_c *lc, bio->bi_bdev = lc->logdev->bdev; bio->bi_end_io = log_end_io; bio->bi_private = lc; + bio->bi_rw = WRITE; ret = bio_add_page(bio, block->vecs[i].bv_page, block->vecs[i].bv_len, 0); @@ -301,7 +304,7 @@ static int log_one_block(struct log_writes_c *lc, } sector += block->vecs[i].bv_len >> SECTOR_SHIFT; } - submit_bio(WRITE, bio); + submit_bio(bio); out: kfree(block->data); kfree(block); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index fc803d50f9f0..8c070eed982c 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -371,7 +371,8 @@ static void end_discard(struct discard_op *op, int r) * need to wait for the chain to complete. */ bio_chain(op->bio, op->parent_bio); - submit_bio(REQ_WRITE | REQ_DISCARD, op->bio); + op->bio->bi_rw = REQ_WRITE | REQ_DISCARD; + submit_bio(op->bio); } blk_finish_plug(&op->plug); diff --git a/drivers/md/md.c b/drivers/md/md.c index 866825f10b4c..fb3950be3070 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -394,8 +394,9 @@ static void submit_flushes(struct work_struct *ws) bi->bi_end_io = md_end_flush; bi->bi_private = rdev; bi->bi_bdev = rdev->bdev; + bi->bi_rw = WRITE_FLUSH; atomic_inc(&mddev->flush_pending); - submit_bio(WRITE_FLUSH, bi); + submit_bio(bi); rcu_read_lock(); rdev_dec_pending(rdev, mddev); } @@ -742,9 +743,10 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, bio_add_page(bio, page, size, 0); bio->bi_private = rdev; bio->bi_end_io = super_written; + bio->bi_rw = WRITE_FLUSH_FUA; atomic_inc(&mddev->pending_writes); - submit_bio(WRITE_FLUSH_FUA, bio); + submit_bio(bio); } void md_super_wait(struct mddev *mddev) @@ -761,6 +763,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; + bio->bi_rw = rw; if (metadata_op) bio->bi_iter.bi_sector = sector + rdev->sb_start; else if (rdev->mddev->reshape_position != MaxSector && @@ -770,7 +773,8 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, else bio->bi_iter.bi_sector = sector + rdev->data_offset; bio_add_page(bio, page, size, 0); - submit_bio_wait(rw, bio); + + submit_bio_wait(bio); ret = !bio->bi_error; bio_put(bio); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c7c8cde0ab21..bc95d4dd600b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2208,7 +2208,8 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) bio_trim(wbio, sector - r1_bio->sector, sectors); wbio->bi_iter.bi_sector += rdev->data_offset; wbio->bi_bdev = rdev->bdev; - if (submit_bio_wait(WRITE, wbio) < 0) + + if (submit_bio_wait(wbio) < 0) /* failure! */ ok = rdev_set_badblocks(rdev, sector, sectors, 0) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c7de2a53e625..0be6497d8e34 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2474,7 +2474,9 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) choose_data_offset(r10_bio, rdev) + (sector - r10_bio->sector)); wbio->bi_bdev = rdev->bdev; - if (submit_bio_wait(WRITE, wbio) < 0) + wbio->bi_rw = WRITE; + + if (submit_bio_wait(wbio) < 0) /* Failure! */ ok = rdev_set_badblocks(rdev, sector, sectors, 0) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index e889e2deb7b3..0b014535f5dd 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -254,7 +254,7 @@ static void r5l_submit_current_io(struct r5l_log *log) __r5l_set_io_unit_state(io, IO_UNIT_IO_START); spin_unlock_irqrestore(&log->io_list_lock, flags); - submit_bio(WRITE, io->current_bio); + submit_bio(io->current_bio); } static struct bio *r5l_bio_alloc(struct r5l_log *log) @@ -373,7 +373,7 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page) io->current_bio = r5l_bio_alloc(log); bio_chain(io->current_bio, prev); - submit_bio(WRITE, prev); + submit_bio(prev); } if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) @@ -686,7 +686,8 @@ void r5l_flush_stripe_to_raid(struct r5l_log *log) bio_reset(&log->flush_bio); log->flush_bio.bi_bdev = log->rdev->bdev; log->flush_bio.bi_end_io = r5l_log_flush_endio; - submit_bio(WRITE_FLUSH, &log->flush_bio); + log->flush_bio.bi_rw = WRITE_FLUSH; + submit_bio(&log->flush_bio); } static void r5l_write_super(struct r5l_log *log, sector_t cp); diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 7c4efb4417b0..c25109cc0329 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -312,7 +312,7 @@ static void iblock_bio_done(struct bio *bio) } static struct bio * -iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num) +iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num, int rw) { struct iblock_dev *ib_dev = IBLOCK_DEV(cmd->se_dev); struct bio *bio; @@ -334,18 +334,19 @@ iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num) bio->bi_private = cmd; bio->bi_end_io = &iblock_bio_done; bio->bi_iter.bi_sector = lba; + bio->bi_rw = rw; return bio; } -static void iblock_submit_bios(struct bio_list *list, int rw) +static void iblock_submit_bios(struct bio_list *list) { struct blk_plug plug; struct bio *bio; blk_start_plug(&plug); while ((bio = bio_list_pop(list))) - submit_bio(rw, bio); + submit_bio(bio); blk_finish_plug(&plug); } @@ -387,9 +388,10 @@ iblock_execute_sync_cache(struct se_cmd *cmd) bio = bio_alloc(GFP_KERNEL, 0); bio->bi_end_io = iblock_end_io_flush; bio->bi_bdev = ib_dev->ibd_bd; + bio->bi_rw = WRITE_FLUSH; if (!immed) bio->bi_private = cmd; - submit_bio(WRITE_FLUSH, bio); + submit_bio(bio); return 0; } @@ -478,7 +480,7 @@ iblock_execute_write_same(struct se_cmd *cmd) goto fail; cmd->priv = ibr; - bio = iblock_get_bio(cmd, block_lba, 1); + bio = iblock_get_bio(cmd, block_lba, 1, WRITE); if (!bio) goto fail_free_ibr; @@ -491,7 +493,7 @@ iblock_execute_write_same(struct se_cmd *cmd) while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset) != sg->length) { - bio = iblock_get_bio(cmd, block_lba, 1); + bio = iblock_get_bio(cmd, block_lba, 1, WRITE); if (!bio) goto fail_put_bios; @@ -504,7 +506,7 @@ iblock_execute_write_same(struct se_cmd *cmd) sectors -= 1; } - iblock_submit_bios(&list, WRITE); + iblock_submit_bios(&list); return 0; fail_put_bios: @@ -712,7 +714,7 @@ iblock_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, return 0; } - bio = iblock_get_bio(cmd, block_lba, sgl_nents); + bio = iblock_get_bio(cmd, block_lba, sgl_nents, rw); if (!bio) goto fail_free_ibr; @@ -732,11 +734,11 @@ iblock_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset) != sg->length) { if (bio_cnt >= IBLOCK_MAX_BIO_PER_TASK) { - iblock_submit_bios(&list, rw); + iblock_submit_bios(&list); bio_cnt = 0; } - bio = iblock_get_bio(cmd, block_lba, sg_num); + bio = iblock_get_bio(cmd, block_lba, sg_num, rw); if (!bio) goto fail_put_bios; @@ -756,7 +758,7 @@ iblock_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, goto fail_put_bios; } - iblock_submit_bios(&list, rw); + iblock_submit_bios(&list); iblock_complete_cmd(cmd); return 0; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index b677a6ea6001..50f81916663b 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1673,6 +1673,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, } bio->bi_bdev = block_ctx->dev->bdev; bio->bi_iter.bi_sector = dev_bytenr >> 9; + bio->bi_rw = READ; for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], @@ -1685,7 +1686,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, "btrfsic: error, failed to add a single page!\n"); return -1; } - if (submit_bio_wait(READ, bio)) { + if (submit_bio_wait(bio)) { printk(KERN_INFO "btrfsic: read error at logical %llu dev %s!\n", block_ctx->start, block_ctx->dev->name); @@ -2918,9 +2919,10 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) return submit_bh(rw, bh); } -static void __btrfsic_submit_bio(int rw, struct bio *bio) +static void __btrfsic_submit_bio(struct bio *bio) { struct btrfsic_dev_state *dev_state; + int rw = bio->bi_rw; if (!btrfsic_is_initialized) return; @@ -3016,16 +3018,16 @@ leave: mutex_unlock(&btrfsic_mutex); } -void btrfsic_submit_bio(int rw, struct bio *bio) +void btrfsic_submit_bio(struct bio *bio) { - __btrfsic_submit_bio(rw, bio); - submit_bio(rw, bio); + __btrfsic_submit_bio(bio); + submit_bio(bio); } -int btrfsic_submit_bio_wait(int rw, struct bio *bio) +int btrfsic_submit_bio_wait(struct bio *bio) { - __btrfsic_submit_bio(rw, bio); - return submit_bio_wait(rw, bio); + __btrfsic_submit_bio(bio); + return submit_bio_wait(bio); } int btrfsic_mount(struct btrfs_root *root, diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index 13b8566c97ab..c04e249058c3 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -21,8 +21,8 @@ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY int btrfsic_submit_bh(int rw, struct buffer_head *bh); -void btrfsic_submit_bio(int rw, struct bio *bio); -int btrfsic_submit_bio_wait(int rw, struct bio *bio); +void btrfsic_submit_bio(struct bio *bio); +int btrfsic_submit_bio_wait(struct bio *bio); #else #define btrfsic_submit_bh submit_bh #define btrfsic_submit_bio submit_bio diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6628fca9f4ed..3666558a2375 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3486,12 +3486,13 @@ static int write_dev_flush(struct btrfs_device *device, int wait) bio->bi_end_io = btrfs_end_empty_barrier; bio->bi_bdev = device->bdev; + bio->bi_rw = WRITE_FLUSH; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; device->flush_bio = bio; bio_get(bio); - btrfsic_submit_bio(WRITE_FLUSH, bio); + btrfsic_submit_bio(bio); return 0; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6e953de83f08..40e8dcc319d4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2049,9 +2049,10 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, return -EIO; } bio->bi_bdev = dev->bdev; + bio->bi_rw = WRITE_SYNC; bio_add_page(bio, page, length, pg_offset); - if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) { + if (btrfsic_submit_bio_wait(bio)) { /* try to remap that extent elsewhere? */ btrfs_bio_counter_dec(fs_info); bio_put(bio); @@ -2735,14 +2736,14 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, start = page_offset(page) + bvec->bv_offset; bio->bi_private = NULL; - + bio->bi_rw = rw; bio_get(bio); if (tree->ops && tree->ops->submit_bio_hook) ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, mirror_num, bio_flags, start); else - btrfsic_submit_bio(rw, bio); + btrfsic_submit_bio(bio); bio_put(bio); return ret; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index f8b6d411a034..36d5bf7bbb59 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1320,7 +1320,9 @@ write_data: bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - submit_bio(WRITE, bio); + bio->bi_rw = WRITE; + + submit_bio(bio); } return; @@ -1573,11 +1575,12 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) bio->bi_private = rbio; bio->bi_end_io = raid_rmw_end_io; + bio->bi_rw = READ; btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - submit_bio(READ, bio); + submit_bio(bio); } /* the actual write will happen once the reads are done */ return 0; @@ -2097,11 +2100,12 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) bio->bi_private = rbio; bio->bi_end_io = raid_recover_end_io; + bio->bi_rw = READ; btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - submit_bio(READ, bio); + submit_bio(bio); } out: return 0; @@ -2433,7 +2437,9 @@ submit_write: bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - submit_bio(WRITE, bio); + bio->bi_rw = WRITE; + + submit_bio(bio); } return; @@ -2610,11 +2616,12 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) bio->bi_private = rbio; bio->bi_end_io = raid56_parity_scrub_end_io; + bio->bi_rw = READ; btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - submit_bio(READ, bio); + submit_bio(bio); } /* the actual write will happen once the reads are done */ return; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 70427ef66b04..9899f3e44a56 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1504,8 +1504,9 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, sblock->no_io_error_seen = 0; } else { bio->bi_iter.bi_sector = page->physical >> 9; + bio->bi_rw = READ; - if (btrfsic_submit_bio_wait(READ, bio)) + if (btrfsic_submit_bio_wait(bio)) sblock->no_io_error_seen = 0; } @@ -1583,6 +1584,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; bio->bi_bdev = page_bad->dev->bdev; bio->bi_iter.bi_sector = page_bad->physical >> 9; + bio->bi_rw = WRITE; ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); if (PAGE_SIZE != ret) { @@ -1590,7 +1592,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; } - if (btrfsic_submit_bio_wait(WRITE, bio)) { + if (btrfsic_submit_bio_wait(bio)) { btrfs_dev_stat_inc_and_print(page_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); btrfs_dev_replace_stats_inc( @@ -1684,6 +1686,7 @@ again: bio->bi_end_io = scrub_wr_bio_end_io; bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; + bio->bi_rw = WRITE; sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical_for_dev_replace || @@ -1731,7 +1734,7 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) * orders the requests before sending them to the driver which * doubled the write performance on spinning disks when measured * with Linux 3.5 */ - btrfsic_submit_bio(WRITE, sbio->bio); + btrfsic_submit_bio(sbio->bio); } static void scrub_wr_bio_end_io(struct bio *bio) @@ -2041,7 +2044,7 @@ static void scrub_submit(struct scrub_ctx *sctx) sbio = sctx->bios[sctx->curr]; sctx->curr = -1; scrub_pending_bio_inc(sctx); - btrfsic_submit_bio(READ, sbio->bio); + btrfsic_submit_bio(sbio->bio); } static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, @@ -2088,6 +2091,7 @@ again: bio->bi_end_io = scrub_bio_end_io; bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; + bio->bi_rw = READ; sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || @@ -4436,6 +4440,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; bio->bi_bdev = dev->bdev; + bio->bi_rw = WRITE_SYNC; ret = bio_add_page(bio, page, PAGE_SIZE, 0); if (ret != PAGE_SIZE) { leave_with_eio: @@ -4444,7 +4449,7 @@ leave_with_eio: return -EIO; } - if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) + if (btrfsic_submit_bio_wait(bio)) goto leave_with_eio; bio_put(bio); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index da9e0036a864..037f3ab841d8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -462,7 +462,7 @@ loop_lock: sync_pending = 0; } - btrfsic_submit_bio(cur->bi_rw, cur); + btrfsic_submit_bio(cur); num_run++; batch_run++; @@ -5996,7 +5996,7 @@ static void btrfs_end_bio(struct bio *bio) */ static noinline void btrfs_schedule_bio(struct btrfs_root *root, struct btrfs_device *device, - int rw, struct bio *bio) + struct bio *bio) { int should_queue = 1; struct btrfs_pending_bios *pending_bios; @@ -6007,9 +6007,9 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, } /* don't bother with additional async steps for reads, right now */ - if (!(rw & REQ_WRITE)) { + if (!(bio->bi_rw & REQ_WRITE)) { bio_get(bio); - btrfsic_submit_bio(rw, bio); + btrfsic_submit_bio(bio); bio_put(bio); return; } @@ -6023,7 +6023,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, atomic_inc(&root->fs_info->nr_async_bios); WARN_ON(bio->bi_next); bio->bi_next = NULL; - bio->bi_rw |= rw; spin_lock(&device->io_lock); if (bio->bi_rw & REQ_SYNC) @@ -6057,6 +6056,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, btrfs_io_bio(bio)->stripe_index = dev_nr; bio->bi_end_io = btrfs_end_bio; bio->bi_iter.bi_sector = physical >> 9; + bio->bi_rw |= rw; #ifdef DEBUG { struct rcu_string *name; @@ -6075,9 +6075,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, btrfs_bio_counter_inc_noblocked(root->fs_info); if (async) - btrfs_schedule_bio(root, dev, rw, bio); + btrfs_schedule_bio(root, dev, bio); else - btrfsic_submit_bio(rw, bio); + btrfsic_submit_bio(bio); } static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) diff --git a/fs/buffer.c b/fs/buffer.c index 754813a6962b..9a55e7f8b25c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3030,8 +3030,9 @@ static int submit_bh_wbc(int rw, struct buffer_head *bh, rw |= REQ_META; if (buffer_prio(bh)) rw |= REQ_PRIO; + bio->bi_rw = rw; - submit_bio(rw, bio); + submit_bio(bio); return 0; } diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 2fc8c43ce531..5b758566199e 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -318,6 +318,7 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, bio->bi_bdev = inode->i_sb->s_bdev; bio->bi_iter.bi_sector = pblk << (inode->i_sb->s_blocksize_bits - 9); + bio->bi_rw = WRITE; ret = bio_add_page(bio, ciphertext_page, inode->i_sb->s_blocksize, 0); if (ret != inode->i_sb->s_blocksize) { @@ -327,7 +328,7 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, err = -EIO; goto errout; } - err = submit_bio_wait(WRITE, bio); + err = submit_bio_wait(bio); if ((err == 0) && bio->bi_error) err = -EIO; bio_put(bio); diff --git a/fs/direct-io.c b/fs/direct-io.c index f3b4408be590..1bcdd5dde00d 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -375,6 +375,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio->bi_bdev = bdev; bio->bi_iter.bi_sector = first_sector; + bio->bi_rw = dio->rw; if (dio->is_async) bio->bi_end_io = dio_bio_end_aio; else @@ -412,7 +413,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) sdio->logical_offset_in_bio); dio->bio_cookie = BLK_QC_T_NONE; } else - dio->bio_cookie = submit_bio(dio->rw, bio); + dio->bio_cookie = submit_bio(bio); sdio->bio = NULL; sdio->boundary = 0; diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 6a6c27373b54..811bd5de8099 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -428,6 +428,7 @@ int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, bio->bi_bdev = inode->i_sb->s_bdev; bio->bi_iter.bi_sector = pblk << (inode->i_sb->s_blocksize_bits - 9); + bio->bi_rw = WRITE; ret = bio_add_page(bio, ciphertext_page, inode->i_sb->s_blocksize, 0); if (ret != inode->i_sb->s_blocksize) { @@ -439,7 +440,7 @@ int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, err = -EIO; goto errout; } - err = submit_bio_wait(WRITE, bio); + err = submit_bio_wait(bio); if ((err == 0) && bio->bi_error) err = -EIO; bio_put(bio); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 2a01df9cc1c3..a72dbcc7a43e 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -342,7 +342,8 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE; - submit_bio(io_op, io->io_bio); + io->io_bio->bi_rw = io_op; + submit_bio(io->io_bio); } io->io_bio = NULL; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index dc54a4b60eba..130bd45f4a99 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -271,7 +271,7 @@ int ext4_mpage_readpages(struct address_space *mapping, */ if (bio && (last_block_in_bio != blocks[0] - 1)) { submit_and_realloc: - submit_bio(READ, bio); + submit_bio(bio); bio = NULL; } if (bio == NULL) { @@ -294,6 +294,7 @@ int ext4_mpage_readpages(struct address_space *mapping, bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; bio->bi_private = ctx; + bio->bi_rw = READ; } length = first_hole << blkbits; @@ -303,14 +304,14 @@ int ext4_mpage_readpages(struct address_space *mapping, if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_page)) { - submit_bio(READ, bio); + submit_bio(bio); bio = NULL; } else last_block_in_bio = blocks[blocks_per_page - 1]; goto next_page; confused: if (bio) { - submit_bio(READ, bio); + submit_bio(bio); bio = NULL; } if (!PageUptodate(page)) @@ -323,6 +324,6 @@ int ext4_mpage_readpages(struct address_space *mapping, } BUG_ON(pages && !list_empty(pages)); if (bio) - submit_bio(READ, bio); + submit_bio(bio); return 0; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9a8bbc1fb1fa..c595c8f84c48 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -102,7 +102,8 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, { if (!is_read_io(rw)) atomic_inc(&sbi->nr_wb_bios); - submit_bio(rw, bio); + bio->bi_rw = rw; + submit_bio(bio); } static void __submit_merged_bio(struct f2fs_bio_info *io) @@ -1080,6 +1081,7 @@ submit_and_realloc: bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(block_nr); bio->bi_end_io = f2fs_read_end_io; bio->bi_private = ctx; + bio->bi_rw = READ; } if (bio_add_page(bio, page, blocksize, 0) < blocksize) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2e6f537a0e7d..3fe2faba842a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -406,7 +406,8 @@ repeat: fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); bio->bi_bdev = sbi->sb->s_bdev; - ret = submit_bio_wait(WRITE_FLUSH, bio); + bio->bi_rw = WRITE_FLUSH; + ret = submit_bio_wait(bio); llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { @@ -438,7 +439,8 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) int ret; bio->bi_bdev = sbi->sb->s_bdev; - ret = submit_bio_wait(WRITE_FLUSH, bio); + bio->bi_rw = WRITE_FLUSH; + ret = submit_bio_wait(bio); bio_put(bio); return ret; } diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index d5369a109781..ce282423a853 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -240,7 +240,8 @@ void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw) { if (sdp->sd_log_bio) { atomic_inc(&sdp->sd_log_in_flight); - submit_bio(rw, sdp->sd_log_bio); + sdp->sd_log_bio->bi_rw = rw; + submit_bio(sdp->sd_log_bio); sdp->sd_log_bio = NULL; } } diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 8eaadabbc771..66670e14f654 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -230,7 +230,8 @@ static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num) bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); } bio->bi_end_io = gfs2_meta_read_endio; - submit_bio(rw, bio); + bio->bi_rw = rw; + submit_bio(bio); } /** diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 45463600fb81..d9e19f61077b 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -246,7 +246,8 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) bio->bi_end_io = end_bio_io_page; bio->bi_private = page; - submit_bio(READ_SYNC | REQ_META, bio); + bio->bi_rw = READ_SYNC | REQ_META; + submit_bio(bio); wait_on_page_locked(page); bio_put(bio); if (!PageUptodate(page)) { diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index cc6235671437..d026bb3fb36f 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -65,6 +65,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, bio = bio_alloc(GFP_NOIO, 1); bio->bi_iter.bi_sector = sector; bio->bi_bdev = sb->s_bdev; + bio->bi_rw = rw; if (!(rw & WRITE) && data) *data = (u8 *)buf + offset; @@ -83,7 +84,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, buf = (u8 *)buf + len; } - ret = submit_bio_wait(rw, bio); + ret = submit_bio_wait(bio); out: bio_put(bio); return ret < 0 ? ret : 0; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 63759d723920..3ee3f32562f9 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2002,12 +2002,13 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio->bi_end_io = lbmIODone; bio->bi_private = bp; + bio->bi_rw = READ_SYNC; /*check if journaling to disk has been disabled*/ if (log->no_integrity) { bio->bi_iter.bi_size = 0; lbmIODone(bio); } else { - submit_bio(READ_SYNC, bio); + submit_bio(bio); } wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD)); @@ -2145,13 +2146,14 @@ static void lbmStartIO(struct lbuf * bp) bio->bi_end_io = lbmIODone; bio->bi_private = bp; + bio->bi_rw = WRITE_SYNC; /* check if journaling to disk has been disabled */ if (log->no_integrity) { bio->bi_iter.bi_size = 0; lbmIODone(bio); } else { - submit_bio(WRITE_SYNC, bio); + submit_bio(bio); INCREMENT(lmStat.submitted); } } diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index b60e015cc757..97254436a4ed 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -411,7 +411,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) inc_io(page); if (!bio->bi_iter.bi_size) goto dump_bio; - submit_bio(WRITE, bio); + submit_bio(bio); nr_underway++; bio = NULL; } else @@ -434,6 +434,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_write_end_io; bio->bi_private = page; + bio->bi_rw = WRITE; /* Don't call bio_add_page yet, we may add to this vec */ bio_offset = offset; @@ -448,7 +449,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) if (!bio->bi_iter.bi_size) goto dump_bio; - submit_bio(WRITE, bio); + submit_bio(bio); nr_underway++; } if (redirty) @@ -506,7 +507,7 @@ static int metapage_readpage(struct file *fp, struct page *page) insert_metapage(page, NULL); inc_io(page); if (bio) - submit_bio(READ, bio); + submit_bio(bio); bio = bio_alloc(GFP_NOFS, 1); bio->bi_bdev = inode->i_sb->s_bdev; @@ -514,6 +515,7 @@ static int metapage_readpage(struct file *fp, struct page *page) pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_read_end_io; bio->bi_private = page; + bio->bi_rw = READ; len = xlen << inode->i_blkbits; offset = block_offset << inode->i_blkbits; if (bio_add_page(bio, page, len, offset) < len) @@ -523,7 +525,7 @@ static int metapage_readpage(struct file *fp, struct page *page) block_offset++; } if (bio) - submit_bio(READ, bio); + submit_bio(bio); else unlock_page(page); diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index cc26f8f215f5..29704bda6394 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -29,8 +29,9 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw) bio.bi_bdev = bdev; bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9); bio.bi_iter.bi_size = PAGE_SIZE; + bio.bi_rw = rw; - return submit_bio_wait(rw, &bio); + return submit_bio_wait(&bio); } static int bdev_readpage(void *_sb, struct page *page) @@ -95,8 +96,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = writeseg_end_io; + bio->bi_rw = WRITE; atomic_inc(&super->s_pending_writes); - submit_bio(WRITE, bio); + submit_bio(bio); ofs += i * PAGE_SIZE; index += i; @@ -122,8 +124,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = writeseg_end_io; + bio->bi_rw = WRITE; atomic_inc(&super->s_pending_writes); - submit_bio(WRITE, bio); + submit_bio(bio); return 0; } @@ -185,8 +188,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = erase_end_io; + bio->bi_rw = WRITE; atomic_inc(&super->s_pending_writes); - submit_bio(WRITE, bio); + submit_bio(bio); ofs += i * PAGE_SIZE; index += i; @@ -206,8 +210,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = erase_end_io; + bio->bi_rw = WRITE; atomic_inc(&super->s_pending_writes); - submit_bio(WRITE, bio); + submit_bio(bio); return 0; } diff --git a/fs/mpage.c b/fs/mpage.c index eedc644b78d7..2c251ecfeeea 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -59,8 +59,9 @@ static void mpage_end_io(struct bio *bio) static struct bio *mpage_bio_submit(int rw, struct bio *bio) { bio->bi_end_io = mpage_end_io; + bio->bi_rw = rw; guard_bio_eod(rw, bio); - submit_bio(rw, bio); + submit_bio(bio); return NULL; } diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 17a42e4eb872..4c79f4ddb052 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -102,14 +102,15 @@ static inline void put_parallel(struct parallel_io *p) } static struct bio * -bl_submit_bio(int rw, struct bio *bio) +bl_submit_bio(struct bio *bio) { if (bio) { get_parallel(bio->bi_private); dprintk("%s submitting %s bio %u@%llu\n", __func__, - rw == READ ? "read" : "write", bio->bi_iter.bi_size, + bio->bi_rw == READ ? "read" : "write", + bio->bi_iter.bi_size, (unsigned long long)bio->bi_iter.bi_sector); - submit_bio(rw, bio); + submit_bio(bio); } return NULL; } @@ -158,7 +159,7 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, if (disk_addr < map->start || disk_addr >= map->start + map->len) { if (!dev->map(dev, disk_addr, map)) return ERR_PTR(-EIO); - bio = bl_submit_bio(rw, bio); + bio = bl_submit_bio(bio); } disk_addr += map->disk_offset; disk_addr -= map->start; @@ -174,9 +175,10 @@ retry: disk_addr >> SECTOR_SHIFT, end_io, par); if (!bio) return ERR_PTR(-ENOMEM); + bio->bi_rw = rw; } if (bio_add_page(bio, page, *len, offset) < *len) { - bio = bl_submit_bio(rw, bio); + bio = bl_submit_bio(bio); goto retry; } return bio; @@ -252,7 +254,7 @@ bl_read_pagelist(struct nfs_pgio_header *header) for (i = pg_index; i < header->page_array.npages; i++) { if (extent_length <= 0) { /* We've used up the previous extent */ - bio = bl_submit_bio(READ, bio); + bio = bl_submit_bio(bio); /* Get the next one */ if (!ext_tree_lookup(bl, isect, &be, false)) { @@ -273,7 +275,7 @@ bl_read_pagelist(struct nfs_pgio_header *header) } if (is_hole(&be)) { - bio = bl_submit_bio(READ, bio); + bio = bl_submit_bio(bio); /* Fill hole w/ zeroes w/o accessing device */ dprintk("%s Zeroing page for hole\n", __func__); zero_user_segment(pages[i], pg_offset, pg_len); @@ -306,7 +308,7 @@ bl_read_pagelist(struct nfs_pgio_header *header) header->res.count = (isect << SECTOR_SHIFT) - header->args.offset; } out: - bl_submit_bio(READ, bio); + bl_submit_bio(bio); blk_finish_plug(&plug); put_parallel(par); return PNFS_ATTEMPTED; @@ -398,7 +400,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) for (i = pg_index; i < header->page_array.npages; i++) { if (extent_length <= 0) { /* We've used up the previous extent */ - bio = bl_submit_bio(WRITE, bio); + bio = bl_submit_bio(bio); /* Get the next one */ if (!ext_tree_lookup(bl, isect, &be, true)) { header->pnfs_error = -EINVAL; @@ -427,7 +429,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) header->res.count = header->args.count; out: - bl_submit_bio(WRITE, bio); + bl_submit_bio(bio); blk_finish_plug(&plug); put_parallel(par); return PNFS_ATTEMPTED; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index bf36df10540b..0f62909557e7 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -364,7 +364,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, bio->bi_end_io = nilfs_end_bio_write; bio->bi_private = segbuf; - submit_bio(mode, bio); + bio->bi_rw = mode; + submit_bio(bio); segbuf->sb_nbio++; wi->bio = NULL; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 6aaf3e351391..8b1d86ebd8d1 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -530,7 +530,7 @@ static void o2hb_bio_end_io(struct bio *bio) static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, struct o2hb_bio_wait_ctxt *wc, unsigned int *current_slot, - unsigned int max_slots) + unsigned int max_slots, int rw) { int len, current_page; unsigned int vec_len, vec_start; @@ -556,6 +556,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, bio->bi_bdev = reg->hr_bdev; bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; + bio->bi_rw = rw; vec_start = (cs << bits) % PAGE_SIZE; while(cs < max_slots) { @@ -591,7 +592,8 @@ static int o2hb_read_slots(struct o2hb_region *reg, o2hb_bio_wait_init(&wc); while(current_slot < max_slots) { - bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots); + bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots, + READ); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); @@ -599,7 +601,7 @@ static int o2hb_read_slots(struct o2hb_region *reg, } atomic_inc(&wc.wc_num_reqs); - submit_bio(READ, bio); + submit_bio(bio); } status = 0; @@ -623,7 +625,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg, slot = o2nm_this_node(); - bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1); + bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, WRITE_SYNC); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); @@ -631,7 +633,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg, } atomic_inc(&write_wc->wc_num_reqs); - submit_bio(WRITE_SYNC, bio); + submit_bio(bio); status = 0; bail: diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4c463b99fe57..0cd1603856b4 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -438,7 +438,10 @@ xfs_submit_ioend( ioend->io_bio->bi_private = ioend; ioend->io_bio->bi_end_io = xfs_end_bio; - + if (wbc->sync_mode == WB_SYNC_ALL) + ioend->io_bio->bi_rw = WRITE_SYNC; + else + ioend->io_bio->bi_rw = WRITE; /* * If we are failing the IO now, just mark the ioend with an * error and finish it. This will run IO completion immediately @@ -451,8 +454,7 @@ xfs_submit_ioend( return status; } - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, - ioend->io_bio); + submit_bio(ioend->io_bio); return 0; } @@ -510,8 +512,11 @@ xfs_chain_bio( bio_chain(ioend->io_bio, new); bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, - ioend->io_bio); + if (wbc->sync_mode == WB_SYNC_ALL) + ioend->io_bio->bi_rw = WRITE_SYNC; + else + ioend->io_bio->bi_rw = WRITE; + submit_bio(ioend->io_bio); ioend->io_bio = new; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index e71cfbd5acb3..0777c67f169b 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1166,7 +1166,7 @@ next_chunk: bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; - + bio->bi_rw = rw; for (; size && nr_pages; nr_pages--, page_index++) { int rbytes, nbytes = PAGE_SIZE - offset; @@ -1190,7 +1190,7 @@ next_chunk: flush_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); } - submit_bio(rw, bio); + submit_bio(bio); if (size) goto next_chunk; } else { diff --git a/include/linux/bio.h b/include/linux/bio.h index 9faebf7f9a33..3bde94234eea 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -473,7 +473,7 @@ static inline void bio_io_error(struct bio *bio) struct request_queue; extern int bio_phys_segments(struct request_queue *, struct bio *); -extern int submit_bio_wait(int rw, struct bio *bio); +extern int submit_bio_wait(struct bio *bio); extern void bio_advance(struct bio *, unsigned); extern void bio_init(struct bio *); diff --git a/include/linux/fs.h b/include/linux/fs.h index dd288148a6b1..65e4c51ecb3d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2747,7 +2747,7 @@ static inline void remove_inode_hash(struct inode *inode) extern void inode_sb_list_add(struct inode *inode); #ifdef CONFIG_BLOCK -extern blk_qc_t submit_bio(int, struct bio *); +extern blk_qc_t submit_bio(struct bio *); extern int bdev_read_only(struct block_device *); #endif extern int set_blocksize(struct block_device *, int); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 160e1006640d..be227f5aa9dc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -271,6 +271,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr, bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); bio->bi_bdev = hib_resume_bdev; + bio->bi_rw = rw; if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", @@ -283,9 +284,9 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr, bio->bi_end_io = hib_end_io; bio->bi_private = hb; atomic_inc(&hb->count); - submit_bio(rw, bio); + submit_bio(bio); } else { - error = submit_bio_wait(rw, bio); + error = submit_bio_wait(bio); bio_put(bio); } diff --git a/mm/page_io.c b/mm/page_io.c index 242dba07545b..5a5fd66d7bd5 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -259,7 +259,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, bio_end_io_t end_write_func) { struct bio *bio; - int ret, rw = WRITE; + int ret; struct swap_info_struct *sis = page_swap_info(page); if (sis->flags & SWP_FILE) { @@ -317,12 +317,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, ret = -ENOMEM; goto out; } + bio->bi_rw = WRITE; if (wbc->sync_mode == WB_SYNC_ALL) - rw |= REQ_SYNC; + bio->bi_rw |= REQ_SYNC; count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); - submit_bio(rw, bio); + submit_bio(bio); out: return ret; } @@ -369,8 +370,9 @@ int swap_readpage(struct page *page) ret = -ENOMEM; goto out; } + bio->bi_rw = READ; count_vm_event(PSWPIN); - submit_bio(READ, bio); + submit_bio(bio); out: return ret; } -- cgit v1.3-8-gc7d7 From 162b99e3119767cb6478c55a5aed70469389df88 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Sun, 5 Jun 2016 14:32:02 -0500 Subject: pm: use bio op accessors Separate the op from the rq_flag_bits and have the pm code set/get the bio using bio_set_op_attrs/bio_op. Signed-off-by: Mike Christie Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- kernel/power/swap.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index be227f5aa9dc..c1aaac431055 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -261,7 +261,7 @@ static void hib_end_io(struct bio *bio) bio_put(bio); } -static int hib_submit_io(int rw, pgoff_t page_off, void *addr, +static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, struct hib_bio_batch *hb) { struct page *page = virt_to_page(addr); @@ -271,7 +271,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr, bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); bio->bi_bdev = hib_resume_bdev; - bio->bi_rw = rw; + bio_set_op_attrs(bio, op, op_flags); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", @@ -307,7 +307,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); + hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block, + swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); @@ -316,8 +317,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; - error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, - swsusp_header, NULL); + error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, + swsusp_resume_block, swsusp_header, NULL); } else { printk(KERN_ERR "PM: Swap header not found!\n"); error = -ENODEV; @@ -390,7 +391,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) } else { src = buf; } - return hib_submit_io(WRITE_SYNC, offset, src, hb); + return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb); } static void release_swap_writer(struct swap_map_handle *handle) @@ -993,7 +994,8 @@ static int get_swap_reader(struct swap_map_handle *handle, return -ENOMEM; } - error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL); + error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, + tmp->map, NULL); if (error) { release_swap_reader(handle); return error; @@ -1017,7 +1019,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = hib_submit_io(READ_SYNC, offset, buf, hb); + error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { @@ -1526,7 +1528,8 @@ int swsusp_check(void) if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); - error = hib_submit_io(READ_SYNC, swsusp_resume_block, + error = hib_submit_io(REQ_OP_READ, READ_SYNC, + swsusp_resume_block, swsusp_header, NULL); if (error) goto put; @@ -1534,7 +1537,8 @@ int swsusp_check(void) if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ - error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, + error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, + swsusp_resume_block, swsusp_header, NULL); } else { error = -EINVAL; @@ -1578,10 +1582,12 @@ int swsusp_unmark(void) { int error; - hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); + hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block, + swsusp_header, NULL); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); - error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, + error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, + swsusp_resume_block, swsusp_header, NULL); } else { printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); -- cgit v1.3-8-gc7d7 From 1b9a9ab78b0ab79dc1f0ddd5fbed7833ec7de3a4 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Sun, 5 Jun 2016 14:32:18 -0500 Subject: blktrace: use op accessors Have blktrace use the req/bio op accessor to get the REQ_OP. Signed-off-by: Mike Christie Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 2 +- include/trace/events/bcache.h | 12 ++++++--- include/trace/events/block.h | 31 ++++++++++++++-------- kernel/trace/blktrace.c | 62 +++++++++++++++++++++++++------------------ 4 files changed, 65 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 0f3172b8b225..cceb72f9e29f 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -118,7 +118,7 @@ static inline int blk_cmd_buf_len(struct request *rq) } extern void blk_dump_cmd(char *buf, struct request *rq); -extern void blk_fill_rwbs(char *rwbs, u32 rw, int bytes); +extern void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes); #endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */ diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 981acf74b14f..65673d8b81ac 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -27,7 +27,8 @@ DECLARE_EVENT_CLASS(bcache_request, __entry->sector = bio->bi_iter.bi_sector; __entry->orig_sector = bio->bi_iter.bi_sector - 16; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + bio->bi_iter.bi_size); ), TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)", @@ -101,7 +102,8 @@ DECLARE_EVENT_CLASS(bcache_bio, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + bio->bi_iter.bi_size); ), TP_printk("%d,%d %s %llu + %u", @@ -136,7 +138,8 @@ TRACE_EVENT(bcache_read, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + bio->bi_iter.bi_size); __entry->cache_hit = hit; __entry->bypass = bypass; ), @@ -167,7 +170,8 @@ TRACE_EVENT(bcache_write, __entry->inode = inode; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + bio->bi_iter.bi_size); __entry->writeback = writeback; __entry->bypass = bypass; ), diff --git a/include/trace/events/block.h b/include/trace/events/block.h index e8a5eca1dbe5..5a2a7592068f 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -84,7 +84,8 @@ DECLARE_EVENT_CLASS(block_rq_with_error, 0 : blk_rq_sectors(rq); __entry->errors = rq->errors; - blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); + blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, + blk_rq_bytes(rq)); blk_dump_cmd(__get_str(cmd), rq); ), @@ -162,7 +163,7 @@ TRACE_EVENT(block_rq_complete, __entry->nr_sector = nr_bytes >> 9; __entry->errors = rq->errors; - blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes); + blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, nr_bytes); blk_dump_cmd(__get_str(cmd), rq); ), @@ -198,7 +199,8 @@ DECLARE_EVENT_CLASS(block_rq, __entry->bytes = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ? blk_rq_bytes(rq) : 0; - blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); + blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, + blk_rq_bytes(rq)); blk_dump_cmd(__get_str(cmd), rq); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -272,7 +274,8 @@ TRACE_EVENT(block_bio_bounce, bio->bi_bdev->bd_dev : 0; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -310,7 +313,8 @@ TRACE_EVENT(block_bio_complete, __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->error = error; - blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + bio->bi_iter.bi_size); ), TP_printk("%d,%d %s %llu + %u [%d]", @@ -337,7 +341,8 @@ DECLARE_EVENT_CLASS(block_bio_merge, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -404,7 +409,8 @@ TRACE_EVENT(block_bio_queue, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -432,7 +438,7 @@ DECLARE_EVENT_CLASS(block_get_rq, __entry->dev = bio ? bio->bi_bdev->bd_dev : 0; __entry->sector = bio ? bio->bi_iter.bi_sector : 0; __entry->nr_sector = bio ? bio_sectors(bio) : 0; - blk_fill_rwbs(__entry->rwbs, + blk_fill_rwbs(__entry->rwbs, bio ? bio_op(bio) : 0, bio ? bio->bi_rw : 0, __entry->nr_sector); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -567,7 +573,8 @@ TRACE_EVENT(block_split, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->new_sector = new_sector; - blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -610,7 +617,8 @@ TRACE_EVENT(block_bio_remap, __entry->nr_sector = bio_sectors(bio); __entry->old_dev = dev; __entry->old_sector = from; - blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + bio->bi_iter.bi_size); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", @@ -656,7 +664,8 @@ TRACE_EVENT(block_rq_remap, __entry->old_dev = dev; __entry->old_sector = from; __entry->nr_bios = blk_rq_count_bios(rq); - blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); + blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, + blk_rq_bytes(rq)); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u", diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 9aef8654e90d..2d16fad519b2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -199,7 +199,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), * blk_io_trace structure and places it in a per-cpu subbuffer. */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, - int rw, u32 what, int error, int pdu_len, void *pdu_data) + int op, int op_flags, u32 what, int error, int pdu_len, + void *pdu_data) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -214,13 +215,14 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; - what |= ddir_act[rw & WRITE]; - what |= MASK_TC_BIT(rw, SYNC); - what |= MASK_TC_BIT(rw, RAHEAD); - what |= MASK_TC_BIT(rw, META); - what |= MASK_TC_BIT(rw, DISCARD); - what |= MASK_TC_BIT(rw, FLUSH); - what |= MASK_TC_BIT(rw, FUA); + what |= ddir_act[op_is_write(op) ? WRITE : READ]; + what |= MASK_TC_BIT(op_flags, SYNC); + what |= MASK_TC_BIT(op_flags, RAHEAD); + what |= MASK_TC_BIT(op_flags, META); + what |= MASK_TC_BIT(op_flags, FLUSH); + what |= MASK_TC_BIT(op_flags, FUA); + if (op == REQ_OP_DISCARD) + what |= BLK_TC_ACT(BLK_TC_DISCARD); pid = tsk->pid; if (act_log_check(bt, what, sector, pid)) @@ -708,11 +710,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags, + __blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags, what, rq->errors, rq->cmd_len, rq->cmd); } else { what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, + __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq), rq->cmd_flags, what, rq->errors, 0, NULL); } } @@ -770,7 +772,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, return; __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio->bi_rw, what, error, 0, NULL); + bio_op(bio), bio->bi_rw, what, error, 0, NULL); } static void blk_add_trace_bio_bounce(void *ignore, @@ -818,7 +820,8 @@ static void blk_add_trace_getrq(void *ignore, struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); + __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, + NULL); } } @@ -833,7 +836,7 @@ static void blk_add_trace_sleeprq(void *ignore, struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, + __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, 0, 0, NULL); } } @@ -843,7 +846,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, @@ -860,7 +863,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); + __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); } } @@ -874,8 +877,9 @@ static void blk_add_trace_split(void *ignore, __be64 rpdu = cpu_to_be64(pdu); __blk_add_trace(bt, bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, - bio->bi_error, sizeof(rpdu), &rpdu); + bio->bi_iter.bi_size, bio_op(bio), bio->bi_rw, + BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu), + &rpdu); } } @@ -907,7 +911,7 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio->bi_rw, BLK_TA_REMAP, bio->bi_error, + bio_op(bio), bio->bi_rw, BLK_TA_REMAP, bio->bi_error, sizeof(r), &r); } @@ -940,7 +944,7 @@ static void blk_add_trace_rq_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), - rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, + rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors, sizeof(r), &r); } @@ -965,10 +969,10 @@ void blk_add_driver_data(struct request_queue *q, return; if (rq->cmd_type == REQ_TYPE_BLOCK_PC) - __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, + __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0, BLK_TA_DRV_DATA, rq->errors, len, data); else - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0, BLK_TA_DRV_DATA, rq->errors, len, data); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1769,21 +1773,27 @@ void blk_dump_cmd(char *buf, struct request *rq) } } -void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) +void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) { int i = 0; if (rw & REQ_FLUSH) rwbs[i++] = 'F'; - if (rw & WRITE) + switch (op) { + case REQ_OP_WRITE: + case REQ_OP_WRITE_SAME: rwbs[i++] = 'W'; - else if (rw & REQ_DISCARD) + break; + case REQ_OP_DISCARD: rwbs[i++] = 'D'; - else if (bytes) + break; + case REQ_OP_READ: rwbs[i++] = 'R'; - else + break; + default: rwbs[i++] = 'N'; + } if (rw & REQ_FUA) rwbs[i++] = 'F'; -- cgit v1.3-8-gc7d7 From 3a5e02ced11e22ecd9da3d6710afe15bcfee1d10 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Sun, 5 Jun 2016 14:32:23 -0500 Subject: block, drivers: add REQ_OP_FLUSH operation This adds a REQ_OP_FLUSH operation that is sent to request_fn based drivers by the block layer's flush code, instead of sending requests with the request->cmd_flags REQ_FLUSH bit set. Signed-off-by: Mike Christie Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- Documentation/block/writeback_cache_control.txt | 6 +++--- arch/um/drivers/ubd_kern.c | 2 +- block/blk-flush.c | 4 ++-- drivers/block/loop.c | 4 ++-- drivers/block/nbd.c | 2 +- drivers/block/osdblk.c | 2 +- drivers/block/ps3disk.c | 4 ++-- drivers/block/skd_main.c | 2 +- drivers/block/virtio_blk.c | 2 +- drivers/block/xen-blkfront.c | 8 ++++---- drivers/ide/ide-disk.c | 2 +- drivers/md/dm.c | 2 +- drivers/mmc/card/block.c | 6 +++--- drivers/mmc/card/queue.h | 3 ++- drivers/mtd/mtd_blkdevs.c | 2 +- drivers/nvme/host/core.c | 2 +- drivers/scsi/sd.c | 7 +++---- include/linux/blk_types.h | 3 ++- include/linux/blkdev.h | 3 +++ kernel/trace/blktrace.c | 5 +++++ 20 files changed, 40 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/Documentation/block/writeback_cache_control.txt b/Documentation/block/writeback_cache_control.txt index 59e0516cbf6b..da70bdacd503 100644 --- a/Documentation/block/writeback_cache_control.txt +++ b/Documentation/block/writeback_cache_control.txt @@ -73,9 +73,9 @@ doing: blk_queue_write_cache(sdkp->disk->queue, true, false); -and handle empty REQ_FLUSH requests in its prep_fn/request_fn. Note that +and handle empty REQ_OP_FLUSH requests in its prep_fn/request_fn. Note that REQ_FLUSH requests with a payload are automatically turned into a sequence -of an empty REQ_FLUSH request followed by the actual write by the block +of an empty REQ_OP_FLUSH request followed by the actual write by the block layer. For devices that also support the FUA bit the block layer needs to be told to pass through the REQ_FUA bit using: @@ -83,4 +83,4 @@ to be told to pass through the REQ_FUA bit using: and the driver must handle write requests that have the REQ_FUA bit set in prep_fn/request_fn. If the FUA bit is not natively supported the block -layer turns it into an empty REQ_FLUSH request after the actual write. +layer turns it into an empty REQ_OP_FLUSH request after the actual write. diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 17e96dc29596..ef6b4d960bad 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1286,7 +1286,7 @@ static void do_ubd_request(struct request_queue *q) req = dev->request; - if (req->cmd_flags & REQ_FLUSH) { + if (req_op(req) == REQ_OP_FLUSH) { io_req = kmalloc(sizeof(struct io_thread_req), GFP_ATOMIC); if (io_req == NULL) { diff --git a/block/blk-flush.c b/block/blk-flush.c index 9fd1f63a6348..21f0d5b0d2ca 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -29,7 +29,7 @@ * The actual execution of flush is double buffered. Whenever a request * needs to execute PRE or POSTFLUSH, it queues at * fq->flush_queue[fq->flush_pending_idx]. Once certain criteria are met, a - * flush is issued and the pending_idx is toggled. When the flush + * REQ_OP_FLUSH is issued and the pending_idx is toggled. When the flush * completes, all the requests which were pending are proceeded to the next * step. This allows arbitrary merging of different types of FLUSH/FUA * requests. @@ -330,7 +330,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) } flush_rq->cmd_type = REQ_TYPE_FS; - flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; + req_set_op_attrs(flush_rq, REQ_OP_FLUSH, WRITE_FLUSH | REQ_FLUSH_SEQ); flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index b9b737cafd5f..364d491d4bdd 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -542,7 +542,7 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; if (op_is_write(req_op(rq))) { - if (rq->cmd_flags & REQ_FLUSH) + if (req_op(rq) == REQ_OP_FLUSH) ret = lo_req_flush(lo, rq); else if (req_op(rq) == REQ_OP_DISCARD) ret = lo_discard(lo, rq, pos); @@ -1659,7 +1659,7 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, if (lo->lo_state != Lo_bound) return -EIO; - if (lo->use_dio && (!(cmd->rq->cmd_flags & REQ_FLUSH) || + if (lo->use_dio && (req_op(cmd->rq) != REQ_OP_FLUSH || req_op(cmd->rq) == REQ_OP_DISCARD)) cmd->use_aio = true; else diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 6c2c28d124d0..d6f3c9336f29 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -284,7 +284,7 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req) type = NBD_CMD_DISC; else if (req_op(req) == REQ_OP_DISCARD) type = NBD_CMD_TRIM; - else if (req->cmd_flags & REQ_FLUSH) + else if (req_op(req) == REQ_OP_FLUSH) type = NBD_CMD_FLUSH; else if (rq_data_dir(req) == WRITE) type = NBD_CMD_WRITE; diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index c2854a2bfdb0..92900f5f0b47 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c @@ -321,7 +321,7 @@ static void osdblk_rq_fn(struct request_queue *q) * driver-specific, etc. */ - do_flush = rq->cmd_flags & REQ_FLUSH; + do_flush = (req_op(rq) == REQ_OP_FLUSH); do_write = (rq_data_dir(rq) == WRITE); if (!do_flush) { /* osd_flush does not use a bio */ diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 4b7e405830d7..acb44529c05e 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -196,7 +196,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev, dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__); while ((req = blk_fetch_request(q))) { - if (req->cmd_flags & REQ_FLUSH) { + if (req_op(req) == REQ_OP_FLUSH) { if (ps3disk_submit_flush_request(dev, req)) break; } else if (req->cmd_type == REQ_TYPE_FS) { @@ -256,7 +256,7 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data) return IRQ_HANDLED; } - if (req->cmd_flags & REQ_FLUSH) { + if (req_op(req) == REQ_OP_FLUSH) { read = 0; op = "flush"; } else { diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 910e065918af..5c07a23e2ada 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -597,7 +597,7 @@ static void skd_request_fn(struct request_queue *q) data_dir = rq_data_dir(req); io_flags = req->cmd_flags; - if (io_flags & REQ_FLUSH) + if (req_op(req) == REQ_OP_FLUSH) flush++; if (io_flags & REQ_FUA) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 42758b52768c..18e4069dd24b 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -172,7 +172,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); vbr->req = req; - if (req->cmd_flags & REQ_FLUSH) { + if (req_op(req) == REQ_OP_FLUSH) { vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_FLUSH); vbr->out_hdr.sector = 0; vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 6fd160197b7a..3aeb25bd5057 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -743,7 +743,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri * The indirect operation can only be a BLKIF_OP_READ or * BLKIF_OP_WRITE */ - BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA)); + BUG_ON(req_op(req) == REQ_OP_FLUSH || req->cmd_flags & REQ_FUA); ring_req->operation = BLKIF_OP_INDIRECT; ring_req->u.indirect.indirect_op = rq_data_dir(req) ? BLKIF_OP_WRITE : BLKIF_OP_READ; @@ -755,7 +755,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri ring_req->u.rw.handle = info->handle; ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE : BLKIF_OP_READ; - if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { + if (req_op(req) == REQ_OP_FLUSH || req->cmd_flags & REQ_FUA) { /* * Ideally we can do an unordered flush-to-disk. * In case the backend onlysupports barriers, use that. @@ -865,7 +865,7 @@ static inline bool blkif_request_flush_invalid(struct request *req, struct blkfront_info *info) { return ((req->cmd_type != REQ_TYPE_FS) || - ((req->cmd_flags & REQ_FLUSH) && + ((req_op(req) == REQ_OP_FLUSH) && !(info->feature_flush & REQ_FLUSH)) || ((req->cmd_flags & REQ_FUA) && !(info->feature_flush & REQ_FUA))); @@ -2055,7 +2055,7 @@ static int blkif_recover(struct blkfront_info *info) /* * Get the bios in the request so we can re-queue them. */ - if (copy[i].request->cmd_flags & REQ_FLUSH || + if (req_op(copy[i].request) == REQ_OP_FLUSH || req_op(copy[i].request) == REQ_OP_DISCARD || copy[i].request->cmd_flags & (REQ_FUA | REQ_SECURE)) { /* diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 05dbcce70b0e..e378ef70ed63 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -431,7 +431,7 @@ static int idedisk_prep_fn(struct request_queue *q, struct request *rq) ide_drive_t *drive = q->queuedata; struct ide_cmd *cmd; - if (!(rq->cmd_flags & REQ_FLUSH)) + if (req_op(rq) != REQ_OP_FLUSH) return BLKPREP_OK; if (rq->special) { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f6b104c77b6d..fcc68c8edba0 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2171,7 +2171,7 @@ static void dm_request_fn(struct request_queue *q) /* always use block 0 to find the target for flushes for now */ pos = 0; - if (!(rq->cmd_flags & REQ_FLUSH)) + if (req_op(rq) != REQ_OP_FLUSH) pos = blk_rq_pos(rq); if ((dm_request_peeked_before_merge_deadline(md) && diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index 201a8719f6c4..bca20f88a8b2 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -1722,7 +1722,8 @@ static u8 mmc_blk_prep_packed_list(struct mmc_queue *mq, struct request *req) !IS_ALIGNED(blk_rq_sectors(next), 8)) break; - if (req_op(next) == REQ_OP_DISCARD || next->cmd_flags & REQ_FLUSH) + if (req_op(next) == REQ_OP_DISCARD || + req_op(next) == REQ_OP_FLUSH) break; if (rq_data_dir(cur) != rq_data_dir(next)) @@ -2147,7 +2148,6 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req) struct mmc_card *card = md->queue.card; struct mmc_host *host = card->host; unsigned long flags; - unsigned int cmd_flags = req ? req->cmd_flags : 0; if (req && !mq->mqrq_prev->req) /* claim host only for the first request */ @@ -2171,7 +2171,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req) ret = mmc_blk_issue_secdiscard_rq(mq, req); else ret = mmc_blk_issue_discard_rq(mq, req); - } else if (cmd_flags & REQ_FLUSH) { + } else if (req && req_op(req) == REQ_OP_FLUSH) { /* complete ongoing async transfer before issuing flush */ if (card->host->areq) mmc_blk_issue_rw_rq(mq, NULL); diff --git a/drivers/mmc/card/queue.h b/drivers/mmc/card/queue.h index 9fb26f20a44d..d62531124d54 100644 --- a/drivers/mmc/card/queue.h +++ b/drivers/mmc/card/queue.h @@ -3,7 +3,8 @@ static inline bool mmc_req_is_special(struct request *req) { - return req && (req->cmd_flags & REQ_FLUSH || req_op(req) == REQ_OP_DISCARD); + return req && + (req_op(req) == REQ_OP_FLUSH || req_op(req) == REQ_OP_DISCARD); } struct request; diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 4eb9a5fb151c..78b3eb45faf6 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -87,7 +87,7 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr, if (req->cmd_type != REQ_TYPE_FS) return -EIO; - if (req->cmd_flags & REQ_FLUSH) + if (req_op(req) == REQ_OP_FLUSH) return tr->flush(dev); if (blk_rq_pos(req) + blk_rq_cur_sectors(req) > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 089b8b8aad4f..abdfdcfb66f4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -290,7 +290,7 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, if (req->cmd_type == REQ_TYPE_DRV_PRIV) memcpy(cmd, req->cmd, sizeof(*cmd)); - else if (req->cmd_flags & REQ_FLUSH) + else if (req_op(req) == REQ_OP_FLUSH) nvme_setup_flush(ns, cmd); else if (req_op(req) == REQ_OP_DISCARD) ret = nvme_setup_discard(ns, req, cmd); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index fad86ad89e64..5a9db0fe1ee0 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1143,12 +1143,11 @@ static int sd_init_command(struct scsi_cmnd *cmd) return sd_setup_discard_cmnd(cmd); case REQ_OP_WRITE_SAME: return sd_setup_write_same_cmnd(cmd); + case REQ_OP_FLUSH: + return sd_setup_flush_cmnd(cmd); case REQ_OP_READ: case REQ_OP_WRITE: - if (rq->cmd_flags & REQ_FLUSH) - return sd_setup_flush_cmnd(cmd); - else - return sd_setup_read_write_cmnd(cmd); + return sd_setup_read_write_cmnd(cmd); default: BUG(); } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 23c1ab2a9475..32d87522f349 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -249,9 +249,10 @@ enum req_op { REQ_OP_WRITE, REQ_OP_DISCARD, /* request to discard sectors */ REQ_OP_WRITE_SAME, /* write same block many times */ + REQ_OP_FLUSH, /* request for cache flush */ }; -#define REQ_OP_BITS 2 +#define REQ_OP_BITS 3 typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 78ae3dbf2de1..0c9f8793c87e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -666,6 +666,9 @@ static inline bool rq_mergeable(struct request *rq) if (rq->cmd_type != REQ_TYPE_FS) return false; + if (req_op(rq) == REQ_OP_FLUSH) + return false; + if (rq->cmd_flags & REQ_NOMERGE_FLAGS) return false; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2d16fad519b2..0c70fbb6ea8d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -223,6 +223,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(op_flags, FUA); if (op == REQ_OP_DISCARD) what |= BLK_TC_ACT(BLK_TC_DISCARD); + if (op == REQ_OP_FLUSH) + what |= BLK_TC_ACT(BLK_TC_FLUSH); pid = tsk->pid; if (act_log_check(bt, what, sector, pid)) @@ -1788,6 +1790,9 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) case REQ_OP_DISCARD: rwbs[i++] = 'D'; break; + case REQ_OP_FLUSH: + rwbs[i++] = 'F'; + break; case REQ_OP_READ: rwbs[i++] = 'R'; break; -- cgit v1.3-8-gc7d7 From 28a8f0d317bf225ff15008f5dd66ae16242dd843 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Sun, 5 Jun 2016 14:32:25 -0500 Subject: block, drivers, fs: rename REQ_FLUSH to REQ_PREFLUSH To avoid confusion between REQ_OP_FLUSH, which is handled by request_fn drivers, and upper layers requesting the block layer perform a flush sequence along with possibly a WRITE, this patch renames REQ_FLUSH to REQ_PREFLUSH. Signed-off-by: Mike Christie Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- Documentation/block/writeback_cache_control.txt | 22 +++++++++++----------- Documentation/device-mapper/log-writes.txt | 10 +++++----- block/blk-core.c | 12 ++++++------ block/blk-flush.c | 16 ++++++++-------- block/blk-mq.c | 4 ++-- drivers/block/drbd/drbd_actlog.c | 4 ++-- drivers/block/drbd/drbd_main.c | 2 +- drivers/block/drbd/drbd_protocol.h | 2 +- drivers/block/drbd/drbd_receiver.c | 2 +- drivers/block/drbd/drbd_req.c | 2 +- drivers/md/bcache/journal.c | 2 +- drivers/md/bcache/request.c | 8 ++++---- drivers/md/dm-cache-target.c | 12 ++++++------ drivers/md/dm-crypt.c | 7 ++++--- drivers/md/dm-era-target.c | 4 ++-- drivers/md/dm-io.c | 2 +- drivers/md/dm-log-writes.c | 2 +- drivers/md/dm-raid1.c | 5 +++-- drivers/md/dm-region-hash.c | 4 ++-- drivers/md/dm-snap.c | 6 +++--- drivers/md/dm-stripe.c | 2 +- drivers/md/dm-thin.c | 8 ++++---- drivers/md/dm.c | 12 ++++++------ drivers/md/linear.c | 2 +- drivers/md/md.c | 2 +- drivers/md/md.h | 2 +- drivers/md/multipath.c | 2 +- drivers/md/raid0.c | 2 +- drivers/md/raid1.c | 3 ++- drivers/md/raid10.c | 2 +- drivers/md/raid5-cache.c | 2 +- drivers/md/raid5.c | 2 +- fs/btrfs/check-integrity.c | 8 ++++---- fs/jbd2/journal.c | 2 +- fs/xfs/xfs_buf.c | 2 +- include/linux/blk_types.h | 8 ++++---- include/linux/fs.h | 4 ++-- include/trace/events/f2fs.h | 2 +- kernel/trace/blktrace.c | 5 +++-- 39 files changed, 102 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/Documentation/block/writeback_cache_control.txt b/Documentation/block/writeback_cache_control.txt index da70bdacd503..8a6bdada5f6b 100644 --- a/Documentation/block/writeback_cache_control.txt +++ b/Documentation/block/writeback_cache_control.txt @@ -20,11 +20,11 @@ a forced cache flush, and the Force Unit Access (FUA) flag for requests. Explicit cache flushes ---------------------- -The REQ_FLUSH flag can be OR ed into the r/w flags of a bio submitted from +The REQ_PREFLUSH flag can be OR ed into the r/w flags of a bio submitted from the filesystem and will make sure the volatile cache of the storage device has been flushed before the actual I/O operation is started. This explicitly guarantees that previously completed write requests are on non-volatile -storage before the flagged bio starts. In addition the REQ_FLUSH flag can be +storage before the flagged bio starts. In addition the REQ_PREFLUSH flag can be set on an otherwise empty bio structure, which causes only an explicit cache flush without any dependent I/O. It is recommend to use the blkdev_issue_flush() helper for a pure cache flush. @@ -41,21 +41,21 @@ signaled after the data has been committed to non-volatile storage. Implementation details for filesystems -------------------------------------- -Filesystems can simply set the REQ_FLUSH and REQ_FUA bits and do not have to +Filesystems can simply set the REQ_PREFLUSH and REQ_FUA bits and do not have to worry if the underlying devices need any explicit cache flushing and how -the Forced Unit Access is implemented. The REQ_FLUSH and REQ_FUA flags +the Forced Unit Access is implemented. The REQ_PREFLUSH and REQ_FUA flags may both be set on a single bio. Implementation details for make_request_fn based block drivers -------------------------------------------------------------- -These drivers will always see the REQ_FLUSH and REQ_FUA bits as they sit +These drivers will always see the REQ_PREFLUSH and REQ_FUA bits as they sit directly below the submit_bio interface. For remapping drivers the REQ_FUA bits need to be propagated to underlying devices, and a global flush needs -to be implemented for bios with the REQ_FLUSH bit set. For real device -drivers that do not have a volatile cache the REQ_FLUSH and REQ_FUA bits -on non-empty bios can simply be ignored, and REQ_FLUSH requests without +to be implemented for bios with the REQ_PREFLUSH bit set. For real device +drivers that do not have a volatile cache the REQ_PREFLUSH and REQ_FUA bits +on non-empty bios can simply be ignored, and REQ_PREFLUSH requests without data can be completed successfully without doing any work. Drivers for devices with volatile caches need to implement the support for these flags themselves without any help from the block layer. @@ -65,8 +65,8 @@ Implementation details for request_fn based block drivers -------------------------------------------------------------- For devices that do not support volatile write caches there is no driver -support required, the block layer completes empty REQ_FLUSH requests before -entering the driver and strips off the REQ_FLUSH and REQ_FUA bits from +support required, the block layer completes empty REQ_PREFLUSH requests before +entering the driver and strips off the REQ_PREFLUSH and REQ_FUA bits from requests that have a payload. For devices with volatile write caches the driver needs to tell the block layer that it supports flushing caches by doing: @@ -74,7 +74,7 @@ doing: blk_queue_write_cache(sdkp->disk->queue, true, false); and handle empty REQ_OP_FLUSH requests in its prep_fn/request_fn. Note that -REQ_FLUSH requests with a payload are automatically turned into a sequence +REQ_PREFLUSH requests with a payload are automatically turned into a sequence of an empty REQ_OP_FLUSH request followed by the actual write by the block layer. For devices that also support the FUA bit the block layer needs to be told to pass through the REQ_FUA bit using: diff --git a/Documentation/device-mapper/log-writes.txt b/Documentation/device-mapper/log-writes.txt index c10f30c9b534..f4ebcbaf50f3 100644 --- a/Documentation/device-mapper/log-writes.txt +++ b/Documentation/device-mapper/log-writes.txt @@ -14,14 +14,14 @@ Log Ordering We log things in order of completion once we are sure the write is no longer in cache. This means that normal WRITE requests are not actually logged until the -next REQ_FLUSH request. This is to make it easier for userspace to replay the -log in a way that correlates to what is on disk and not what is in cache, to -make it easier to detect improper waiting/flushing. +next REQ_PREFLUSH request. This is to make it easier for userspace to replay +the log in a way that correlates to what is on disk and not what is in cache, +to make it easier to detect improper waiting/flushing. This works by attaching all WRITE requests to a list once the write completes. -Once we see a REQ_FLUSH request we splice this list onto the request and once +Once we see a REQ_PREFLUSH request we splice this list onto the request and once the FLUSH request completes we log all of the WRITEs and then the FLUSH. Only -completed WRITEs, at the time the REQ_FLUSH is issued, are added in order to +completed WRITEs, at the time the REQ_PREFLUSH is issued, are added in order to simulate the worst case scenario with regard to power failures. Consider the following example (W means write, C means complete): diff --git a/block/blk-core.c b/block/blk-core.c index c7d66c23a708..32a283eb7274 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1029,7 +1029,7 @@ static bool blk_rq_should_init_elevator(struct bio *bio) * Flush requests do not use the elevator so skip initialization. * This allows a request to share the flush and elevator data. */ - if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) + if (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA)) return false; return true; @@ -1736,7 +1736,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } - if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { + if (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA)) { spin_lock_irq(q->queue_lock); where = ELEVATOR_INSERT_FLUSH; goto get_rq; @@ -1968,9 +1968,9 @@ generic_make_request_checks(struct bio *bio) * drivers without flush support don't have to worry * about them. */ - if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && + if ((bio->bi_rw & (REQ_PREFLUSH | REQ_FUA)) && !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { - bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); + bio->bi_rw &= ~(REQ_PREFLUSH | REQ_FUA); if (!nr_sectors) { err = 0; goto end_io; @@ -2217,7 +2217,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) */ BUG_ON(blk_queued_rq(rq)); - if (rq->cmd_flags & (REQ_FLUSH|REQ_FUA)) + if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA)) where = ELEVATOR_INSERT_FLUSH; add_acct_request(q, rq, where); @@ -3311,7 +3311,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) /* * rq is already accounted, so use raw insert */ - if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) + if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA)) __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH); else __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE); diff --git a/block/blk-flush.c b/block/blk-flush.c index 21f0d5b0d2ca..d308def812db 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -10,8 +10,8 @@ * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request * properties and hardware capability. * - * If a request doesn't have data, only REQ_FLUSH makes sense, which - * indicates a simple flush request. If there is data, REQ_FLUSH indicates + * If a request doesn't have data, only REQ_PREFLUSH makes sense, which + * indicates a simple flush request. If there is data, REQ_PREFLUSH indicates * that the device cache should be flushed before the data is executed, and * REQ_FUA means that the data must be on non-volatile media on request * completion. @@ -20,11 +20,11 @@ * difference. The requests are either completed immediately if there's no * data or executed as normal requests otherwise. * - * If the device has writeback cache and supports FUA, REQ_FLUSH is + * If the device has writeback cache and supports FUA, REQ_PREFLUSH is * translated to PREFLUSH but REQ_FUA is passed down directly with DATA. * - * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is - * translated to PREFLUSH and REQ_FUA to POSTFLUSH. + * If the device has writeback cache and doesn't support FUA, REQ_PREFLUSH + * is translated to PREFLUSH and REQ_FUA to POSTFLUSH. * * The actual execution of flush is double buffered. Whenever a request * needs to execute PRE or POSTFLUSH, it queues at @@ -103,7 +103,7 @@ static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) policy |= REQ_FSEQ_DATA; if (fflags & (1UL << QUEUE_FLAG_WC)) { - if (rq->cmd_flags & REQ_FLUSH) + if (rq->cmd_flags & REQ_PREFLUSH) policy |= REQ_FSEQ_PREFLUSH; if (!(fflags & (1UL << QUEUE_FLAG_FUA)) && (rq->cmd_flags & REQ_FUA)) @@ -391,9 +391,9 @@ void blk_insert_flush(struct request *rq) /* * @policy now records what operations need to be done. Adjust - * REQ_FLUSH and FUA for the driver. + * REQ_PREFLUSH and FUA for the driver. */ - rq->cmd_flags &= ~REQ_FLUSH; + rq->cmd_flags &= ~REQ_PREFLUSH; if (!(fflags & (1UL << QUEUE_FLAG_FUA))) rq->cmd_flags &= ~REQ_FUA; diff --git a/block/blk-mq.c b/block/blk-mq.c index 29bcd9c07a34..13f460368759 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1247,7 +1247,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = rw_is_sync(bio_op(bio), bio->bi_rw); - const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); + const int is_flush_fua = bio->bi_rw & (REQ_PREFLUSH | REQ_FUA); struct blk_map_ctx data; struct request *rq; unsigned int request_count = 0; @@ -1344,7 +1344,7 @@ done: static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = rw_is_sync(bio_op(bio), bio->bi_rw); - const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); + const int is_flush_fua = bio->bi_rw & (REQ_PREFLUSH | REQ_FUA); struct blk_plug *plug; unsigned int request_count = 0; struct blk_map_ctx data; diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index f236a31cc095..d524973f94b3 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -148,7 +148,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, device->md_io.error = -ENODEV; if ((op == REQ_OP_WRITE) && !test_bit(MD_NO_FUA, &device->flags)) - op_flags |= REQ_FUA | REQ_FLUSH; + op_flags |= REQ_FUA | REQ_PREFLUSH; op_flags |= REQ_SYNC | REQ_NOIDLE; bio = bio_alloc_drbd(GFP_NOIO); @@ -847,7 +847,7 @@ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, unsigned long count = 0; sector_t esector, nr_sectors; - /* This would be an empty REQ_FLUSH, be silent. */ + /* This would be an empty REQ_PREFLUSH, be silent. */ if ((mode == SET_OUT_OF_SYNC) && size == 0) return 0; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index d55febcaa414..2b37744db0fa 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1609,7 +1609,7 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection, if (connection->agreed_pro_version >= 95) return (bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | (bio->bi_rw & REQ_FUA ? DP_FUA : 0) | - (bio->bi_rw & REQ_FLUSH ? DP_FLUSH : 0) | + (bio->bi_rw & REQ_PREFLUSH ? DP_FLUSH : 0) | (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0); else return bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0; diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h index ef9245363dcc..129f8c76c9b1 100644 --- a/drivers/block/drbd/drbd_protocol.h +++ b/drivers/block/drbd/drbd_protocol.h @@ -112,7 +112,7 @@ struct p_header100 { #define DP_MAY_SET_IN_SYNC 4 #define DP_UNPLUG 8 /* not used anymore */ #define DP_FUA 16 /* equals REQ_FUA */ -#define DP_FLUSH 32 /* equals REQ_FLUSH */ +#define DP_FLUSH 32 /* equals REQ_PREFLUSH */ #define DP_DISCARD 64 /* equals REQ_DISCARD */ #define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */ #define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */ diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 6c5997894475..1ee002352ea2 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2158,7 +2158,7 @@ static unsigned long wire_flags_to_bio_flags(u32 dpf) { return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | (dpf & DP_FUA ? REQ_FUA : 0) | - (dpf & DP_FLUSH ? REQ_FLUSH : 0); + (dpf & DP_FLUSH ? REQ_PREFLUSH : 0); } static unsigned long wire_flags_to_bio_op(u32 dpf) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 2255dcfebd2b..eef6e9575b4e 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1132,7 +1132,7 @@ static int drbd_process_write_request(struct drbd_request *req) * replicating, in which case there is no point. */ if (unlikely(req->i.size == 0)) { /* The only size==0 bios we expect are empty flushes. */ - D_ASSERT(device, req->master_bio->bi_rw & REQ_FLUSH); + D_ASSERT(device, req->master_bio->bi_rw & REQ_PREFLUSH); if (remote) _req_mod(req, QUEUE_AS_DRBD_BARRIER); return remote; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index a3c3b309ff4a..6925023e12d4 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -631,7 +631,7 @@ static void journal_write_unlocked(struct closure *cl) bio->bi_end_io = journal_write_endio; bio->bi_private = w; bio_set_op_attrs(bio, REQ_OP_WRITE, - REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA); + REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); bch_bio_map(bio, w->data); trace_bcache_journal_write(bio); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 016b0aa7199c..69f16f43f8ab 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -205,10 +205,10 @@ static void bch_data_insert_start(struct closure *cl) return bch_data_invalidate(cl); /* - * Journal writes are marked REQ_FLUSH; if the original write was a + * Journal writes are marked REQ_PREFLUSH; if the original write was a * flush, it'll wait on the journal write. */ - bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA); + bio->bi_rw &= ~(REQ_PREFLUSH|REQ_FUA); do { unsigned i; @@ -668,7 +668,7 @@ static inline struct search *search_alloc(struct bio *bio, s->iop.write_prio = 0; s->iop.error = 0; s->iop.flags = 0; - s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; + s->iop.flush_journal = (bio->bi_rw & (REQ_PREFLUSH|REQ_FUA)) != 0; s->iop.wq = bcache_wq; return s; @@ -920,7 +920,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) bch_writeback_add(dc); s->iop.bio = bio; - if (bio->bi_rw & REQ_FLUSH) { + if (bio->bi_rw & REQ_PREFLUSH) { /* Also need to send a flush to the backing device */ struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, dc->disk.bio_split); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 540e80eb317d..718744db62df 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -788,7 +788,7 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) spin_lock_irqsave(&cache->lock, flags); if (cache->need_tick_bio && - !(bio->bi_rw & (REQ_FUA | REQ_FLUSH)) && + !(bio->bi_rw & (REQ_FUA | REQ_PREFLUSH)) && bio_op(bio) != REQ_OP_DISCARD) { pb->tick = true; cache->need_tick_bio = false; @@ -830,7 +830,7 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) static int bio_triggers_commit(struct cache *cache, struct bio *bio) { - return bio->bi_rw & (REQ_FLUSH | REQ_FUA); + return bio->bi_rw & (REQ_PREFLUSH | REQ_FUA); } /* @@ -1069,7 +1069,7 @@ static void dec_io_migrations(struct cache *cache) static bool discard_or_flush(struct bio *bio) { return bio_op(bio) == REQ_OP_DISCARD || - bio->bi_rw & (REQ_FLUSH | REQ_FUA); + bio->bi_rw & (REQ_PREFLUSH | REQ_FUA); } static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) @@ -1614,8 +1614,8 @@ static void process_flush_bio(struct cache *cache, struct bio *bio) remap_to_cache(cache, bio, 0); /* - * REQ_FLUSH is not directed at any particular block so we don't - * need to inc_ds(). REQ_FUA's are split into a write + REQ_FLUSH + * REQ_PREFLUSH is not directed at any particular block so we don't + * need to inc_ds(). REQ_FUA's are split into a write + REQ_PREFLUSH * by dm-core. */ issue(cache, bio); @@ -1980,7 +1980,7 @@ static void process_deferred_bios(struct cache *cache) bio = bio_list_pop(&bios); - if (bio->bi_rw & REQ_FLUSH) + if (bio->bi_rw & REQ_PREFLUSH) process_flush_bio(cache, bio); else if (bio_op(bio) == REQ_OP_DISCARD) process_discard_bio(cache, &structs, bio); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 057d19b5e196..96dd5d7e454a 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1911,11 +1911,12 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) struct crypt_config *cc = ti->private; /* - * If bio is REQ_FLUSH or REQ_OP_DISCARD, just bypass crypt queues. - * - for REQ_FLUSH device-mapper core ensures that no IO is in-flight + * If bio is REQ_PREFLUSH or REQ_OP_DISCARD, just bypass crypt queues. + * - for REQ_PREFLUSH device-mapper core ensures that no IO is in-flight * - for REQ_OP_DISCARD caller must use flush if IO ordering matters */ - if (unlikely(bio->bi_rw & REQ_FLUSH || bio_op(bio) == REQ_OP_DISCARD)) { + if (unlikely(bio->bi_rw & REQ_PREFLUSH || + bio_op(bio) == REQ_OP_DISCARD)) { bio->bi_bdev = cc->dev->bdev; if (bio_sectors(bio)) bio->bi_iter.bi_sector = cc->start + diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 665bf3285618..2faf49d8f4d7 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1540,9 +1540,9 @@ static int era_map(struct dm_target *ti, struct bio *bio) remap_to_origin(era, bio); /* - * REQ_FLUSH bios carry no data, so we're not interested in them. + * REQ_PREFLUSH bios carry no data, so we're not interested in them. */ - if (!(bio->bi_rw & REQ_FLUSH) && + if (!(bio->bi_rw & REQ_PREFLUSH) && (bio_data_dir(bio) == WRITE) && !metadata_current_marked(era->md, block)) { defer_bio(era, bio); diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 22e0597d631e..0e225fd4a8d1 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -380,7 +380,7 @@ static void dispatch_io(int op, int op_flags, unsigned int num_regions, */ for (i = 0; i < num_regions; i++) { *dp = old_pages; - if (where[i].count || (op_flags & REQ_FLUSH)) + if (where[i].count || (op_flags & REQ_PREFLUSH)) do_region(op, op_flags, i, where + i, dp, io); } diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 0edb8ea51e46..b5dbf7a0515e 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -555,7 +555,7 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio) struct bio_vec bv; size_t alloc_size; int i = 0; - bool flush_bio = (bio->bi_rw & REQ_FLUSH); + bool flush_bio = (bio->bi_rw & REQ_PREFLUSH); bool fua_bio = (bio->bi_rw & REQ_FUA); bool discard_bio = (bio_op(bio) == REQ_OP_DISCARD); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 907df2ba8fd4..9f5f460c0e92 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -704,7 +704,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) bio_list_init(&requeue); while ((bio = bio_list_pop(writes))) { - if ((bio->bi_rw & REQ_FLUSH) || + if ((bio->bi_rw & REQ_PREFLUSH) || (bio_op(bio) == REQ_OP_DISCARD)) { bio_list_add(&sync, bio); continue; @@ -1253,7 +1253,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) * We need to dec pending if this was a write. */ if (rw == WRITE) { - if (!(bio->bi_rw & REQ_FLUSH) && bio_op(bio) != REQ_OP_DISCARD) + if (!(bio->bi_rw & REQ_PREFLUSH) && + bio_op(bio) != REQ_OP_DISCARD) dm_rh_dec(ms->rh, bio_record->write_region); return error; } diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 3550ca7c6577..b11813431f31 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -398,7 +398,7 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) region_t region = dm_rh_bio_to_region(rh, bio); int recovering = 0; - if (bio->bi_rw & REQ_FLUSH) { + if (bio->bi_rw & REQ_PREFLUSH) { rh->flush_failure = 1; return; } @@ -526,7 +526,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) struct bio *bio; for (bio = bios->head; bio; bio = bio->bi_next) { - if (bio->bi_rw & REQ_FLUSH || bio_op(bio) == REQ_OP_DISCARD) + if (bio->bi_rw & REQ_PREFLUSH || bio_op(bio) == REQ_OP_DISCARD) continue; rh_inc(rh, dm_rh_bio_to_region(rh, bio)); } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 70bb0e8b62ce..69ab1ff5f5c9 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1680,7 +1680,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) init_tracked_chunk(bio); - if (bio->bi_rw & REQ_FLUSH) { + if (bio->bi_rw & REQ_PREFLUSH) { bio->bi_bdev = s->cow->bdev; return DM_MAPIO_REMAPPED; } @@ -1799,7 +1799,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) init_tracked_chunk(bio); - if (bio->bi_rw & REQ_FLUSH) { + if (bio->bi_rw & REQ_PREFLUSH) { if (!dm_bio_get_target_bio_nr(bio)) bio->bi_bdev = s->origin->bdev; else @@ -2285,7 +2285,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio) bio->bi_bdev = o->dev->bdev; - if (unlikely(bio->bi_rw & REQ_FLUSH)) + if (unlikely(bio->bi_rw & REQ_PREFLUSH)) return DM_MAPIO_REMAPPED; if (bio_rw(bio) != WRITE) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index b738178ca068..48f1c01d7b9f 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -286,7 +286,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) uint32_t stripe; unsigned target_bio_nr; - if (bio->bi_rw & REQ_FLUSH) { + if (bio->bi_rw & REQ_PREFLUSH) { target_bio_nr = dm_bio_get_target_bio_nr(bio); BUG_ON(target_bio_nr >= sc->stripes); bio->bi_bdev = sc->stripe[target_bio_nr].dev->bdev; diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 1b684cbb9ba2..5f9e3d799d66 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -697,7 +697,7 @@ static void remap_to_origin(struct thin_c *tc, struct bio *bio) static int bio_triggers_commit(struct thin_c *tc, struct bio *bio) { - return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && + return (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA)) && dm_thin_changed_this_transaction(tc->td); } @@ -868,7 +868,7 @@ static void __inc_remap_and_issue_cell(void *context, struct bio *bio; while ((bio = bio_list_pop(&cell->bios))) { - if (bio->bi_rw & (REQ_FLUSH | REQ_FUA) || + if (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA) || bio_op(bio) == REQ_OP_DISCARD) bio_list_add(&info->defer_bios, bio); else { @@ -1641,7 +1641,7 @@ static void __remap_and_issue_shared_cell(void *context, while ((bio = bio_list_pop(&cell->bios))) { if ((bio_data_dir(bio) == WRITE) || - (bio->bi_rw & (REQ_FLUSH | REQ_FUA) || + (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA) || bio_op(bio) == REQ_OP_DISCARD)) bio_list_add(&info->defer_bios, bio); else { @@ -2556,7 +2556,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } - if (bio->bi_rw & (REQ_FLUSH | REQ_FUA) || + if (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA) || bio_op(bio) == REQ_OP_DISCARD) { thin_defer_bio_with_throttle(tc, bio); return DM_MAPIO_SUBMITTED; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index fcc68c8edba0..aba7ed9abb3a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1003,12 +1003,12 @@ static void dec_pending(struct dm_io *io, int error) if (io_error == DM_ENDIO_REQUEUE) return; - if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) { + if ((bio->bi_rw & REQ_PREFLUSH) && bio->bi_iter.bi_size) { /* * Preflush done for flush with data, reissue - * without REQ_FLUSH. + * without REQ_PREFLUSH. */ - bio->bi_rw &= ~REQ_FLUSH; + bio->bi_rw &= ~REQ_PREFLUSH; queue_io(md, bio); } else { /* done with normal IO or empty flush */ @@ -1477,7 +1477,7 @@ EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); /* * A target may call dm_accept_partial_bio only from the map routine. It is - * allowed for all bio types except REQ_FLUSH. + * allowed for all bio types except REQ_PREFLUSH. * * dm_accept_partial_bio informs the dm that the target only wants to process * additional n_sectors sectors of the bio and the rest of the data should be @@ -1507,7 +1507,7 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) { struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; - BUG_ON(bio->bi_rw & REQ_FLUSH); + BUG_ON(bio->bi_rw & REQ_PREFLUSH); BUG_ON(bi_size > *tio->len_ptr); BUG_ON(n_sectors > bi_size); *tio->len_ptr -= bi_size - n_sectors; @@ -1795,7 +1795,7 @@ static void __split_and_process_bio(struct mapped_device *md, start_io_acct(ci.io); - if (bio->bi_rw & REQ_FLUSH) { + if (bio->bi_rw & REQ_PREFLUSH) { ci.bio = &ci.md->flush_bio; ci.sector_count = 0; error = __send_empty_flush(&ci); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 1ad3f485672c..70ff888d25d0 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -221,7 +221,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) struct bio *split; sector_t start_sector, end_sector, data_offset; - if (unlikely(bio->bi_rw & REQ_FLUSH)) { + if (unlikely(bio->bi_rw & REQ_PREFLUSH)) { md_flush_request(mddev, bio); return; } diff --git a/drivers/md/md.c b/drivers/md/md.c index bd4844fe0e98..1f123f5a29da 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -414,7 +414,7 @@ static void md_submit_flush_data(struct work_struct *ws) /* an empty barrier - all done */ bio_endio(bio); else { - bio->bi_rw &= ~REQ_FLUSH; + bio->bi_rw &= ~REQ_PREFLUSH; mddev->pers->make_request(mddev, bio); } diff --git a/drivers/md/md.h b/drivers/md/md.h index 2e0918fc376d..b4f335245bd6 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -424,7 +424,7 @@ struct mddev { /* Generic flush handling. * The last to finish preflush schedules a worker to submit - * the rest of the request (without the REQ_FLUSH flag). + * the rest of the request (without the REQ_PREFLUSH flag). */ struct bio *flush_bio; atomic_t flush_pending; diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index dd483bb2e111..72ea98e89e57 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -111,7 +111,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) struct multipath_bh * mp_bh; struct multipath_info *multipath; - if (unlikely(bio->bi_rw & REQ_FLUSH)) { + if (unlikely(bio->bi_rw & REQ_PREFLUSH)) { md_flush_request(mddev, bio); return; } diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 051a10ca4e09..c3d439083212 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -458,7 +458,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) struct md_rdev *tmp_dev; struct bio *split; - if (unlikely(bio->bi_rw & REQ_FLUSH)) { + if (unlikely(bio->bi_rw & REQ_PREFLUSH)) { md_flush_request(mddev, bio); return; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a3427230f7cf..10e53cd6a995 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1056,7 +1056,8 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio) const int op = bio_op(bio); const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); - const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); + const unsigned long do_flush_fua = (bio->bi_rw & + (REQ_PREFLUSH | REQ_FUA)); const unsigned long do_sec = (bio->bi_rw & REQ_SECURE); struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 615045a11bac..245640b50153 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1447,7 +1447,7 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio) struct bio *split; - if (unlikely(bio->bi_rw & REQ_FLUSH)) { + if (unlikely(bio->bi_rw & REQ_PREFLUSH)) { md_flush_request(mddev, bio); return; } diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 92651381b094..5504ce2bac06 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -536,7 +536,7 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) bio_endio(bio); return 0; } - bio->bi_rw &= ~REQ_FLUSH; + bio->bi_rw &= ~REQ_PREFLUSH; return -EAGAIN; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b9122e2c6aa1..7aacf5b55e15 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5150,7 +5150,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) DEFINE_WAIT(w); bool do_prepare; - if (unlikely(bi->bi_rw & REQ_FLUSH)) { + if (unlikely(bi->bi_rw & REQ_PREFLUSH)) { int ret = r5l_handle_flush_request(conf->log, bi); if (ret == 0) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index da944ffddbaf..ca70bc78b27d 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -2207,7 +2207,7 @@ static void btrfsic_bio_end_io(struct bio *bp) block->dev_bytenr, block->mirror_num); next_block = block->next_in_same_bio; block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_FLUSH) { + if (block->submit_bio_bh_rw & REQ_PREFLUSH) { dev_state->last_flush_gen++; if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) @@ -2243,7 +2243,7 @@ static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) block->dev_bytenr, block->mirror_num); block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_FLUSH) { + if (block->submit_bio_bh_rw & REQ_PREFLUSH) { dev_state->last_flush_gen++; if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) @@ -2884,7 +2884,7 @@ int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh) btrfsic_process_written_block(dev_state, dev_bytenr, &bh->b_data, 1, NULL, NULL, bh, op_flags); - } else if (NULL != dev_state && (op_flags & REQ_FLUSH)) { + } else if (NULL != dev_state && (op_flags & REQ_PREFLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO @@ -2982,7 +2982,7 @@ static void __btrfsic_submit_bio(struct bio *bio) kunmap(bio->bi_io_vec[i].bv_page); } kfree(mapped_datav); - } else if (NULL != dev_state && (bio->bi_rw & REQ_FLUSH)) { + } else if (NULL != dev_state && (bio->bi_rw & REQ_PREFLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 34bc99637d5a..dc68f562f681 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1354,7 +1354,7 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) trace_jbd2_write_superblock(journal, write_flags); if (!(journal->j_flags & JBD2_BARRIER)) - write_flags &= ~(REQ_FUA | REQ_FLUSH); + write_flags &= ~(REQ_FUA | REQ_PREFLUSH); lock_buffer(bh); if (buffer_write_io_error(bh)) { /* diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d8acd3716dbd..686d8f160f5c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1237,7 +1237,7 @@ _xfs_buf_ioapply( if (bp->b_flags & XBF_FUA) op_flags |= REQ_FUA; if (bp->b_flags & XBF_FLUSH) - op_flags |= REQ_FLUSH; + op_flags |= REQ_PREFLUSH; /* * Run the write verifier callback function if it exists. If diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 32d87522f349..562ab8301217 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -168,7 +168,7 @@ enum rq_flag_bits { __REQ_NOIDLE, /* don't anticipate more IO after this one */ __REQ_INTEGRITY, /* I/O includes block integrity payload */ __REQ_FUA, /* forced unit access */ - __REQ_FLUSH, /* request for cache flush */ + __REQ_PREFLUSH, /* request for cache flush */ /* bio only flags */ __REQ_RAHEAD, /* read ahead, can fail anytime */ @@ -212,12 +212,12 @@ enum rq_flag_bits { (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) #define REQ_COMMON_MASK \ (REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \ - REQ_FLUSH | REQ_FUA | REQ_SECURE | REQ_INTEGRITY | REQ_NOMERGE) + REQ_PREFLUSH | REQ_FUA | REQ_SECURE | REQ_INTEGRITY | REQ_NOMERGE) #define REQ_CLONE_MASK REQ_COMMON_MASK /* This mask is used for both bio and request merge checking */ #define REQ_NOMERGE_FLAGS \ - (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA | REQ_FLUSH_SEQ) + (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_PREFLUSH | REQ_FUA | REQ_FLUSH_SEQ) #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_THROTTLED (1ULL << __REQ_THROTTLED) @@ -235,7 +235,7 @@ enum rq_flag_bits { #define REQ_PREEMPT (1ULL << __REQ_PREEMPT) #define REQ_ALLOCED (1ULL << __REQ_ALLOCED) #define REQ_COPY_USER (1ULL << __REQ_COPY_USER) -#define REQ_FLUSH (1ULL << __REQ_FLUSH) +#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) #define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ) #define REQ_IO_STAT (1ULL << __REQ_IO_STAT) #define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE) diff --git a/include/linux/fs.h b/include/linux/fs.h index ccd166477487..183024525d40 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -204,9 +204,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define READ_SYNC REQ_SYNC #define WRITE_SYNC (REQ_SYNC | REQ_NOIDLE) #define WRITE_ODIRECT REQ_SYNC -#define WRITE_FLUSH (REQ_SYNC | REQ_NOIDLE | REQ_FLUSH) +#define WRITE_FLUSH (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH) #define WRITE_FUA (REQ_SYNC | REQ_NOIDLE | REQ_FUA) -#define WRITE_FLUSH_FUA (REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA) +#define WRITE_FLUSH_FUA (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA) /* * Attribute flags. These should be or-ed together to figure out what diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 30efa44a473c..878963a1f058 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -33,7 +33,7 @@ TRACE_DEFINE_ENUM(SSR); TRACE_DEFINE_ENUM(__REQ_RAHEAD); TRACE_DEFINE_ENUM(__REQ_SYNC); TRACE_DEFINE_ENUM(__REQ_NOIDLE); -TRACE_DEFINE_ENUM(__REQ_FLUSH); +TRACE_DEFINE_ENUM(__REQ_PREFLUSH); TRACE_DEFINE_ENUM(__REQ_FUA); TRACE_DEFINE_ENUM(__REQ_PRIO); TRACE_DEFINE_ENUM(__REQ_META); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 0c70fbb6ea8d..03b0dd98ff0e 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -189,6 +189,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; #define BLK_TC_RAHEAD BLK_TC_AHEAD +#define BLK_TC_PREFLUSH BLK_TC_FLUSH /* The ilog2() calls fall out because they're constant */ #define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ @@ -219,7 +220,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(op_flags, SYNC); what |= MASK_TC_BIT(op_flags, RAHEAD); what |= MASK_TC_BIT(op_flags, META); - what |= MASK_TC_BIT(op_flags, FLUSH); + what |= MASK_TC_BIT(op_flags, PREFLUSH); what |= MASK_TC_BIT(op_flags, FUA); if (op == REQ_OP_DISCARD) what |= BLK_TC_ACT(BLK_TC_DISCARD); @@ -1779,7 +1780,7 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) { int i = 0; - if (rw & REQ_FLUSH) + if (rw & REQ_PREFLUSH) rwbs[i++] = 'F'; switch (op) { -- cgit v1.3-8-gc7d7 From a4f144ebbdf6f7807c477bce8e136047ed27321f Mon Sep 17 00:00:00 2001 From: David Carrillo-Cisneros Date: Wed, 1 Jun 2016 12:33:05 -0700 Subject: perf/core: Fix crash due to account/unaccount_sb_event() inconsistency unaccount_pmu_sb_event() did not check for attributes in event->attr before calling detach_sb_event(), while account_pmu_event() did. This caused NULL pointer reference in cgroup events that did not have any of the attributes checked by account_pmu_event(). To trigger the bug just wait for a cgroup event to terminate, e.g.: $ mkdir /dev/cgroup/devices/test $ perf stat -e cycles -a -G test sleep 0 ... see crash ... Signed-off-by: David Carrillo-Cisneros Reviewed-by: Stephane Eranian Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Zheng Link: http://lkml.kernel.org/r/1464809585-66072-1-git-send-email-davidcc@google.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5d48306879d5..ae081a141a4a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3682,15 +3682,28 @@ static void detach_sb_event(struct perf_event *event) raw_spin_unlock(&pel->lock); } -static void unaccount_pmu_sb_event(struct perf_event *event) +static bool is_sb_event(struct perf_event *event) { + struct perf_event_attr *attr = &event->attr; + if (event->parent) - return; + return false; if (event->attach_state & PERF_ATTACH_TASK) - return; + return false; - detach_sb_event(event); + if (attr->mmap || attr->mmap_data || attr->mmap2 || + attr->comm || attr->comm_exec || + attr->task || + attr->context_switch) + return true; + return false; +} + +static void unaccount_pmu_sb_event(struct perf_event *event) +{ + if (is_sb_event(event)) + detach_sb_event(event); } static void unaccount_event_cpu(struct perf_event *event, int cpu) @@ -8666,18 +8679,7 @@ static void attach_sb_event(struct perf_event *event) */ static void account_pmu_sb_event(struct perf_event *event) { - struct perf_event_attr *attr = &event->attr; - - if (event->parent) - return; - - if (event->attach_state & PERF_ATTACH_TASK) - return; - - if (attr->mmap || attr->mmap_data || attr->mmap2 || - attr->comm || attr->comm_exec || - attr->task || - attr->context_switch) + if (is_sb_event(event)) attach_sb_event(event); } -- cgit v1.3-8-gc7d7 From dfaaf3fa01d65cf6e2072965bb0b7aaa7285344f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 May 2016 18:31:33 +0200 Subject: locking/lockdep: Use __jhash_mix() for iterate_chain_key() Use __jhash_mix() to mix the class_idx into the class_key. This function provides better mixing than the previously used, home grown mix function. Leave hashing to the professionals :-) Suggested-by: George Spelvin Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 81f1a7107c0e..589d763a49b3 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -46,6 +46,7 @@ #include #include #include +#include #include @@ -309,10 +310,14 @@ static struct hlist_head chainhash_table[CHAINHASH_SIZE]; * It's a 64-bit hash, because it's important for the keys to be * unique. */ -#define iterate_chain_key(key1, key2) \ - (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ - ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ - (key2)) +static inline u64 iterate_chain_key(u64 key, u32 idx) +{ + u32 k0 = key, k1 = key >> 32; + + __jhash_mix(idx, k0, k1); /* Macro that modifies arguments! */ + + return k0 | (u64)k1 << 32; +} void lockdep_off(void) { -- cgit v1.3-8-gc7d7 From a461d58792d4a46b8b10ae50973ec9b2763b694e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 27 May 2016 15:47:18 +0200 Subject: locking/rtmutex: Only warn once on a trylock from bad context One warning should be enough to get one motivated to fix this. It is possible that this happens more than once and that starts flooding the output. Later the prints will be suppressed so we only get half of it. Depending on the console system used it might not be helpful. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1464356838-1755-1-git-send-email-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 3e746607abe5..1ec0f48962b3 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1478,7 +1478,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); */ int __sched rt_mutex_trylock(struct rt_mutex *lock) { - if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq())) + if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) return 0; return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); -- cgit v1.3-8-gc7d7 From 03c041c5bf6ed584dff36b7cd509e0146a124277 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 3 Jun 2016 17:58:41 -0500 Subject: sched/debug: Always show 'nr_migrations' The nr_migrations field is updated independently of CONFIG_SCHEDSTATS, so it can be displayed regardless. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5b1b04057ae2b14d73c2d03f56582c1d38cfe066.1464994423.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0368c393a336..2a0a9995256d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) nr_switches = p->nvcsw + p->nivcsw; -#ifdef CONFIG_SCHEDSTATS P(se.nr_migrations); +#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; -- cgit v1.3-8-gc7d7 From 8d53fa19041ae65c484d81d75179b4a577e6d8e4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Jun 2016 09:12:30 +0200 Subject: locking/qspinlock: Clarify xchg_tail() ordering While going over the code I noticed that xchg_tail() is a RELEASE but had no obvious pairing commented. It pairs with a somewhat unique address dependency through decode_tail(). So the store-release of xchg_tail() is paired by the address dependency of the load of xchg_tail followed by the dereference from the pointer computed from that load. The @old -> @prev transformation itself is pure, and therefore does not depend on external state, so that is immaterial wrt. ordering. Signed-off-by: Peter Zijlstra (Intel) Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Pan Xinhui Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 5fc8c311b8fe..ee7deb08d43d 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -90,7 +90,7 @@ static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); * therefore increment the cpu number by one. */ -static inline u32 encode_tail(int cpu, int idx) +static inline __pure u32 encode_tail(int cpu, int idx) { u32 tail; @@ -103,7 +103,7 @@ static inline u32 encode_tail(int cpu, int idx) return tail; } -static inline struct mcs_spinlock *decode_tail(u32 tail) +static inline __pure struct mcs_spinlock *decode_tail(u32 tail) { int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; @@ -455,6 +455,8 @@ queue: * pending stuff. * * p,*,* -> n,*,* + * + * RELEASE, such that the stores to @node must be complete. */ old = xchg_tail(lock, tail); next = NULL; @@ -465,6 +467,15 @@ queue: */ if (old & _Q_TAIL_MASK) { prev = decode_tail(old); + /* + * The above xchg_tail() is also a load of @lock which generates, + * through decode_tail(), a pointer. + * + * The address dependency matches the RELEASE of xchg_tail() + * such that the access to @prev must happen after. + */ + smp_read_barrier_depends(); + WRITE_ONCE(prev->next, node); pv_wait_node(node, prev); -- cgit v1.3-8-gc7d7 From 055ce0fd1b86c204430cbc0887165599d6e15090 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Jun 2016 10:36:53 +0200 Subject: locking/qspinlock: Add comments I figured we need to document the spin_is_locked() and spin_unlock_wait() constraints somwehere. Ideally 'someone' would rewrite Documentation/atomic_ops.txt and we could find a place in there. But currently that document is stale to the point of hardly being useful. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Pan Xinhui Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index ee7deb08d43d..2f9153b183c9 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -267,6 +267,63 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath #endif +/* + * Various notes on spin_is_locked() and spin_unlock_wait(), which are + * 'interesting' functions: + * + * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE + * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64, + * PPC). Also qspinlock has a similar issue per construction, the setting of + * the locked byte can be unordered acquiring the lock proper. + * + * This gets to be 'interesting' in the following cases, where the /should/s + * end up false because of this issue. + * + * + * CASE 1: + * + * So the spin_is_locked() correctness issue comes from something like: + * + * CPU0 CPU1 + * + * global_lock(); local_lock(i) + * spin_lock(&G) spin_lock(&L[i]) + * for (i) if (!spin_is_locked(&G)) { + * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep(); + * return; + * } + * // deal with fail + * + * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such + * that there is exclusion between the two critical sections. + * + * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from + * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i]) + * /should/ be constrained by the ACQUIRE from spin_lock(&G). + * + * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB. + * + * + * CASE 2: + * + * For spin_unlock_wait() there is a second correctness issue, namely: + * + * CPU0 CPU1 + * + * flag = set; + * smp_mb(); spin_lock(&l) + * spin_unlock_wait(&l); if (!flag) + * // add to lockless list + * spin_unlock(&l); + * // iterate lockless list + * + * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0 + * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE + * semantics etc..) + * + * Where flag /should/ be ordered against the locked store of l. + */ + /* * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before * issuing an _unordered_ store to set _Q_LOCKED_VAL. -- cgit v1.3-8-gc7d7 From 8ee62b1870be8e630158701632a533d0378e15b8 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 3 Jun 2016 22:26:02 -0700 Subject: locking/rwsem: Convert sem->count to 'atomic_long_t' Convert the rwsem count variable to an atomic_long_t since we use it as an atomic variable. This also allows us to remove the rwsem_atomic_{add,update}() "abstraction" which would now be an unnecesary level of indirection. In follow up patches, we also remove the rwsem_atomic_{add,update}() definitions across the various architectures. Suggested-by: Peter Zijlstra Signed-off-by: Jason Low [ Build warning fixes on various architectures. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Fenghua Yu Cc: Heiko Carstens Cc: Jason Low Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Paul E. McKenney Cc: Peter Hurley Cc: Terry Rudd Cc: Thomas Gleixner Cc: Tim Chen Cc: Tony Luck Cc: Waiman Long Link: http://lkml.kernel.org/r/1465017963-4839-2-git-send-email-jason.low2@hpe.com Signed-off-by: Ingo Molnar --- arch/alpha/include/asm/rwsem.h | 26 +++++++++++++------------- arch/ia64/include/asm/rwsem.h | 24 ++++++++++++------------ include/asm-generic/rwsem.h | 6 +++--- include/linux/rwsem.h | 8 +++++--- kernel/locking/rwsem-xadd.c | 32 +++++++++++++++++--------------- 5 files changed, 50 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h index 0131a7058778..b40021aabb9f 100644 --- a/arch/alpha/include/asm/rwsem.h +++ b/arch/alpha/include/asm/rwsem.h @@ -25,8 +25,8 @@ static inline void __down_read(struct rw_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP - oldcount = sem->count; - sem->count += RWSEM_ACTIVE_READ_BIAS; + oldcount = sem->count.counter; + sem->count.counter += RWSEM_ACTIVE_READ_BIAS; #else long temp; __asm__ __volatile__( @@ -52,13 +52,13 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) { long old, new, res; - res = sem->count; + res = atomic_long_read(&sem->count); do { new = res + RWSEM_ACTIVE_READ_BIAS; if (new <= 0) break; old = res; - res = cmpxchg(&sem->count, old, new); + res = atomic_long_cmpxchg(&sem->count, old, new); } while (res != old); return res >= 0 ? 1 : 0; } @@ -67,8 +67,8 @@ static inline long ___down_write(struct rw_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP - oldcount = sem->count; - sem->count += RWSEM_ACTIVE_WRITE_BIAS; + oldcount = sem->count.counter; + sem->count.counter += RWSEM_ACTIVE_WRITE_BIAS; #else long temp; __asm__ __volatile__( @@ -106,7 +106,7 @@ static inline int __down_write_killable(struct rw_semaphore *sem) */ static inline int __down_write_trylock(struct rw_semaphore *sem) { - long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, + long ret = atomic_long_cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); if (ret == RWSEM_UNLOCKED_VALUE) return 1; @@ -117,8 +117,8 @@ static inline void __up_read(struct rw_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP - oldcount = sem->count; - sem->count -= RWSEM_ACTIVE_READ_BIAS; + oldcount = sem->count.counter; + sem->count.counter -= RWSEM_ACTIVE_READ_BIAS; #else long temp; __asm__ __volatile__( @@ -142,8 +142,8 @@ static inline void __up_write(struct rw_semaphore *sem) { long count; #ifndef CONFIG_SMP - sem->count -= RWSEM_ACTIVE_WRITE_BIAS; - count = sem->count; + sem->count.counter -= RWSEM_ACTIVE_WRITE_BIAS; + count = sem->count.counter; #else long temp; __asm__ __volatile__( @@ -171,8 +171,8 @@ static inline void __downgrade_write(struct rw_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP - oldcount = sem->count; - sem->count -= RWSEM_WAITING_BIAS; + oldcount = sem->count.counter; + sem->count.counter -= RWSEM_WAITING_BIAS; #else long temp; __asm__ __volatile__( diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h index 8b23e070b844..c5d544f188ed 100644 --- a/arch/ia64/include/asm/rwsem.h +++ b/arch/ia64/include/asm/rwsem.h @@ -40,7 +40,7 @@ static inline void __down_read (struct rw_semaphore *sem) { - long result = ia64_fetchadd8_acq((unsigned long *)&sem->count, 1); + long result = ia64_fetchadd8_acq((unsigned long *)&sem->count.counter, 1); if (result < 0) rwsem_down_read_failed(sem); @@ -55,9 +55,9 @@ ___down_write (struct rw_semaphore *sem) long old, new; do { - old = sem->count; + old = atomic_long_read(&sem->count); new = old + RWSEM_ACTIVE_WRITE_BIAS; - } while (cmpxchg_acq(&sem->count, old, new) != old); + } while (atomic_long_cmpxchg_acquire(&sem->count, old, new) != old); return old; } @@ -85,7 +85,7 @@ __down_write_killable (struct rw_semaphore *sem) static inline void __up_read (struct rw_semaphore *sem) { - long result = ia64_fetchadd8_rel((unsigned long *)&sem->count, -1); + long result = ia64_fetchadd8_rel((unsigned long *)&sem->count.counter, -1); if (result < 0 && (--result & RWSEM_ACTIVE_MASK) == 0) rwsem_wake(sem); @@ -100,9 +100,9 @@ __up_write (struct rw_semaphore *sem) long old, new; do { - old = sem->count; + old = atomic_long_read(&sem->count); new = old - RWSEM_ACTIVE_WRITE_BIAS; - } while (cmpxchg_rel(&sem->count, old, new) != old); + } while (atomic_long_cmpxchg_release(&sem->count, old, new) != old); if (new < 0 && (new & RWSEM_ACTIVE_MASK) == 0) rwsem_wake(sem); @@ -115,8 +115,8 @@ static inline int __down_read_trylock (struct rw_semaphore *sem) { long tmp; - while ((tmp = sem->count) >= 0) { - if (tmp == cmpxchg_acq(&sem->count, tmp, tmp+1)) { + while ((tmp = atomic_long_read(&sem->count)) >= 0) { + if (tmp == atomic_long_cmpxchg_acquire(&sem->count, tmp, tmp+1)) { return 1; } } @@ -129,8 +129,8 @@ __down_read_trylock (struct rw_semaphore *sem) static inline int __down_write_trylock (struct rw_semaphore *sem) { - long tmp = cmpxchg_acq(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); + long tmp = atomic_long_cmpxchg_acquire(&sem->count, + RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); return tmp == RWSEM_UNLOCKED_VALUE; } @@ -143,9 +143,9 @@ __downgrade_write (struct rw_semaphore *sem) long old, new; do { - old = sem->count; + old = atomic_long_read(&sem->count); new = old - RWSEM_WAITING_BIAS; - } while (cmpxchg_rel(&sem->count, old, new) != old); + } while (atomic_long_cmpxchg_release(&sem->count, old, new) != old); if (old < 0) rwsem_downgrade_wake(sem); diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h index 3fc94a046bf5..a3a93eca766c 100644 --- a/include/asm-generic/rwsem.h +++ b/include/asm-generic/rwsem.h @@ -41,8 +41,8 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) { long tmp; - while ((tmp = sem->count) >= 0) { - if (tmp == cmpxchg_acquire(&sem->count, tmp, + while ((tmp = atomic_long_read(&sem->count)) >= 0) { + if (tmp == atomic_long_cmpxchg_acquire(&sem->count, tmp, tmp + RWSEM_ACTIVE_READ_BIAS)) { return 1; } @@ -79,7 +79,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) { long tmp; - tmp = cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, + tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); return tmp == RWSEM_UNLOCKED_VALUE; } diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index d37fbb34d06f..dd1d14250340 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -23,10 +23,11 @@ struct rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK #include /* use a generic implementation */ +#define __RWSEM_INIT_COUNT(name) .count = RWSEM_UNLOCKED_VALUE #else /* All arch specific implementations share the same struct */ struct rw_semaphore { - long count; + atomic_long_t count; struct list_head wait_list; raw_spinlock_t wait_lock; #ifdef CONFIG_RWSEM_SPIN_ON_OWNER @@ -54,9 +55,10 @@ extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); /* In all implementations count != 0 means locked */ static inline int rwsem_is_locked(struct rw_semaphore *sem) { - return sem->count != 0; + return atomic_long_read(&sem->count) != 0; } +#define __RWSEM_INIT_COUNT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) #endif /* Common initializer macros and functions */ @@ -74,7 +76,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) #endif #define __RWSEM_INITIALIZER(name) \ - { .count = RWSEM_UNLOCKED_VALUE, \ + { __RWSEM_INIT_COUNT(name), \ .wait_list = LIST_HEAD_INIT((name).wait_list), \ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock) \ __RWSEM_OPT_INIT(name) \ diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index b957da7fcb19..63b40a5c62ec 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -80,7 +80,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map(&sem->dep_map, name, key, 0); #endif - sem->count = RWSEM_UNLOCKED_VALUE; + atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); #ifdef CONFIG_RWSEM_SPIN_ON_OWNER @@ -153,10 +153,11 @@ __rwsem_mark_wake(struct rw_semaphore *sem, if (wake_type != RWSEM_WAKE_READ_OWNED) { adjustment = RWSEM_ACTIVE_READ_BIAS; try_reader_grant: - oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; + oldcount = atomic_long_add_return(adjustment, &sem->count) - adjustment; + if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { /* A writer stole the lock. Undo our reader grant. */ - if (rwsem_atomic_update(-adjustment, sem) & + if (atomic_long_sub_return(adjustment, &sem->count) & RWSEM_ACTIVE_MASK) goto out; /* Last active locker left. Retry waking readers. */ @@ -186,7 +187,7 @@ __rwsem_mark_wake(struct rw_semaphore *sem, adjustment -= RWSEM_WAITING_BIAS; if (adjustment) - rwsem_atomic_add(adjustment, sem); + atomic_long_add(adjustment, &sem->count); next = sem->wait_list.next; loop = woken; @@ -233,7 +234,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); + count = atomic_long_add_return(adjustment, &sem->count); /* If there are no active locks, wake the front queued process(es). * @@ -282,7 +283,8 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) RWSEM_ACTIVE_WRITE_BIAS : RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS; - if (cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) == RWSEM_WAITING_BIAS) { + if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) + == RWSEM_WAITING_BIAS) { rwsem_set_owner(sem); return true; } @@ -296,13 +298,13 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) */ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) { - long old, count = READ_ONCE(sem->count); + long old, count = atomic_long_read(&sem->count); while (true) { if (!(count == 0 || count == RWSEM_WAITING_BIAS)) return false; - old = cmpxchg_acquire(&sem->count, count, + old = atomic_long_cmpxchg_acquire(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); if (old == count) { rwsem_set_owner(sem); @@ -324,7 +326,7 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) rcu_read_lock(); owner = READ_ONCE(sem->owner); if (!owner) { - long count = READ_ONCE(sem->count); + long count = atomic_long_read(&sem->count); /* * If sem->owner is not set, yet we have just recently entered the * slowpath with the lock being active, then there is a possibility @@ -375,7 +377,7 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) * held by readers. Check the counter to verify the * state. */ - count = READ_ONCE(sem->count); + count = atomic_long_read(&sem->count); return (count == 0 || count == RWSEM_WAITING_BIAS); } @@ -460,7 +462,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) WAKE_Q(wake_q); /* undo write bias from down_write operation, stop active locking */ - count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); + count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count); /* do optimistic spinning and steal lock if possible */ if (rwsem_optimistic_spin(sem)) @@ -483,7 +485,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) /* we're now waiting on the lock, but no longer actively locking */ if (waiting) { - count = READ_ONCE(sem->count); + count = atomic_long_read(&sem->count); /* * If there were already threads queued before us and there are @@ -505,7 +507,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) } } else - count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count); /* wait until we successfully acquire the lock */ set_current_state(state); @@ -521,7 +523,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) schedule(); set_current_state(state); - } while ((count = sem->count) & RWSEM_ACTIVE_MASK); + } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } @@ -536,7 +538,7 @@ out_nolock: raw_spin_lock_irq(&sem->wait_lock); list_del(&waiter.list); if (list_empty(&sem->wait_list)) - rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem); + atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); else __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); -- cgit v1.3-8-gc7d7 From 19c5d690e41697fcdd19379ab9d10d8d37818414 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 17 May 2016 21:26:19 -0400 Subject: locking/rwsem: Add reader-owned state to the owner field Currently, it is not possible to determine for sure if a reader owns a rwsem by looking at the content of the rwsem data structure. This patch adds a new state RWSEM_READER_OWNED to the owner field to indicate that readers currently own the lock. This enables us to address the following 2 issues in the rwsem optimistic spinning code: 1) rwsem_can_spin_on_owner() will disallow optimistic spinning if the owner field is NULL which can mean either the readers own the lock or the owning writer hasn't set the owner field yet. In the latter case, we miss the chance to do optimistic spinning. 2) While a writer is waiting in the OSQ and a reader takes the lock, the writer will continue to spin when out of the OSQ in the main rwsem_optimistic_spin() loop as the owner field is NULL wasting CPU cycles if some of readers are sleeping. Adding the new state will allow optimistic spinning to go forward as long as the owner field is not RWSEM_READER_OWNED and the owner is running, if set, but stop immediately when that state has been reached. On a 4-socket Haswell machine running on a 4.6-rc1 based kernel, the fio test with multithreaded randrw and randwrite tests on the same file on a XFS partition on top of a NVDIMM were run, the aggregated bandwidths before and after the patch were as follows: Test BW before patch BW after patch % change ---- --------------- -------------- -------- randrw 988 MB/s 1192 MB/s +21% randwrite 1513 MB/s 1623 MB/s +7.3% The perf profile of the rwsem_down_write_failed() function in randrw before and after the patch were: 19.95% 5.88% fio [kernel.vmlinux] [k] rwsem_down_write_failed 14.20% 1.52% fio [kernel.vmlinux] [k] rwsem_down_write_failed The actual CPU cycles spend in rwsem_down_write_failed() dropped from 5.88% to 1.52% after the patch. The xfstests was also run and no regression was observed. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Jason Low Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Dave Chinner Cc: Douglas Hatch Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Hurley Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1463534783-38814-2-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 41 ++++++++++++++++++++++------------------- kernel/locking/rwsem.c | 8 ++++++-- kernel/locking/rwsem.h | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 63b40a5c62ec..6b0d0605910e 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -163,6 +163,12 @@ __rwsem_mark_wake(struct rw_semaphore *sem, /* Last active locker left. Retry waking readers. */ goto try_reader_grant; } + /* + * It is not really necessary to set it to reader-owned here, + * but it gives the spinners an early indication that the + * readers now have the lock. + */ + rwsem_set_reader_owned(sem); } /* Grant an infinite number of read locks to the readers at the front @@ -325,16 +331,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) rcu_read_lock(); owner = READ_ONCE(sem->owner); - if (!owner) { - long count = atomic_long_read(&sem->count); + if (!rwsem_owner_is_writer(owner)) { /* - * If sem->owner is not set, yet we have just recently entered the - * slowpath with the lock being active, then there is a possibility - * reader(s) may have the lock. To be safe, bail spinning in these - * situations. + * Don't spin if the rwsem is readers owned. */ - if (count & RWSEM_ACTIVE_MASK) - ret = false; + ret = !rwsem_owner_is_reader(owner); goto done; } @@ -347,8 +348,6 @@ done: static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) { - long count; - rcu_read_lock(); while (sem->owner == owner) { /* @@ -369,16 +368,11 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) } rcu_read_unlock(); - if (READ_ONCE(sem->owner)) - return true; /* new owner, continue spinning */ - /* - * When the owner is not set, the lock could be free or - * held by readers. Check the counter to verify the - * state. + * If there is a new owner or the owner is not set, we continue + * spinning. */ - count = atomic_long_read(&sem->count); - return (count == 0 || count == RWSEM_WAITING_BIAS); + return !rwsem_owner_is_reader(READ_ONCE(sem->owner)); } static bool rwsem_optimistic_spin(struct rw_semaphore *sem) @@ -397,7 +391,16 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) while (true) { owner = READ_ONCE(sem->owner); - if (owner && !rwsem_spin_on_owner(sem, owner)) + /* + * Don't spin if + * 1) the owner is a reader as we we can't determine if the + * reader is actively running or not. + * 2) The rwsem_spin_on_owner() returns false which means + * the owner isn't running. + */ + if (rwsem_owner_is_reader(owner) || + (rwsem_owner_is_writer(owner) && + !rwsem_spin_on_owner(sem, owner))) break; /* wait_lock will be acquired if write_lock is obtained */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2e853ad93a3a..45ba475d4be3 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -22,6 +22,7 @@ void __sched down_read(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); + rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read); @@ -33,8 +34,10 @@ int down_read_trylock(struct rw_semaphore *sem) { int ret = __down_read_trylock(sem); - if (ret == 1) + if (ret == 1) { rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); + rwsem_set_reader_owned(sem); + } return ret; } @@ -124,7 +127,7 @@ void downgrade_write(struct rw_semaphore *sem) * lockdep: a downgraded write will live on as a write * dependency. */ - rwsem_clear_owner(sem); + rwsem_set_reader_owned(sem); __downgrade_write(sem); } @@ -138,6 +141,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); + rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read_nested); diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 870ed9a5b426..8f43ba234787 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,3 +1,20 @@ +/* + * The owner field of the rw_semaphore structure will be set to + * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear + * the owner field when it unlocks. A reader, on the other hand, will + * not touch the owner field when it unlocks. + * + * In essence, the owner field now has the following 3 states: + * 1) 0 + * - lock is free or the owner hasn't set the field yet + * 2) RWSEM_READER_OWNED + * - lock is currently or previously owned by readers (lock is free + * or not set by owner yet) + * 3) Other non-zero value + * - a writer owns the lock + */ +#define RWSEM_READER_OWNED ((struct task_struct *)1UL) + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER static inline void rwsem_set_owner(struct rw_semaphore *sem) { @@ -9,6 +26,26 @@ static inline void rwsem_clear_owner(struct rw_semaphore *sem) sem->owner = NULL; } +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ + /* + * We check the owner value first to make sure that we will only + * do a write to the rwsem cacheline when it is really necessary + * to minimize cacheline contention. + */ + if (sem->owner != RWSEM_READER_OWNED) + sem->owner = RWSEM_READER_OWNED; +} + +static inline bool rwsem_owner_is_writer(struct task_struct *owner) +{ + return owner && owner != RWSEM_READER_OWNED; +} + +static inline bool rwsem_owner_is_reader(struct task_struct *owner) +{ + return owner == RWSEM_READER_OWNED; +} #else static inline void rwsem_set_owner(struct rw_semaphore *sem) { @@ -17,4 +54,8 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem) static inline void rwsem_clear_owner(struct rw_semaphore *sem) { } + +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ +} #endif -- cgit v1.3-8-gc7d7 From fb6a44f33be542fd81575ff93a4e8118d6a58592 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 17 May 2016 21:26:20 -0400 Subject: locking/rwsem: Protect all writes to owner by WRITE_ONCE() Without using WRITE_ONCE(), the compiler can potentially break a write into multiple smaller ones (store tearing). So a read from the same data by another task concurrently may return a partial result. This can result in a kernel crash if the data is a memory address that is being dereferenced. This patch changes all write to rwsem->owner to use WRITE_ONCE() to make sure that store tearing will not happen. READ_ONCE() may not be needed for rwsem->owner as long as the value is only used for comparison and not dereferencing. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Dave Chinner Cc: Davidlohr Bueso Cc: Douglas Hatch Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Hurley Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1463534783-38814-3-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 8f43ba234787..a699f4048ba1 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -16,14 +16,21 @@ #define RWSEM_READER_OWNED ((struct task_struct *)1UL) #ifdef CONFIG_RWSEM_SPIN_ON_OWNER +/* + * All writes to owner are protected by WRITE_ONCE() to make sure that + * store tearing can't happen as optimistic spinners may read and use + * the owner value concurrently without lock. Read from owner, however, + * may not need READ_ONCE() as long as the pointer value is only used + * for comparison and isn't being dereferenced. + */ static inline void rwsem_set_owner(struct rw_semaphore *sem) { - sem->owner = current; + WRITE_ONCE(sem->owner, current); } static inline void rwsem_clear_owner(struct rw_semaphore *sem) { - sem->owner = NULL; + WRITE_ONCE(sem->owner, NULL); } static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) @@ -34,7 +41,7 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) * to minimize cacheline contention. */ if (sem->owner != RWSEM_READER_OWNED) - sem->owner = RWSEM_READER_OWNED; + WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); } static inline bool rwsem_owner_is_writer(struct task_struct *owner) -- cgit v1.3-8-gc7d7 From bf7b4c472db44413251bcef79ca1f6bf1ec81475 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 17 May 2016 21:26:22 -0400 Subject: locking/rwsem: Improve reader wakeup code In __rwsem_do_wake(), the reader wakeup code will assume a writer has stolen the lock if the active reader/writer count is not 0. However, this is not as reliable an indicator as the original "< RWSEM_WAITING_BIAS" check. If another reader is present, the code will still break out and exit even if the writer is gone. This patch changes it to check the same "< RWSEM_WAITING_BIAS" condition to reduce the chance of false positive. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Peter Hurley Cc: Andrew Morton Cc: Dave Chinner Cc: Davidlohr Bueso Cc: Douglas Hatch Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1463534783-38814-5-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 6b0d0605910e..4f1daf5a472d 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -156,9 +156,14 @@ __rwsem_mark_wake(struct rw_semaphore *sem, oldcount = atomic_long_add_return(adjustment, &sem->count) - adjustment; if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { - /* A writer stole the lock. Undo our reader grant. */ - if (atomic_long_sub_return(adjustment, &sem->count) & - RWSEM_ACTIVE_MASK) + /* + * If the count is still less than RWSEM_WAITING_BIAS + * after removing the adjustment, it is assumed that + * a writer has stolen the lock. We have to undo our + * reader grant. + */ + if (atomic_long_add_return(-adjustment, &sem->count) < + RWSEM_WAITING_BIAS) goto out; /* Last active locker left. Retry waking readers. */ goto try_reader_grant; -- cgit v1.3-8-gc7d7 From ddd0fa73c2b71c35de4fe7ae60a5f1a6cddc2cf0 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 17 May 2016 21:26:23 -0400 Subject: locking/rwsem: Streamline the rwsem_optimistic_spin() code This patch moves the owner loading and checking code entirely inside of rwsem_spin_on_owner() to simplify the logic of rwsem_optimistic_spin() loop. Suggested-by: Peter Hurley Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Peter Hurley Cc: Andrew Morton Cc: Dave Chinner Cc: Davidlohr Bueso Cc: Douglas Hatch Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1463534783-38814-6-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 4f1daf5a472d..2031281bb940 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -350,9 +350,16 @@ done: return ret; } -static noinline -bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) +/* + * Return true only if we can still spin on the owner field of the rwsem. + */ +static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) { + struct task_struct *owner = READ_ONCE(sem->owner); + + if (!rwsem_owner_is_writer(owner)) + goto out; + rcu_read_lock(); while (sem->owner == owner) { /* @@ -372,7 +379,7 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) cpu_relax_lowlatency(); } rcu_read_unlock(); - +out: /* * If there is a new owner or the owner is not set, we continue * spinning. @@ -382,7 +389,6 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) static bool rwsem_optimistic_spin(struct rw_semaphore *sem) { - struct task_struct *owner; bool taken = false; preempt_disable(); @@ -394,21 +400,17 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) if (!osq_lock(&sem->osq)) goto done; - while (true) { - owner = READ_ONCE(sem->owner); + /* + * Optimistically spin on the owner field and attempt to acquire the + * lock whenever the owner changes. Spinning will be stopped when: + * 1) the owning writer isn't running; or + * 2) readers own the lock as we can't determine if they are + * actively running or not. + */ + while (rwsem_spin_on_owner(sem)) { /* - * Don't spin if - * 1) the owner is a reader as we we can't determine if the - * reader is actively running or not. - * 2) The rwsem_spin_on_owner() returns false which means - * the owner isn't running. + * Try to acquire the lock */ - if (rwsem_owner_is_reader(owner) || - (rwsem_owner_is_writer(owner) && - !rwsem_spin_on_owner(sem, owner))) - break; - - /* wait_lock will be acquired if write_lock is obtained */ if (rwsem_try_write_lock_unqueued(sem)) { taken = true; break; @@ -420,7 +422,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) * we're an RT task that will live-lock because we won't let * the owner complete. */ - if (!owner && (need_resched() || rt_task(current))) + if (!sem->owner && (need_resched() || rt_task(current))) break; /* -- cgit v1.3-8-gc7d7 From 6428671bae97caa7040e24e79e969fd87908f4f3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Jun 2016 20:58:15 +0200 Subject: locking/mutex: Optimize mutex_trylock() fast-path A while back Viro posted a number of 'interesting' mutex_is_locked() users on IRC, one of those was RCU. RCU seems to use mutex_is_locked() to avoid doing mutex_trylock(), the regular load before modify pattern. While the use isn't wrong per se, its curious in that its needed at all, mutex_trylock() should be good enough on its own to avoid the pointless cacheline bounces. So fix those and remove the mutex_is_locked() (ab)use from RCU. Reported-by: Al Viro Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paul McKenney Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Link: http://lkml.kernel.org/r/20160601185815.GW3190@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/mutex.h | 2 +- arch/powerpc/include/asm/mutex.h | 2 +- arch/x86/include/asm/mutex_32.h | 2 +- arch/x86/include/asm/mutex_64.h | 6 +++--- include/asm-generic/mutex-dec.h | 2 +- include/asm-generic/mutex-xchg.h | 6 +++++- kernel/rcu/tree.c | 1 - 7 files changed, 12 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/include/asm/mutex.h b/arch/ia64/include/asm/mutex.h index f41e66d65e31..28cb819e0ff9 100644 --- a/arch/ia64/include/asm/mutex.h +++ b/arch/ia64/include/asm/mutex.h @@ -82,7 +82,7 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *)) static inline int __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) { - if (cmpxchg_acq(count, 1, 0) == 1) + if (atomic_read(count) == 1 && cmpxchg_acq(count, 1, 0) == 1) return 1; return 0; } diff --git a/arch/powerpc/include/asm/mutex.h b/arch/powerpc/include/asm/mutex.h index 127ab23e1f6c..078155fa1189 100644 --- a/arch/powerpc/include/asm/mutex.h +++ b/arch/powerpc/include/asm/mutex.h @@ -124,7 +124,7 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *)) static inline int __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) { - if (likely(__mutex_cmpxchg_lock(count, 1, 0) == 1)) + if (likely(atomic_read(count) == 1 && __mutex_cmpxchg_lock(count, 1, 0) == 1)) return 1; return 0; } diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h index 85e6cda45a02..e9355a84fc67 100644 --- a/arch/x86/include/asm/mutex_32.h +++ b/arch/x86/include/asm/mutex_32.h @@ -101,7 +101,7 @@ static inline int __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) { /* cmpxchg because it never induces a false contention state. */ - if (likely(atomic_cmpxchg(count, 1, 0) == 1)) + if (likely(atomic_read(count) == 1 && atomic_cmpxchg(count, 1, 0) == 1)) return 1; return 0; diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h index 07537a44216e..d9850758464e 100644 --- a/arch/x86/include/asm/mutex_64.h +++ b/arch/x86/include/asm/mutex_64.h @@ -118,10 +118,10 @@ do { \ static inline int __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) { - if (likely(atomic_cmpxchg(count, 1, 0) == 1)) + if (likely(atomic_read(count) == 1 && atomic_cmpxchg(count, 1, 0) == 1)) return 1; - else - return 0; + + return 0; } #endif /* _ASM_X86_MUTEX_64_H */ diff --git a/include/asm-generic/mutex-dec.h b/include/asm-generic/mutex-dec.h index fd694cfd678a..c54829d3de37 100644 --- a/include/asm-generic/mutex-dec.h +++ b/include/asm-generic/mutex-dec.h @@ -80,7 +80,7 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *)) static inline int __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) { - if (likely(atomic_cmpxchg_acquire(count, 1, 0) == 1)) + if (likely(atomic_read(count) == 1 && atomic_cmpxchg_acquire(count, 1, 0) == 1)) return 1; return 0; } diff --git a/include/asm-generic/mutex-xchg.h b/include/asm-generic/mutex-xchg.h index a6b4a7bd6ac9..3269ec4e195f 100644 --- a/include/asm-generic/mutex-xchg.h +++ b/include/asm-generic/mutex-xchg.h @@ -91,8 +91,12 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *)) static inline int __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) { - int prev = atomic_xchg_acquire(count, 0); + int prev; + if (atomic_read(count) != 1) + return 0; + + prev = atomic_xchg_acquire(count, 0); if (unlikely(prev < 0)) { /* * The lock was marked contended so we must restore that diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c7f1bc4f817c..b7326893221f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3681,7 +3681,6 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && (rnp == rnp_root || ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && - !mutex_is_locked(&rsp->exp_mutex) && mutex_trylock(&rsp->exp_mutex)) goto fastpath; -- cgit v1.3-8-gc7d7 From 288dab8a35a0bde426a09870943c8d3ee3a50dab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Jun 2016 16:00:36 +0200 Subject: block: add a separate operation type for secure erase Instead of overloading the discard support with the REQ_SECURE flag. Use the opportunity to rename the queue flag as well, and remove the dead checks for this flag in the RAID 1 and RAID 10 drivers that don't claim support for secure erase. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 27 +++++++++++++++++---------- block/blk-lib.c | 25 ++++++++++++++----------- block/blk-merge.c | 6 ++---- drivers/block/xen-blkback/xenbus.c | 2 +- drivers/block/xen-blkfront.c | 14 +++++++++----- drivers/md/raid1.c | 3 +-- drivers/md/raid10.c | 5 ++--- drivers/mmc/card/block.c | 10 ++++++---- drivers/mmc/card/queue.c | 2 +- include/linux/blk_types.h | 5 ++--- include/linux/blkdev.h | 23 ++++------------------- kernel/trace/blktrace.c | 6 ++++-- 12 files changed, 63 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/block/blk-core.c b/block/blk-core.c index 32a283eb7274..db31a2981223 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1977,16 +1977,21 @@ generic_make_request_checks(struct bio *bio) } } - if ((bio_op(bio) == REQ_OP_DISCARD) && - (!blk_queue_discard(q) || - ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) { - err = -EOPNOTSUPP; - goto end_io; - } - - if (bio_op(bio) == REQ_OP_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) { - err = -EOPNOTSUPP; - goto end_io; + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + if (!blk_queue_discard(q)) + goto not_supported; + break; + case REQ_OP_SECURE_ERASE: + if (!blk_queue_secure_erase(q)) + goto not_supported; + break; + case REQ_OP_WRITE_SAME: + if (!bdev_write_same(bio->bi_bdev)) + goto not_supported; + break; + default: + break; } /* @@ -2003,6 +2008,8 @@ generic_make_request_checks(struct bio *bio) trace_block_bio_queue(q, bio); return true; +not_supported: + err = -EOPNOTSUPP; end_io: bio->bi_error = err; bio_endio(bio); diff --git a/block/blk-lib.c b/block/blk-lib.c index ff2a7f04af4d..78626c2fde33 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -23,20 +23,27 @@ static struct bio *next_bio(struct bio *bio, unsigned int nr_pages, } int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, int op_flags, + sector_t nr_sects, gfp_t gfp_mask, int flags, struct bio **biop) { struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = *biop; unsigned int granularity; + enum req_op op; int alignment; if (!q) return -ENXIO; - if (!blk_queue_discard(q)) - return -EOPNOTSUPP; - if ((op_flags & REQ_SECURE) && !blk_queue_secdiscard(q)) - return -EOPNOTSUPP; + + if (flags & BLKDEV_DISCARD_SECURE) { + if (!blk_queue_secure_erase(q)) + return -EOPNOTSUPP; + op = REQ_OP_SECURE_ERASE; + } else { + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + op = REQ_OP_DISCARD; + } /* Zero-sector (unknown) and one-sector granularities are the same. */ granularity = max(q->limits.discard_granularity >> 9, 1U); @@ -66,7 +73,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, bio = next_bio(bio, 1, gfp_mask); bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; - bio_set_op_attrs(bio, REQ_OP_DISCARD, op_flags); + bio_set_op_attrs(bio, op, 0); bio->bi_iter.bi_size = req_sects << 9; nr_sects -= req_sects; @@ -100,16 +107,12 @@ EXPORT_SYMBOL(__blkdev_issue_discard); int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) { - int op_flags = 0; struct bio *bio = NULL; struct blk_plug plug; int ret; - if (flags & BLKDEV_DISCARD_SECURE) - op_flags |= REQ_SECURE; - blk_start_plug(&plug); - ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, op_flags, + ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags, &bio); if (!ret && bio) { ret = submit_bio_wait(bio); diff --git a/block/blk-merge.c b/block/blk-merge.c index c265348b75d1..9772308a8391 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -649,8 +649,7 @@ static int attempt_merge(struct request_queue *q, struct request *req, if (!rq_mergeable(req) || !rq_mergeable(next)) return 0; - if (!blk_check_merge_flags(req->cmd_flags, req_op(req), next->cmd_flags, - req_op(next))) + if (req_op(req) != req_op(next)) return 0; /* @@ -752,8 +751,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (!rq_mergeable(rq) || !bio_mergeable(bio)) return false; - if (!blk_check_merge_flags(rq->cmd_flags, req_op(rq), bio->bi_rw, - bio_op(bio))) + if (req_op(rq) != bio_op(bio)) return false; /* different data direction or already started, don't merge */ diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 3355f1cdd4e5..2994cfa44c8a 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -480,7 +480,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags)) vbd->flush_support = true; - if (q && blk_queue_secdiscard(q)) + if (q && blk_queue_secure_erase(q)) vbd->discard_secure = true; pr_debug("Successful creation of handle=%04x (dom=%u)\n", diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 343ef7abe5fd..10711292da2c 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -545,7 +545,7 @@ static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_inf ring_req->u.discard.nr_sectors = blk_rq_sectors(req); ring_req->u.discard.id = id; ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); - if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) + if (req_op(req) == REQ_OP_SECURE_ERASE && info->feature_secdiscard) ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; else ring_req->u.discard.flag = 0; @@ -841,7 +841,7 @@ static int blkif_queue_request(struct request *req, struct blkfront_ring_info *r return 1; if (unlikely(req_op(req) == REQ_OP_DISCARD || - req->cmd_flags & REQ_SECURE)) + req_op(req) == REQ_OP_SECURE_ERASE)) return blkif_queue_discard_req(req, rinfo); else return blkif_queue_rw_req(req, rinfo); @@ -955,7 +955,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, rq->limits.discard_granularity = info->discard_granularity; rq->limits.discard_alignment = info->discard_alignment; if (info->feature_secdiscard) - queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq); + queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, rq); } /* Hard sector size and max sectors impersonate the equiv. hardware. */ @@ -1595,7 +1595,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) info->feature_discard = 0; info->feature_secdiscard = 0; queue_flag_clear(QUEUE_FLAG_DISCARD, rq); - queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq); + queue_flag_clear(QUEUE_FLAG_SECERASE, rq); } blk_mq_complete_request(req, error); break; @@ -2052,10 +2052,14 @@ static int blkif_recover(struct blkfront_info *info) */ if (req_op(copy[i].request) == REQ_OP_FLUSH || req_op(copy[i].request) == REQ_OP_DISCARD || - copy[i].request->cmd_flags & (REQ_FUA | REQ_SECURE)) { + req_op(copy[i].request) == REQ_OP_SECURE_ERASE || + copy[i].request->cmd_flags & REQ_FUA) { /* * Flush operations don't contain bios, so * we need to requeue the whole request + * + * XXX: but this doesn't make any sense for a + * write with the FUA flag set.. */ list_add(©[i].request->queuelist, &requests); continue; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 10e53cd6a995..41d9c31da3b3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1058,7 +1058,6 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA)); - const unsigned long do_sec = (bio->bi_rw & REQ_SECURE); struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; struct raid1_plug_cb *plug = NULL; @@ -1376,7 +1375,7 @@ read_again: conf->mirrors[i].rdev->data_offset); mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - bio_set_op_attrs(mbio, op, do_flush_fua | do_sync | do_sec); + bio_set_op_attrs(mbio, op, do_flush_fua | do_sync); mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 245640b50153..26ae74fd0d01 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1062,7 +1062,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio) const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_fua = (bio->bi_rw & REQ_FUA); - const unsigned long do_sec = (bio->bi_rw & REQ_SECURE); unsigned long flags; struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; @@ -1362,7 +1361,7 @@ retry_write: rdev)); mbio->bi_bdev = rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - bio_set_op_attrs(mbio, op, do_sync | do_fua | do_sec); + bio_set_op_attrs(mbio, op, do_sync | do_fua); mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1404,7 +1403,7 @@ retry_write: r10_bio, rdev)); mbio->bi_bdev = rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - bio_set_op_attrs(mbio, op, do_sync | do_fua | do_sec); + bio_set_op_attrs(mbio, op, do_sync | do_fua); mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index bca20f88a8b2..383184743f9a 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -2167,10 +2167,12 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req) /* complete ongoing async transfer before issuing discard */ if (card->host->areq) mmc_blk_issue_rw_rq(mq, NULL); - if (req->cmd_flags & REQ_SECURE) - ret = mmc_blk_issue_secdiscard_rq(mq, req); - else - ret = mmc_blk_issue_discard_rq(mq, req); + ret = mmc_blk_issue_discard_rq(mq, req); + } else if (req && req_op(req) == REQ_OP_SECURE_ERASE) { + /* complete ongoing async transfer before issuing secure erase*/ + if (card->host->areq) + mmc_blk_issue_rw_rq(mq, NULL); + ret = mmc_blk_issue_secdiscard_rq(mq, req); } else if (req && req_op(req) == REQ_OP_FLUSH) { /* complete ongoing async transfer before issuing flush */ if (card->host->areq) diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c index c2d5f6f35145..bf14642a576a 100644 --- a/drivers/mmc/card/queue.c +++ b/drivers/mmc/card/queue.c @@ -171,7 +171,7 @@ static void mmc_queue_setup_discard(struct request_queue *q, if (card->pref_erase > max_discard) q->limits.discard_granularity = 0; if (mmc_can_secure_erase_trim(card)) - queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q); + queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, q); } /** diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 562ab8301217..efba1f2ace2e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -163,7 +163,6 @@ enum rq_flag_bits { __REQ_SYNC, /* request is sync (sync write or read) */ __REQ_META, /* metadata io request */ __REQ_PRIO, /* boost priority in cfq */ - __REQ_SECURE, /* secure discard (used with REQ_OP_DISCARD) */ __REQ_NOIDLE, /* don't anticipate more IO after this one */ __REQ_INTEGRITY, /* I/O includes block integrity payload */ @@ -212,7 +211,7 @@ enum rq_flag_bits { (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) #define REQ_COMMON_MASK \ (REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \ - REQ_PREFLUSH | REQ_FUA | REQ_SECURE | REQ_INTEGRITY | REQ_NOMERGE) + REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE) #define REQ_CLONE_MASK REQ_COMMON_MASK /* This mask is used for both bio and request merge checking */ @@ -239,7 +238,6 @@ enum rq_flag_bits { #define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ) #define REQ_IO_STAT (1ULL << __REQ_IO_STAT) #define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE) -#define REQ_SECURE (1ULL << __REQ_SECURE) #define REQ_PM (1ULL << __REQ_PM) #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) @@ -248,6 +246,7 @@ enum req_op { REQ_OP_READ, REQ_OP_WRITE, REQ_OP_DISCARD, /* request to discard sectors */ + REQ_OP_SECURE_ERASE, /* request to securely erase sectors */ REQ_OP_WRITE_SAME, /* write same block many times */ REQ_OP_FLUSH, /* request for cache flush */ }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0c9f8793c87e..53fee6123893 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -497,7 +497,7 @@ struct request_queue { #define QUEUE_FLAG_DISCARD 14 /* supports DISCARD */ #define QUEUE_FLAG_NOXMERGES 15 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 16 /* Contributes to random pool */ -#define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */ +#define QUEUE_FLAG_SECERASE 17 /* supports secure erase */ #define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ #define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */ #define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */ @@ -593,8 +593,8 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_queue_stackable(q) \ test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) -#define blk_queue_secdiscard(q) (blk_queue_discard(q) && \ - test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags)) +#define blk_queue_secure_erase(q) \ + (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) #define blk_noretry_request(rq) \ ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ @@ -675,21 +675,6 @@ static inline bool rq_mergeable(struct request *rq) return true; } -static inline bool blk_check_merge_flags(unsigned int flags1, unsigned int op1, - unsigned int flags2, unsigned int op2) -{ - if ((op1 == REQ_OP_DISCARD) != (op2 == REQ_OP_DISCARD)) - return false; - - if ((flags1 & REQ_SECURE) != (flags2 & REQ_SECURE)) - return false; - - if ((op1 == REQ_OP_WRITE_SAME) != (op2 == REQ_OP_WRITE_SAME)) - return false; - - return true; -} - static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) { if (bio_data(a) == bio_data(b)) @@ -1158,7 +1143,7 @@ extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, int op_flags, + sector_t nr_sects, gfp_t gfp_mask, int flags, struct bio **biop); extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 03b0dd98ff0e..af49caf973eb 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1791,6 +1791,10 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) case REQ_OP_DISCARD: rwbs[i++] = 'D'; break; + case REQ_OP_SECURE_ERASE: + rwbs[i++] = 'D'; + rwbs[i++] = 'E'; + break; case REQ_OP_FLUSH: rwbs[i++] = 'F'; break; @@ -1809,8 +1813,6 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) rwbs[i++] = 'S'; if (rw & REQ_META) rwbs[i++] = 'M'; - if (rw & REQ_SECURE) - rwbs[i++] = 'E'; rwbs[i] = '\0'; } -- cgit v1.3-8-gc7d7 From b5227d03b7191a9a44bf75a4c228a6a9ddbe781b Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 31 May 2016 16:23:02 -0500 Subject: timers: Clarify usleep_range() function comment Update the usleep_range() function comment to make it clear that it can only be used in non-atomic context. Previously we claimed usleep_range() was a drop-in replacement for udelay() where wakeup is flexible. But that's only true in non-atomic contexts, where it's possible to sleep instead of delay. Signed-off-by: Bjorn Helgaas Cc: John Stultz Link: http://lkml.kernel.org/r/20160531212302.28502.44995.stgit@bhelgaas-glaptop2.roam.corp.google.com Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 3a95f9728778..67dd6103003a 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1702,9 +1702,15 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max) } /** - * usleep_range - Drop in replacement for udelay where wakeup is flexible + * usleep_range - Sleep for an approximate time * @min: Minimum time in usecs to sleep * @max: Maximum time in usecs to sleep + * + * In non-atomic context where the exact wakeup time is flexible, use + * usleep_range() instead of udelay(). The sleep improves responsiveness + * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces + * power usage by allowing hrtimers to take advantage of an already- + * scheduled interrupt instead of scheduling a new one just for this sleep. */ void __sched usleep_range(unsigned long min, unsigned long max) { -- cgit v1.3-8-gc7d7 From 86721ab63b61ef1dd7305308e4049f644703decf Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Tue, 1 Mar 2016 22:58:49 +0530 Subject: hrtimer: Remove redundant #ifdef block Only need CONFIG_NO_HZ_COMMON as this block is already in a CONFIG_SMP block. Signed-off-by: Pratyush Patel Link: http://lkml.kernel.org/r/20160301172849.GA18152@cyborg Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e99df0ff1d42..d13c9aebf7a3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -177,7 +177,7 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) #endif } -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#ifdef CONFIG_NO_HZ_COMMON static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) -- cgit v1.3-8-gc7d7 From fe3464ca8710012a247bb4586dde21b080f88514 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Sat, 19 Mar 2016 21:59:19 +0800 Subject: genirq: Remove redundant NULL check of irq_desc for_each_irq_desc() macro has already skipped NULL irq_desc, don't bother to check it again. Signed-off-by: Jianyu Zhan Cc: mingo@kernel.org Cc: yhlu.kernel@gmail.com Link: http://lkml.kernel.org/r/1458395959-7046-1-git-send-email-nasa4836@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/proc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 4e1b94726818..50a8f28be247 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -421,12 +421,8 @@ void init_irq_proc(void) /* * Create entries for all existing IRQs. */ - for_each_irq_desc(irq, desc) { - if (!desc) - continue; - + for_each_irq_desc(irq, desc) register_irq_proc(irq, desc); - } } #ifdef CONFIG_GENERIC_IRQ_SHOW -- cgit v1.3-8-gc7d7 From ff5b706f5189fe8d2a6fd576b491b769ec1d29d3 Mon Sep 17 00:00:00 2001 From: Weongyo Jeong Date: Thu, 31 Mar 2016 12:15:03 -0700 Subject: genirq: Remove unnecessary memset() calls sprintf() and snprintf() implementation of kernel guarantees that its result is terminated with null byte if size is larger than 0. So we don't need to call memset() at all. Signed-off-by: Weongyo Jeong Link: http://lkml.kernel.org/r/1459451703-5744-1-git-send-email-weongyo.linux@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/proc.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 50a8f28be247..f30425dce9dd 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -311,7 +311,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) !name_unique(irq, action)) return; - memset(name, 0, MAX_NAMELEN); snprintf(name, MAX_NAMELEN, "%s", action->name); /* create /proc/irq/1234/handler/ */ @@ -340,7 +339,6 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) if (desc->dir) goto out_unlock; - memset(name, 0, MAX_NAMELEN); sprintf(name, "%d", irq); /* create /proc/irq/1234 */ @@ -386,7 +384,6 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) #endif remove_proc_entry("spurious", desc->dir); - memset(name, 0, MAX_NAMELEN); sprintf(name, "%u", irq); remove_proc_entry(name, root_irq_dir); } -- cgit v1.3-8-gc7d7 From b62b2cf5759b0c2206ddff92226f1eb8ac8f9f13 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 7 Jun 2016 16:12:26 +0100 Subject: irqdomain: Fix handling of type settings for existing mappings When mapping an IRQ, it is possible that a mapping for the IRQ already exists. If mapping does exist then there are the following issues with regard to the handling of the IRQ type settings ... 1. If the domain is part of a hierarchy, then: a. We do not check that the type settings for the existing mapping match those of the new mapping. b. We do not check to see if the type settings have been programmed yet (and they might not have been) and so we may never set the type. 2. If the domain is NOT part of a hierarchy, we will overwrite the current type settings programmed if they are different from the previous mapping. Please note that irq_create_mapping() calls irq_find_mapping() to check if a mapping already exists. Although, it may be unlikely that the type settings for a shared interrupt would not match, nonetheless we should check for this. Therefore, to fix this check if a mapping exists (regardless of whether the domain is part of a hierarchy or not) and if it does then: 1. Return the IRQ number if the type settings match or are not specified. 2. Program the type settings and return the IRQ number if the type settings have not been programmed yet. 3. Otherwise if the type setting do not match, then print a warning and don't return the IRQ number. Furthermore, add a warning if the type return by irq_domain_translate() has bits outside the sense mask set and then clear these bits. If these bits are not cleared then this will cause the comparision of the type settings for an existing mapping to fail with that of the new mapping even if the sense bit themselves match. The reason being is that the existing type settings are read by calling irq_get_trigger_type() which will clear any bits outside the sense mask. This will allow us to detect irqchips that are not correctly clearing these bits and fix them. Signed-off-by: Jon Hunter Reviewed-by: Marc Zyngier Signed-off-by: Marc Zyngier --- kernel/irq/irqdomain.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8798b6c9e945..f3ff1eb8dd09 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -588,15 +588,42 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) if (irq_domain_translate(domain, fwspec, &hwirq, &type)) return 0; - if (irq_domain_is_hierarchy(domain)) { + /* + * WARN if the irqchip returns a type with bits + * outside the sense mask set and clear these bits. + */ + if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK)) + type &= IRQ_TYPE_SENSE_MASK; + + /* + * If we've already configured this interrupt, + * don't do it again, or hell will break loose. + */ + virq = irq_find_mapping(domain, hwirq); + if (virq) { /* - * If we've already configured this interrupt, - * don't do it again, or hell will break loose. + * If the trigger type is not specified or matches the + * current trigger type then we are done so return the + * interrupt number. */ - virq = irq_find_mapping(domain, hwirq); - if (virq) + if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq)) return virq; + /* + * If the trigger type has not been set yet, then set + * it now and return the interrupt number. + */ + if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) { + irq_set_irq_type(virq, type); + return virq; + } + + pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n", + hwirq, of_node_full_name(to_of_node(fwspec->fwnode))); + return 0; + } + + if (irq_domain_is_hierarchy(domain)) { virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec); if (virq <= 0) return 0; -- cgit v1.3-8-gc7d7 From 4b357daed698c95d6b5eacc1c3c4afa206071ba2 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 7 Jun 2016 16:12:27 +0100 Subject: genirq: Look-up trigger type if not specified by caller For some devices the IRQ trigger type for a device is read from firmware, such as device-tree. The IRQ trigger type is typically read when the mapping for IRQ is created, which is before the IRQ is requested. Hence, the IRQ trigger type is programmed when mapping the IRQ and not when requesting the IRQ. Although this works for most cases, in order to support IRQ chips which require runtime power management, which may not be accessible prior to requesting the IRQ, it is desirable to look-up the IRQ trigger type when it is requested. Therefore, if the IRQ trigger type is not specified when __setup_irq() is called, look-up the saved IRQ trigger type. This will allow us to defer the programming of the trigger type from when the IRQ is mapped to when it is actually requested. Signed-off-by: Jon Hunter Reviewed-by: Marc Zyngier Signed-off-by: Marc Zyngier --- kernel/irq/manage.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ef0bc02c3a70..eaedeb74b49d 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1116,6 +1116,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->irq = irq; + /* + * If the trigger type is not specified by the caller, + * then use the default for this interrupt. + */ + if (!(new->flags & IRQF_TRIGGER_MASK)) + new->flags |= irqd_get_trigger_type(&desc->irq_data); + /* * Check whether the interrupt nests into another interrupt * thread. -- cgit v1.3-8-gc7d7 From f35ad083783e8ed6ac030f5feb209f864875b413 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 13 Jun 2016 10:39:44 +0100 Subject: genirq: Look-up percpu trigger type if not specified by caller As we now do for non-percpu interrupt, perform a lookup of the interrupt trigger if the user doesn't supply one. The difference here is that we can only do it at enable time (trigger configuration can be per-cpu as well). Signed-off-by: Marc Zyngier --- kernel/irq/manage.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index eaedeb74b49d..f78b0846afb1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1737,7 +1737,14 @@ void enable_percpu_irq(unsigned int irq, unsigned int type) if (!desc) return; + /* + * If the trigger type is not specified by the caller, then + * use the default for this interrupt. + */ type &= IRQ_TYPE_SENSE_MASK; + if (type == IRQ_TYPE_NONE) + type = irqd_get_trigger_type(&desc->irq_data); + if (type != IRQ_TYPE_NONE) { int ret; -- cgit v1.3-8-gc7d7 From 1e2a7d78499ec8859d2b469051b7b80bad3b08aa Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 7 Jun 2016 16:12:28 +0100 Subject: irqdomain: Don't set type when mapping an IRQ Some IRQ chips, such as GPIO controllers or secondary level interrupt controllers, may require require additional runtime power management control to ensure they are accessible. For such IRQ chips, it makes sense to enable the IRQ chip when interrupts are requested and disabled them again once all interrupts have been freed. When mapping an IRQ, the IRQ type settings are read and then programmed. The mapping of the IRQ happens before the IRQ is requested and so the programming of the type settings occurs before the IRQ is requested. This is a problem for IRQ chips that require additional power management control because they may not be accessible yet. Therefore, when mapping the IRQ, don't program the type settings, just save them and then program these saved settings when the IRQ is requested (so long as if they are not overridden via the call to request the IRQ). Add a stub function for irq_domain_free_irqs() to avoid any compilation errors when CONFIG_IRQ_DOMAIN_HIERARCHY is not selected. Signed-off-by: Jon Hunter Reviewed-by: Marc Zyngier Signed-off-by: Marc Zyngier --- include/linux/irqdomain.h | 3 +++ kernel/irq/irqdomain.c | 23 ++++++++++++++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index f1f36e04d885..317503763314 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -452,6 +452,9 @@ static inline int irq_domain_alloc_irqs(struct irq_domain *domain, return -1; } +static inline void irq_domain_free_irqs(unsigned int virq, + unsigned int nr_irqs) { } + static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) { return false; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f3ff1eb8dd09..caa6a63d26f0 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -567,6 +567,7 @@ static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data, unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) { struct irq_domain *domain; + struct irq_data *irq_data; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; int virq; @@ -614,7 +615,11 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) * it now and return the interrupt number. */ if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) { - irq_set_irq_type(virq, type); + irq_data = irq_get_irq_data(virq); + if (!irq_data) + return 0; + + irqd_set_trigger_type(irq_data, type); return virq; } @@ -634,10 +639,18 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) return virq; } - /* Set type if specified and different than the current one */ - if (type != IRQ_TYPE_NONE && - type != irq_get_trigger_type(virq)) - irq_set_irq_type(virq, type); + irq_data = irq_get_irq_data(virq); + if (!irq_data) { + if (irq_domain_is_hierarchy(domain)) + irq_domain_free_irqs(virq, 1); + else + irq_dispose_mapping(virq); + return 0; + } + + /* Store trigger type */ + irqd_set_trigger_type(irq_data, type); + return virq; } EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping); -- cgit v1.3-8-gc7d7 From be45beb2df6909d42a6b3b0052601b3eef878fc0 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 7 Jun 2016 16:12:29 +0100 Subject: genirq: Add runtime power management support for IRQ chips Some IRQ chips may be located in a power domain outside of the CPU subsystem and hence will require device specific runtime power management. In order to support such IRQ chips, add a pointer for a device structure to the irq_chip structure, and if this pointer is populated by the IRQ chip driver and CONFIG_PM is selected in the kernel configuration, then the pm_runtime_get/put APIs for this chip will be called when an IRQ is requested/freed, respectively. Reviewed-by: Kevin Hilman Signed-off-by: Jon Hunter Signed-off-by: Marc Zyngier --- include/linux/irq.h | 4 ++++ kernel/irq/chip.c | 40 ++++++++++++++++++++++++++++++++++++++++ kernel/irq/internals.h | 1 + kernel/irq/manage.c | 31 ++++++++++++++++++++++++++++++- 4 files changed, 75 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 4d758a7c604a..6c92a847394d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -315,6 +315,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) /** * struct irq_chip - hardware interrupt chip descriptor * + * @parent_device: pointer to parent device for irqchip * @name: name for /proc/interrupts * @irq_startup: start up the interrupt (defaults to ->enable if NULL) * @irq_shutdown: shut down the interrupt (defaults to ->disable if NULL) @@ -354,6 +355,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @flags: chip specific flags */ struct irq_chip { + struct device *parent_device; const char *name; unsigned int (*irq_startup)(struct irq_data *data); void (*irq_shutdown)(struct irq_data *data); @@ -488,6 +490,8 @@ extern void handle_bad_irq(struct irq_desc *desc); extern void handle_nested_irq(unsigned int irq); extern int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg); +extern int irq_chip_pm_get(struct irq_data *data); +extern int irq_chip_pm_put(struct irq_data *data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY extern void irq_chip_enable_parent(struct irq_data *data); extern void irq_chip_disable_parent(struct irq_data *data); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2f9f2b0e79f2..ad8131473774 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1093,3 +1093,43 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) return 0; } + +/** + * irq_chip_pm_get - Enable power for an IRQ chip + * @data: Pointer to interrupt specific data + * + * Enable the power to the IRQ chip referenced by the interrupt data + * structure. + */ +int irq_chip_pm_get(struct irq_data *data) +{ + int retval; + + if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) { + retval = pm_runtime_get_sync(data->chip->parent_device); + if (retval < 0) { + pm_runtime_put_noidle(data->chip->parent_device); + return retval; + } + } + + return 0; +} + +/** + * irq_chip_pm_put - Disable power for an IRQ chip + * @data: Pointer to interrupt specific data + * + * Disable the power to the IRQ chip referenced by the interrupt data + * structure, belongs. Note that power will only be disabled, once this + * function has been called for all IRQs that have called irq_chip_pm_get(). + */ +int irq_chip_pm_put(struct irq_data *data) +{ + int retval = 0; + + if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) + retval = pm_runtime_put(data->chip->parent_device); + + return (retval < 0) ? retval : 0; +} diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 09be2c903c6d..d5edcdc9382a 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -7,6 +7,7 @@ */ #include #include +#include #ifdef CONFIG_SPARSE_IRQ # define IRQ_BITMAP_BITS (NR_IRQS + 8196) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f78b0846afb1..00cfc852cca8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1416,10 +1416,18 @@ int setup_irq(unsigned int irq, struct irqaction *act) if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return -EINVAL; + + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + return retval; + chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); chip_bus_sync_unlock(desc); + if (retval) + irq_chip_pm_put(&desc->irq_data); + return retval; } EXPORT_SYMBOL_GPL(setup_irq); @@ -1513,6 +1521,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) } } + irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); kfree(action->secondary); return action; @@ -1655,11 +1664,16 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, action->name = devname; action->dev_id = dev_id; + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + return retval; + chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); chip_bus_sync_unlock(desc); if (retval) { + irq_chip_pm_put(&desc->irq_data); kfree(action->secondary); kfree(action); } @@ -1836,6 +1850,7 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_ unregister_handler_proc(irq, action); + irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return action; @@ -1898,10 +1913,18 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) if (!desc || !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; + + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + return retval; + chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); chip_bus_sync_unlock(desc); + if (retval) + irq_chip_pm_put(&desc->irq_data); + return retval; } @@ -1945,12 +1968,18 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, action->name = devname; action->percpu_dev_id = dev_id; + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + return retval; + chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); chip_bus_sync_unlock(desc); - if (retval) + if (retval) { + irq_chip_pm_put(&desc->irq_data); kfree(action); + } return retval; } -- cgit v1.3-8-gc7d7 From 678309117768e25751594a48a2d873b0552a3130 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 2 Jun 2016 13:20:32 +0100 Subject: PM / Hibernate: Don't let kasan instrument snapshot.c Kasan causes the compiler to instrument C code and is used at runtime to detect accesses to memory that has been freed, or not yet allocated. The code in snapshot.c saves and restores memory when hibernating. This will access whole pages in the slab cache that have both free and allocated areas, resulting in a large number of false positives from Kasan. Disable instrumentation of this file. Signed-off-by: James Morse Acked-by: Catalin Marinas Signed-off-by: Rafael J. Wysocki --- kernel/power/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/power/Makefile b/kernel/power/Makefile index cb880a14cc39..eb4f717705ba 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,6 +1,8 @@ ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG +KASAN_SANITIZE_snapshot.o := n + obj-y += qos.o obj-$(CONFIG_PM) += main.o obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o -- cgit v1.3-8-gc7d7 From 3d89e5478bf550a50c99e93adf659369798263b0 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 13 Jun 2016 18:32:45 +0800 Subject: sched/cputime: Fix prev steal time accouting during CPU hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit: e9532e69b8d1 ("sched/cputime: Fix steal time accounting vs. CPU hotplug") ... set rq->prev_* to 0 after a CPU hotplug comes back, in order to fix the case where (after CPU hotplug) steal time is smaller than rq->prev_steal_time. However, this should never happen. Steal time was only smaller because of the KVM-specific bug fixed by the previous patch. Worse, the previous patch triggers a bug on CPU hot-unplug/plug operation: because rq->prev_steal_time is cleared, all of the CPU's past steal time will be accounted again on hot-plug. Since the root cause has been fixed, we can just revert commit e9532e69b8d1. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paolo Bonzini Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Thomas Gleixner Fixes: 'commit e9532e69b8d1 ("sched/cputime: Fix steal time accounting vs. CPU hotplug")' Link: http://lkml.kernel.org/r/1465813966-3116-3-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - kernel/sched/sched.h | 13 ------------- 2 files changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13d0896aff87..c1b537bc4b90 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7227,7 +7227,6 @@ static void sched_rq_cpu_starting(unsigned int cpu) struct rq *rq = cpu_rq(cpu); rq->calc_load_update = calc_load_update; - account_reset_rq(rq); update_max_interval(); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 72f1f3087b04..de607e4febd9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1809,16 +1809,3 @@ static inline void cpufreq_trigger_update(u64 time) {} #else /* arch_scale_freq_capacity */ #define arch_scale_freq_invariant() (false) #endif - -static inline void account_reset_rq(struct rq *rq) -{ -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - rq->prev_irq_time = 0; -#endif -#ifdef CONFIG_PARAVIRT - rq->prev_steal_time = 0; -#endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - rq->prev_steal_time_rq = 0; -#endif -} -- cgit v1.3-8-gc7d7 From 807e5b80687c06715d62df51a5473b231e3e8b15 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 13 Jun 2016 18:32:46 +0800 Subject: sched/cputime: Add steal time support to full dynticks CPU time accounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds guest steal-time support to full dynticks CPU time accounting. After the following commit: ff9a9b4c4334 ("sched, time: Switch VIRT_CPU_ACCOUNTING_GEN to jiffy granularity") ... time sampling became jiffy based, even if we do the sampling from the context tracking code, so steal_account_process_tick() can be reused to account how many 'ticks' are stolen-time, after the last accumulation. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paolo Bonzini Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1465813966-3116-4-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 75f98c5498d5..3d60e5d76fdb 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -257,7 +257,7 @@ void account_idle_time(cputime_t cputime) cpustat[CPUTIME_IDLE] += (__force u64) cputime; } -static __always_inline bool steal_account_process_tick(void) +static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies) { #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { @@ -272,14 +272,14 @@ static __always_inline bool steal_account_process_tick(void) * time in jiffies. Lets cast the result to jiffies * granularity and account the rest on the next rounds. */ - steal_jiffies = nsecs_to_jiffies(steal); + steal_jiffies = min(nsecs_to_jiffies(steal), max_jiffies); this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); account_steal_time(jiffies_to_cputime(steal_jiffies)); return steal_jiffies; } #endif - return false; + return 0; } /* @@ -346,7 +346,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, u64 cputime = (__force u64) cputime_one_jiffy; u64 *cpustat = kcpustat_this_cpu->cpustat; - if (steal_account_process_tick()) + if (steal_account_process_tick(ULONG_MAX)) return; cputime *= ticks; @@ -477,7 +477,7 @@ void account_process_tick(struct task_struct *p, int user_tick) return; } - if (steal_account_process_tick()) + if (steal_account_process_tick(ULONG_MAX)) return; if (user_tick) @@ -681,12 +681,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) static cputime_t get_vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); - unsigned long delta = now - tsk->vtime_snap; + unsigned long delta_jiffies, steal_jiffies; + delta_jiffies = now - tsk->vtime_snap; + steal_jiffies = steal_account_process_tick(delta_jiffies); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); tsk->vtime_snap = now; - return jiffies_to_cputime(delta); + return jiffies_to_cputime(delta_jiffies - steal_jiffies); } static void __vtime_account_system(struct task_struct *tsk) -- cgit v1.3-8-gc7d7 From 2c95afc1e83d93fac3be6923465e1753c2c53b0a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jun 2016 06:14:38 -0700 Subject: perf/x86/intel, watchdog: Switch NMI watchdog to ref cycles on x86 The NMI watchdog uses either the fixed cycles or a generic cycles counter. This causes a lot of conflicts with users of the PMU who want to run a full group including the cycles fixed counter, for example the --topdown support recently added to perf stat. The code needs to fall back to not use groups, which can cause measurement inaccuracy due to multiplexing errors. This patch switches the NMI watchdog to use reference cycles on Intel systems. This is actually more accurate than cycles, because cycles can tick faster than the measured CPU Frequency due to Turbo mode. The ref cycles always tick at their frequency, or slower when the system is idling. That means the NMI watchdog can never expire too early, unlike with cycles. The reference cycles tick roughly at the frequency of the TSC, so the same period computation can be used. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Link: http://lkml.kernel.org/r/1465478079-19993-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/hw_nmi.c | 8 ++++++++ include/linux/nmi.h | 1 + kernel/watchdog.c | 7 +++++++ 3 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 7788ce643bf4..016f4263fad4 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -18,8 +18,16 @@ #include #include #include +#include #ifdef CONFIG_HARDLOCKUP_DETECTOR +int hw_nmi_get_event(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + return PERF_COUNT_HW_REF_CPU_CYCLES; + return PERF_COUNT_HW_CPU_CYCLES; +} + u64 hw_nmi_get_sample_period(int watchdog_thresh) { return (u64)(cpu_khz) * 1000 * watchdog_thresh; diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 4630eeae18e0..79858af27209 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -66,6 +66,7 @@ static inline bool trigger_allbutself_cpu_backtrace(void) #ifdef CONFIG_LOCKUP_DETECTOR u64 hw_nmi_get_sample_period(int watchdog_thresh); +int hw_nmi_get_event(void); extern int nmi_watchdog_enabled; extern int soft_watchdog_enabled; extern int watchdog_user_enabled; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9acb29f280ec..8dd30fcd91be 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -315,6 +315,12 @@ static int is_softlockup(unsigned long touch_ts) #ifdef CONFIG_HARDLOCKUP_DETECTOR +/* Can be overriden by architecture */ +__weak int hw_nmi_get_event(void) +{ + return PERF_COUNT_HW_CPU_CYCLES; +} + static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -604,6 +610,7 @@ static int watchdog_nmi_enable(unsigned int cpu) wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); + wd_attr->config = hw_nmi_get_event(); /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); -- cgit v1.3-8-gc7d7 From 1f03e8d2919270bd6ef64f39a45ce8df8a9f012a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Apr 2016 10:57:12 +0200 Subject: locking/barriers: Replace smp_cond_acquire() with smp_cond_load_acquire() This new form allows using hardware assisted waiting. Some hardware (ARM64 and x86) allow monitoring an address for changes, so by providing a pointer we can use this to replace the cpu_relax() with hardware optimized methods in the future. Requested-by: Will Deacon Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 25 +++++++++++++++++++------ kernel/locking/qspinlock.c | 12 ++++++------ kernel/sched/core.c | 8 ++++---- kernel/sched/sched.h | 2 +- kernel/smp.c | 2 +- 5 files changed, 31 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 06f27fd9d760..2bcaedc0f032 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -305,21 +305,34 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s }) /** - * smp_cond_acquire() - Spin wait for cond with ACQUIRE ordering + * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering + * @ptr: pointer to the variable to wait on * @cond: boolean expression to wait for * * Equivalent to using smp_load_acquire() on the condition variable but employs * the control dependency of the wait to reduce the barrier on many platforms. * + * Due to C lacking lambda expressions we load the value of *ptr into a + * pre-named variable @VAL to be used in @cond. + * * The control dependency provides a LOAD->STORE order, the additional RMB * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order, * aka. ACQUIRE. */ -#define smp_cond_acquire(cond) do { \ - while (!(cond)) \ - cpu_relax(); \ - smp_rmb(); /* ctrl + rmb := acquire */ \ -} while (0) +#ifndef smp_cond_load_acquire +#define smp_cond_load_acquire(ptr, cond_expr) ({ \ + typeof(ptr) __PTR = (ptr); \ + typeof(*ptr) VAL; \ + for (;;) { \ + VAL = READ_ONCE(*__PTR); \ + if (cond_expr) \ + break; \ + cpu_relax(); \ + } \ + smp_rmb(); /* ctrl + rmb := acquire */ \ + VAL; \ +}) +#endif #endif /* __KERNEL__ */ diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 2f9153b183c9..1b8dda90ebfa 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -475,7 +475,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * sequentiality; this is because not all clear_pending_set_locked() * implementations imply full barriers. */ - smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK)); + smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK)); /* * take ownership and clear the pending bit. @@ -562,7 +562,7 @@ queue: * * The PV pv_wait_head_or_lock function, if active, will acquire * the lock and return a non-zero value. So we have to skip the - * smp_cond_acquire() call. As the next PV queue head hasn't been + * smp_cond_load_acquire() call. As the next PV queue head hasn't been * designated yet, there is no way for the locked value to become * _Q_SLOW_VAL. So both the set_locked() and the * atomic_cmpxchg_relaxed() calls will be safe. @@ -573,7 +573,7 @@ queue: if ((val = pv_wait_head_or_lock(lock, node))) goto locked; - smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)); + val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK)); locked: /* @@ -593,9 +593,9 @@ locked: break; } /* - * The smp_cond_acquire() call above has provided the necessary - * acquire semantics required for locking. At most two - * iterations of this loop may be ran. + * The smp_cond_load_acquire() call above has provided the + * necessary acquire semantics required for locking. At most + * two iterations of this loop may be ran. */ old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); if (old == val) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 017d5394f5dc..5cd6931cb2cb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1935,7 +1935,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * chain to provide order. Instead we do: * * 1) smp_store_release(X->on_cpu, 0) - * 2) smp_cond_acquire(!X->on_cpu) + * 2) smp_cond_load_acquire(!X->on_cpu) * * Example: * @@ -1946,7 +1946,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * sched-out X * smp_store_release(X->on_cpu, 0); * - * smp_cond_acquire(!X->on_cpu); + * smp_cond_load_acquire(&X->on_cpu, !VAL); * X->state = WAKING * set_task_cpu(X,2) * @@ -1972,7 +1972,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * This means that any means of doing remote wakeups must order the CPU doing * the wakeup against the CPU the task is going to end up running on. This, * however, is already required for the regular Program-Order guarantee above, - * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire). + * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). * */ @@ -2045,7 +2045,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * This ensures that tasks getting woken will be fully ordered against * their previous state and preserve Program Order. */ - smp_cond_acquire(!p->on_cpu); + smp_cond_load_acquire(&p->on_cpu, !VAL); p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 72f1f3087b04..425bf5ddaa5a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1113,7 +1113,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * In particular, the load of prev->state in finish_task_switch() must * happen before this. * - * Pairs with the smp_cond_acquire() in try_to_wake_up(). + * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). */ smp_store_release(&prev->on_cpu, 0); #endif diff --git a/kernel/smp.c b/kernel/smp.c index 74165443c240..36552beed397 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -107,7 +107,7 @@ void __init call_function_init(void) */ static __always_inline void csd_lock_wait(struct call_single_data *csd) { - smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK)); + smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); } static __always_inline void csd_lock(struct call_single_data *csd) -- cgit v1.3-8-gc7d7 From 33ac279677dcc2441cb93d8cb9cf7a74df62814d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 May 2016 13:17:12 +0200 Subject: locking/barriers: Introduce smp_acquire__after_ctrl_dep() Introduce smp_acquire__after_ctrl_dep(), this construct is not uncommon, but the lack of this barrier is. Use it to better express smp_rmb() uses in WRITE_ONCE(), the IPC semaphore code and the qspinlock code. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 17 ++++++++++++----- ipc/sem.c | 14 ++------------ kernel/locking/qspinlock.c | 2 +- 3 files changed, 15 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 2bcaedc0f032..59a7004fc7dd 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -304,6 +304,17 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s __u.__val; \ }) +/** + * smp_acquire__after_ctrl_dep() - Provide ACQUIRE ordering after a control dependency + * + * A control dependency provides a LOAD->STORE order, the additional RMB + * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order, + * aka. (load)-ACQUIRE. + * + * Architectures that do not do load speculation can have this be barrier(). + */ +#define smp_acquire__after_ctrl_dep() smp_rmb() + /** * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering * @ptr: pointer to the variable to wait on @@ -314,10 +325,6 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s * * Due to C lacking lambda expressions we load the value of *ptr into a * pre-named variable @VAL to be used in @cond. - * - * The control dependency provides a LOAD->STORE order, the additional RMB - * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order, - * aka. ACQUIRE. */ #ifndef smp_cond_load_acquire #define smp_cond_load_acquire(ptr, cond_expr) ({ \ @@ -329,7 +336,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s break; \ cpu_relax(); \ } \ - smp_rmb(); /* ctrl + rmb := acquire */ \ + smp_acquire__after_ctrl_dep(); \ VAL; \ }) #endif diff --git a/ipc/sem.c b/ipc/sem.c index b3757ea0694b..84dff3df11a4 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -259,16 +259,6 @@ static void sem_rcu_free(struct rcu_head *head) ipc_rcu_free(head); } -/* - * spin_unlock_wait() and !spin_is_locked() are not memory barriers, they - * are only control barriers. - * The code must pair with spin_unlock(&sem->lock) or - * spin_unlock(&sem_perm.lock), thus just the control barrier is insufficient. - * - * smp_rmb() is sufficient, as writes cannot pass the control barrier. - */ -#define ipc_smp_acquire__after_spin_is_unlocked() smp_rmb() - /* * Wait until all currently ongoing simple ops have completed. * Caller must own sem_perm.lock. @@ -292,7 +282,7 @@ static void sem_wait_array(struct sem_array *sma) sem = sma->sem_base + i; spin_unlock_wait(&sem->lock); } - ipc_smp_acquire__after_spin_is_unlocked(); + smp_acquire__after_ctrl_dep(); } /* @@ -350,7 +340,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, * complex_count++; * spin_unlock(sem_perm.lock); */ - ipc_smp_acquire__after_spin_is_unlocked(); + smp_acquire__after_ctrl_dep(); /* * Now repeat the test of complex_count: diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 1b8dda90ebfa..730655533440 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -379,7 +379,7 @@ void queued_spin_unlock_wait(struct qspinlock *lock) cpu_relax(); done: - smp_rmb(); /* CTRL + RMB -> ACQUIRE */ + smp_acquire__after_ctrl_dep(); } EXPORT_SYMBOL(queued_spin_unlock_wait); #endif -- cgit v1.3-8-gc7d7 From be3e7844980352756de4261b276ee2ba5be7a26b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 May 2016 14:45:21 +0200 Subject: locking/spinlock: Update spin_unlock_wait() users With the modified semantics of spin_unlock_wait() a number of explicit barriers can be removed. Also update the comment for the do_exit() usecase, as that was somewhat stale/obscure. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- ipc/sem.c | 1 - kernel/exit.c | 8 ++++++-- kernel/task_work.c | 1 - 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/ipc/sem.c b/ipc/sem.c index 84dff3df11a4..ae72b3cddc8d 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -282,7 +282,6 @@ static void sem_wait_array(struct sem_array *sma) sem = sma->sem_base + i; spin_unlock_wait(&sem->lock); } - smp_acquire__after_ctrl_dep(); } /* diff --git a/kernel/exit.c b/kernel/exit.c index 9e6e1356e6bb..0b40791b9e70 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -700,10 +700,14 @@ void do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ /* - * tsk->flags are checked in the futex code to protect against - * an exiting task cleaning up the robust pi futexes. + * Ensure that all new tsk->pi_lock acquisitions must observe + * PF_EXITING. Serializes against futex.c:attach_to_pi_owner(). */ smp_mb(); + /* + * Ensure that we must observe the pi_state in exit_mm() -> + * mm_release() -> exit_pi_state_list(). + */ raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) { diff --git a/kernel/task_work.c b/kernel/task_work.c index 53fa971d000d..6ab4842b00e8 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -108,7 +108,6 @@ void task_work_run(void) * fail, but it can play with *work and other entries. */ raw_spin_unlock_wait(&task->pi_lock); - smp_mb(); do { next = work->next; -- cgit v1.3-8-gc7d7 From 2f275de5d1ed7269913ef9b4c64a13952c0a38e8 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 27 May 2016 12:57:02 -0700 Subject: seccomp: Add a seccomp_data parameter secure_computing() Currently, if arch code wants to supply seccomp_data directly to seccomp (which is generally much faster than having seccomp do it using the syscall_get_xyz() API), it has to use the two-phase seccomp hooks. Add it to the easy hooks, too. Cc: linux-arch@vger.kernel.org Signed-off-by: Andy Lutomirski Signed-off-by: Kees Cook --- arch/arm/kernel/ptrace.c | 2 +- arch/arm64/kernel/ptrace.c | 2 +- arch/mips/kernel/ptrace.c | 2 +- arch/parisc/kernel/ptrace.c | 2 +- arch/powerpc/kernel/ptrace.c | 2 +- arch/s390/kernel/ptrace.c | 2 +- arch/tile/kernel/ptrace.c | 2 +- arch/um/kernel/skas/syscall.c | 2 +- arch/x86/entry/vsyscall/vsyscall_64.c | 2 +- include/linux/seccomp.h | 8 ++++---- kernel/seccomp.c | 4 ++-- 11 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 4d9375814b53..1027d3b54541 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -934,7 +934,7 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno) /* Do the secure computing check first; failures should be fast. */ #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER - if (secure_computing() == -1) + if (secure_computing(NULL) == -1) return -1; #else /* XXX: remove this once OABI gets fixed */ diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 3f6cd5c5234f..6e2cf046615d 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1247,7 +1247,7 @@ static void tracehook_report_syscall(struct pt_regs *regs, asmlinkage int syscall_trace_enter(struct pt_regs *regs) { /* Do the secure computing check first; failures should be fast. */ - if (secure_computing() == -1) + if (secure_computing(NULL) == -1) return -1; if (test_thread_flag(TIF_SYSCALL_TRACE)) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 0dcf69194473..c50af846ecf9 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -893,7 +893,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall) current_thread_info()->syscall = syscall; - if (secure_computing() == -1) + if (secure_computing(NULL) == -1) return -1; if (test_thread_flag(TIF_SYSCALL_TRACE) && diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c index b5458b37fc5b..8edc47c0b98e 100644 --- a/arch/parisc/kernel/ptrace.c +++ b/arch/parisc/kernel/ptrace.c @@ -312,7 +312,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, long do_syscall_trace_enter(struct pt_regs *regs) { /* Do the secure computing check first. */ - if (secure_computing() == -1) + if (secure_computing(NULL) == -1) return -1; if (test_thread_flag(TIF_SYSCALL_TRACE) && diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 30a03c03fe73..ed799e994773 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -1783,7 +1783,7 @@ static int do_seccomp(struct pt_regs *regs) * have already loaded -ENOSYS into r3, or seccomp has put * something else in r3 (via SECCOMP_RET_ERRNO/TRACE). */ - if (__secure_computing()) + if (__secure_computing(NULL)) return -1; /* diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 49b1c13bf6c9..c238e9958c2a 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -824,7 +824,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) long ret = 0; /* Do the secure computing check first. */ - if (secure_computing()) { + if (secure_computing(NULL)) { /* seccomp failures shouldn't expose any additional code. */ ret = -1; goto out; diff --git a/arch/tile/kernel/ptrace.c b/arch/tile/kernel/ptrace.c index 54e7b723db99..8c6d2f2fefa3 100644 --- a/arch/tile/kernel/ptrace.c +++ b/arch/tile/kernel/ptrace.c @@ -255,7 +255,7 @@ int do_syscall_trace_enter(struct pt_regs *regs) { u32 work = ACCESS_ONCE(current_thread_info()->flags); - if (secure_computing() == -1) + if (secure_computing(NULL) == -1) return -1; if (work & _TIF_SYSCALL_TRACE) { diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index 48b0dcbd87be..9c5570f0f397 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -21,7 +21,7 @@ void handle_syscall(struct uml_pt_regs *r) PT_REGS_SET_SYSCALL_RETURN(regs, -ENOSYS); /* Do the secure computing check first; failures should be fast. */ - if (secure_computing() == -1) + if (secure_computing(NULL) == -1) return; if (syscall_trace_enter(regs)) diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 174c2549939d..85acde5fa442 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -207,7 +207,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) */ regs->orig_ax = syscall_nr; regs->ax = -ENOSYS; - tmp = secure_computing(); + tmp = secure_computing(NULL); if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { warn_bad_vsyscall(KERN_DEBUG, regs, "seccomp tried to change syscall nr or ip"); diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 2296e6b2f690..9eaa7b34d6da 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -28,11 +28,11 @@ struct seccomp { }; #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER -extern int __secure_computing(void); -static inline int secure_computing(void) +extern int __secure_computing(const struct seccomp_data *sd); +static inline int secure_computing(const struct seccomp_data *sd) { if (unlikely(test_thread_flag(TIF_SECCOMP))) - return __secure_computing(); + return __secure_computing(sd); return 0; } @@ -61,7 +61,7 @@ struct seccomp { }; struct seccomp_filter { }; #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER -static inline int secure_computing(void) { return 0; } +static inline int secure_computing(struct seccomp_data *sd) { return 0; } #else static inline void secure_computing_strict(int this_syscall) { return; } #endif diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 7002796f14a4..06816290a212 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -554,9 +554,9 @@ void secure_computing_strict(int this_syscall) BUG(); } #else -int __secure_computing(void) +int __secure_computing(const struct seccomp_data *sd) { - u32 phase1_result = seccomp_phase1(NULL); + u32 phase1_result = seccomp_phase1(sd); if (likely(phase1_result == SECCOMP_PHASE1_OK)) return 0; -- cgit v1.3-8-gc7d7 From 8112c4f140fa03f9ee68aad2cc79afa7df5418d3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 1 Jun 2016 16:02:17 -0700 Subject: seccomp: remove 2-phase API Since nothing is using the 2-phase API, and it adds more complexity than benefit, remove it. Signed-off-by: Kees Cook Cc: Andy Lutomirski --- include/linux/seccomp.h | 6 --- kernel/seccomp.c | 129 +++++++++++++++--------------------------------- 2 files changed, 41 insertions(+), 94 deletions(-) (limited to 'kernel') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 9eaa7b34d6da..ecc296c137cd 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -35,12 +35,6 @@ static inline int secure_computing(const struct seccomp_data *sd) return __secure_computing(sd); return 0; } - -#define SECCOMP_PHASE1_OK 0 -#define SECCOMP_PHASE1_SKIP 1 - -extern u32 seccomp_phase1(struct seccomp_data *sd); -int seccomp_phase2(u32 phase1_result); #else extern void secure_computing_strict(int this_syscall); #endif diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 06816290a212..14a37d71b612 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -173,7 +173,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) * * Returns valid seccomp BPF response codes. */ -static u32 seccomp_run_filters(struct seccomp_data *sd) +static u32 seccomp_run_filters(const struct seccomp_data *sd) { struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; @@ -554,20 +554,9 @@ void secure_computing_strict(int this_syscall) BUG(); } #else -int __secure_computing(const struct seccomp_data *sd) -{ - u32 phase1_result = seccomp_phase1(sd); - - if (likely(phase1_result == SECCOMP_PHASE1_OK)) - return 0; - else if (likely(phase1_result == SECCOMP_PHASE1_SKIP)) - return -1; - else - return seccomp_phase2(phase1_result); -} #ifdef CONFIG_SECCOMP_FILTER -static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) +static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd) { u32 filter_ret, action; int data; @@ -599,10 +588,33 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) goto skip; case SECCOMP_RET_TRACE: - return filter_ret; /* Save the rest for phase 2. */ + /* ENOSYS these calls if there is no tracer attached. */ + if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { + syscall_set_return_value(current, + task_pt_regs(current), + -ENOSYS, 0); + goto skip; + } + + /* Allow the BPF to provide the event message */ + ptrace_event(PTRACE_EVENT_SECCOMP, data); + /* + * The delivery of a fatal signal during event + * notification may silently skip tracer notification. + * Terminating the task now avoids executing a system + * call that may not be intended. + */ + if (fatal_signal_pending(current)) + do_exit(SIGSYS); + /* Check if the tracer forced the syscall to be skipped. */ + this_syscall = syscall_get_nr(current, task_pt_regs(current)); + if (this_syscall < 0) + goto skip; + + return 0; case SECCOMP_RET_ALLOW: - return SECCOMP_PHASE1_OK; + return 0; case SECCOMP_RET_KILL: default: @@ -614,96 +626,37 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) skip: audit_seccomp(this_syscall, 0, action); - return SECCOMP_PHASE1_SKIP; + return -1; +} +#else +static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd) +{ + BUG(); } #endif -/** - * seccomp_phase1() - run fast path seccomp checks on the current syscall - * @arg sd: The seccomp_data or NULL - * - * This only reads pt_regs via the syscall_xyz helpers. The only change - * it will make to pt_regs is via syscall_set_return_value, and it will - * only do that if it returns SECCOMP_PHASE1_SKIP. - * - * If sd is provided, it will not read pt_regs at all. - * - * It may also call do_exit or force a signal; these actions must be - * safe. - * - * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should - * be processed normally. - * - * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be - * invoked. In this case, seccomp_phase1 will have set the return value - * using syscall_set_return_value. - * - * If it returns anything else, then the return value should be passed - * to seccomp_phase2 from a context in which ptrace hooks are safe. - */ -u32 seccomp_phase1(struct seccomp_data *sd) +int __secure_computing(const struct seccomp_data *sd) { int mode = current->seccomp.mode; - int this_syscall = sd ? sd->nr : - syscall_get_nr(current, task_pt_regs(current)); + int this_syscall; if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) - return SECCOMP_PHASE1_OK; + return 0; + + this_syscall = sd ? sd->nr : + syscall_get_nr(current, task_pt_regs(current)); switch (mode) { case SECCOMP_MODE_STRICT: __secure_computing_strict(this_syscall); /* may call do_exit */ - return SECCOMP_PHASE1_OK; -#ifdef CONFIG_SECCOMP_FILTER + return 0; case SECCOMP_MODE_FILTER: - return __seccomp_phase1_filter(this_syscall, sd); -#endif + return __seccomp_filter(this_syscall, sd); default: BUG(); } } - -/** - * seccomp_phase2() - finish slow path seccomp work for the current syscall - * @phase1_result: The return value from seccomp_phase1() - * - * This must be called from a context in which ptrace hooks can be used. - * - * Returns 0 if the syscall should be processed or -1 to skip the syscall. - */ -int seccomp_phase2(u32 phase1_result) -{ - struct pt_regs *regs = task_pt_regs(current); - u32 action = phase1_result & SECCOMP_RET_ACTION; - int data = phase1_result & SECCOMP_RET_DATA; - - BUG_ON(action != SECCOMP_RET_TRACE); - - audit_seccomp(syscall_get_nr(current, regs), 0, action); - - /* Skip these calls if there is no tracer. */ - if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { - syscall_set_return_value(current, regs, - -ENOSYS, 0); - return -1; - } - - /* Allow the BPF to provide the event message */ - ptrace_event(PTRACE_EVENT_SECCOMP, data); - /* - * The delivery of a fatal signal during event - * notification may silently skip tracer notification. - * Terminating the task now avoids executing a system - * call that may not be intended. - */ - if (fatal_signal_pending(current)) - do_exit(SIGSYS); - if (syscall_get_nr(current, regs) < 0) - return -1; /* Explicit request to skip. */ - - return 0; -} #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */ long prctl_get_seccomp(void) -- cgit v1.3-8-gc7d7 From ce6526e8afa4b6ad0ab134a4cc50c9c863319637 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 1 Jun 2016 19:29:15 -0700 Subject: seccomp: recheck the syscall after RET_TRACE When RET_TRACE triggers, a tracer may change a syscall into something that should be filtered by seccomp. This re-runs seccomp after a trace event to make sure things continue to pass. Signed-off-by: Kees Cook Cc: Andy Lutomirski --- kernel/seccomp.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 14a37d71b612..54d15eb2b701 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -556,7 +556,8 @@ void secure_computing_strict(int this_syscall) #else #ifdef CONFIG_SECCOMP_FILTER -static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd) +static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, + const bool recheck_after_trace) { u32 filter_ret, action; int data; @@ -588,6 +589,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd) goto skip; case SECCOMP_RET_TRACE: + /* We've been put in this state by the ptracer already. */ + if (recheck_after_trace) + return 0; + /* ENOSYS these calls if there is no tracer attached. */ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { syscall_set_return_value(current, @@ -611,6 +616,15 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd) if (this_syscall < 0) goto skip; + /* + * Recheck the syscall, since it may have changed. This + * intentionally uses a NULL struct seccomp_data to force + * a reload of all registers. This does not goto skip since + * a skip would have already been reported. + */ + if (__seccomp_filter(this_syscall, NULL, true)) + return -1; + return 0; case SECCOMP_RET_ALLOW: @@ -629,7 +643,8 @@ skip: return -1; } #else -static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd) +static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, + const bool recheck_after_trace) { BUG(); } @@ -652,7 +667,7 @@ int __secure_computing(const struct seccomp_data *sd) __secure_computing_strict(this_syscall); /* may call do_exit */ return 0; case SECCOMP_MODE_FILTER: - return __seccomp_filter(this_syscall, sd); + return __seccomp_filter(this_syscall, sd, false); default: BUG(); } -- cgit v1.3-8-gc7d7 From 0d95092ccba1a30b74fa52ff94ec5415e63744a0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 8 Apr 2016 05:00:03 -0700 Subject: rcu: Fix outdated rcu_scheduler_active comment The comment header for rcu_scheduler_active states that it is used to optimize synchronize_sched() at early boot. This is incorrect. The synchronize_sched() function instead checks the number of online CPUs. This commit therefore replaces the comment's synchronize_sched() with synchronize_rcu(), which really does use rcu_scheduler_active for this purpose. Reported-by: Lihao Liang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c7f1bc4f817c..0fa692f1094e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -130,7 +130,7 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ * The rcu_scheduler_active variable transitions from zero to one just * before the first task is spawned. So when this variable is zero, RCU * can assume that there is but one task, allowing RCU to (for example) - * optimize synchronize_sched() to a simple barrier(). When this variable + * optimize synchronize_rcu() to a simple barrier(). When this variable * is one, RCU must actually do all the hard work required to detect real * grace periods. This variable is also used to suppress boot-time false * positives from lockdep-RCU error checking. -- cgit v1.3-8-gc7d7 From 590d1757b9d177e7fe3707963d0209d6eefbc746 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 10 Apr 2016 08:23:24 -0700 Subject: rcu: Fix outdated hotplug-exclusion comment in rcu_gp_init() In the past, RCU grace-period initialization excluded CPU-hotplug operations, but this is no longer the case. This commit therefore removed an outdated comment in rcu_gp_init() claiming that these are excluded. Reported-by: Lihao Liang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0fa692f1094e..6043c14165d1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1989,8 +1989,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) * of the tree within the rsp->node[] array. Note that other CPUs * will access only the leaves of the hierarchy, thus seeing that no * grace period is in progress, at least until the corresponding - * leaf node has been initialized. In addition, we have excluded - * CPU-hotplug operations. + * leaf node has been initialized. * * The grace period cannot complete until the initialization * process finishes, because this kthread handles both. -- cgit v1.3-8-gc7d7 From d3acab65f274800dd0901f0816f8bca9f2a8c8ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Mar 2016 09:49:04 +0100 Subject: rcu: Remove some superfluous lines I think you'll find this condition is superfluous, as the whole function is under #ifdef of that same. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6043c14165d1..4aefeafb9a95 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4363,9 +4363,6 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return; - /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ -- cgit v1.3-8-gc7d7 From 3549c2bc2c4ea8ecfeb9d21cb81cb00c6002b011 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 15 Apr 2016 16:35:29 -0700 Subject: rcu: Move expedited code from tree.c to tree_exp.h People have been having some difficulty finding their way around the RCU code. This commit therefore pulls some of the expedited grace-period code from tree.c to a new tree_exp.h file. This commit is strictly code movement, with the exception of a forward declaration that was added for the sync_sched_exp_online_cleanup() function. A subsequent commit will move the remaining expedited grace-period code from tree_plugin.h to tree_exp.h. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 545 +----------------------------------------------- kernel/rcu/tree_exp.h | 564 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 566 insertions(+), 543 deletions(-) create mode 100644 kernel/rcu/tree_exp.h (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4aefeafb9a95..c844b6142a86 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -159,6 +159,7 @@ static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, bool wake); +static void sync_sched_exp_online_cleanup(int cpu); /* rcuc/rcub kthread realtime priority */ #ifdef CONFIG_RCU_KTHREAD_PRIO @@ -3447,549 +3448,6 @@ static bool rcu_seq_done(unsigned long *sp, unsigned long s) return ULONG_CMP_GE(READ_ONCE(*sp), s); } -/* Wrapper functions for expedited grace periods. */ -static void rcu_exp_gp_seq_start(struct rcu_state *rsp) -{ - rcu_seq_start(&rsp->expedited_sequence); -} -static void rcu_exp_gp_seq_end(struct rcu_state *rsp) -{ - rcu_seq_end(&rsp->expedited_sequence); - smp_mb(); /* Ensure that consecutive grace periods serialize. */ -} -static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) -{ - unsigned long s; - - smp_mb(); /* Caller's modifications seen first by other CPUs. */ - s = rcu_seq_snap(&rsp->expedited_sequence); - trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); - return s; -} -static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) -{ - return rcu_seq_done(&rsp->expedited_sequence, s); -} - -/* - * Reset the ->expmaskinit values in the rcu_node tree to reflect any - * recent CPU-online activity. Note that these masks are not cleared - * when CPUs go offline, so they reflect the union of all CPUs that have - * ever been online. This means that this function normally takes its - * no-work-to-do fastpath. - */ -static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) -{ - bool done; - unsigned long flags; - unsigned long mask; - unsigned long oldmask; - int ncpus = READ_ONCE(rsp->ncpus); - struct rcu_node *rnp; - struct rcu_node *rnp_up; - - /* If no new CPUs onlined since last time, nothing to do. */ - if (likely(ncpus == rsp->ncpus_snap)) - return; - rsp->ncpus_snap = ncpus; - - /* - * Each pass through the following loop propagates newly onlined - * CPUs for the current rcu_node structure up the rcu_node tree. - */ - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (rnp->expmaskinit == rnp->expmaskinitnext) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - continue; /* No new CPUs, nothing to do. */ - } - - /* Update this node's mask, track old value for propagation. */ - oldmask = rnp->expmaskinit; - rnp->expmaskinit = rnp->expmaskinitnext; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - /* If was already nonzero, nothing to propagate. */ - if (oldmask) - continue; - - /* Propagate the new CPU up the tree. */ - mask = rnp->grpmask; - rnp_up = rnp->parent; - done = false; - while (rnp_up) { - raw_spin_lock_irqsave_rcu_node(rnp_up, flags); - if (rnp_up->expmaskinit) - done = true; - rnp_up->expmaskinit |= mask; - raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); - if (done) - break; - mask = rnp_up->grpmask; - rnp_up = rnp_up->parent; - } - } -} - -/* - * Reset the ->expmask values in the rcu_node tree in preparation for - * a new expedited grace period. - */ -static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) -{ - unsigned long flags; - struct rcu_node *rnp; - - sync_exp_reset_tree_hotplug(rsp); - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - WARN_ON_ONCE(rnp->expmask); - rnp->expmask = rnp->expmaskinit; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } -} - -/* - * Return non-zero if there is no RCU expedited grace period in progress - * for the specified rcu_node structure, in other words, if all CPUs and - * tasks covered by the specified rcu_node structure have done their bit - * for the current expedited grace period. Works only for preemptible - * RCU -- other RCU implementation use other means. - * - * Caller must hold the rcu_state's exp_mutex. - */ -static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) -{ - return rnp->exp_tasks == NULL && - READ_ONCE(rnp->expmask) == 0; -} - -/* - * Report the exit from RCU read-side critical section for the last task - * that queued itself during or before the current expedited preemptible-RCU - * grace period. This event is reported either to the rcu_node structure on - * which the task was queued or to one of that rcu_node structure's ancestors, - * recursively up the tree. (Calm down, calm down, we do the recursion - * iteratively!) - * - * Caller must hold the rcu_state's exp_mutex and the specified rcu_node - * structure's ->lock. - */ -static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake, unsigned long flags) - __releases(rnp->lock) -{ - unsigned long mask; - - for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) { - if (!rnp->expmask) - rcu_initiate_boost(rnp, flags); - else - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - break; - } - if (rnp->parent == NULL) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (wake) { - smp_mb(); /* EGP done before wake_up(). */ - swake_up(&rsp->expedited_wq); - } - break; - } - mask = rnp->grpmask; - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ - rnp = rnp->parent; - raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ - WARN_ON_ONCE(!(rnp->expmask & mask)); - rnp->expmask &= ~mask; - } -} - -/* - * Report expedited quiescent state for specified node. This is a - * lock-acquisition wrapper function for __rcu_report_exp_rnp(). - * - * Caller must hold the rcu_state's exp_mutex. - */ -static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, - struct rcu_node *rnp, bool wake) -{ - unsigned long flags; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - __rcu_report_exp_rnp(rsp, rnp, wake, flags); -} - -/* - * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. Caller must hold the rcu_state's - * exp_mutex. - */ -static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, - unsigned long mask, bool wake) -{ - unsigned long flags; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (!(rnp->expmask & mask)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - rnp->expmask &= ~mask; - __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ -} - -/* - * Report expedited quiescent state for specified rcu_data (CPU). - */ -static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, - bool wake) -{ - rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); -} - -/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, - unsigned long s) -{ - if (rcu_exp_gp_seq_done(rsp, s)) { - trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); - /* Ensure test happens before caller kfree(). */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(stat); - return true; - } - return false; -} - -/* - * Funnel-lock acquisition for expedited grace periods. Returns true - * if some other task completed an expedited grace period that this task - * can piggy-back on, and with no mutex held. Otherwise, returns false - * with the mutex held, indicating that the caller must actually do the - * expedited grace period. - */ -static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) -{ - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - struct rcu_node *rnp = rdp->mynode; - struct rcu_node *rnp_root = rcu_get_root(rsp); - - /* Low-contention fastpath. */ - if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && - (rnp == rnp_root || - ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && - !mutex_is_locked(&rsp->exp_mutex) && - mutex_trylock(&rsp->exp_mutex)) - goto fastpath; - - /* - * Each pass through the following loop works its way up - * the rcu_node tree, returning if others have done the work or - * otherwise falls through to acquire rsp->exp_mutex. The mapping - * from CPU to rcu_node structure can be inexact, as it is just - * promoting locality and is not strictly needed for correctness. - */ - for (; rnp != NULL; rnp = rnp->parent) { - if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) - return true; - - /* Work not done, either wait here or go up. */ - spin_lock(&rnp->exp_lock); - if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { - - /* Someone else doing GP, so wait for them. */ - spin_unlock(&rnp->exp_lock); - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, - rnp->grplo, rnp->grphi, - TPS("wait")); - wait_event(rnp->exp_wq[(s >> 1) & 0x3], - sync_exp_work_done(rsp, - &rdp->exp_workdone2, s)); - return true; - } - rnp->exp_seq_rq = s; /* Followers can wait on us. */ - spin_unlock(&rnp->exp_lock); - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, - rnp->grphi, TPS("nxtlvl")); - } - mutex_lock(&rsp->exp_mutex); -fastpath: - if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { - mutex_unlock(&rsp->exp_mutex); - return true; - } - rcu_exp_gp_seq_start(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); - return false; -} - -/* Invoked on each online non-idle CPU for expedited quiescent state. */ -static void sync_sched_exp_handler(void *data) -{ - struct rcu_data *rdp; - struct rcu_node *rnp; - struct rcu_state *rsp = data; - - rdp = this_cpu_ptr(rsp->rda); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - if (rcu_is_cpu_rrupt_from_idle()) { - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), true); - return; - } - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); - resched_cpu(smp_processor_id()); -} - -/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ - struct rcu_data *rdp; - int ret; - struct rcu_node *rnp; - struct rcu_state *rsp = &rcu_sched_state; - - rdp = per_cpu_ptr(rsp->rda, cpu); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) - return; - ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); - WARN_ON_ONCE(ret); -} - -/* - * Select the nodes that the upcoming expedited grace period needs - * to wait for. - */ -static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, - smp_call_func_t func) -{ - int cpu; - unsigned long flags; - unsigned long mask; - unsigned long mask_ofl_test; - unsigned long mask_ofl_ipi; - int ret; - struct rcu_node *rnp; - - sync_exp_reset_tree(rsp); - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - - /* Each pass checks a CPU for identity, offline, and idle. */ - mask_ofl_test = 0; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - if (raw_smp_processor_id() == cpu || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) - mask_ofl_test |= rdp->grpmask; - } - mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited - * GP until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - /* IPI the remaining CPUs for expedited quiescent state. */ - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - if (!(mask_ofl_ipi & mask)) - continue; -retry_ipi: - ret = smp_call_function_single(cpu, func, rsp, 0); - if (!ret) { - mask_ofl_ipi &= ~mask; - continue; - } - /* Failed, raced with offline. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (cpu_online(cpu) && - (rnp->expmask & mask)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - schedule_timeout_uninterruptible(1); - if (cpu_online(cpu) && - (rnp->expmask & mask)) - goto retry_ipi; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - } - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } - /* Report quiescent states for those that went offline. */ - mask_ofl_test |= mask_ofl_ipi; - if (mask_ofl_test) - rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); - } -} - -static void synchronize_sched_expedited_wait(struct rcu_state *rsp) -{ - int cpu; - unsigned long jiffies_stall; - unsigned long jiffies_start; - unsigned long mask; - int ndetected; - struct rcu_node *rnp; - struct rcu_node *rnp_root = rcu_get_root(rsp); - int ret; - - jiffies_stall = rcu_jiffies_till_stall_check(); - jiffies_start = jiffies; - - for (;;) { - ret = swait_event_timeout( - rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root), - jiffies_stall); - if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) - return; - if (ret < 0) { - /* Hit a signal, disable CPU stall warnings. */ - swait_event(rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root)); - return; - } - pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", - rsp->name); - ndetected = 0; - rcu_for_each_leaf_node(rsp, rnp) { - ndetected += rcu_print_task_exp_stall(rnp); - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - struct rcu_data *rdp; - - if (!(rnp->expmask & mask)) - continue; - ndetected++; - rdp = per_cpu_ptr(rsp->rda, cpu); - pr_cont(" %d-%c%c%c", cpu, - "O."[!!cpu_online(cpu)], - "o."[!!(rdp->grpmask & rnp->expmaskinit)], - "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); - } - mask <<= 1; - } - pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - jiffies - jiffies_start, rsp->expedited_sequence, - rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); - if (ndetected) { - pr_err("blocking rcu_node structures:"); - rcu_for_each_node_breadth_first(rsp, rnp) { - if (rnp == rnp_root) - continue; /* printed unconditionally */ - if (sync_rcu_preempt_exp_done(rnp)) - continue; - pr_cont(" l=%u:%d-%d:%#lx/%c", - rnp->level, rnp->grplo, rnp->grphi, - rnp->expmask, - ".T"[!!rnp->exp_tasks]); - } - pr_cont("\n"); - } - rcu_for_each_leaf_node(rsp, rnp) { - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - if (!(rnp->expmask & mask)) - continue; - dump_cpu_task(cpu); - } - } - jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; - } -} - -/* - * Wait for the current expedited grace period to complete, and then - * wake up everyone who piggybacked on the just-completed expedited - * grace period. Also update all the ->exp_seq_rq counters as needed - * in order to avoid counter-wrap problems. - */ -static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) -{ - struct rcu_node *rnp; - - synchronize_sched_expedited_wait(rsp); - rcu_exp_gp_seq_end(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); - - /* - * Switch over to wakeup mode, allowing the next GP, but -only- the - * next GP, to proceed. - */ - mutex_lock(&rsp->exp_wake_mutex); - mutex_unlock(&rsp->exp_mutex); - - rcu_for_each_node_breadth_first(rsp, rnp) { - if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { - spin_lock(&rnp->exp_lock); - /* Recheck, avoid hang in case someone just arrived. */ - if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) - rnp->exp_seq_rq = s; - spin_unlock(&rnp->exp_lock); - } - wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); - } - trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); - mutex_unlock(&rsp->exp_wake_mutex); -} - -/** - * synchronize_sched_expedited - Brute-force RCU-sched grace period - * - * Wait for an RCU-sched grace period to elapse, but use a "big hammer" - * approach to force the grace period to end quickly. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. In fact, - * if you are using synchronize_sched_expedited() in a loop, please - * restructure your code to batch your updates, and then use a single - * synchronize_sched() instead. - * - * This implementation can be thought of as an application of sequence - * locking to expedited grace periods, but using the sequence counter to - * determine when someone else has already done the work instead of for - * retrying readers. - */ -void synchronize_sched_expedited(void) -{ - unsigned long s; - struct rcu_state *rsp = &rcu_sched_state; - - /* If only one CPU, this is automatically a grace period. */ - if (rcu_blocking_is_gp()) - return; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu_sched); - return; - } - - /* Take a snapshot of the sequence number. */ - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) - return; /* Someone else did our work for us. */ - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); - - /* Wait and clean up, including waking everyone. */ - rcu_exp_wait_wake(rsp, s); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - /* * Check to see if there is any immediate RCU-related work to be done * by the current CPU, for the specified type of RCU, returning 1 if so. @@ -4747,4 +4205,5 @@ void __init rcu_init(void) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); } +#include "tree_exp.h" #include "tree_plugin.h" diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h new file mode 100644 index 000000000000..db0909cf7fe1 --- /dev/null +++ b/kernel/rcu/tree_exp.h @@ -0,0 +1,564 @@ +/* + * RCU expedited grace periods + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2016 + * + * Authors: Paul E. McKenney + */ + +/* Wrapper functions for expedited grace periods. */ +static void rcu_exp_gp_seq_start(struct rcu_state *rsp) +{ + rcu_seq_start(&rsp->expedited_sequence); +} +static void rcu_exp_gp_seq_end(struct rcu_state *rsp) +{ + rcu_seq_end(&rsp->expedited_sequence); + smp_mb(); /* Ensure that consecutive grace periods serialize. */ +} +static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) +{ + unsigned long s; + + smp_mb(); /* Caller's modifications seen first by other CPUs. */ + s = rcu_seq_snap(&rsp->expedited_sequence); + trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); + return s; +} +static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) +{ + return rcu_seq_done(&rsp->expedited_sequence, s); +} + +/* + * Reset the ->expmaskinit values in the rcu_node tree to reflect any + * recent CPU-online activity. Note that these masks are not cleared + * when CPUs go offline, so they reflect the union of all CPUs that have + * ever been online. This means that this function normally takes its + * no-work-to-do fastpath. + */ +static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) +{ + bool done; + unsigned long flags; + unsigned long mask; + unsigned long oldmask; + int ncpus = READ_ONCE(rsp->ncpus); + struct rcu_node *rnp; + struct rcu_node *rnp_up; + + /* If no new CPUs onlined since last time, nothing to do. */ + if (likely(ncpus == rsp->ncpus_snap)) + return; + rsp->ncpus_snap = ncpus; + + /* + * Each pass through the following loop propagates newly onlined + * CPUs for the current rcu_node structure up the rcu_node tree. + */ + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (rnp->expmaskinit == rnp->expmaskinitnext) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + continue; /* No new CPUs, nothing to do. */ + } + + /* Update this node's mask, track old value for propagation. */ + oldmask = rnp->expmaskinit; + rnp->expmaskinit = rnp->expmaskinitnext; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + /* If was already nonzero, nothing to propagate. */ + if (oldmask) + continue; + + /* Propagate the new CPU up the tree. */ + mask = rnp->grpmask; + rnp_up = rnp->parent; + done = false; + while (rnp_up) { + raw_spin_lock_irqsave_rcu_node(rnp_up, flags); + if (rnp_up->expmaskinit) + done = true; + rnp_up->expmaskinit |= mask; + raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); + if (done) + break; + mask = rnp_up->grpmask; + rnp_up = rnp_up->parent; + } + } +} + +/* + * Reset the ->expmask values in the rcu_node tree in preparation for + * a new expedited grace period. + */ +static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_node *rnp; + + sync_exp_reset_tree_hotplug(rsp); + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + WARN_ON_ONCE(rnp->expmask); + rnp->expmask = rnp->expmaskinit; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } +} + +/* + * Return non-zero if there is no RCU expedited grace period in progress + * for the specified rcu_node structure, in other words, if all CPUs and + * tasks covered by the specified rcu_node structure have done their bit + * for the current expedited grace period. Works only for preemptible + * RCU -- other RCU implementation use other means. + * + * Caller must hold the rcu_state's exp_mutex. + */ +static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +{ + return rnp->exp_tasks == NULL && + READ_ONCE(rnp->expmask) == 0; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. This event is reported either to the rcu_node structure on + * which the task was queued or to one of that rcu_node structure's ancestors, + * recursively up the tree. (Calm down, calm down, we do the recursion + * iteratively!) + * + * Caller must hold the rcu_state's exp_mutex and the specified rcu_node + * structure's ->lock. + */ +static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake, unsigned long flags) + __releases(rnp->lock) +{ + unsigned long mask; + + for (;;) { + if (!sync_rcu_preempt_exp_done(rnp)) { + if (!rnp->expmask) + rcu_initiate_boost(rnp, flags); + else + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + break; + } + if (rnp->parent == NULL) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (wake) { + smp_mb(); /* EGP done before wake_up(). */ + swake_up(&rsp->expedited_wq); + } + break; + } + mask = rnp->grpmask; + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ + rnp = rnp->parent; + raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ + WARN_ON_ONCE(!(rnp->expmask & mask)); + rnp->expmask &= ~mask; + } +} + +/* + * Report expedited quiescent state for specified node. This is a + * lock-acquisition wrapper function for __rcu_report_exp_rnp(). + * + * Caller must hold the rcu_state's exp_mutex. + */ +static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + __rcu_report_exp_rnp(rsp, rnp, wake, flags); +} + +/* + * Report expedited quiescent state for multiple CPUs, all covered by the + * specified leaf rcu_node structure. Caller must hold the rcu_state's + * exp_mutex. + */ +static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, + unsigned long mask, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!(rnp->expmask & mask)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + rnp->expmask &= ~mask; + __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ +} + +/* + * Report expedited quiescent state for specified rcu_data (CPU). + */ +static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, + bool wake) +{ + rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); +} + +/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ +static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, + unsigned long s) +{ + if (rcu_exp_gp_seq_done(rsp, s)) { + trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); + /* Ensure test happens before caller kfree(). */ + smp_mb__before_atomic(); /* ^^^ */ + atomic_long_inc(stat); + return true; + } + return false; +} + +/* + * Funnel-lock acquisition for expedited grace periods. Returns true + * if some other task completed an expedited grace period that this task + * can piggy-back on, and with no mutex held. Otherwise, returns false + * with the mutex held, indicating that the caller must actually do the + * expedited grace period. + */ +static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); + struct rcu_node *rnp = rdp->mynode; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + /* Low-contention fastpath. */ + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && + (rnp == rnp_root || + ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && + !mutex_is_locked(&rsp->exp_mutex) && + mutex_trylock(&rsp->exp_mutex)) + goto fastpath; + + /* + * Each pass through the following loop works its way up + * the rcu_node tree, returning if others have done the work or + * otherwise falls through to acquire rsp->exp_mutex. The mapping + * from CPU to rcu_node structure can be inexact, as it is just + * promoting locality and is not strictly needed for correctness. + */ + for (; rnp != NULL; rnp = rnp->parent) { + if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) + return true; + + /* Work not done, either wait here or go up. */ + spin_lock(&rnp->exp_lock); + if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { + + /* Someone else doing GP, so wait for them. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, + rnp->grplo, rnp->grphi, + TPS("wait")); + wait_event(rnp->exp_wq[(s >> 1) & 0x3], + sync_exp_work_done(rsp, + &rdp->exp_workdone2, s)); + return true; + } + rnp->exp_seq_rq = s; /* Followers can wait on us. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, + rnp->grphi, TPS("nxtlvl")); + } + mutex_lock(&rsp->exp_mutex); +fastpath: + if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { + mutex_unlock(&rsp->exp_mutex); + return true; + } + rcu_exp_gp_seq_start(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); + return false; +} + +/* Invoked on each online non-idle CPU for expedited quiescent state. */ +static void sync_sched_exp_handler(void *data) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp = data; + + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + if (rcu_is_cpu_rrupt_from_idle()) { + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); + return; + } + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); + resched_cpu(smp_processor_id()); +} + +/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ +static void sync_sched_exp_online_cleanup(int cpu) +{ + struct rcu_data *rdp; + int ret; + struct rcu_node *rnp; + struct rcu_state *rsp = &rcu_sched_state; + + rdp = per_cpu_ptr(rsp->rda, cpu); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) + return; + ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); + WARN_ON_ONCE(ret); +} + +/* + * Select the nodes that the upcoming expedited grace period needs + * to wait for. + */ +static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, + smp_call_func_t func) +{ + int cpu; + unsigned long flags; + unsigned long mask; + unsigned long mask_ofl_test; + unsigned long mask_ofl_ipi; + int ret; + struct rcu_node *rnp; + + sync_exp_reset_tree(rsp); + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + + /* Each pass checks a CPU for identity, offline, and idle. */ + mask_ofl_test = 0; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + if (raw_smp_processor_id() == cpu || + !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + mask_ofl_test |= rdp->grpmask; + } + mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; + + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited + * GP until such time as the ->expmask bits are cleared. + */ + if (rcu_preempt_has_tasks(rnp)) + rnp->exp_tasks = rnp->blkd_tasks.next; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + /* IPI the remaining CPUs for expedited quiescent state. */ + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(mask_ofl_ipi & mask)) + continue; +retry_ipi: + ret = smp_call_function_single(cpu, func, rsp, 0); + if (!ret) { + mask_ofl_ipi &= ~mask; + continue; + } + /* Failed, raced with offline. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (cpu_online(cpu) && + (rnp->expmask & mask)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + schedule_timeout_uninterruptible(1); + if (cpu_online(cpu) && + (rnp->expmask & mask)) + goto retry_ipi; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + } + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + /* Report quiescent states for those that went offline. */ + mask_ofl_test |= mask_ofl_ipi; + if (mask_ofl_test) + rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); + } +} + +static void synchronize_sched_expedited_wait(struct rcu_state *rsp) +{ + int cpu; + unsigned long jiffies_stall; + unsigned long jiffies_start; + unsigned long mask; + int ndetected; + struct rcu_node *rnp; + struct rcu_node *rnp_root = rcu_get_root(rsp); + int ret; + + jiffies_stall = rcu_jiffies_till_stall_check(); + jiffies_start = jiffies; + + for (;;) { + ret = swait_event_timeout( + rsp->expedited_wq, + sync_rcu_preempt_exp_done(rnp_root), + jiffies_stall); + if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) + return; + if (ret < 0) { + /* Hit a signal, disable CPU stall warnings. */ + swait_event(rsp->expedited_wq, + sync_rcu_preempt_exp_done(rnp_root)); + return; + } + pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", + rsp->name); + ndetected = 0; + rcu_for_each_leaf_node(rsp, rnp) { + ndetected += rcu_print_task_exp_stall(rnp); + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + struct rcu_data *rdp; + + if (!(rnp->expmask & mask)) + continue; + ndetected++; + rdp = per_cpu_ptr(rsp->rda, cpu); + pr_cont(" %d-%c%c%c", cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rnp->expmaskinit)], + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); + } + mask <<= 1; + } + pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", + jiffies - jiffies_start, rsp->expedited_sequence, + rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); + if (ndetected) { + pr_err("blocking rcu_node structures:"); + rcu_for_each_node_breadth_first(rsp, rnp) { + if (rnp == rnp_root) + continue; /* printed unconditionally */ + if (sync_rcu_preempt_exp_done(rnp)) + continue; + pr_cont(" l=%u:%d-%d:%#lx/%c", + rnp->level, rnp->grplo, rnp->grphi, + rnp->expmask, + ".T"[!!rnp->exp_tasks]); + } + pr_cont("\n"); + } + rcu_for_each_leaf_node(rsp, rnp) { + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(rnp->expmask & mask)) + continue; + dump_cpu_task(cpu); + } + } + jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; + } +} + +/* + * Wait for the current expedited grace period to complete, and then + * wake up everyone who piggybacked on the just-completed expedited + * grace period. Also update all the ->exp_seq_rq counters as needed + * in order to avoid counter-wrap problems. + */ +static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_node *rnp; + + synchronize_sched_expedited_wait(rsp); + rcu_exp_gp_seq_end(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); + + /* + * Switch over to wakeup mode, allowing the next GP, but -only- the + * next GP, to proceed. + */ + mutex_lock(&rsp->exp_wake_mutex); + mutex_unlock(&rsp->exp_mutex); + + rcu_for_each_node_breadth_first(rsp, rnp) { + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { + spin_lock(&rnp->exp_lock); + /* Recheck, avoid hang in case someone just arrived. */ + if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) + rnp->exp_seq_rq = s; + spin_unlock(&rnp->exp_lock); + } + wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); + } + trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); + mutex_unlock(&rsp->exp_wake_mutex); +} + +/** + * synchronize_sched_expedited - Brute-force RCU-sched grace period + * + * Wait for an RCU-sched grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. In fact, + * if you are using synchronize_sched_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_sched() instead. + * + * This implementation can be thought of as an application of sequence + * locking to expedited grace periods, but using the sequence counter to + * determine when someone else has already done the work instead of for + * retrying readers. + */ +void synchronize_sched_expedited(void) +{ + unsigned long s; + struct rcu_state *rsp = &rcu_sched_state; + + /* If only one CPU, this is automatically a grace period. */ + if (rcu_blocking_is_gp()) + return; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu_sched); + return; + } + + /* Take a snapshot of the sequence number. */ + s = rcu_exp_gp_seq_snap(rsp); + if (exp_funnel_lock(rsp, s)) + return; /* Someone else did our work for us. */ + + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); + + /* Wait and clean up, including waking everyone. */ + rcu_exp_wait_wake(rsp, s); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); -- cgit v1.3-8-gc7d7 From 40e0a6cfd53e37d9b8863cdbc0adb1f72e9311e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 15 Apr 2016 16:44:07 -0700 Subject: rcu: Move expedited code from tree_plugin.h to tree_exp.h People have been having some difficulty finding their way around the RCU code. This commit therefore pulls some of the expedited grace-period code from tree_plugin.h to a new tree_exp.h file. This commit is strictly code movement. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 94 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcu/tree_plugin.h | 88 --------------------------------------------- 2 files changed, 94 insertions(+), 88 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index db0909cf7fe1..00a02a231ada 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -562,3 +562,97 @@ void synchronize_sched_expedited(void) rcu_exp_wait_wake(rsp, s); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#ifdef CONFIG_PREEMPT_RCU + +/* + * Remote handler for smp_call_function_single(). If there is an + * RCU read-side critical section in effect, request that the + * next rcu_read_unlock() record the quiescent state up the + * ->expmask fields in the rcu_node tree. Otherwise, immediately + * report the quiescent state. + */ +static void sync_rcu_exp_handler(void *info) +{ + struct rcu_data *rdp; + struct rcu_state *rsp = info; + struct task_struct *t = current; + + /* + * Within an RCU read-side critical section, request that the next + * rcu_read_unlock() report. Unless this RCU read-side critical + * section has already blocked, in which case it is already set + * up for the expedited grace period to wait on it. + */ + if (t->rcu_read_lock_nesting > 0 && + !t->rcu_read_unlock_special.b.blocked) { + t->rcu_read_unlock_special.b.exp_need_qs = true; + return; + } + + /* + * We are either exiting an RCU read-side critical section (negative + * values of t->rcu_read_lock_nesting) or are not in one at all + * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU + * read-side critical section that blocked before this expedited + * grace period started. Either way, we can immediately report + * the quiescent state. + */ + rdp = this_cpu_ptr(rsp->rda); + rcu_report_exp_rdp(rsp, rdp, true); +} + +/** + * synchronize_rcu_expedited - Brute-force RCU grace period + * + * Wait for an RCU-preempt grace period, but expedite it. The basic + * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler + * checks whether the CPU is in an RCU-preempt critical section, and + * if so, it sets a flag that causes the outermost rcu_read_unlock() + * to report the quiescent state. On the other hand, if the CPU is + * not in an RCU read-side critical section, the IPI handler reports + * the quiescent state immediately. + * + * Although this is a greate improvement over previous expedited + * implementations, it is still unfriendly to real-time workloads, so is + * thus not recommended for any sort of common-case code. In fact, if + * you are using synchronize_rcu_expedited() in a loop, please restructure + * your code to batch your updates, and then Use a single synchronize_rcu() + * instead. + */ +void synchronize_rcu_expedited(void) +{ + struct rcu_state *rsp = rcu_state_p; + unsigned long s; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu); + return; + } + + s = rcu_exp_gp_seq_snap(rsp); + if (exp_funnel_lock(rsp, s)) + return; /* Someone else did our work for us. */ + + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); + + /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ + rcu_exp_wait_wake(rsp, s); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +/* + * Wait for an rcu-preempt grace period, but make it happen quickly. + * But because preemptible RCU does not exist, map to rcu-sched. + */ +void synchronize_rcu_expedited(void) +{ + synchronize_sched_expedited(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ff1cd4e1188d..695071dd1e9c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -681,84 +681,6 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -/* - * Remote handler for smp_call_function_single(). If there is an - * RCU read-side critical section in effect, request that the - * next rcu_read_unlock() record the quiescent state up the - * ->expmask fields in the rcu_node tree. Otherwise, immediately - * report the quiescent state. - */ -static void sync_rcu_exp_handler(void *info) -{ - struct rcu_data *rdp; - struct rcu_state *rsp = info; - struct task_struct *t = current; - - /* - * Within an RCU read-side critical section, request that the next - * rcu_read_unlock() report. Unless this RCU read-side critical - * section has already blocked, in which case it is already set - * up for the expedited grace period to wait on it. - */ - if (t->rcu_read_lock_nesting > 0 && - !t->rcu_read_unlock_special.b.blocked) { - t->rcu_read_unlock_special.b.exp_need_qs = true; - return; - } - - /* - * We are either exiting an RCU read-side critical section (negative - * values of t->rcu_read_lock_nesting) or are not in one at all - * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU - * read-side critical section that blocked before this expedited - * grace period started. Either way, we can immediately report - * the quiescent state. - */ - rdp = this_cpu_ptr(rsp->rda); - rcu_report_exp_rdp(rsp, rdp, true); -} - -/** - * synchronize_rcu_expedited - Brute-force RCU grace period - * - * Wait for an RCU-preempt grace period, but expedite it. The basic - * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler - * checks whether the CPU is in an RCU-preempt critical section, and - * if so, it sets a flag that causes the outermost rcu_read_unlock() - * to report the quiescent state. On the other hand, if the CPU is - * not in an RCU read-side critical section, the IPI handler reports - * the quiescent state immediately. - * - * Although this is a greate improvement over previous expedited - * implementations, it is still unfriendly to real-time workloads, so is - * thus not recommended for any sort of common-case code. In fact, if - * you are using synchronize_rcu_expedited() in a loop, please restructure - * your code to batch your updates, and then Use a single synchronize_rcu() - * instead. - */ -void synchronize_rcu_expedited(void) -{ - struct rcu_state *rsp = rcu_state_p; - unsigned long s; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu); - return; - } - - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) - return; /* Someone else did our work for us. */ - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); - - /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ - rcu_exp_wait_wake(rsp, s); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - /** * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. * @@ -882,16 +804,6 @@ static void rcu_preempt_check_callbacks(void) { } -/* - * Wait for an rcu-preempt grace period, but make it happen quickly. - * But because preemptible RCU does not exist, map to rcu-sched. - */ -void synchronize_rcu_expedited(void) -{ - synchronize_sched_expedited(); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - /* * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). -- cgit v1.3-8-gc7d7 From f8cbdee99b161cb08c3fb55200941028c5fe25c8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Apr 2016 13:03:18 -0700 Subject: torture: Simplify code, eliminate RCU_PERF_TEST_RUNNABLE This commit applies the infamous IS_ENABLED() macro to eliminate a #ifdef. It also eliminates the RCU_PERF_TEST_RUNNABLE Kconfig option in favor of the already-existing rcuperf.perf_runnable kernel boot parameter. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 7 +------ lib/Kconfig.debug | 16 ---------------- 2 files changed, 1 insertion(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 3cee0d8393ed..afd174e901c3 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -96,12 +96,7 @@ static int rcu_perf_writer_state; #define MAX_MEAS 10000 #define MIN_MEAS 100 -#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE) -#define RCUPERF_RUNNABLE_INIT 1 -#else -#define RCUPERF_RUNNABLE_INIT 0 -#endif -static int perf_runnable = RCUPERF_RUNNABLE_INIT; +static int perf_runnable = IS_ENABLED(MODULE); module_param(perf_runnable, int, 0444); MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b9cfdbfae9aa..cf6ddcd8f70c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1307,22 +1307,6 @@ config RCU_PERF_TEST Say M if you want the RCU performance tests to build as a module. Say N if you are unsure. -config RCU_PERF_TEST_RUNNABLE - bool "performance tests for RCU runnable by default" - depends on RCU_PERF_TEST = y - default n - help - This option provides a way to build the RCU performance tests - directly into the kernel without them starting up at boot time. - You can use /sys/module to manually override this setting. - This /proc file is available only when the RCU performance - tests have been built into the kernel. - - Say Y here if you want the RCU performance tests to start during - boot (you probably don't). - Say N here if you want the RCU performance tests to start only - after being manually enabled via /sys/module. - config RCU_TORTURE_TEST tristate "torture tests for RCU" depends on DEBUG_KERNEL -- cgit v1.3-8-gc7d7 From 4e9a073f60367157fd64b65490654c39d4c44321 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Apr 2016 14:42:34 -0700 Subject: torture: Remove CONFIG_RCU_TORTURE_TEST_RUNNABLE, simplify code This commit removes CONFIG_RCU_TORTURE_TEST_RUNNABLE in favor of the already-existing rcutorture.torture_runnable kernel boot parameter. It also converts an #ifdef into IS_ENABLED(), saving a few lines of code. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 7 +------ kernel/rcu/tree_plugin.h | 2 -- lib/Kconfig.debug | 17 ----------------- 3 files changed, 1 insertion(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 084a28a732eb..01cb57ff106f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -182,12 +182,7 @@ static const char *rcu_torture_writer_state_getname(void) return rcu_torture_writer_state_names[i]; } -#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) -#define RCUTORTURE_RUNNABLE_INIT 1 -#else -#define RCUTORTURE_RUNNABLE_INIT 0 -#endif -static int torture_runnable = RCUTORTURE_RUNNABLE_INIT; +static int torture_runnable = IS_ENABLED(MODULE); module_param(torture_runnable, int, 0444); MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ff1cd4e1188d..5b2d723e6568 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -79,8 +79,6 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); - if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) - pr_info("\tRCU torture testing starts during boot.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cf6ddcd8f70c..805b7048a1bd 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1324,23 +1324,6 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. -config RCU_TORTURE_TEST_RUNNABLE - bool "torture tests for RCU runnable by default" - depends on RCU_TORTURE_TEST = y - default n - help - This option provides a way to build the RCU torture tests - directly into the kernel without them starting up at boot - time. You can use /proc/sys/kernel/rcutorture_runnable - to manually override this setting. This /proc file is - available only when the RCU torture tests have been built - into the kernel. - - Say Y here if you want the RCU torture tests to start during - boot (you probably don't). - Say N here if you want the RCU torture tests to start only - after being manually enabled via /proc. - config RCU_TORTURE_TEST_SLOW_PREINIT bool "Slow down RCU grace-period pre-initialization to expose races" depends on RCU_TORTURE_TEST -- cgit v1.3-8-gc7d7 From d95f5ba90fa6043a366958897fdef705af968b70 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 20 Apr 2016 17:18:41 -0700 Subject: torture: Break online and offline functions out of torture_onoff() This commit breaks torture_online() and torture_offline() out of torture_onoff() in preparation for allowing waketorture finer-grained control of its CPU-hotplug workload. Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 4 ++ kernel/torture.c | 168 ++++++++++++++++++++++++++++++------------------ 2 files changed, 108 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/include/linux/torture.h b/include/linux/torture.h index 7759fc3c622d..6685a73736a2 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -50,6 +50,10 @@ do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); } while (0) /* Definitions for online/offline exerciser. */ +bool torture_offline(int cpu, long *n_onl_attempts, long *n_onl_successes, + unsigned long *sum_offl, int *min_onl, int *max_onl); +bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, + unsigned long *sum_onl, int *min_onl, int *max_onl); int torture_onoff_init(long ooholdoff, long oointerval); void torture_onoff_stats(void); bool torture_onoff_failures(void); diff --git a/kernel/torture.c b/kernel/torture.c index fa0bdeee17ac..fb39a06bbef5 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -81,6 +81,104 @@ static unsigned long sum_online; static int min_online = -1; static int max_online; +/* + * Attempt to take a CPU offline. Return false if the CPU is already + * offline or if it is not subject to CPU-hotplug operations. The + * caller can detect other failures by looking at the statistics. + */ +bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, + unsigned long *sum_offl, int *min_offl, int *max_offl) +{ + unsigned long delta; + int ret; + unsigned long starttime; + + if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) + return false; + + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlining %d\n", + torture_type, cpu); + starttime = jiffies; + (*n_offl_attempts)++; + ret = cpu_down(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offline %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlined %d\n", + torture_type, cpu); + (*n_offl_successes)++; + delta = jiffies - starttime; + sum_offl += delta; + if (*min_offl < 0) { + *min_offl = delta; + *max_offl = delta; + } + if (*min_offl > delta) + *min_offl = delta; + if (*max_offl < delta) + *max_offl = delta; + } + + return true; +} +EXPORT_SYMBOL_GPL(torture_offline); + +/* + * Attempt to bring a CPU online. Return false if the CPU is already + * online or if it is not subject to CPU-hotplug operations. The + * caller can detect other failures by looking at the statistics. + */ +bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, + unsigned long *sum_onl, int *min_onl, int *max_onl) +{ + unsigned long delta; + int ret; + unsigned long starttime; + + if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) + return false; + + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlining %d\n", + torture_type, cpu); + starttime = jiffies; + (*n_onl_attempts)++; + ret = cpu_up(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: online %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlined %d\n", + torture_type, cpu); + (*n_onl_successes)++; + delta = jiffies - starttime; + *sum_onl += delta; + if (*min_onl < 0) { + *min_onl = delta; + *max_onl = delta; + } + if (*min_onl > delta) + *min_onl = delta; + if (*max_onl < delta) + *max_onl = delta; + } + + return true; +} +EXPORT_SYMBOL_GPL(torture_online); + /* * Execute random CPU-hotplug operations at the interval specified * by the onoff_interval. @@ -89,11 +187,8 @@ static int torture_onoff(void *arg) { int cpu; - unsigned long delta; int maxcpu = -1; DEFINE_TORTURE_RANDOM(rand); - int ret; - unsigned long starttime; VERBOSE_TOROUT_STRING("torture_onoff task started"); for_each_online_cpu(cpu) @@ -106,67 +201,12 @@ torture_onoff(void *arg) } while (!torture_must_stop()) { cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); - if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_offline_attempts++; - ret = cpu_down(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offline %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offlined %d\n", - torture_type, cpu); - n_offline_successes++; - delta = jiffies - starttime; - sum_offline += delta; - if (min_offline < 0) { - min_offline = delta; - max_offline = delta; - } - if (min_offline > delta) - min_offline = delta; - if (max_offline < delta) - max_offline = delta; - } - } else if (cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: onlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_online_attempts++; - ret = cpu_up(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: online %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: onlined %d\n", - torture_type, cpu); - n_online_successes++; - delta = jiffies - starttime; - sum_online += delta; - if (min_online < 0) { - min_online = delta; - max_online = delta; - } - if (min_online > delta) - min_online = delta; - if (max_online < delta) - max_online = delta; - } - } + if (!torture_offline(cpu, + &n_offline_attempts, &n_offline_successes, + &sum_offline, &min_offline, &max_offline)) + torture_online(cpu, + &n_online_attempts, &n_online_successes, + &sum_online, &min_online, &max_online); schedule_timeout_interruptible(onoff_interval); } torture_kthread_stopping("torture_onoff"); -- cgit v1.3-8-gc7d7 From 750db0f5f7d0ff6b86158015f02c275702639b20 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Mon, 2 May 2016 10:30:00 +0800 Subject: torture: Stop onoff task if there is only one cpu If the whole system has only one cpu, that cpu won't be able to be offlined, so there is no need onoff task is stil running. Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/torture.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index fb39a06bbef5..75961b3decfe 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -194,6 +194,12 @@ torture_onoff(void *arg) for_each_online_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); + + if (maxcpu == 0) { + VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled"); + goto stop; + } + if (onoff_holdoff > 0) { VERBOSE_TOROUT_STRING("torture_onoff begin holdoff"); schedule_timeout_interruptible(onoff_holdoff); @@ -209,6 +215,8 @@ torture_onoff(void *arg) &sum_online, &min_online, &max_online); schedule_timeout_interruptible(onoff_interval); } + +stop: torture_kthread_stopping("torture_onoff"); return 0; } -- cgit v1.3-8-gc7d7 From af06d4f74a7d2132c805339bfd5ab771b5706f42 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Wed, 25 May 2016 09:25:33 +0800 Subject: rcuperf: Don't treat gp_exp mis-setting as a WARN 0day found a boot warning triggered in rcu_perf_writer() on !SMP kernel: WARN_ON(rcu_gp_is_normal() && gp_exp); , the root cause of which is trying to measure expedited grace periods(by setting gp_exp to true by default) when all the grace periods are normal(TINY RCU only has normal grace periods). However, such a mis-setting would only result in failing to measure the performance for a specific kind of grace periods, therefore using a WARN_ON to check this is a little overkilling. We could handle this inside rcuperf module via some error messages to tell users about the mis-settings. Therefore this patch removes the WARN_ON in rcu_perf_writer() and handles those checkings in rcu_perf_init() with plain if() code. Moreover, this patch changes the default value of gp_exp to 1) align with rcutorture tests and 2) make the default setting work for all RCU implementations by default. Suggested-by: Paul E. McKenney Signed-off-by: Boqun Feng Fixes: http://lkml.kernel.org/r/57411b10.mFvG0+AgcrMXGtcj%fengguang.wu@intel.com Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index afd174e901c3..7b2dbdffd791 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -58,7 +58,7 @@ MODULE_AUTHOR("Paul E. McKenney "); #define VERBOSE_PERFOUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) -torture_param(bool, gp_exp, true, "Use expedited GP wait primitives"); +torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); @@ -358,8 +358,6 @@ rcu_perf_writer(void *arg) u64 *wdpp = writer_durations[me]; VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); - WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp); - WARN_ON(rcu_gp_is_normal() && gp_exp); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); sp.sched_priority = 1; @@ -626,6 +624,16 @@ rcu_perf_init(void) firsterr = -ENOMEM; goto unwind; } + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) { + VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + firsterr = -EINVAL; + goto unwind; + } + if (rcu_gp_is_normal() && gp_exp) { + VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + firsterr = -EINVAL; + goto unwind; + } for (i = 0; i < nrealwriters; i++) { writer_durations[i] = kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), -- cgit v1.3-8-gc7d7 From 05dbbfe753792dcebb6b85d84fa5926f09723cfe Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 13 Jun 2016 15:20:39 +0000 Subject: rcutorture: Fix error return code in rcu_perf_init() Fix to return a negative error code -ENOMEM from kcalloc() error handling case instead of 0, as done elsewhere in this function. Signed-off-by: Wei Yongjun Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 7b2dbdffd791..d38ab08a3fe7 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -638,8 +638,10 @@ rcu_perf_init(void) writer_durations[i] = kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), GFP_KERNEL); - if (!writer_durations[i]) + if (!writer_durations[i]) { + firsterr = -ENOMEM; goto unwind; + } firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, writer_tasks[i]); if (firsterr) -- cgit v1.3-8-gc7d7 From ca5f2b4c4fb7bb7397317ee2ead83485aa295a3e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 14 Jun 2016 16:23:22 +0200 Subject: PM / sleep: Make pm_prepare_console() return void Nothing is using its return value so change it to return void. No functionality change. Signed-off-by: Borislav Petkov Signed-off-by: Rafael J. Wysocki --- include/linux/suspend.h | 5 ++--- kernel/power/console.c | 8 ++++---- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 8b6ec7ef0854..7693e39b14fe 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -18,12 +18,11 @@ static inline void pm_set_vt_switch(int do_switch) #endif #ifdef CONFIG_VT_CONSOLE_SLEEP -extern int pm_prepare_console(void); +extern void pm_prepare_console(void); extern void pm_restore_console(void); #else -static inline int pm_prepare_console(void) +static inline void pm_prepare_console(void) { - return 0; } static inline void pm_restore_console(void) diff --git a/kernel/power/console.c b/kernel/power/console.c index aba9c545a0e3..0e781798b0b3 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -126,17 +126,17 @@ out: return ret; } -int pm_prepare_console(void) +void pm_prepare_console(void) { if (!pm_vt_switch()) - return 0; + return; orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); if (orig_fgconsole < 0) - return 1; + return; orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); - return 0; + return; } void pm_restore_console(void) -- cgit v1.3-8-gc7d7 From 4929c913bda505dbe44bb42c00da06011fee6c9d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 May 2016 11:58:56 -0700 Subject: rcu: Make call_rcu_tasks() tolerate first call with irqs disabled Currently, if the very first call to call_rcu_tasks() has irqs disabled, it will create the rcu_tasks_kthread with irqs disabled, which will result in a splat in the memory allocator, which kthread_run() invokes with the expectation that irqs are enabled. This commit fixes this problem by deferring kthread creation if called with irqs disabled. The first call to call_rcu_tasks() that has irqs enabled will create the kthread. This bug was detected by rcutorture changes that were motivated by Iftekhar Ahmed's mutation-testing efforts. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 1 + kernel/rcu/update.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index a8af79738a0e..3bc5de08c0b7 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -45,6 +45,7 @@ #include #include #include +#include #include diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 3e888cd5a594..f0d8322bc3ec 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -528,6 +528,7 @@ static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; module_param(rcu_task_stall_timeout, int, 0644); static void rcu_spawn_tasks_kthread(void); +static struct task_struct *rcu_tasks_kthread_ptr; /* * Post an RCU-tasks callback. First call must be from process context @@ -537,6 +538,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; bool needwake; + bool havetask = READ_ONCE(rcu_tasks_kthread_ptr); rhp->next = NULL; rhp->func = func; @@ -545,7 +547,9 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) *rcu_tasks_cbs_tail = rhp; rcu_tasks_cbs_tail = &rhp->next; raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - if (needwake) { + /* We can't create the thread unless interrupts are enabled. */ + if ((needwake && havetask) || + (!havetask && !irqs_disabled_flags(flags))) { rcu_spawn_tasks_kthread(); wake_up(&rcu_tasks_cbs_wq); } @@ -790,7 +794,6 @@ static int __noreturn rcu_tasks_kthread(void *arg) static void rcu_spawn_tasks_kthread(void) { static DEFINE_MUTEX(rcu_tasks_kthread_mutex); - static struct task_struct *rcu_tasks_kthread_ptr; struct task_struct *t; if (READ_ONCE(rcu_tasks_kthread_ptr)) { -- cgit v1.3-8-gc7d7 From aab057382cb9b16249552684c1ebd270f070ec02 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 May 2016 12:20:51 -0700 Subject: rcu: Fix a typo in a comment In the area in hot pursuit of a bug, so might as well clean it up. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 084a28a732eb..60bd533902f9 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1476,7 +1476,7 @@ static int rcu_torture_barrier_cbs(void *arg) break; /* * The above smp_load_acquire() ensures barrier_phase load - * is ordered before the folloiwng ->call(). + * is ordered before the following ->call(). */ local_irq_disable(); /* Just to test no-irq call_rcu(). */ cur_ops->call(&rcu, rcu_torture_barrier_cbf); -- cgit v1.3-8-gc7d7 From 088e9d253d3a4ab7e058dd84bb532c32dadf1882 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Thu, 2 Jun 2016 13:51:41 -0300 Subject: rcu: sysctl: Panic on RCU Stall It is not always easy to determine the cause of an RCU stall just by analysing the RCU stall messages, mainly when the problem is caused by the indirect starvation of rcu threads. For example, when preempt_rcu is not awakened due to the starvation of a timer softirq. We have been hard coding panic() in the RCU stall functions for some time while testing the kernel-rt. But this is not possible in some scenarios, like when supporting customers. This patch implements the sysctl kernel.panic_on_rcu_stall. If set to 1, the system will panic() when an RCU stall takes place, enabling the capture of a vmcore. The vmcore provides a way to analyze all kernel/tasks states, helping out to point to the culprit and the solution for the stall. The kernel.panic_on_rcu_stall sysctl is disabled by default. Changes from v1: - Fixed a typo in the git log - The if(sysctl_panic_on_rcu_stall) panic() is in a static function - Fixed the CONFIG_TINY_RCU compilation issue - The var sysctl_panic_on_rcu_stall is now __read_mostly Cc: Jonathan Corbet Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Acked-by: Christian Borntraeger Reviewed-by: Josh Triplett Reviewed-by: Arnaldo Carvalho de Melo Tested-by: "Luis Claudio R. Goncalves" Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Paul E. McKenney --- Documentation/sysctl/kernel.txt | 12 ++++++++++++ include/linux/kernel.h | 1 + kernel/rcu/tree.c | 12 ++++++++++++ kernel/sysctl.c | 11 +++++++++++ 4 files changed, 36 insertions(+) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index a3683ce2a2f3..33204604de6c 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -58,6 +58,7 @@ show up in /proc/sys/kernel: - panic_on_stackoverflow - panic_on_unrecovered_nmi - panic_on_warn +- panic_on_rcu_stall - perf_cpu_time_max_percent - perf_event_paranoid - perf_event_max_stack @@ -618,6 +619,17 @@ a kernel rebuild when attempting to kdump at the location of a WARN(). ============================================================== +panic_on_rcu_stall: + +When set to 1, calls panic() after RCU stall detection messages. This +is useful to define the root cause of RCU stalls using a vmcore. + +0: do not panic() when RCU stall takes place, default behavior. + +1: panic() after printing RCU stall messages. + +============================================================== + perf_cpu_time_max_percent: Hints to the kernel how much CPU time it should be allowed to diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 94aa10ffe156..c42082112ec8 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -451,6 +451,7 @@ extern int panic_on_oops; extern int panic_on_unrecovered_nmi; extern int panic_on_io_nmi; extern int panic_on_warn; +extern int sysctl_panic_on_rcu_stall; extern int sysctl_panic_on_stackoverflow; extern bool crash_kexec_post_notifiers; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c844b6142a86..e5ca15a461b9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -125,6 +125,8 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; /* Number of rcu_nodes at specified level. */ static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ +/* panic() on RCU Stall sysctl. */ +int sysctl_panic_on_rcu_stall __read_mostly; /* * The rcu_scheduler_active variable transitions from zero to one just @@ -1312,6 +1314,12 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) } } +static inline void panic_on_rcu_stall(void) +{ + if (sysctl_panic_on_rcu_stall) + panic("RCU Stall\n"); +} + static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; @@ -1391,6 +1399,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) rcu_check_gp_kthread_starvation(rsp); + panic_on_rcu_stall(); + force_quiescent_state(rsp); /* Kick them all. */ } @@ -1431,6 +1441,8 @@ static void print_cpu_stall(struct rcu_state *rsp) jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + panic_on_rcu_stall(); + /* * Attempt to revive the RCU machinery by forcing a context switch. * diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 87b2fc38398b..35f0dcb1cb4f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1204,6 +1204,17 @@ static struct ctl_table kern_table[] = { .extra1 = &one, .extra2 = &one, }, +#endif +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) + { + .procname = "panic_on_rcu_stall", + .data = &sysctl_panic_on_rcu_stall, + .maxlen = sizeof(sysctl_panic_on_rcu_stall), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } }; -- cgit v1.3-8-gc7d7 From bc75e99983df1efd977a5cd468893d55d52b8d70 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 3 Jun 2016 15:20:04 +0100 Subject: rcu: Correctly handle sparse possible cpus In many cases in the RCU tree code, we iterate over the set of cpus for a leaf node described by rcu_node::grplo and rcu_node::grphi, checking per-cpu data for each cpu in this range. However, if the set of possible cpus is sparse, some cpus described in this range are not possible, and thus no per-cpu region will have been allocated (or initialised) for them by the generic percpu code. Erroneous accesses to a per-cpu area for these !possible cpus may fault or may hit other data depending on the addressed generated when the erroneous per cpu offset is applied. In practice, both cases have been observed on arm64 hardware (the former being silent, but detectable with additional patches). To avoid issues resulting from this, we must iterate over the set of *possible* cpus for a given leaf node. This patch add a new helper, for_each_leaf_node_possible_cpu, to enable this. As iteration is often intertwined with rcu_node local bitmask manipulation, a new leaf_node_cpu_bit helper is added to make this simpler and more consistent. The RCU tree code is made to use both of these where appropriate. Without this patch, running reboot at a shell can result in an oops like: [ 3369.075979] Unable to handle kernel paging request at virtual address ffffff8008b21b4c [ 3369.083881] pgd = ffffffc3ecdda000 [ 3369.087270] [ffffff8008b21b4c] *pgd=00000083eca48003, *pud=00000083eca48003, *pmd=0000000000000000 [ 3369.096222] Internal error: Oops: 96000007 [#1] PREEMPT SMP [ 3369.101781] Modules linked in: [ 3369.104825] CPU: 2 PID: 1817 Comm: NetworkManager Tainted: G W 4.6.0+ #3 [ 3369.121239] task: ffffffc0fa13e000 ti: ffffffc3eb940000 task.ti: ffffffc3eb940000 [ 3369.128708] PC is at sync_rcu_exp_select_cpus+0x188/0x510 [ 3369.134094] LR is at sync_rcu_exp_select_cpus+0x104/0x510 [ 3369.139479] pc : [] lr : [] pstate: 200001c5 [ 3369.146860] sp : ffffffc3eb9435a0 [ 3369.150162] x29: ffffffc3eb9435a0 x28: ffffff8008be4f88 [ 3369.155465] x27: ffffff8008b66c80 x26: ffffffc3eceb2600 [ 3369.160767] x25: 0000000000000001 x24: ffffff8008be4f88 [ 3369.166070] x23: ffffff8008b51c3c x22: ffffff8008b66c80 [ 3369.171371] x21: 0000000000000001 x20: ffffff8008b21b40 [ 3369.176673] x19: ffffff8008b66c80 x18: 0000000000000000 [ 3369.181975] x17: 0000007fa951a010 x16: ffffff80086a30f0 [ 3369.187278] x15: 0000007fa9505590 x14: 0000000000000000 [ 3369.192580] x13: ffffff8008b51000 x12: ffffffc3eb940000 [ 3369.197882] x11: 0000000000000006 x10: ffffff8008b51b78 [ 3369.203184] x9 : 0000000000000001 x8 : ffffff8008be4000 [ 3369.208486] x7 : ffffff8008b21b40 x6 : 0000000000001003 [ 3369.213788] x5 : 0000000000000000 x4 : ffffff8008b27280 [ 3369.219090] x3 : ffffff8008b21b4c x2 : 0000000000000001 [ 3369.224406] x1 : 0000000000000001 x0 : 0000000000000140 ... [ 3369.972257] [] sync_rcu_exp_select_cpus+0x188/0x510 [ 3369.978685] [] synchronize_rcu_expedited+0x64/0xa8 [ 3369.985026] [] synchronize_net+0x24/0x30 [ 3369.990499] [] dev_deactivate_many+0x28c/0x298 [ 3369.996493] [] __dev_close_many+0x60/0xd0 [ 3370.002052] [] __dev_close+0x28/0x40 [ 3370.007178] [] __dev_change_flags+0x8c/0x158 [ 3370.012999] [] dev_change_flags+0x20/0x60 [ 3370.018558] [] do_setlink+0x288/0x918 [ 3370.023771] [] rtnl_newlink+0x398/0x6a8 [ 3370.029158] [] rtnetlink_rcv_msg+0xe4/0x220 [ 3370.034891] [] netlink_rcv_skb+0xc4/0xf8 [ 3370.040364] [] rtnetlink_rcv+0x2c/0x40 [ 3370.045663] [] netlink_unicast+0x160/0x238 [ 3370.051309] [] netlink_sendmsg+0x2f0/0x358 [ 3370.056956] [] sock_sendmsg+0x18/0x30 [ 3370.062168] [] ___sys_sendmsg+0x26c/0x280 [ 3370.067728] [] __sys_sendmsg+0x44/0x88 [ 3370.073027] [] SyS_sendmsg+0x10/0x20 [ 3370.078153] [] el0_svc_naked+0x24/0x28 Signed-off-by: Mark Rutland Reported-by: Dennis Chen Cc: Catalin Marinas Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: Steve Capper Cc: Steven Rostedt Cc: Will Deacon Cc: linux-kernel@vger.kernel.org Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 21 +++++++++------------ kernel/rcu/tree.h | 15 +++++++++++++++ kernel/rcu/tree_exp.h | 16 +++++++--------- kernel/rcu/tree_plugin.h | 5 +++-- 4 files changed, 34 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e5ca15a461b9..f433959e9322 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1287,9 +1287,9 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask != 0) { - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) - dump_cpu_task(rnp->grplo + cpu); + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) + dump_cpu_task(cpu); } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -1360,10 +1360,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) raw_spin_lock_irqsave_rcu_node(rnp, flags); ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) { - print_cpu_stall_info(rsp, - rnp->grplo + cpu); + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { + print_cpu_stall_info(rsp, cpu); ndetected++; } } @@ -2884,7 +2883,6 @@ static void force_qs_rnp(struct rcu_state *rsp, unsigned long *maxj), bool *isidle, unsigned long *maxj) { - unsigned long bit; int cpu; unsigned long flags; unsigned long mask; @@ -2919,9 +2917,8 @@ static void force_qs_rnp(struct rcu_state *rsp, continue; } } - cpu = rnp->grplo; - bit = 1; - for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + unsigned long bit = leaf_node_cpu_bit(rnp, cpu); if ((rnp->qsmask & bit) != 0) { if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) mask |= bit; @@ -3750,7 +3747,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); - rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); + rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e3959f5e6ddf..f714f873bf9d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -253,6 +253,13 @@ struct rcu_node { wait_queue_head_t exp_wq[4]; } ____cacheline_internodealigned_in_smp; +/* + * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and + * are indexed relative to this interval rather than the global CPU ID space. + * This generates the bit for a CPU in node-local masks. + */ +#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) + /* * Do a full breadth-first scan of the rcu_node structures for the * specified rcu_state structure. @@ -280,6 +287,14 @@ struct rcu_node { for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) +/* + * Iterate over all possible CPUs in a leaf RCU node. + */ +#define for_each_leaf_node_possible_cpu(rnp, cpu) \ + for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ + cpu <= rnp->grphi; \ + cpu = cpumask_next((cpu), cpu_possible_mask)) + /* * Union to allow "aggregate OR" operation on the need for a quiescent * state by the normal and expedited grace periods. diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 00a02a231ada..d400434af6b2 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -344,7 +344,6 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, { int cpu; unsigned long flags; - unsigned long mask; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; @@ -356,7 +355,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { + for_each_leaf_node_possible_cpu(rnp, cpu) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); @@ -376,8 +375,8 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); if (!(mask_ofl_ipi & mask)) continue; retry_ipi: @@ -440,10 +439,10 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) ndetected = 0; rcu_for_each_leaf_node(rsp, rnp) { ndetected += rcu_print_task_exp_stall(rnp); - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + for_each_leaf_node_possible_cpu(rnp, cpu) { struct rcu_data *rdp; + mask = leaf_node_cpu_bit(rnp, cpu); if (!(rnp->expmask & mask)) continue; ndetected++; @@ -453,7 +452,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) "o."[!!(rdp->grpmask & rnp->expmaskinit)], "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); } - mask <<= 1; } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rsp->expedited_sequence, @@ -473,8 +471,8 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) pr_cont("\n"); } rcu_for_each_leaf_node(rsp, rnp) { - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + mask = leaf_node_cpu_bit(rnp, cpu); if (!(rnp->expmask & mask)) continue; dump_cpu_task(cpu); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 695071dd1e9c..534c590e8852 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1166,8 +1166,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) return; if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) return; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) - if ((mask & 0x1) && cpu != outgoingcpu) + for_each_leaf_node_possible_cpu(rnp, cpu) + if ((mask & leaf_node_cpu_bit(rnp, cpu)) && + cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); if (cpumask_weight(cm) == 0) cpumask_setall(cm); -- cgit v1.3-8-gc7d7 From 61d1b6a42fec61c5065f54cc62cef02b483c69fb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 15 Jun 2016 22:47:12 +0200 Subject: bpf, maps: add release callback Add a release callback for maps that is invoked when the last reference to its struct file is gone and the struct file about to be released by vfs. The handler will be used by fd array maps. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 3 ++- kernel/bpf/syscall.c | 7 ++++++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1bcae82c6cb1..29b5a1ae22cb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -19,7 +19,8 @@ struct bpf_map; struct bpf_map_ops { /* funcs callable from userspace (via syscall) */ struct bpf_map *(*map_alloc)(union bpf_attr *attr); - void (*map_free)(struct bpf_map *); + void (*map_release)(struct bpf_map *map, struct file *map_file); + void (*map_free)(struct bpf_map *map); int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); /* funcs callable from userspace and from eBPF programs */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 46ecce4b79ed..fc3adcd064b1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -124,7 +124,12 @@ void bpf_map_put_with_uref(struct bpf_map *map) static int bpf_map_release(struct inode *inode, struct file *filp) { - bpf_map_put_with_uref(filp->private_data); + struct bpf_map *map = filp->private_data; + + if (map->ops->map_release) + map->ops->map_release(map, filp); + + bpf_map_put_with_uref(map); return 0; } -- cgit v1.3-8-gc7d7 From d056a788765e67773124f520159185bc89f5d1ad Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 15 Jun 2016 22:47:13 +0200 Subject: bpf, maps: extend map_fd_get_ptr arguments This patch extends map_fd_get_ptr() callback that is used by fd array maps, so that struct file pointer from the related map can be passed in. It's safe to remove map_update_elem() callback for the two maps since this is only allowed from syscall side, but not from eBPF programs for these two map types. Like in per-cpu map case, bpf_fd_array_map_update_elem() needs to be called directly here due to the extra argument. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 12 +++++++++--- kernel/bpf/arraymap.c | 16 +++++++++------- kernel/bpf/syscall.c | 6 ++++++ 3 files changed, 24 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 29b5a1ae22cb..d7b43e73fe87 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -29,8 +29,9 @@ struct bpf_map_ops { int (*map_delete_elem)(struct bpf_map *map, void *key); /* funcs called by prog_array and perf_event_array map */ - void *(*map_fd_get_ptr) (struct bpf_map *map, int fd); - void (*map_fd_put_ptr) (void *ptr); + void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, + int fd); + void (*map_fd_put_ptr)(void *ptr); }; struct bpf_map { @@ -169,7 +170,7 @@ struct bpf_array { u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); -void bpf_fd_array_map_clear(struct bpf_map *map); + bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); @@ -207,8 +208,13 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, u64 flags); + int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); +int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, + void *key, void *value, u64 map_flags); +void bpf_fd_array_map_clear(struct bpf_map *map); + /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. * Best-effort only. No barriers here, since it _will_ race with concurrent diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 76d5a794e426..bfedcbdb4d84 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -328,8 +328,8 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) } /* only called from syscall */ -static int fd_array_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, + void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *new_ptr, *old_ptr; @@ -342,7 +342,7 @@ static int fd_array_map_update_elem(struct bpf_map *map, void *key, return -E2BIG; ufd = *(u32 *)value; - new_ptr = map->ops->map_fd_get_ptr(map, ufd); + new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); if (IS_ERR(new_ptr)) return PTR_ERR(new_ptr); @@ -371,10 +371,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) } } -static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) +static void *prog_fd_array_get_ptr(struct bpf_map *map, + struct file *map_file, int fd) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog = bpf_prog_get(fd); + if (IS_ERR(prog)) return prog; @@ -382,6 +384,7 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) bpf_prog_put(prog); return ERR_PTR(-EINVAL); } + return prog; } @@ -407,7 +410,6 @@ static const struct bpf_map_ops prog_array_ops = { .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, - .map_update_elem = fd_array_map_update_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, @@ -431,7 +433,8 @@ static void perf_event_array_map_free(struct bpf_map *map) fd_array_map_free(map); } -static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) +static void *perf_event_fd_array_get_ptr(struct bpf_map *map, + struct file *map_file, int fd) { struct perf_event *event; const struct perf_event_attr *attr; @@ -474,7 +477,6 @@ static const struct bpf_map_ops perf_event_array_ops = { .map_free = perf_event_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, - .map_update_elem = fd_array_map_update_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fc3adcd064b1..c23a4e9311b3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -392,6 +392,12 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || + map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + rcu_read_lock(); + err = bpf_fd_array_map_update_elem(map, f.file, key, value, + attr->flags); + rcu_read_unlock(); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); -- cgit v1.3-8-gc7d7 From 3b1efb196eee45b2f0c4994e0c43edb5e367f620 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 15 Jun 2016 22:47:14 +0200 Subject: bpf, maps: flush own entries on perf map release The behavior of perf event arrays are quite different from all others as they are tightly coupled to perf event fds, f.e. shown recently by commit e03e7ee34fdd ("perf/bpf: Convert perf_event_array to use struct file") to make refcounting on perf event more robust. A remaining issue that the current code still has is that since additions to the perf event array take a reference on the struct file via perf_event_get() and are only released via fput() (that cleans up the perf event eventually via perf_event_release_kernel()) when the element is either manually removed from the map from user space or automatically when the last reference on the perf event map is dropped. However, this leads us to dangling struct file's when the map gets pinned after the application owning the perf event descriptor exits, and since the struct file reference will in such case only be manually dropped or via pinned file removal, it leads to the perf event living longer than necessary, consuming needlessly resources for that time. Relations between perf event fds and bpf perf event map fds can be rather complex. F.e. maps can act as demuxers among different perf event fds that can possibly be owned by different threads and based on the index selection from the program, events get dispatched to one of the per-cpu fd endpoints. One perf event fd (or, rather a per-cpu set of them) can also live in multiple perf event maps at the same time, listening for events. Also, another requirement is that perf event fds can get closed from application side after they have been attached to the perf event map, so that on exit perf event map will take care of dropping their references eventually. Likewise, when such maps are pinned, the intended behavior is that a user application does bpf_obj_get(), puts its fds in there and on exit when fd is released, they are dropped from the map again, so the map acts rather as connector endpoint. This also makes perf event maps inherently different from program arrays as described in more detail in commit c9da161c6517 ("bpf: fix clearing on persistent program array maps"). To tackle this, map entries are marked by the map struct file that added the element to the map. And when the last reference to that map struct file is released from user space, then the tracked entries are purged from the map. This is okay, because new map struct files instances resp. frontends to the anon inode are provided via bpf_map_new_fd() that is called when we invoke bpf_obj_get_user() for retrieving a pinned map, but also when an initial instance is created via map_create(). The rest is resolved by the vfs layer automatically for us by keeping reference count on the map's struct file. Any concurrent updates on the map slot are fine as well, it just means that perf_event_fd_array_release() needs to delete less of its own entires. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 9 +++++ kernel/bpf/arraymap.c | 102 ++++++++++++++++++++++++++++++++++------------- kernel/trace/bpf_trace.c | 18 ++++----- 3 files changed, 91 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d7b43e73fe87..9adfef694a25 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -13,6 +13,7 @@ #include #include +struct perf_event; struct bpf_map; /* map is generic key/value storage optionally accesible by eBPF programs */ @@ -166,8 +167,16 @@ struct bpf_array { void __percpu *pptrs[0] __aligned(8); }; }; + #define MAX_TAIL_CALL_CNT 32 +struct bpf_event_entry { + struct perf_event *event; + struct file *perf_file; + struct file *map_file; + struct rcu_head rcu; +}; + u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index bfedcbdb4d84..5af30732697b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -427,59 +427,105 @@ static int __init register_prog_array_map(void) } late_initcall(register_prog_array_map); -static void perf_event_array_map_free(struct bpf_map *map) +static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, + struct file *map_file) { - bpf_fd_array_map_clear(map); - fd_array_map_free(map); + struct bpf_event_entry *ee; + + ee = kzalloc(sizeof(*ee), GFP_KERNEL); + if (ee) { + ee->event = perf_file->private_data; + ee->perf_file = perf_file; + ee->map_file = map_file; + } + + return ee; +} + +static void __bpf_event_entry_free(struct rcu_head *rcu) +{ + struct bpf_event_entry *ee; + + ee = container_of(rcu, struct bpf_event_entry, rcu); + fput(ee->perf_file); + kfree(ee); +} + +static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) +{ + call_rcu(&ee->rcu, __bpf_event_entry_free); } static void *perf_event_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { - struct perf_event *event; const struct perf_event_attr *attr; - struct file *file; + struct bpf_event_entry *ee; + struct perf_event *event; + struct file *perf_file; - file = perf_event_get(fd); - if (IS_ERR(file)) - return file; + perf_file = perf_event_get(fd); + if (IS_ERR(perf_file)) + return perf_file; - event = file->private_data; + event = perf_file->private_data; + ee = ERR_PTR(-EINVAL); attr = perf_event_attrs(event); - if (IS_ERR(attr)) - goto err; - - if (attr->inherit) - goto err; - - if (attr->type == PERF_TYPE_RAW) - return file; - - if (attr->type == PERF_TYPE_HARDWARE) - return file; + if (IS_ERR(attr) || attr->inherit) + goto err_out; + + switch (attr->type) { + case PERF_TYPE_SOFTWARE: + if (attr->config != PERF_COUNT_SW_BPF_OUTPUT) + goto err_out; + /* fall-through */ + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + ee = bpf_event_entry_gen(perf_file, map_file); + if (ee) + return ee; + ee = ERR_PTR(-ENOMEM); + /* fall-through */ + default: + break; + } - if (attr->type == PERF_TYPE_SOFTWARE && - attr->config == PERF_COUNT_SW_BPF_OUTPUT) - return file; -err: - fput(file); - return ERR_PTR(-EINVAL); +err_out: + fput(perf_file); + return ee; } static void perf_event_fd_array_put_ptr(void *ptr) { - fput((struct file *)ptr); + bpf_event_entry_free_rcu(ptr); +} + +static void perf_event_fd_array_release(struct bpf_map *map, + struct file *map_file) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_event_entry *ee; + int i; + + rcu_read_lock(); + for (i = 0; i < array->map.max_entries; i++) { + ee = READ_ONCE(array->ptrs[i]); + if (ee && ee->map_file == map_file) + fd_array_map_delete_elem(map, &i); + } + rcu_read_unlock(); } static const struct bpf_map_ops perf_event_array_ops = { .map_alloc = fd_array_map_alloc, - .map_free = perf_event_array_map_free, + .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, + .map_release = perf_event_fd_array_release, }; static struct bpf_map_type_list perf_event_array_type __read_mostly = { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 720b7bb01d43..037ea6ea3cb2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -192,18 +192,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) { struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_event_entry *ee; struct perf_event *event; - struct file *file; if (unlikely(index >= array->map.max_entries)) return -E2BIG; - file = READ_ONCE(array->ptrs[index]); - if (unlikely(!file)) + ee = READ_ONCE(array->ptrs[index]); + if (unlikely(!ee)) return -ENOENT; - event = file->private_data; - + event = ee->event; /* make sure event is local and doesn't have pmu::count */ if (event->oncpu != smp_processor_id() || event->pmu->count) @@ -233,8 +232,8 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) u64 index = flags & BPF_F_INDEX_MASK; void *data = (void *) (long) r4; struct perf_sample_data sample_data; + struct bpf_event_entry *ee; struct perf_event *event; - struct file *file; struct perf_raw_record raw = { .size = size, .data = data, @@ -247,12 +246,11 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) if (unlikely(index >= array->map.max_entries)) return -E2BIG; - file = READ_ONCE(array->ptrs[index]); - if (unlikely(!file)) + ee = READ_ONCE(array->ptrs[index]); + if (unlikely(!ee)) return -ENOENT; - event = file->private_data; - + event = ee->event; if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) return -EINVAL; -- cgit v1.3-8-gc7d7 From e37837fb62f95a81bdcefa86ceea043df84937d7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Apr 2016 01:01:27 +0200 Subject: locking/atomic: Remove the deprecated atomic_{set,clear}_mask() functions These functions have been deprecated for a while and there is only the one user left, convert and kill. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/atomic.h | 10 ---------- kernel/locking/qspinlock_paravirt.h | 4 ++-- 2 files changed, 2 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 0b3802d33125..12d910d61b83 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -480,16 +480,6 @@ static inline int atomic_fetch_andnot_release(int i, atomic_t *v) } #endif -static inline __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v) -{ - atomic_andnot(mask, v); -} - -static inline __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v) -{ - atomic_or(mask, v); -} - /** * atomic_inc_not_zero_hint - increment if not null * @v: pointer of type atomic_t diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 21ede57f68b3..37649e69056c 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -112,12 +112,12 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock) #else /* _Q_PENDING_BITS == 8 */ static __always_inline void set_pending(struct qspinlock *lock) { - atomic_set_mask(_Q_PENDING_VAL, &lock->val); + atomic_or(_Q_PENDING_VAL, &lock->val); } static __always_inline void clear_pending(struct qspinlock *lock) { - atomic_clear_mask(_Q_PENDING_VAL, &lock->val); + atomic_andnot(_Q_PENDING_VAL, &lock->val); } static __always_inline int trylock_clear_pending(struct qspinlock *lock) -- cgit v1.3-8-gc7d7 From f9852b74bec0117b888da39d070c323ea1cb7f4c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Apr 2016 01:27:03 +0200 Subject: locking/atomic, arch/qrwlock: Employ atomic_fetch_add_acquire() The only reason for the current code is to make GCC emit only the "LOCK XADD" instruction on x86 (and not do a pointless extra ADD on the result), do so nicer. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/qrwlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index fec082338668..19248ddf37ce 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -93,7 +93,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) * that accesses can't leak upwards out of our subsequent critical * section in the case that the lock is currently held for write. */ - cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS; + cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts); rspin_until_writer_unlock(lock, cnts); /* -- cgit v1.3-8-gc7d7 From 86a3b5f34fc1fb307abef4fde76bebd3edce0324 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 18 May 2016 12:42:21 +0200 Subject: locking/atomic, arch/rwsem: Employ atomic_long_fetch_add() Now that we have fetch_add() we can stop using add_return() - val. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 2031281bb940..447e08de1fab 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -153,7 +153,7 @@ __rwsem_mark_wake(struct rw_semaphore *sem, if (wake_type != RWSEM_WAKE_READ_OWNED) { adjustment = RWSEM_ACTIVE_READ_BIAS; try_reader_grant: - oldcount = atomic_long_add_return(adjustment, &sem->count) - adjustment; + oldcount = atomic_long_fetch_add(adjustment, &sem->count); if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { /* -- cgit v1.3-8-gc7d7 From 66b12abc846d31e75fa5f2f31db1396ddfa8ee4a Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 16 Jun 2016 17:08:19 -0400 Subject: audit: fix some horrible switch statement style crimes Signed-off-by: Paul Moore --- kernel/auditfilter.c | 8 ++++++-- kernel/auditsc.c | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 8a8aa3fbc8d8..ff59a5eed691 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1343,8 +1343,12 @@ static int audit_filter_user_rules(struct audit_krule *rule, int type, return result; } switch (rule->action) { - case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; + case AUDIT_NEVER: + *state = AUDIT_DISABLED; + break; + case AUDIT_ALWAYS: + *state = AUDIT_RECORD_CONTEXT; + break; } return 1; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 7d0e3cf8abe1..ec4c552876a7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -695,8 +695,12 @@ static int audit_filter_rules(struct task_struct *tsk, ctx->prio = rule->prio; } switch (rule->action) { - case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; + case AUDIT_NEVER: + *state = AUDIT_DISABLED; + break; + case AUDIT_ALWAYS: + *state = AUDIT_RECORD_CONTEXT; + break; } return 1; } -- cgit v1.3-8-gc7d7 From 8c8a5502183c724854afd2f143a72f7eb71b6fea Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 17 Jun 2016 12:23:59 -0400 Subject: cgroup: fix idr leak for the first cgroup root The valid cgroup hierarchy ID range includes 0, so we can't filter for positive numbers when freeing it, or it'll leak the first ID. No big deal, just disruptive when reading the code. The ID is freed during error handling and when the reference count hits zero, so the double-free test is not necessary; remove it. Signed-off-by: Johannes Weiner Signed-off-by: Tejun Heo --- kernel/cgroup.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 86cb5c6e8932..36fc0ff506c3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1158,18 +1158,12 @@ static void cgroup_exit_root_id(struct cgroup_root *root) { lockdep_assert_held(&cgroup_mutex); - if (root->hierarchy_id) { - idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); - root->hierarchy_id = 0; - } + idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); } static void cgroup_free_root(struct cgroup_root *root) { if (root) { - /* hierarchy ID should already have been released */ - WARN_ON_ONCE(root->hierarchy_id); - idr_destroy(&root->cgroup_idr); kfree(root); } -- cgit v1.3-8-gc7d7 From d6ccc55e66ccdbc8ad0eeda14419f8eaccbc246b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 17 Jun 2016 12:24:27 -0400 Subject: cgroup: remove unnecessary 0 check from css_from_id() css_idr allocation starts at 1, so index 0 will never point to an item. css_from_id() currently filters that before asking idr_find(), but idr_find() would also just return NULL, so this is not needed. Signed-off-by: Johannes Weiner Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 36fc0ff506c3..78f6d18ff0af 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6162,7 +6162,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { WARN_ON_ONCE(!rcu_read_lock_held()); - return id > 0 ? idr_find(&ss->css_idr, id) : NULL; + return idr_find(&ss->css_idr, id); } /** -- cgit v1.3-8-gc7d7 From 59a37f8baeb2c9d97f316584c90892d18bf846d4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Jun 2016 16:58:26 +0200 Subject: blktrace: avoid using timespec The blktrace code stores the current time in a 32-bit word in its user interface. This is a bad idea because 32-bit seconds overflow at some point. We probably have until 2106 before this one overflows, as it seems to use an 'unsigned' variable, but we should confirm that user space treats it the same way. Aside from this, we want to stop using 'struct timespec' here, so I'm adding a comment about the overflow and change the code to use timespec64 instead to make the loss of range more obvious. Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 03b0dd98ff0e..bedb84d168d1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -127,12 +127,13 @@ static void trace_note_tsk(struct task_struct *tsk) static void trace_note_time(struct blk_trace *bt) { - struct timespec now; + struct timespec64 now; unsigned long flags; u32 words[2]; - getnstimeofday(&now); - words[0] = now.tv_sec; + /* need to check user space to see if this breaks in y2038 or y2106 */ + ktime_get_real_ts64(&now); + words[0] = (u32)now.tv_sec; words[1] = now.tv_nsec; local_irq_save(flags); -- cgit v1.3-8-gc7d7 From edd14cfebc4404698544d407ecf8eda6e19aa19e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 17 Jun 2016 16:00:20 -0600 Subject: genirq: Add untracked irq handler This adds a software irq handler for controllers that multiplex interrupts from multiple devices, but don't know which device generated the interrupt. For these devices, the irq handler that demuxes must check every action for every software irq using the same h/w irq in order to find out which device generated the interrupt. This will inevitably trigger spurious interrupt detection if we are noting the irq. The new irq handler does not track the handling for spurious interrupt detection. An irq that uses this also won't get stats tracked since it didn't generate the interrupt, nor added to randomness since they are not random. Signed-off-by: Keith Busch Cc: Bjorn Helgaas Cc: linux-pci@vger.kernel.org Cc: Jon Derrick Link: http://lkml.kernel.org/r/1466200821-29159-1-git-send-email-keith.busch@intel.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 1 + kernel/irq/chip.c | 43 +++++++++++++++++++++++++++++++++++++++++++ kernel/irq/handle.c | 18 ++++++++++++++---- kernel/irq/internals.h | 1 + 4 files changed, 59 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 6c92a847394d..562cef010aa8 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -484,6 +484,7 @@ extern void handle_fasteoi_irq(struct irq_desc *desc); extern void handle_edge_irq(struct irq_desc *desc); extern void handle_edge_eoi_irq(struct irq_desc *desc); extern void handle_simple_irq(struct irq_desc *desc); +extern void handle_untracked_irq(struct irq_desc *desc); extern void handle_percpu_irq(struct irq_desc *desc); extern void handle_percpu_devid_irq(struct irq_desc *desc); extern void handle_bad_irq(struct irq_desc *desc); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ad8131473774..b4c1bc7c9ca2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -426,6 +426,49 @@ out_unlock: } EXPORT_SYMBOL_GPL(handle_simple_irq); +/** + * handle_untracked_irq - Simple and software-decoded IRQs. + * @desc: the interrupt description structure for this irq + * + * Untracked interrupts are sent from a demultiplexing interrupt + * handler when the demultiplexer does not know which device it its + * multiplexed irq domain generated the interrupt. IRQ's handled + * through here are not subjected to stats tracking, randomness, or + * spurious interrupt detection. + * + * Note: Like handle_simple_irq, the caller is expected to handle + * the ack, clear, mask and unmask issues if necessary. + */ +void handle_untracked_irq(struct irq_desc *desc) +{ + unsigned int flags = 0; + + raw_spin_lock(&desc->lock); + + if (!irq_may_run(desc)) + goto out_unlock; + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; + goto out_unlock; + } + + desc->istate &= ~IRQS_PENDING; + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); + raw_spin_unlock(&desc->lock); + + __handle_irq_event_percpu(desc, &flags); + + raw_spin_lock(&desc->lock); + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); + +out_unlock: + raw_spin_unlock(&desc->lock); +} +EXPORT_SYMBOL_GPL(handle_untracked_irq); + /* * Called unconditionally from handle_level_irq() and only for oneshot * interrupts from handle_fasteoi_irq() diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a15b5485b446..d3f24905852c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -132,10 +132,10 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) wake_up_process(action->thread); } -irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) +irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags) { irqreturn_t retval = IRQ_NONE; - unsigned int flags = 0, irq = desc->irq_data.irq; + unsigned int irq = desc->irq_data.irq; struct irqaction *action; for_each_action_of_desc(desc, action) { @@ -164,7 +164,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) /* Fall through to add to randomness */ case IRQ_HANDLED: - flags |= action->flags; + *flags |= action->flags; break; default: @@ -174,7 +174,17 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) retval |= res; } - add_interrupt_randomness(irq, flags); + return retval; +} + +irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) +{ + irqreturn_t retval; + unsigned int flags = 0; + + retval = __handle_irq_event_percpu(desc, &flags); + + add_interrupt_randomness(desc->irq_data.irq, flags); if (!noirqdebug) note_interrupt(desc, retval); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index d5edcdc9382a..0c6f35ba9cc0 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -84,6 +84,7 @@ extern void irq_mark_irq(unsigned int irq); extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); +irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); -- cgit v1.3-8-gc7d7 From 4e267db135c44d0b18e553899fe7df32b89211a5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Apr 2016 07:38:13 -0400 Subject: tracing: Make the pid filtering helper functions global Make the functions used for pid filtering global for tracing, such that the function tracer can use the pid code as well. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 9 +++++++++ kernel/trace/trace_events.c | 34 +++++++++++++++++----------------- 2 files changed, 26 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5167c366d6b7..172330891c6d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -628,6 +628,15 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_thresh; +/* PID filtering */ +bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, + pid_t search_pid); +bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, + struct task_struct *task); +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task); + #ifdef CONFIG_TRACER_MAX_TRACE void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3d4155892a1e..b5e514c4dada 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -503,8 +503,8 @@ static void ftrace_clear_events(struct trace_array *tr) extern int pid_max; /* Returns true if found in filter */ -static bool -find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) +bool +trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) { /* * If pid_max changed after filtered_pids was created, we @@ -516,8 +516,8 @@ find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) return test_bit(search_pid, filtered_pids->pids); } -static bool -ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) +bool +trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) { /* * Return false, because if filtered_pids does not exist, @@ -526,19 +526,19 @@ ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) if (!filtered_pids) return false; - return !find_filtered_pid(filtered_pids, task->pid); + return !trace_find_filtered_pid(filtered_pids, task->pid); } -static void filter_add_remove_task(struct trace_pid_list *pid_list, - struct task_struct *self, - struct task_struct *task) +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task) { if (!pid_list) return; /* For forks, we only add if the forking task is listed */ if (self) { - if (!find_filtered_pid(pid_list, self->pid)) + if (!trace_find_filtered_pid(pid_list, self->pid)) return; } @@ -560,7 +560,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task) struct trace_array *tr = data; pid_list = rcu_dereference_sched(tr->filtered_pids); - filter_add_remove_task(pid_list, NULL, task); + trace_filter_add_remove_task(pid_list, NULL, task); } static void @@ -572,7 +572,7 @@ event_filter_pid_sched_process_fork(void *data, struct trace_array *tr = data; pid_list = rcu_dereference_sched(tr->filtered_pids); - filter_add_remove_task(pid_list, self, task); + trace_filter_add_remove_task(pid_list, self, task); } void trace_event_follow_fork(struct trace_array *tr, bool enable) @@ -600,8 +600,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, prev) && - ignore_this_task(pid_list, next)); + trace_ignore_this_task(pid_list, prev) && + trace_ignore_this_task(pid_list, next)); } static void @@ -614,7 +614,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, next)); + trace_ignore_this_task(pid_list, next)); } static void @@ -630,7 +630,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, task)); + trace_ignore_this_task(pid_list, task)); } static void @@ -647,7 +647,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) /* Set tracing if current is enabled */ this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, current)); + trace_ignore_this_task(pid_list, current)); } static void __ftrace_clear_event_pids(struct trace_array *tr) @@ -1654,7 +1654,7 @@ static void ignore_task_cpu(void *data) mutex_is_locked(&event_mutex)); this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, current)); + trace_ignore_this_task(pid_list, current)); } static ssize_t -- cgit v1.3-8-gc7d7 From d8275c454dcdba296675221b4c12f19d1b6e0ee8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Apr 2016 12:15:22 -0400 Subject: tracing: Move filtered_pid helper functions into trace.c As the filtered_pid functions are going to be used by function tracer as well as trace_events, move the code into the generic trace.c file. The functions moved are: trace_find_filtered_pid() trace_ignore_this_task() trace_filter_add_remove_task() Kernel Doc text was also added. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 78 +++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events.c | 51 ----------------------------- 2 files changed, 78 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a4bd6b68a0b..0b87fe8e6d0b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -319,6 +319,84 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec, return 0; } +/** + * trace_find_filtered_pid - check if a pid exists in a filtered_pid list + * @filtered_pids: The list of pids to check + * @search_pid: The PID to find in @filtered_pids + * + * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis. + */ +bool +trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) +{ + /* + * If pid_max changed after filtered_pids was created, we + * by default ignore all pids greater than the previous pid_max. + */ + if (search_pid >= filtered_pids->pid_max) + return false; + + return test_bit(search_pid, filtered_pids->pids); +} + +/** + * trace_ignore_this_task - should a task be ignored for tracing + * @filtered_pids: The list of pids to check + * @task: The task that should be ignored if not filtered + * + * Checks if @task should be traced or not from @filtered_pids. + * Returns true if @task should *NOT* be traced. + * Returns false if @task should be traced. + */ +bool +trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) +{ + /* + * Return false, because if filtered_pids does not exist, + * all pids are good to trace. + */ + if (!filtered_pids) + return false; + + return !trace_find_filtered_pid(filtered_pids, task->pid); +} + +/** + * trace_pid_filter_add_remove - Add or remove a task from a pid_list + * @pid_list: The list to modify + * @self: The current task for fork or NULL for exit + * @task: The task to add or remove + * + * If adding a task, if @self is defined, the task is only added if @self + * is also included in @pid_list. This happens on fork and tasks should + * only be added when the parent is listed. If @self is NULL, then the + * @task pid will be removed from the list, which would happen on exit + * of a task. + */ +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task) +{ + if (!pid_list) + return; + + /* For forks, we only add if the forking task is listed */ + if (self) { + if (!trace_find_filtered_pid(pid_list, self->pid)) + return; + } + + /* Sorry, but we don't support pid_max changing after setting */ + if (task->pid >= pid_list->pid_max) + return; + + /* "self" is set for forks, and NULL for exits */ + if (self) + set_bit(task->pid, pid_list->pids); + else + clear_bit(task->pid, pid_list->pids); +} + static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { u64 ts; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b5e514c4dada..a11e6d9a3841 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -502,57 +502,6 @@ static void ftrace_clear_events(struct trace_array *tr) /* Shouldn't this be in a header? */ extern int pid_max; -/* Returns true if found in filter */ -bool -trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) -{ - /* - * If pid_max changed after filtered_pids was created, we - * by default ignore all pids greater than the previous pid_max. - */ - if (search_pid >= filtered_pids->pid_max) - return false; - - return test_bit(search_pid, filtered_pids->pids); -} - -bool -trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) -{ - /* - * Return false, because if filtered_pids does not exist, - * all pids are good to trace. - */ - if (!filtered_pids) - return false; - - return !trace_find_filtered_pid(filtered_pids, task->pid); -} - -void trace_filter_add_remove_task(struct trace_pid_list *pid_list, - struct task_struct *self, - struct task_struct *task) -{ - if (!pid_list) - return; - - /* For forks, we only add if the forking task is listed */ - if (self) { - if (!trace_find_filtered_pid(pid_list, self->pid)) - return; - } - - /* Sorry, but we don't support pid_max changing after setting */ - if (task->pid >= pid_list->pid_max) - return; - - /* "self" is set for forks, and NULL for exits */ - if (self) - set_bit(task->pid, pid_list->pids); - else - clear_bit(task->pid, pid_list->pids); -} - static void event_filter_pid_sched_process_exit(void *data, struct task_struct *task) { -- cgit v1.3-8-gc7d7 From 5cc8976bd52153678ca37cc1e3000833b20276f3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 20 Apr 2016 15:19:54 -0400 Subject: tracing: Move the pid_list seq_file functions to be global To allow other aspects of ftrace to use the pid_list logic, we need to reuse the seq_file functions. Making the generic part into functions that can be called by other files will help in this regard. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 71 +++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 3 ++ kernel/trace/trace_events.c | 34 ++-------------------- 3 files changed, 77 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0b87fe8e6d0b..7943e306cc7f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -397,6 +397,77 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list, clear_bit(task->pid, pid_list->pids); } +/** + * trace_pid_next - Used for seq_file to get to the next pid of a pid_list + * @pid_list: The pid list to show + * @v: The last pid that was shown (+1 the actual pid to let zero be displayed) + * @pos: The position of the file + * + * This is used by the seq_file "next" operation to iterate the pids + * listed in a trace_pid_list structure. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) +{ + unsigned long pid = (unsigned long)v; + + (*pos)++; + + /* pid already is +1 of the actual prevous bit */ + pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + + /* Return pid + 1 to allow zero to be represented */ + if (pid < pid_list->pid_max) + return (void *)(pid + 1); + + return NULL; +} + +/** + * trace_pid_start - Used for seq_file to start reading pid lists + * @pid_list: The pid list to show + * @pos: The position of the file + * + * This is used by seq_file "start" operation to start the iteration + * of listing pids. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) +{ + unsigned long pid; + loff_t l = 0; + + pid = find_first_bit(pid_list->pids, pid_list->pid_max); + if (pid >= pid_list->pid_max) + return NULL; + + /* Return pid + 1 so that zero can be the exit value */ + for (pid++; pid && l < *pos; + pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) + ; + return (void *)pid; +} + +/** + * trace_pid_show - show the current pid in seq_file processing + * @m: The seq_file structure to write into + * @v: A void pointer of the pid (+1) value to display + * + * Can be directly used by seq_file operations to display the current + * pid value. + */ +int trace_pid_show(struct seq_file *m, void *v) +{ + unsigned long pid = (unsigned long)v - 1; + + seq_printf(m, "%lu\n", pid); + return 0; +} + static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { u64 ts; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 172330891c6d..45442d5842f2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -636,6 +636,9 @@ bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, void trace_filter_add_remove_task(struct trace_pid_list *pid_list, struct task_struct *self, struct task_struct *task); +void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos); +void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos); +int trace_pid_show(struct seq_file *m, void *v); #ifdef CONFIG_TRACER_MAX_TRACE void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a11e6d9a3841..fd831a972bae 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -983,18 +983,8 @@ p_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_array *tr = m->private; struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); - unsigned long pid = (unsigned long)v; - (*pos)++; - - /* pid already is +1 of the actual prevous bit */ - pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); - - /* Return pid + 1 to allow zero to be represented */ - if (pid < pid_list->pid_max) - return (void *)(pid + 1); - - return NULL; + return trace_pid_next(pid_list, v, pos); } static void *p_start(struct seq_file *m, loff_t *pos) @@ -1002,8 +992,6 @@ static void *p_start(struct seq_file *m, loff_t *pos) { struct trace_pid_list *pid_list; struct trace_array *tr = m->private; - unsigned long pid; - loff_t l = 0; /* * Grab the mutex, to keep calls to p_next() having the same @@ -1019,15 +1007,7 @@ static void *p_start(struct seq_file *m, loff_t *pos) if (!pid_list) return NULL; - pid = find_first_bit(pid_list->pids, pid_list->pid_max); - if (pid >= pid_list->pid_max) - return NULL; - - /* Return pid + 1 so that zero can be the exit value */ - for (pid++; pid && l < *pos; - pid = (unsigned long)p_next(m, (void *)pid, &l)) - ; - return (void *)pid; + return trace_pid_start(pid_list, pos); } static void p_stop(struct seq_file *m, void *p) @@ -1037,14 +1017,6 @@ static void p_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } -static int p_show(struct seq_file *m, void *v) -{ - unsigned long pid = (unsigned long)v - 1; - - seq_printf(m, "%lu\n", pid); - return 0; -} - static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -1795,7 +1767,7 @@ static const struct seq_operations show_set_event_seq_ops = { static const struct seq_operations show_set_pid_seq_ops = { .start = p_start, .next = p_next, - .show = p_show, + .show = trace_pid_show, .stop = p_stop, }; -- cgit v1.3-8-gc7d7 From 76c813e26606d35ea9d8d6f96e646b3944c730a9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 21 Apr 2016 11:35:30 -0400 Subject: tracing: Move pid_list write processing into its own function The addition of PIDs into a pid_list via the write operation of set_event_pid is a bit complex. The same operation will be needed for function tracing pids. Move the code into its own generic function in trace.c, so that we can avoid duplication of this code. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 109 ++++++++++++++++++++++++++++++++++++++++++- kernel/trace/trace.h | 7 +++ kernel/trace/trace_events.c | 110 ++++---------------------------------------- 3 files changed, 124 insertions(+), 102 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7943e306cc7f..a8bb7485fd1d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include @@ -319,6 +319,12 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec, return 0; } +void trace_free_pid_list(struct trace_pid_list *pid_list) +{ + vfree(pid_list->pids); + kfree(pid_list); +} + /** * trace_find_filtered_pid - check if a pid exists in a filtered_pid list * @filtered_pids: The list of pids to check @@ -468,6 +474,107 @@ int trace_pid_show(struct seq_file *m, void *v) return 0; } +/* 128 should be much more than enough */ +#define PID_BUF_SIZE 127 + +int trace_pid_write(struct trace_pid_list *filtered_pids, + struct trace_pid_list **new_pid_list, + const char __user *ubuf, size_t cnt) +{ + struct trace_pid_list *pid_list; + struct trace_parser parser; + unsigned long val; + int nr_pids = 0; + ssize_t read = 0; + ssize_t ret = 0; + loff_t pos; + pid_t pid; + + if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1)) + return -ENOMEM; + + /* + * Always recreate a new array. The write is an all or nothing + * operation. Always create a new array when adding new pids by + * the user. If the operation fails, then the current list is + * not modified. + */ + pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + if (!pid_list) + return -ENOMEM; + + pid_list->pid_max = READ_ONCE(pid_max); + + /* Only truncating will shrink pid_max */ + if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) + pid_list->pid_max = filtered_pids->pid_max; + + pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); + if (!pid_list->pids) { + kfree(pid_list); + return -ENOMEM; + } + + if (filtered_pids) { + /* copy the current bits to the new max */ + pid = find_first_bit(filtered_pids->pids, + filtered_pids->pid_max); + while (pid < filtered_pids->pid_max) { + set_bit(pid, pid_list->pids); + pid = find_next_bit(filtered_pids->pids, + filtered_pids->pid_max, + pid + 1); + nr_pids++; + } + } + + while (cnt > 0) { + + pos = 0; + + ret = trace_get_user(&parser, ubuf, cnt, &pos); + if (ret < 0 || !trace_parser_loaded(&parser)) + break; + + read += ret; + ubuf += ret; + cnt -= ret; + + parser.buffer[parser.idx] = 0; + + ret = -EINVAL; + if (kstrtoul(parser.buffer, 0, &val)) + break; + if (val >= pid_list->pid_max) + break; + + pid = (pid_t)val; + + set_bit(pid, pid_list->pids); + nr_pids++; + + trace_parser_clear(&parser); + ret = 0; + } + trace_parser_put(&parser); + + if (ret < 0) { + trace_free_pid_list(pid_list); + return ret; + } + + if (!nr_pids) { + /* Cleared the list of pids */ + trace_free_pid_list(pid_list); + read = ret; + pid_list = NULL; + } + + *new_pid_list = pid_list; + + return read; +} + static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { u64 ts; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 45442d5842f2..a4dce1ef9e03 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -629,6 +629,9 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_thresh; /* PID filtering */ + +extern int pid_max; + bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid); bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, @@ -639,6 +642,10 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list, void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos); void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos); int trace_pid_show(struct seq_file *m, void *v); +void trace_free_pid_list(struct trace_pid_list *pid_list); +int trace_pid_write(struct trace_pid_list *filtered_pids, + struct trace_pid_list **new_pid_list, + const char __user *ubuf, size_t cnt); #ifdef CONFIG_TRACER_MAX_TRACE void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fd831a972bae..fd449eb138cf 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -499,9 +498,6 @@ static void ftrace_clear_events(struct trace_array *tr) mutex_unlock(&event_mutex); } -/* Shouldn't this be in a header? */ -extern int pid_max; - static void event_filter_pid_sched_process_exit(void *data, struct task_struct *task) { @@ -634,8 +630,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) /* Wait till all users are no longer using pid filtering */ synchronize_sched(); - vfree(pid_list->pids); - kfree(pid_list); + trace_free_pid_list(pid_list); } static void ftrace_clear_event_pids(struct trace_array *tr) @@ -1587,13 +1582,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, struct trace_pid_list *filtered_pids = NULL; struct trace_pid_list *pid_list; struct trace_event_file *file; - struct trace_parser parser; - unsigned long val; - loff_t this_pos; - ssize_t read = 0; - ssize_t ret = 0; - pid_t pid; - int nr_pids = 0; + ssize_t ret; if (!cnt) return 0; @@ -1602,93 +1591,15 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; - if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) - return -ENOMEM; - mutex_lock(&event_mutex); + filtered_pids = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); - /* - * Always recreate a new array. The write is an all or nothing - * operation. Always create a new array when adding new pids by - * the user. If the operation fails, then the current list is - * not modified. - */ - pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); - if (!pid_list) { - read = -ENOMEM; - goto out; - } - pid_list->pid_max = READ_ONCE(pid_max); - /* Only truncating will shrink pid_max */ - if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) - pid_list->pid_max = filtered_pids->pid_max; - pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); - if (!pid_list->pids) { - kfree(pid_list); - read = -ENOMEM; - goto out; - } - if (filtered_pids) { - /* copy the current bits to the new max */ - pid = find_first_bit(filtered_pids->pids, - filtered_pids->pid_max); - while (pid < filtered_pids->pid_max) { - set_bit(pid, pid_list->pids); - pid = find_next_bit(filtered_pids->pids, - filtered_pids->pid_max, - pid + 1); - nr_pids++; - } - } - - while (cnt > 0) { - - this_pos = 0; - - ret = trace_get_user(&parser, ubuf, cnt, &this_pos); - if (ret < 0 || !trace_parser_loaded(&parser)) - break; - - read += ret; - ubuf += ret; - cnt -= ret; - - parser.buffer[parser.idx] = 0; - - ret = -EINVAL; - if (kstrtoul(parser.buffer, 0, &val)) - break; - if (val >= pid_list->pid_max) - break; - - pid = (pid_t)val; - - set_bit(pid, pid_list->pids); - nr_pids++; - - trace_parser_clear(&parser); - ret = 0; - } - trace_parser_put(&parser); - - if (ret < 0) { - vfree(pid_list->pids); - kfree(pid_list); - read = ret; + ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); + if (ret < 0) goto out; - } - if (!nr_pids) { - /* Cleared the list of pids */ - vfree(pid_list->pids); - kfree(pid_list); - read = ret; - if (!filtered_pids) - goto out; - pid_list = NULL; - } rcu_assign_pointer(tr->filtered_pids, pid_list); list_for_each_entry(file, &tr->events, list) { @@ -1697,10 +1608,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, if (filtered_pids) { synchronize_sched(); - - vfree(filtered_pids->pids); - kfree(filtered_pids); - } else { + trace_free_pid_list(filtered_pids); + } else if (pid_list) { /* * Register a probe that is called before all other probes * to set ignore_pid if next or prev do not match. @@ -1738,9 +1647,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&event_mutex); - ret = read; - if (read > 0) - *ppos += read; + if (ret > 0) + *ppos += ret; return ret; } -- cgit v1.3-8-gc7d7 From 345ddcc882d8896dcbdcb3e0ee4a415fc23ec8b0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 22 Apr 2016 18:11:33 -0400 Subject: ftrace: Have set_ftrace_pid use the bitmap like events do Convert set_ftrace_pid to use the bitmap like set_event_pid does. This allows for instances to use the pid filtering as well, and will allow for function-fork option to set if the children of a traced function should be traced or not. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 313 +++++++++++++++-------------------- kernel/trace/trace.c | 1 + kernel/trace/trace.h | 15 +- kernel/trace/trace_functions.c | 2 +- kernel/trace/trace_functions_graph.c | 2 +- 5 files changed, 148 insertions(+), 185 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 900dbb1efff2..8b488f4dd8e8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -89,16 +89,16 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; /* What to set function_trace_op to */ static struct ftrace_ops *set_function_trace_op; -/* List for set_ftrace_pid's pids. */ -LIST_HEAD(ftrace_pids); -struct ftrace_pid { - struct list_head list; - struct pid *pid; -}; - -static bool ftrace_pids_enabled(void) +static bool ftrace_pids_enabled(struct ftrace_ops *ops) { - return !list_empty(&ftrace_pids); + struct trace_array *tr; + + if (!(ops->flags & FTRACE_OPS_FL_PID) || !ops->private) + return false; + + tr = ops->private; + + return tr->function_pids != NULL; } static void ftrace_update_trampoline(struct ftrace_ops *ops); @@ -179,7 +179,9 @@ int ftrace_nr_registered_ops(void) static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { - if (!test_tsk_trace_trace(current)) + struct trace_array *tr = op->private; + + if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid)) return; op->saved_func(ip, parent_ip, op, regs); @@ -417,7 +419,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) /* Always save the function, and reset at unregistering */ ops->saved_func = ops->func; - if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled()) + if (ftrace_pids_enabled(ops)) ops->func = ftrace_pid_func; ftrace_update_trampoline(ops); @@ -450,7 +452,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) static void ftrace_update_pid_func(void) { - bool enabled = ftrace_pids_enabled(); struct ftrace_ops *op; /* Only do something if we are tracing something */ @@ -459,8 +460,8 @@ static void ftrace_update_pid_func(void) do_for_each_ftrace_op(op, ftrace_ops_list) { if (op->flags & FTRACE_OPS_FL_PID) { - op->func = enabled ? ftrace_pid_func : - op->saved_func; + op->func = ftrace_pids_enabled(op) ? + ftrace_pid_func : op->saved_func; ftrace_update_trampoline(op); } } while_for_each_ftrace_op(op); @@ -5324,179 +5325,99 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) return ops->func; } -static void clear_ftrace_swapper(void) +static void +ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, + struct task_struct *prev, struct task_struct *next) { - struct task_struct *p; - int cpu; + struct trace_array *tr = data; + struct trace_pid_list *pid_list; - get_online_cpus(); - for_each_online_cpu(cpu) { - p = idle_task(cpu); - clear_tsk_trace_trace(p); - } - put_online_cpus(); -} - -static void set_ftrace_swapper(void) -{ - struct task_struct *p; - int cpu; + pid_list = rcu_dereference_sched(tr->function_pids); - get_online_cpus(); - for_each_online_cpu(cpu) { - p = idle_task(cpu); - set_tsk_trace_trace(p); - } - put_online_cpus(); + this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid, + trace_ignore_this_task(pid_list, next)); } -static void clear_ftrace_pid(struct pid *pid) +static void clear_ftrace_pids(struct trace_array *tr) { - struct task_struct *p; + struct trace_pid_list *pid_list; + int cpu; - rcu_read_lock(); - do_each_pid_task(pid, PIDTYPE_PID, p) { - clear_tsk_trace_trace(p); - } while_each_pid_task(pid, PIDTYPE_PID, p); - rcu_read_unlock(); + pid_list = rcu_dereference_protected(tr->function_pids, + lockdep_is_held(&ftrace_lock)); + if (!pid_list) + return; - put_pid(pid); -} + unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); -static void set_ftrace_pid(struct pid *pid) -{ - struct task_struct *p; + for_each_possible_cpu(cpu) + per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false; - rcu_read_lock(); - do_each_pid_task(pid, PIDTYPE_PID, p) { - set_tsk_trace_trace(p); - } while_each_pid_task(pid, PIDTYPE_PID, p); - rcu_read_unlock(); -} + rcu_assign_pointer(tr->function_pids, NULL); -static void clear_ftrace_pid_task(struct pid *pid) -{ - if (pid == ftrace_swapper_pid) - clear_ftrace_swapper(); - else - clear_ftrace_pid(pid); -} + /* Wait till all users are no longer using pid filtering */ + synchronize_sched(); -static void set_ftrace_pid_task(struct pid *pid) -{ - if (pid == ftrace_swapper_pid) - set_ftrace_swapper(); - else - set_ftrace_pid(pid); + trace_free_pid_list(pid_list); } -static int ftrace_pid_add(int p) +static void ftrace_pid_reset(struct trace_array *tr) { - struct pid *pid; - struct ftrace_pid *fpid; - int ret = -EINVAL; - mutex_lock(&ftrace_lock); - - if (!p) - pid = ftrace_swapper_pid; - else - pid = find_get_pid(p); - - if (!pid) - goto out; - - ret = 0; - - list_for_each_entry(fpid, &ftrace_pids, list) - if (fpid->pid == pid) - goto out_put; - - ret = -ENOMEM; - - fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); - if (!fpid) - goto out_put; - - list_add(&fpid->list, &ftrace_pids); - fpid->pid = pid; - - set_ftrace_pid_task(pid); + clear_ftrace_pids(tr); ftrace_update_pid_func(); - ftrace_startup_all(0); mutex_unlock(&ftrace_lock); - return 0; - -out_put: - if (pid != ftrace_swapper_pid) - put_pid(pid); - -out: - mutex_unlock(&ftrace_lock); - return ret; } -static void ftrace_pid_reset(void) -{ - struct ftrace_pid *fpid, *safe; - - mutex_lock(&ftrace_lock); - list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { - struct pid *pid = fpid->pid; - - clear_ftrace_pid_task(pid); - - list_del(&fpid->list); - kfree(fpid); - } - - ftrace_update_pid_func(); - ftrace_startup_all(0); - - mutex_unlock(&ftrace_lock); -} +/* Greater than any max PID */ +#define FTRACE_NO_PIDS (void *)(PID_MAX_LIMIT + 1) static void *fpid_start(struct seq_file *m, loff_t *pos) + __acquires(RCU) { + struct trace_pid_list *pid_list; + struct trace_array *tr = m->private; + mutex_lock(&ftrace_lock); + rcu_read_lock_sched(); - if (!ftrace_pids_enabled() && (!*pos)) - return (void *) 1; + pid_list = rcu_dereference_sched(tr->function_pids); - return seq_list_start(&ftrace_pids, *pos); + if (!pid_list) + return !(*pos) ? FTRACE_NO_PIDS : NULL; + + return trace_pid_start(pid_list, pos); } static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) { - if (v == (void *)1) + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids); + + if (v == FTRACE_NO_PIDS) return NULL; - return seq_list_next(v, &ftrace_pids, pos); + return trace_pid_next(pid_list, v, pos); } static void fpid_stop(struct seq_file *m, void *p) + __releases(RCU) { + rcu_read_unlock_sched(); mutex_unlock(&ftrace_lock); } static int fpid_show(struct seq_file *m, void *v) { - const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); - - if (v == (void *)1) { + if (v == FTRACE_NO_PIDS) { seq_puts(m, "no pid\n"); return 0; } - if (fpid->pid == ftrace_swapper_pid) - seq_puts(m, "swapper tasks\n"); - else - seq_printf(m, "%u\n", pid_vnr(fpid->pid)); - - return 0; + return trace_pid_show(m, v); } static const struct seq_operations ftrace_pid_sops = { @@ -5509,58 +5430,103 @@ static const struct seq_operations ftrace_pid_sops = { static int ftrace_pid_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; + struct seq_file *m; int ret = 0; + if (trace_array_get(tr) < 0) + return -ENODEV; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - ftrace_pid_reset(); + ftrace_pid_reset(tr); - if (file->f_mode & FMODE_READ) - ret = seq_open(file, &ftrace_pid_sops); + ret = seq_open(file, &ftrace_pid_sops); + if (ret < 0) { + trace_array_put(tr); + } else { + m = file->private_data; + /* copy tr over to seq ops */ + m->private = tr; + } return ret; } +static void ignore_task_cpu(void *data) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + /* + * This function is called by on_each_cpu() while the + * event_mutex is held. + */ + pid_list = rcu_dereference_protected(tr->function_pids, + mutex_is_locked(&ftrace_lock)); + + this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid, + trace_ignore_this_task(pid_list, current)); +} + static ssize_t ftrace_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[64], *tmp; - long val; - int ret; + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; + struct trace_pid_list *filtered_pids = NULL; + struct trace_pid_list *pid_list; + ssize_t ret; - if (cnt >= sizeof(buf)) - return -EINVAL; + if (!cnt) + return 0; + + mutex_lock(&ftrace_lock); + + filtered_pids = rcu_dereference_protected(tr->function_pids, + lockdep_is_held(&ftrace_lock)); + + ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); + if (ret < 0) + goto out; - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; + rcu_assign_pointer(tr->function_pids, pid_list); - buf[cnt] = 0; + if (filtered_pids) { + synchronize_sched(); + trace_free_pid_list(filtered_pids); + } else if (pid_list) { + /* Register a probe to set whether to ignore the tracing of a task */ + register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); + } /* - * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" - * to clean the filter quietly. + * Ignoring of pids is done at task switch. But we have to + * check for those tasks that are currently running. + * Always do this in case a pid was appended or removed. */ - tmp = strstrip(buf); - if (strlen(tmp) == 0) - return 1; + on_each_cpu(ignore_task_cpu, tr, 1); - ret = kstrtol(tmp, 10, &val); - if (ret < 0) - return ret; + ftrace_update_pid_func(); + ftrace_startup_all(0); + out: + mutex_unlock(&ftrace_lock); - ret = ftrace_pid_add(val); + if (ret > 0) + *ppos += ret; - return ret ? ret : cnt; + return ret; } static int ftrace_pid_release(struct inode *inode, struct file *file) { - if (file->f_mode & FMODE_READ) - seq_release(inode, file); + struct trace_array *tr = inode->i_private; - return 0; + trace_array_put(tr); + + return seq_release(inode, file); } static const struct file_operations ftrace_pid_fops = { @@ -5571,24 +5537,17 @@ static const struct file_operations ftrace_pid_fops = { .release = ftrace_pid_release, }; -static __init int ftrace_init_tracefs(void) +void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer) { - struct dentry *d_tracer; - - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) - return 0; - - ftrace_init_dyn_tracefs(d_tracer); + /* Only the top level directory has the dyn_tracefs and profile */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { + ftrace_init_dyn_tracefs(d_tracer); + ftrace_profile_tracefs(d_tracer); + } trace_create_file("set_ftrace_pid", 0644, d_tracer, - NULL, &ftrace_pid_fops); - - ftrace_profile_tracefs(d_tracer); - - return 0; + tr, &ftrace_pid_fops); } -fs_initcall(ftrace_init_tracefs); /** * ftrace_kill - kill ftrace diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a8bb7485fd1d..aa240551fc5d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7233,6 +7233,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) for_each_tracing_cpu(cpu) tracing_init_tracefs_percpu(tr, cpu); + ftrace_init_tracefs(tr, d_tracer); } static struct vfsmount *trace_automount(void *ingore) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a4dce1ef9e03..eaee458755a4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -156,6 +156,9 @@ struct trace_array_cpu { char comm[TASK_COMM_LEN]; bool ignore_pid; +#ifdef CONFIG_FUNCTION_TRACER + bool ftrace_ignore_pid; +#endif }; struct tracer; @@ -247,6 +250,7 @@ struct trace_array { int ref; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; + struct trace_pid_list __rcu *function_pids; /* function tracing enabled */ int function_enabled; #endif @@ -840,12 +844,9 @@ extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER extern bool ftrace_filter_param __initdata; -static inline int ftrace_trace_task(struct task_struct *task) +static inline int ftrace_trace_task(struct trace_array *tr) { - if (list_empty(&ftrace_pids)) - return 1; - - return test_tsk_trace_trace(task); + return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid); } extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, @@ -855,8 +856,9 @@ void ftrace_init_global_array_ops(struct trace_array *tr); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); int using_ftrace_ops_list_func(void); +void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); #else -static inline int ftrace_trace_task(struct task_struct *task) +static inline int ftrace_trace_task(struct trace_array *tr) { return 1; } @@ -871,6 +873,7 @@ static inline void ftrace_destroy_function_files(struct trace_array *tr) { } static inline __init void ftrace_init_global_array_ops(struct trace_array *tr) { } static inline void ftrace_reset_array_ops(struct trace_array *tr) { } +static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } /* ftace_func_t type is not defined, use macro instead of static inline */ #define ftrace_init_array_ops(tr, func) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5a095c2e4b69..0efa00d80623 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -43,7 +43,7 @@ static int allocate_ftrace_ops(struct trace_array *tr) /* Currently only the non stack verision is supported */ ops->func = function_trace_call; - ops->flags = FTRACE_OPS_FL_RECURSION_SAFE; + ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID; tr->ops = ops; ops->private = tr; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 3a0244ff7ea8..67cce7896aeb 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -319,7 +319,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) int cpu; int pc; - if (!ftrace_trace_task(current)) + if (!ftrace_trace_task(tr)) return 0; /* trace it when it is-nested-in or is a function enabled. */ -- cgit v1.3-8-gc7d7 From 35abb67de744b5dbaec54381f2f9e0246089331d Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 8 Jun 2016 18:38:02 -0700 Subject: tracing: expose current->comm to [ku]probe events ftrace is very quick to give up on saving the task command line (see `trace_save_cmdline()`). The workaround for events which really care about the command line is to explicitly assign it as part of the entry. However, this doesn't work for kprobe events, as there's no straightforward way to get access to current->comm. Add a kprobe/uprobe event variable $comm which provides exactly that. Link: http://lkml.kernel.org/r/f59b472033b943a370f5f48d0af37698f409108f.1465435894.git.osandov@fb.com Acked-by: Masami Hiramatsu Signed-off-by: Omar Sandoval Signed-off-by: Steven Rostedt --- Documentation/trace/kprobetrace.txt | 3 +++ Documentation/trace/uprobetracer.txt | 3 +++ kernel/trace/trace_kprobe.c | 1 + kernel/trace/trace_probe.c | 33 +++++++++++++++++++++++++++++++++ kernel/trace/trace_probe.h | 10 ++++++++++ 5 files changed, 50 insertions(+) (limited to 'kernel') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index d68ea5fc812b..ea52ec1f8484 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -40,6 +40,7 @@ Synopsis of kprobe_events $stackN : Fetch Nth entry of stack (N >= 0) $stack : Fetch stack address. $retval : Fetch return value.(*) + $comm : Fetch current task comm. +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**) NAME=FETCHARG : Set NAME as the argument name of FETCHARG. FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types @@ -62,6 +63,8 @@ offset, and container-size (usually 32). The syntax is; b@/ +For $comm, the default type is "string"; any other type is invalid. + Per-Probe Event Filtering ------------------------- diff --git a/Documentation/trace/uprobetracer.txt b/Documentation/trace/uprobetracer.txt index f1cf9a34ad9d..72d1cd4f7bf3 100644 --- a/Documentation/trace/uprobetracer.txt +++ b/Documentation/trace/uprobetracer.txt @@ -36,6 +36,7 @@ Synopsis of uprobe_tracer $stackN : Fetch Nth entry of stack (N >= 0) $stack : Fetch stack address. $retval : Fetch return value.(*) + $comm : Fetch current task comm. +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**) NAME=FETCHARG : Set NAME as the argument name of FETCHARG. FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types @@ -57,6 +58,8 @@ offset, and container-size (usually 32). The syntax is; b@/ +For $comm, the default type is "string"; any other type is invalid. + Event Profiling --------------- diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5546eec0505f..9aedb0b06683 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -587,6 +587,7 @@ static int create_trace_kprobe(int argc, char **argv) * $retval : fetch return value * $stack : fetch stack address * $stackN : fetch Nth of stack (N:0-) + * $comm : fetch current task comm * @ADDR : fetch memory at ADDR (ADDR should be in kernel) * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) * %REG : fetch register REG diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1d372fa6fefb..74e80a582c28 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -218,6 +218,28 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data) kfree(data); } +void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs, + void *data, void *dest) +{ + int maxlen = get_rloc_len(*(u32 *)dest); + u8 *dst = get_rloc_data(dest); + long ret; + + if (!maxlen) + return; + + ret = strlcpy(dst, current->comm, maxlen); + *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); +} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string)); + +void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs, + void *data, void *dest) +{ + *(u32 *)dest = strlen(current->comm) + 1; +} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size)); + static const struct fetch_type *find_fetch_type(const char *type, const struct fetch_type *ftbl) { @@ -348,6 +370,11 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } } else ret = -EINVAL; + } else if (strcmp(arg, "comm") == 0) { + if (strcmp(t->name, "string") != 0 && + strcmp(t->name, "string_size") != 0) + return -EINVAL; + f->fn = t->fetch[FETCH_MTD_comm]; } else ret = -EINVAL; @@ -522,6 +549,12 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, arg[t - parg->comm] = '\0'; t++; } + /* + * The default type of $comm should be "string", and it can't be + * dereferenced. + */ + if (!t && strcmp(arg, "$comm") == 0) + t = "string"; parg->type = find_fetch_type(t, ftbl); if (!parg->type) { pr_info("Unsupported type: %s\n", t); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index f6398db09114..45400ca5ded1 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -102,6 +102,7 @@ enum { FETCH_MTD_reg = 0, FETCH_MTD_stack, FETCH_MTD_retval, + FETCH_MTD_comm, FETCH_MTD_memory, FETCH_MTD_symbol, FETCH_MTD_deref, @@ -183,6 +184,14 @@ DECLARE_BASIC_FETCH_FUNCS(bitfield); #define fetch_bitfield_string NULL #define fetch_bitfield_string_size NULL +/* comm only makes sense as a string */ +#define fetch_comm_u8 NULL +#define fetch_comm_u16 NULL +#define fetch_comm_u32 NULL +#define fetch_comm_u64 NULL +DECLARE_FETCH_FUNC(comm, string); +DECLARE_FETCH_FUNC(comm, string_size); + /* * Define macro for basic types - we don't need to define s* types, because * we have to care only about bitwidth at recording time. @@ -213,6 +222,7 @@ DEFINE_FETCH_##method(u64) ASSIGN_FETCH_FUNC(reg, ftype), \ ASSIGN_FETCH_FUNC(stack, ftype), \ ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(comm, ftype), \ ASSIGN_FETCH_FUNC(memory, ftype), \ ASSIGN_FETCH_FUNC(symbol, ftype), \ ASSIGN_FETCH_FUNC(deref, ftype), \ -- cgit v1.3-8-gc7d7 From e2ace001176dc9745a472fe8bda1f0b28a4d7351 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 26 May 2016 12:00:33 -0700 Subject: tracing: Choose static tp_printk buffer by explicit nesting count Currently, the trace_printk code chooses which static buffer to use based on what type of atomic context (NMI, IRQ, etc) it's in. Simplify the code and make it more robust: simply count the nesting depth and choose a buffer based on the current nesting depth. The new code will only drop an event if we nest more than 4 deep, and the old code was guaranteed to malfunction if that happened. Link: http://lkml.kernel.org/r/07ab03aecfba25fcce8f9a211b14c9c5e2865c58.1464289095.git.luto@kernel.org Acked-by: Namhyung Kim Signed-off-by: Andy Lutomirski Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 83 +++++++++++++++------------------------------------- 1 file changed, 24 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index aa240551fc5d..45e6747589c6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2339,83 +2339,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) /* created for use with alloc_percpu */ struct trace_buffer_struct { - char buffer[TRACE_BUF_SIZE]; + int nesting; + char buffer[4][TRACE_BUF_SIZE]; }; static struct trace_buffer_struct *trace_percpu_buffer; -static struct trace_buffer_struct *trace_percpu_sirq_buffer; -static struct trace_buffer_struct *trace_percpu_irq_buffer; -static struct trace_buffer_struct *trace_percpu_nmi_buffer; /* - * The buffer used is dependent on the context. There is a per cpu - * buffer for normal context, softirq contex, hard irq context and - * for NMI context. Thise allows for lockless recording. - * - * Note, if the buffers failed to be allocated, then this returns NULL + * Thise allows for lockless recording. If we're nested too deeply, then + * this returns NULL. */ static char *get_trace_buf(void) { - struct trace_buffer_struct *percpu_buffer; - - /* - * If we have allocated per cpu buffers, then we do not - * need to do any locking. - */ - if (in_nmi()) - percpu_buffer = trace_percpu_nmi_buffer; - else if (in_irq()) - percpu_buffer = trace_percpu_irq_buffer; - else if (in_softirq()) - percpu_buffer = trace_percpu_sirq_buffer; - else - percpu_buffer = trace_percpu_buffer; + struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); - if (!percpu_buffer) + if (!buffer || buffer->nesting >= 4) return NULL; - return this_cpu_ptr(&percpu_buffer->buffer[0]); + return &buffer->buffer[buffer->nesting++][0]; +} + +static void put_trace_buf(void) +{ + this_cpu_dec(trace_percpu_buffer->nesting); } static int alloc_percpu_trace_buffer(void) { struct trace_buffer_struct *buffers; - struct trace_buffer_struct *sirq_buffers; - struct trace_buffer_struct *irq_buffers; - struct trace_buffer_struct *nmi_buffers; buffers = alloc_percpu(struct trace_buffer_struct); - if (!buffers) - goto err_warn; - - sirq_buffers = alloc_percpu(struct trace_buffer_struct); - if (!sirq_buffers) - goto err_sirq; - - irq_buffers = alloc_percpu(struct trace_buffer_struct); - if (!irq_buffers) - goto err_irq; - - nmi_buffers = alloc_percpu(struct trace_buffer_struct); - if (!nmi_buffers) - goto err_nmi; + if (WARN(!buffers, "Could not allocate percpu trace_printk buffer")) + return -ENOMEM; trace_percpu_buffer = buffers; - trace_percpu_sirq_buffer = sirq_buffers; - trace_percpu_irq_buffer = irq_buffers; - trace_percpu_nmi_buffer = nmi_buffers; - return 0; - - err_nmi: - free_percpu(irq_buffers); - err_irq: - free_percpu(sirq_buffers); - err_sirq: - free_percpu(buffers); - err_warn: - WARN(1, "Could not allocate percpu trace_printk buffer"); - return -ENOMEM; } static int buffers_allocated; @@ -2506,7 +2464,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; - goto out; + goto out_nobuffer; } len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); @@ -2532,6 +2490,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) } out: + put_trace_buf(); + +out_nobuffer: preempt_enable_notrace(); unpause_graph_tracing(); @@ -2563,7 +2524,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; - goto out; + goto out_nobuffer; } len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); @@ -2582,7 +2543,11 @@ __trace_array_vprintk(struct ring_buffer *buffer, __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); } - out: + +out: + put_trace_buf(); + +out_nobuffer: preempt_enable_notrace(); unpause_graph_tracing(); -- cgit v1.3-8-gc7d7 From e947841c0dce9db675a957182214ef8091ac3d61 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 17 Jun 2016 17:40:58 -0400 Subject: tracing: Show the preempt count of when the event was called Because tracepoint callbacks are done with preemption enabled, the trace events are always called with preempt disable due to the rcu_read_lock_sched_notrace() in __DO_TRACE(). This causes the preempt count shown in the recorded trace event to be inaccurate. It is always one more that what the preempt_count was when the tracepoint was called. If CONFIG_PREEMPT is enabled, subtract 1 from the preempt_count before recording it in the trace buffer. Link: http://lkml.kernel.org/r/20160525132537.GA10808@linutronix.de Reported-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fd449eb138cf..03c0a48c3ac4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -261,6 +261,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, local_save_flags(fbuffer->flags); fbuffer->pc = preempt_count(); + /* + * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables + * preemption (adding one to the preempt_count). Since we are + * interested in the preempt_count at the time the tracepoint was + * hit, we need to subtract one to offset the increment. + */ + if (IS_ENABLED(CONFIG_PREEMPT)) + fbuffer->pc--; fbuffer->trace_file = trace_file; fbuffer->event = -- cgit v1.3-8-gc7d7 From 9a51933e360897d9b3867c9b09dd5ccf7493e97e Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 11 May 2016 14:06:57 -0500 Subject: tracing: Expose CPU physical addresses (resource values) for PCI devices Previously, mmio_print_pcidev() put "user" addresses in the trace buffer. On most architectures, these are the same as CPU physical addresses, but on microblaze, mips, powerpc, and sparc, they may be something else, typically a raw BAR value (a bus address as opposed to a CPU address). Always expose the CPU physical address to avoid this arch-dependent behavior. This change should have no user-visible effect because this file currently depends on CONFIG_HAVE_MMIOTRACE_SUPPORT, which is only defined for x86, and pci_resource_to_user() is a no-op on x86. Link: http://lkml.kernel.org/r/20160511190657.5898.4248.stgit@bhelgaas-glaptop2.roam.corp.google.com Signed-off-by: Bjorn Helgaas Signed-off-by: Steven Rostedt --- kernel/trace/trace_mmiotrace.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 68f376ca6d3f..cd7480d0a201 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -68,19 +68,15 @@ static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", dev->bus->number, dev->devfn, dev->vendor, dev->device, dev->irq); - /* - * XXX: is pci_resource_to_user() appropriate, since we are - * supposed to interpret the __ioremap() phys_addr argument based on - * these printed values? - */ for (i = 0; i < 7; i++) { - pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + start = dev->resource[i].start; trace_seq_printf(s, " %llx", (unsigned long long)(start | (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); } for (i = 0; i < 7; i++) { - pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + start = dev->resource[i].start; + end = dev->resource[i].end; trace_seq_printf(s, " %llx", dev->resource[i].start < dev->resource[i].end ? (unsigned long long)(end - start) + 1 : 0); -- cgit v1.3-8-gc7d7 From 0fb71d340d355156818bb53eb36ae79a3f88bda9 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Mon, 25 Apr 2016 17:20:28 +0800 Subject: clocksource: Make clocksource insert entry more efficient In clocksource_enqueue(), it is unnecessary to continue looping the list, if we find there is an entry that the value of rating is smaller than the new one. It is safe to be out the loop, because all of entry are inserted in descending order. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Minfei Huang Signed-off-by: John Stultz --- kernel/time/clocksource.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 56ece145a814..6a5a310a1a53 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -669,10 +669,12 @@ static void clocksource_enqueue(struct clocksource *cs) struct list_head *entry = &clocksource_list; struct clocksource *tmp; - list_for_each_entry(tmp, &clocksource_list, list) + list_for_each_entry(tmp, &clocksource_list, list) { /* Keep track of the place, where to insert */ - if (tmp->rating >= cs->rating) - entry = &tmp->list; + if (tmp->rating < cs->rating) + break; + entry = &tmp->list; + } list_add(&cs->list, entry); } -- cgit v1.3-8-gc7d7 From 0209b937569a133dedfe930cdfff3a0d1d68c9e9 Mon Sep 17 00:00:00 2001 From: Thomas Graziadei Date: Tue, 31 May 2016 15:06:06 +0200 Subject: timekeeping: Fix 1ns/tick drift with GENERIC_TIME_VSYSCALL_OLD The user notices the problem in a raw and real time drift, calling clock_gettime with CLOCK_REALTIME / CLOCK_MONOTONIC_RAW on a system with no ntp correction taking place (no ntpd or ptp stuff running). The problem is, that old_vsyscall_fixup adds an extra 1ns even though xtime_nsec is already held in full nsecs and the remainder in this case is 0. Do the rounding up buisness only if needed. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Thomas Graziadei Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 479d25cd3d4f..a196e08324e7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -480,10 +480,12 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) * users are removed, this can be killed. */ remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1); - tk->tkr_mono.xtime_nsec -= remainder; - tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; - tk->ntp_error += remainder << tk->ntp_error_shift; - tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; + if (remainder != 0) { + tk->tkr_mono.xtime_nsec -= remainder; + tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; + tk->ntp_error += remainder << tk->ntp_error_shift; + tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; + } } #else #define old_vsyscall_fixup(tk) -- cgit v1.3-8-gc7d7 From af4afb40085f04f8b2ea8d28df878f7f0be02f89 Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Tue, 14 Jun 2016 11:00:42 +0200 Subject: alarmtimer: Fix comments describing structure fields Updated struct alarm and struct alarm_timer descriptions. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Pratyush Patel Signed-off-by: John Stultz --- include/linux/alarmtimer.h | 6 +++--- kernel/time/alarmtimer.c | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index 52f3b7da4f2d..9d8031257a90 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -26,10 +26,10 @@ enum alarmtimer_restart { * struct alarm - Alarm timer structure * @node: timerqueue node for adding to the event list this value * also includes the expiration time. - * @period: Period for recuring alarms + * @timer: hrtimer used to schedule events while running * @function: Function pointer to be executed when the timer fires. - * @type: Alarm type (BOOTTIME/REALTIME) - * @enabled: Flag that represents if the alarm is set to fire or not + * @type: Alarm type (BOOTTIME/REALTIME). + * @state: Flag that represents if the alarm is set to fire or not. * @data: Internal data value. */ struct alarm { diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index e840ed867a5d..c3aad685bbc0 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -30,7 +30,6 @@ * struct alarm_base - Alarm timer bases * @lock: Lock for syncrhonized access to the base * @timerqueue: Timerqueue head managing the list of events - * @timer: hrtimer used to schedule events while running * @gettime: Function to read the time correlating to the base * @base_clockid: clockid for the base */ -- cgit v1.3-8-gc7d7 From e6c2682a1da36a2e79d9bab470412374434ce89e Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Wed, 8 Jun 2016 22:04:59 -0700 Subject: time: Add time64_to_tm() time_to_tm() takes time_t as an argument. time_t is not y2038 safe. Add time64_to_tm() that takes time64_t as an argument which is y2038 safe. The plan is to eventually replace all calls to time_to_tm() by time64_to_tm(). Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Deepa Dinamani Signed-off-by: John Stultz --- include/linux/time.h | 15 ++++++++++++++- kernel/time/timeconv.c | 11 ++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/time.h b/include/linux/time.h index 297f09f23896..4cea09d94208 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -205,7 +205,20 @@ struct tm { int tm_yday; }; -void time_to_tm(time_t totalsecs, int offset, struct tm *result); +void time64_to_tm(time64_t totalsecs, int offset, struct tm *result); + +/** + * time_to_tm - converts the calendar time to local broken-down time + * + * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, + * Coordinated Universal Time (UTC). + * @offset offset seconds adding to totalsecs. + * @result pointer to struct tm variable to receive broken-down time + */ +static inline void time_to_tm(time_t totalsecs, int offset, struct tm *result) +{ + time64_to_tm(totalsecs, offset, result); +} /** * timespec_to_ns - Convert timespec to nanoseconds diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c index 86628e755f38..7142580ad94f 100644 --- a/kernel/time/timeconv.c +++ b/kernel/time/timeconv.c @@ -67,20 +67,21 @@ static const unsigned short __mon_yday[2][13] = { #define SECS_PER_DAY (SECS_PER_HOUR * 24) /** - * time_to_tm - converts the calendar time to local broken-down time + * time64_to_tm - converts the calendar time to local broken-down time * * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, * Coordinated Universal Time (UTC). * @offset offset seconds adding to totalsecs. * @result pointer to struct tm variable to receive broken-down time */ -void time_to_tm(time_t totalsecs, int offset, struct tm *result) +void time64_to_tm(time64_t totalsecs, int offset, struct tm *result) { long days, rem, y; + int remainder; const unsigned short *ip; - days = totalsecs / SECS_PER_DAY; - rem = totalsecs % SECS_PER_DAY; + days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder); + rem = remainder; rem += offset; while (rem < 0) { rem += SECS_PER_DAY; @@ -124,4 +125,4 @@ void time_to_tm(time_t totalsecs, int offset, struct tm *result) result->tm_mon = y; result->tm_mday = days + 1; } -EXPORT_SYMBOL(time_to_tm); +EXPORT_SYMBOL(time64_to_tm); -- cgit v1.3-8-gc7d7 From 4a19bd3d22d51a0c89db10879dacaffa0f52aecf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Jun 2016 18:03:02 +0200 Subject: time: Avoid timespec in udelay_test udelay_test_single() uses ktime_get_ts() to get two timespec values and calculate the difference between them, while udelay_test_show() uses the same to printk() the current monotonic time. Both of these are y2038 safe on all machines, but we want to get rid of struct timespec anyway, so this converts the code to use ktime_get_ns() and ktime_get_ts64() respectively. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Arnd Bergmann Signed-off-by: John Stultz --- kernel/time/test_udelay.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c index e622ba365a13..b0928ab3270f 100644 --- a/kernel/time/test_udelay.c +++ b/kernel/time/test_udelay.c @@ -43,13 +43,13 @@ static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters) int allowed_error_ns = usecs * 5; for (i = 0; i < iters; ++i) { - struct timespec ts1, ts2; + s64 kt1, kt2; int time_passed; - ktime_get_ts(&ts1); + kt1 = ktime_get_ns(); udelay(usecs); - ktime_get_ts(&ts2); - time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1); + kt2 = ktime_get_ns(); + time_passed = kt2 - kt1; if (i == 0 || time_passed < min) min = time_passed; @@ -87,11 +87,11 @@ static int udelay_test_show(struct seq_file *s, void *v) if (usecs > 0 && iters > 0) { return udelay_test_single(s, usecs, iters); } else if (usecs == 0) { - struct timespec ts; + struct timespec64 ts; - ktime_get_ts(&ts); - seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n", - loops_per_jiffy, ts.tv_sec, ts.tv_nsec); + ktime_get_ts64(&ts); + seq_printf(s, "udelay() test (lpj=%ld kt=%lld.%09ld)\n", + loops_per_jiffy, (s64)ts.tv_sec, ts.tv_nsec); seq_puts(s, "usage:\n"); seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n"); seq_puts(s, "cat " DEBUGFS_FILENAME "\n"); -- cgit v1.3-8-gc7d7 From 7c71feb0a6766c7c3a262e3cc33ae231f3953cb6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Jun 2016 17:30:47 +0200 Subject: timer: Avoid using timespec The tstats_show() function prints a ktime_t variable by converting it to struct timespec first. The algorithm is ok, but we want to stop using timespec in general because of the 32-bit time_t overflow problem. This changes the code to use struct timespec64, without any functional change. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Arnd Bergmann Signed-off-by: John Stultz --- kernel/time/timer_stats.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 1adecb4b87c8..087204c733eb 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -279,7 +279,7 @@ static void print_name_offset(struct seq_file *m, unsigned long addr) static int tstats_show(struct seq_file *m, void *v) { - struct timespec period; + struct timespec64 period; struct entry *entry; unsigned long ms; long events = 0; @@ -295,11 +295,11 @@ static int tstats_show(struct seq_file *m, void *v) time = ktime_sub(time_stop, time_start); - period = ktime_to_timespec(time); + period = ktime_to_timespec64(time); ms = period.tv_nsec / 1000000; seq_puts(m, "Timer Stats Version: v0.3\n"); - seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); + seq_printf(m, "Sample period: %ld.%03ld s\n", (long)period.tv_sec, ms); if (atomic_read(&overflow_count)) seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); -- cgit v1.3-8-gc7d7 From e7e15b87f86d4a48c270b81cf027eafd801e5b89 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 21 Jun 2016 13:06:24 -0400 Subject: cgroup: allow NULL return from ss->css_alloc() cgroup core expected css_alloc to return an ERR_PTR value on failure and caused NULL deref if it returned NULL. It's an easy mistake to make from an alloc function and there's no ambiguity in what's being indicated. Update css_create() so that it interprets NULL return from css_alloc as -ENOMEM. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 78f6d18ff0af..dd26e1bb7222 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5133,6 +5133,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, lockdep_assert_held(&cgroup_mutex); css = ss->css_alloc(parent_css); + if (!css) + css = ERR_PTR(-ENOMEM); if (IS_ERR(css)) return css; -- cgit v1.3-8-gc7d7 From 135b8b37bd91cc82f83e98fca109b80375f5317e Mon Sep 17 00:00:00 2001 From: Kenny Yu Date: Tue, 21 Jun 2016 14:04:36 -0400 Subject: cgroup: Add pids controller event when fork fails because of pid limit This patch adds more visibility into the pids controller when the controller rejects a fork request. Whenever fork fails because the limit on the number of pids in the cgroup is reached, the controller will log this and also notify the newly added cgroups events file. The `max` key in the events file represents the number of times fork failed because of the pids controller. This change also logs only the first time the `max` event counter is incremented. This is to provide a hint to the user to understand why fork failed, as users are not yet used to seeing fork failures because of the pids controller. Signed-off-by: Kenny Yu Acked-by: Johannes Weiner cmpxchg.org> Signed-off-by: Tejun Heo --- kernel/cgroup_pids.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index 303097b37429..9740ea6762de 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -49,6 +49,12 @@ struct pids_cgroup { */ atomic64_t counter; int64_t limit; + + /* Handle for "pids.events" */ + struct cgroup_file events_file; + + /* Number of times fork failed because limit was hit. */ + atomic64_t events_limit; }; static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) @@ -72,6 +78,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent) pids->limit = PIDS_MAX; atomic64_set(&pids->counter, 0); + atomic64_set(&pids->events_limit, 0); return &pids->css; } @@ -213,10 +220,21 @@ static int pids_can_fork(struct task_struct *task) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; + int err; css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); - return pids_try_charge(pids, 1); + err = pids_try_charge(pids, 1); + if (err) { + /* Only log the first time events_limit is incremented. */ + if (atomic64_inc_return(&pids->events_limit) == 1) { + pr_info("cgroup: fork rejected by pids controller in "); + pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id)); + pr_cont("\n"); + } + cgroup_file_notify(&pids->events_file); + } + return err; } static void pids_cancel_fork(struct task_struct *task) @@ -288,6 +306,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css, return atomic64_read(&pids->counter); } +static int pids_events_show(struct seq_file *sf, void *v) +{ + struct pids_cgroup *pids = css_pids(seq_css(sf)); + + seq_printf(sf, "max %ld\n", atomic64_read(&pids->events_limit)); + return 0; +} + static struct cftype pids_files[] = { { .name = "max", @@ -300,6 +326,12 @@ static struct cftype pids_files[] = { .read_s64 = pids_current_read, .flags = CFTYPE_NOT_ON_ROOT, }, + { + .name = "events", + .seq_show = pids_events_show, + .file_offset = offsetof(struct pids_cgroup, events_file), + .flags = CFTYPE_NOT_ON_ROOT, + }, { } /* terminate */ }; -- cgit v1.3-8-gc7d7 From 9f6870dd9790dd87da1d0cf9e43e60113f3a278d Mon Sep 17 00:00:00 2001 From: Kenny Yu Date: Tue, 21 Jun 2016 11:55:35 -0700 Subject: cgroup: Use lld instead of ld when printing pids controller events_limit The `events_limit` variable needs to be formatted with %lld and not %ld. This fixes the following warning discovered by kbuild test robot: kernel/cgroup_pids.c: In function 'pids_events_show': kernel/cgroup_pids.c:313:24: warning: format '%ld' expects argument of type 'long int', but argument 3 has type 'long long int' [-Wformat=] seq_printf(sf, "max %ld\n", atomic64_read(&pids->events_limit)); ^ tj: Added explicit (s64) cast as atomic64 switches between long long and long depending on 32 or 64. Signed-off-by: Kenny Yu Signed-off-by: Tejun Heo --- kernel/cgroup_pids.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index 9740ea6762de..2bd673783f1a 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -310,7 +310,7 @@ static int pids_events_show(struct seq_file *sf, void *v) { struct pids_cgroup *pids = css_pids(seq_css(sf)); - seq_printf(sf, "max %ld\n", atomic64_read(&pids->events_limit)); + seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit)); return 0; } -- cgit v1.3-8-gc7d7 From d16dcd3d18759eb955e0325572d07457f93494f5 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 21 Jun 2016 10:23:22 +0100 Subject: irqdomain: Fix disposal of mappings for interrupt hierarchies The function irq_create_of_mapping() is used to create an interrupt mapping. However, depending on whether the irqdomain, to which the interrupt belongs, is part of a hierarchy, determines whether the mapping is created via calling irq_domain_alloc_irqs() or irq_create_mapping(). To dispose of the interrupt mapping, drivers call irq_dispose_mapping(). However, this function does not check to see if the irqdomain is part of a hierarchy or not and simply assumes that it was mapped via calling irq_create_mapping() so calls irq_domain_disassociate() to unmap the interrupt. Fix this by checking to see if the irqdomain is part of a hierarchy and if so call irq_domain_free_irqs() to free/unmap the interrupt. Signed-off-by: Jon Hunter Cc: Marc Zyngier Cc: Jiang Liu Link: http://lkml.kernel.org/r/1466501002-16368-1-git-send-email-jonathanh@nvidia.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index caa6a63d26f0..5d89d724a02a 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -680,8 +680,12 @@ void irq_dispose_mapping(unsigned int virq) if (WARN_ON(domain == NULL)) return; - irq_domain_disassociate(domain, virq); - irq_free_desc(virq); + if (irq_domain_is_hierarchy(domain)) { + irq_domain_free_irqs(virq, 1); + } else { + irq_domain_disassociate(domain, virq); + irq_free_desc(virq); + } } EXPORT_SYMBOL_GPL(irq_dispose_mapping); -- cgit v1.3-8-gc7d7 From be54f69c26193de31053190761e521903b89d098 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 23 Jun 2016 14:03:47 -0400 Subject: tracing: Skip more functions when doing stack tracing of events # echo 1 > options/stacktrace # echo 1 > events/sched/sched_switch/enable # cat trace -0 [002] d..2 1982.525169: => save_stack_trace => __ftrace_trace_stack => trace_buffer_unlock_commit_regs => event_trigger_unlock_commit => trace_event_buffer_commit => trace_event_raw_event_sched_switch => __schedule => schedule => schedule_preempt_disabled => cpu_startup_entry => start_secondary The above shows that we are seeing 6 functions before ever making it to the caller of the sched_switch event. # echo stacktrace > events/sched/sched_switch/trigger # cat trace -0 [002] d..3 2146.335208: => trace_event_buffer_commit => trace_event_raw_event_sched_switch => __schedule => schedule => schedule_preempt_disabled => cpu_startup_entry => start_secondary The stacktrace trigger isn't as bad, because it adds its own skip to the stacktracing, but still has two events extra. One issue is that if the stacktrace passes its own "regs" then there should be no addition to the skip, as the regs will not include the functions being called. This was an issue that was fixed by commit 7717c6be6999 ("tracing: Fix stacktrace skip depth in trace_buffer_unlock_commit_regs()" as adding the skip number for kprobes made the probes not have any stack at all. But since this is only an issue when regs is being used, a skip should be added if regs is NULL. Now we have: # echo 1 > options/stacktrace # echo 1 > events/sched/sched_switch/enable # cat trace -0 [000] d..2 1297.676333: => __schedule => schedule => schedule_preempt_disabled => cpu_startup_entry => rest_init => start_kernel => x86_64_start_reservations => x86_64_start_kernel # echo stacktrace > events/sched/sched_switch/trigger # cat trace -0 [002] d..3 1370.759745: => __schedule => schedule => schedule_preempt_disabled => cpu_startup_entry => start_secondary And kprobes are not touched. Reported-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 45e6747589c6..3d9f31b576f3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2118,7 +2118,17 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); + /* + * If regs is not set, then skip the following callers: + * trace_buffer_unlock_commit_regs + * event_trigger_unlock_commit + * trace_event_buffer_commit + * trace_event_raw_event_sched_switch + * Note, we can still get here via blktrace, wakeup tracer + * and mmiotrace, but that's ok if they lose a function or + * two. They are that meaningful. + */ + ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } @@ -2168,6 +2178,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, trace.nr_entries = 0; trace.skip = skip; + /* + * Add two, for this function and the call to save_stack_trace() + * If regs is set, then these functions will not be in the way. + */ + if (!regs) + trace.skip += 2; + /* * Since events can happen in NMIs there's no safe way to * use the per cpu ftrace_stacks. We reserve it and if an interrupt -- cgit v1.3-8-gc7d7 From d07b846f6200454c50d791796edb82660192513d Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Wed, 23 Sep 2015 15:16:04 -0500 Subject: fs: Limit file caps to the user namespace of the super block Capability sets attached to files must be ignored except in the user namespaces where the mounter is privileged, i.e. s_user_ns and its descendants. Otherwise a vector exists for gaining privileges in namespaces where a user is not already privileged. Add a new helper function, current_in_user_ns(), to test whether a user namespace is the same as or a descendant of another namespace. Use this helper to determine whether a file's capability set should be applied to the caps constructed during exec. --EWB Replaced in_userns with the simpler current_in_userns. Acked-by: Serge Hallyn Signed-off-by: Seth Forshee Signed-off-by: Eric W. Biederman --- include/linux/user_namespace.h | 6 ++++++ kernel/user_namespace.c | 14 ++++++++++++++ security/commoncap.c | 2 ++ 3 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 8297e5b341d8..9217169c64cb 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -72,6 +72,7 @@ extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *); extern int proc_setgroups_show(struct seq_file *m, void *v); extern bool userns_may_setgroups(const struct user_namespace *ns); +extern bool current_in_userns(const struct user_namespace *target_ns); #else static inline struct user_namespace *get_user_ns(struct user_namespace *ns) @@ -100,6 +101,11 @@ static inline bool userns_may_setgroups(const struct user_namespace *ns) { return true; } + +static inline bool current_in_userns(const struct user_namespace *target_ns) +{ + return true; +} #endif #endif /* _LINUX_USER_H */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9bafc211930c..68f594212759 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -938,6 +938,20 @@ bool userns_may_setgroups(const struct user_namespace *ns) return allowed; } +/* + * Returns true if @ns is the same namespace as or a descendant of + * @target_ns. + */ +bool current_in_userns(const struct user_namespace *target_ns) +{ + struct user_namespace *ns; + for (ns = current_user_ns(); ns; ns = ns->parent) { + if (ns == target_ns) + return true; + } + return false; +} + static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); diff --git a/security/commoncap.c b/security/commoncap.c index e7fadde737f4..e109e6dac858 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -455,6 +455,8 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) return 0; + if (!current_in_userns(bprm->file->f_path.mnt->mnt_sb->s_user_ns)) + return 0; rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps); if (rc < 0) { -- cgit v1.3-8-gc7d7 From f295e53b60eb93ee53ed5ac610374ed293caa57b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 17 Jun 2016 11:08:06 -0700 Subject: libnvdimm, pmem: allow nfit_test to override pmem_direct_access() Currently phys_to_pfn_t() is an exported symbol to allow nfit_test to override it and indicate that nfit_test-pmem is not device-mapped. Now, we want to enable nfit_test to operate without DMA_CMA and the pmem it provides will no longer be physically contiguous, i.e. won't be capable of supporting direct_access requests larger than a page. Make pmem_direct_access() a weak symbol so that it can be replaced by the tools/testing/nvdimm/ version, and move phys_to_pfn_t() to a static inline now that it no longer needs to be overridden. Acked-by: Johannes Thumshirn Signed-off-by: Dan Williams --- drivers/nvdimm/pmem.c | 18 +++------------ drivers/nvdimm/pmem.h | 24 ++++++++++++++++++++ include/linux/pfn_t.h | 5 ++++- kernel/memremap.c | 6 ----- tools/testing/nvdimm/Kbuild | 3 ++- tools/testing/nvdimm/pmem-dax.c | 42 +++++++++++++++++++++++++++++++++++ tools/testing/nvdimm/test/iomap.c | 3 ++- tools/testing/nvdimm/test/nfit_test.h | 2 ++ 8 files changed, 79 insertions(+), 24 deletions(-) create mode 100644 drivers/nvdimm/pmem.h create mode 100644 tools/testing/nvdimm/pmem-dax.c (limited to 'kernel') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index ba253df6233f..b6fcb97a601c 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -29,23 +29,10 @@ #include #include #include +#include "pmem.h" #include "pfn.h" #include "nd.h" -struct pmem_device { - /* One contiguous memory region per device */ - phys_addr_t phys_addr; - /* when non-zero this device is hosting a 'pfn' instance */ - phys_addr_t data_offset; - u64 pfn_flags; - void __pmem *virt_addr; - /* immutable base size of the namespace */ - size_t size; - /* trim size when namespace capacity has been section aligned */ - u32 pfn_pad; - struct badblocks bb; -}; - static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, unsigned int len) { @@ -163,7 +150,8 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, return rc; } -static long pmem_direct_access(struct block_device *bdev, sector_t sector, +/* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ +__weak long pmem_direct_access(struct block_device *bdev, sector_t sector, void __pmem **kaddr, pfn_t *pfn, long size) { struct pmem_device *pmem = bdev->bd_queue->queuedata; diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h new file mode 100644 index 000000000000..c48d4e3aa346 --- /dev/null +++ b/drivers/nvdimm/pmem.h @@ -0,0 +1,24 @@ +#ifndef __NVDIMM_PMEM_H__ +#define __NVDIMM_PMEM_H__ +#include +#include +#include +#include + +long pmem_direct_access(struct block_device *bdev, sector_t sector, + void __pmem **kaddr, pfn_t *pfn, long size); +/* this definition is in it's own header for tools/testing/nvdimm to consume */ +struct pmem_device { + /* One contiguous memory region per device */ + phys_addr_t phys_addr; + /* when non-zero this device is hosting a 'pfn' instance */ + phys_addr_t data_offset; + u64 pfn_flags; + void __pmem *virt_addr; + /* immutable base size of the namespace */ + size_t size; + /* trim size when namespace capacity has been section aligned */ + u32 pfn_pad; + struct badblocks bb; +}; +#endif /* __NVDIMM_PMEM_H__ */ diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 94994810c7c0..a3d90b9da18d 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -28,7 +28,10 @@ static inline pfn_t pfn_to_pfn_t(unsigned long pfn) return __pfn_to_pfn_t(pfn, 0); } -extern pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags); +static inline pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags) +{ + return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); +} static inline bool pfn_t_has_page(pfn_t pfn) { diff --git a/kernel/memremap.c b/kernel/memremap.c index 017532193fb1..852e5266124a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -169,12 +169,6 @@ void devm_memunmap(struct device *dev, void *addr) } EXPORT_SYMBOL(devm_memunmap); -pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags) -{ - return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); -} -EXPORT_SYMBOL(phys_to_pfn_t); - #ifdef CONFIG_ZONE_DEVICE static DEFINE_MUTEX(pgmap_lock); static RADIX_TREE(pgmap_radix, GFP_KERNEL); diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild index 785985677159..4adfa9cdf26f 100644 --- a/tools/testing/nvdimm/Kbuild +++ b/tools/testing/nvdimm/Kbuild @@ -11,12 +11,12 @@ ldflags-y += --wrap=__devm_release_region ldflags-y += --wrap=__request_region ldflags-y += --wrap=__release_region ldflags-y += --wrap=devm_memremap_pages -ldflags-y += --wrap=phys_to_pfn_t DRIVERS := ../../../drivers NVDIMM_SRC := $(DRIVERS)/nvdimm ACPI_SRC := $(DRIVERS)/acpi DAX_SRC := $(DRIVERS)/dax +ccflags-y := -I$(src)/$(NVDIMM_SRC)/ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o @@ -31,6 +31,7 @@ nfit-y := $(ACPI_SRC)/nfit.o nfit-y += config_check.o nd_pmem-y := $(NVDIMM_SRC)/pmem.o +nd_pmem-y += pmem-dax.o nd_pmem-y += config_check.o nd_btt-y := $(NVDIMM_SRC)/btt.o diff --git a/tools/testing/nvdimm/pmem-dax.c b/tools/testing/nvdimm/pmem-dax.c new file mode 100644 index 000000000000..fdba77f2bb06 --- /dev/null +++ b/tools/testing/nvdimm/pmem-dax.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2014-2016, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include "test/nfit_test.h" +#include +#include +#include + +long pmem_direct_access(struct block_device *bdev, sector_t sector, + void __pmem **kaddr, pfn_t *pfn, long size) +{ + struct pmem_device *pmem = bdev->bd_queue->queuedata; + resource_size_t offset = sector * 512 + pmem->data_offset; + + /* disable DAX for nfit_test pmem devices */ + if (get_nfit_res(pmem->phys_addr + offset)) { + dev_info_once(pmem->bb.dev, "dax is disabled for nfit_test\n"); + return -EIO; + } + + if (unlikely(is_bad_pmem(&pmem->bb, sector, size))) + return -EIO; + *kaddr = pmem->virt_addr + offset; + *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); + + /* + * If badblocks are present, limit known good range to the + * requested range. + */ + if (unlikely(pmem->bb.count)) + return size; + return pmem->size - pmem->pfn_pad - offset; +} diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index c842095f2801..9966f093b4ff 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -52,7 +52,7 @@ static struct nfit_test_resource *__get_nfit_res(resource_size_t resource) return NULL; } -static struct nfit_test_resource *get_nfit_res(resource_size_t resource) +struct nfit_test_resource *get_nfit_res(resource_size_t resource) { struct nfit_test_resource *res; @@ -62,6 +62,7 @@ static struct nfit_test_resource *get_nfit_res(resource_size_t resource) return res; } +EXPORT_SYMBOL(get_nfit_res); void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size, void __iomem *(*fallback_fn)(resource_size_t, unsigned long)) diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h index 96c5e16d7db9..9f18e2a4a862 100644 --- a/tools/testing/nvdimm/test/nfit_test.h +++ b/tools/testing/nvdimm/test/nfit_test.h @@ -12,6 +12,7 @@ */ #ifndef __NFIT_TEST_H__ #define __NFIT_TEST_H__ +#include struct nfit_test_resource { struct list_head list; @@ -26,4 +27,5 @@ void __iomem *__wrap_ioremap_nocache(resource_size_t offset, void __wrap_iounmap(volatile void __iomem *addr); void nfit_test_setup(nfit_test_lookup_fn lookup); void nfit_test_teardown(void); +struct nfit_test_resource *get_nfit_res(resource_size_t resource); #endif -- cgit v1.3-8-gc7d7 From 65fe935dd2387a4faf15314c73f5e6d31ef0217e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 13 Jun 2016 15:10:02 -0700 Subject: x86/KASLR, x86/power: Remove x86 hibernation restrictions With the following fix: 70595b479ce1 ("x86/power/64: Fix crash whan the hibernation code passes control to the image kernel") ... there is no longer a problem with hibernation resuming a KASLR-booted kernel image, so remove the restriction. Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jonathan Corbet Cc: Len Brown Cc: Linus Torvalds Cc: Linux PM list Cc: Logan Gunthorpe Cc: Pavel Machek Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Cc: Yinghai Lu Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/20160613221002.GA29719@www.outflux.net Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 10 ++++------ arch/x86/boot/compressed/kaslr.c | 7 ------- kernel/power/hibernate.c | 6 ------ 3 files changed, 4 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 82b42c958d1c..fa8c6d470ad2 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1803,12 +1803,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. js= [HW,JOY] Analog joystick See Documentation/input/joystick.txt. - kaslr/nokaslr [X86] - Enable/disable kernel and module base offset ASLR - (Address Space Layout Randomization) if built into - the kernel. When CONFIG_HIBERNATION is selected, - kASLR is disabled by default. When kASLR is enabled, - hibernation will be disabled. + nokaslr [KNL] + When CONFIG_RANDOMIZE_BASE is set, this disables + kernel and module base offset ASLR (Address Space + Layout Randomization). keepinitrd [HW,ARM] diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index cfeb0259ed81..dff42177cb0c 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -471,17 +471,10 @@ unsigned char *choose_random_location(unsigned long input, unsigned long choice = output; unsigned long random_addr; -#ifdef CONFIG_HIBERNATION - if (!cmdline_find_option_bool("kaslr")) { - warn("KASLR disabled: 'kaslr' not on cmdline (hibernation selected)."); - goto out; - } -#else if (cmdline_find_option_bool("nokaslr")) { warn("KASLR disabled: 'nokaslr' on cmdline."); goto out; } -#endif boot_params->hdr.loadflags |= KASLR_FLAG; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index fca9254280ee..9021387c6ff4 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1154,11 +1154,6 @@ static int __init nohibernate_setup(char *str) return 1; } -static int __init kaslr_nohibernate_setup(char *str) -{ - return nohibernate_setup(str); -} - static int __init page_poison_nohibernate_setup(char *str) { #ifdef CONFIG_PAGE_POISONING_ZERO @@ -1182,5 +1177,4 @@ __setup("hibernate=", hibernate_setup); __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup); -__setup("kaslr", kaslr_nohibernate_setup); __setup("page_poison=", page_poison_nohibernate_setup); -- cgit v1.3-8-gc7d7 From 0dceeaf599e6d9b8bd908ba4bd3dfee84aa26be2 Mon Sep 17 00:00:00 2001 From: Pan Xinhui Date: Tue, 14 Jun 2016 14:37:27 +0800 Subject: locking/qspinlock: Use __this_cpu_dec() instead of full-blown this_cpu_dec() queued_spin_lock_slowpath() should not worry about another queued_spin_lock_slowpath() running in interrupt context and changing node->count by accident, because node->count keeps the same value every time we enter/leave queued_spin_lock_slowpath(). On some architectures this_cpu_dec() will save/restore irq flags, which has high overhead. Use the much cheaper __this_cpu_dec() instead. Signed-off-by: Pan Xinhui Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hpe.com Link: http://lkml.kernel.org/r/1465886247-3773-1-git-send-email-xinhui.pan@linux.vnet.ibm.com [ Rewrote changelog. ] Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 730655533440..b2caec7315af 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -619,7 +619,7 @@ release: /* * release the node */ - this_cpu_dec(mcs_nodes[0].count); + __this_cpu_dec(mcs_nodes[0].count); } EXPORT_SYMBOL(queued_spin_lock_slowpath); -- cgit v1.3-8-gc7d7 From e210bffd39d01b649c94b820c28ff112673266dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jun 2016 18:51:48 +0200 Subject: sched/fair: Fix and optimize the fork() path The task_fork_fair() callback already calls __set_task_cpu() and takes rq->lock. If we move the sched_class::task_fork callback in sched_fork() under the existing p->pi_lock, right after its set_task_cpu() call, we can avoid doing two such calls and omit the IRQ disabling on the rq->lock. Change to __set_task_cpu() to skip the migration bits, this is a new task, not a migration. Similarly, make wake_up_new_task() use __set_task_cpu() for the same reason, the task hasn't actually migrated as it hasn't ever ran. This cures the problem of calling migrate_task_rq_fair(), which does remove_entity_from_load_avg() on tasks that have never been added to the load avg to begin with. This bug would result in transiently messed up load_avg values, averaged out after a few dozen milliseconds. This is probably the reason why this bug was not found for such a long time. Reported-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 16 +++++++++++----- kernel/sched/fair.c | 27 ++++++--------------------- 2 files changed, 17 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e406ba0c6891..fa3434dffbbb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2383,9 +2383,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_class = &fair_sched_class; } - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); - /* * The child is not yet in the pid-hash so no cgroup attach races, * and the cgroup is pinned to this child due to cgroup_fork() @@ -2394,7 +2391,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); - set_task_cpu(p, cpu); + /* + * We're setting the cpu for the first time, we don't migrate, + * so use __set_task_cpu(). + */ + __set_task_cpu(p, cpu); + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); #ifdef CONFIG_SCHED_INFO @@ -2534,8 +2537,11 @@ void wake_up_new_task(struct task_struct *p) * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug + * + * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, + * as we're not fully set-up yet. */ - set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); post_init_entity_util_avg(&p->se); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 73063560b9ec..994f5493ee5b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4448,7 +4448,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running increment below. - */ + */ if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; @@ -8289,31 +8289,17 @@ static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; - int this_cpu = smp_processor_id(); struct rq *rq = this_rq(); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock(&rq->lock); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - - /* - * Not only the cpu but also the task_group of the parent might have - * been changed after parent->se.parent,cfs_rq were copied to - * child->se.parent,cfs_rq. So call __set_task_cpu() to make those - * of child point to valid ones. - */ - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - - update_curr(cfs_rq); - - if (curr) + if (curr) { + update_curr(cfs_rq); se->vruntime = curr->vruntime; + } place_entity(cfs_rq, se, 1); if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { @@ -8326,8 +8312,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); } /* -- cgit v1.3-8-gc7d7 From 010114739d294c474764c94196d32fb92e233657 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Jun 2016 11:20:46 +0200 Subject: sched/fair: Fix PELT integrity for new groups Vincent reported that when a new task is moved into a new cgroup it gets attached twice to the load tracking: sched_move_task() task_move_group_fair() detach_task_cfs_rq() set_task_rq() attach_task_cfs_rq() attach_entity_load_avg() se->avg.last_load_update = cfs_rq->avg.last_load_update // == 0 enqueue_entity() enqueue_entity_load_avg() update_cfs_rq_load_avg() now = clock() __update_load_avg(&cfs_rq->avg) cfs_rq->avg.last_load_update = now // ages load/util for: now - 0, load/util -> 0 if (migrated) attach_entity_load_avg() se->avg.last_load_update = cfs_rq->avg.last_load_update; // now != 0 The problem is that we don't update cfs_rq load_avg before all entity attach/detach operations. Only enqueue_task() and migrate_task() do this. By fixing this, the above will not happen, because the sched_move_task() attach will have updated cfs_rq's last_load_update time before attach, and in turn the attach will have set the entity's last_load_update stamp. Note that there is a further problem with sched_move_task() calling detach on a task that hasn't yet been attached; this will be taken care of in a subsequent patch. Reported-by: Vincent Guittot Tested-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yuyang Du Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 994f5493ee5b..dda0ccbd0f3d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3083,6 +3083,12 @@ static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +{ + return 0; +} + static inline void update_load_avg(struct sched_entity *se, int not_used) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -8368,6 +8374,7 @@ static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); if (!vruntime_normalized(p)) { /* @@ -8379,6 +8386,7 @@ static void detach_task_cfs_rq(struct task_struct *p) } /* Catch up with the cfs_rq and remove our load when we leave */ + update_cfs_rq_load_avg(now, cfs_rq, false); detach_entity_load_avg(cfs_rq, se); } @@ -8386,6 +8394,7 @@ static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8396,6 +8405,7 @@ static void attach_task_cfs_rq(struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ + update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); if (!vruntime_normalized(p)) -- cgit v1.3-8-gc7d7 From ea86cb4b7621e1298a37197005bf0abcc86348d4 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 17 Jun 2016 13:38:55 +0200 Subject: sched/cgroup: Fix cpu_cgroup_fork() handling A new fair task is detached and attached from/to task_group with: cgroup_post_fork() ss->fork(child) := cpu_cgroup_fork() sched_move_task() task_move_group_fair() Which is wrong, because at this point in fork() the task isn't fully initialized and it cannot 'move' to another group, because its not attached to any group as yet. In fact, cpu_cgroup_fork() needs a small part of sched_move_task() so we can just call this small part directly instead sched_move_task(). And the task doesn't really migrate because it is not yet attached so we need the following sequence: do_fork() sched_fork() __set_task_cpu() cgroup_post_fork() set_task_rq() # set task group and runqueue wake_up_new_task() select_task_rq() can select a new cpu __set_task_cpu post_init_entity_util_avg attach_task_cfs_rq() activate_task enqueue_task This patch makes that happen. Signed-off-by: Vincent Guittot [ Added TASK_SET_GROUP to set depth properly. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 63 ++++++++++++++++++++++++++++++++++------------------ kernel/sched/fair.c | 23 ++++++++++++++++++- kernel/sched/sched.h | 5 ++++- 3 files changed, 67 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa3434dffbbb..3d856c46f6d8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7744,27 +7744,9 @@ void sched_offline_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -/* change task's runqueue when it moves between groups. - * The caller of this function should have put the task in its new group - * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to - * reflect its new group. - */ -void sched_move_task(struct task_struct *tsk) +static void sched_change_group(struct task_struct *tsk, int type) { struct task_group *tg; - int queued, running; - struct rq_flags rf; - struct rq *rq; - - rq = task_rq_lock(tsk, &rf); - - running = task_current(rq, tsk); - queued = task_on_rq_queued(tsk); - - if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); - if (unlikely(running)) - put_prev_task(rq, tsk); /* * All callers are synchronized by task_rq_lock(); we do not use RCU @@ -7777,11 +7759,37 @@ void sched_move_task(struct task_struct *tsk) tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk); + if (tsk->sched_class->task_change_group) + tsk->sched_class->task_change_group(tsk, type); else #endif set_task_rq(tsk, task_cpu(tsk)); +} + +/* + * Change task's runqueue when it moves between groups. + * + * The caller of this function should have put the task in its new group by + * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect + * its new group. + */ +void sched_move_task(struct task_struct *tsk) +{ + int queued, running; + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(tsk, &rf); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + if (queued) + dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); + if (unlikely(running)) + put_prev_task(rq, tsk); + + sched_change_group(tsk, TASK_MOVE_GROUP); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); @@ -8209,9 +8217,20 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_free_group(tg); } +/* + * This is called before wake_up_new_task(), therefore we really only + * have to set its group bits, all the other stuff does not apply. + */ static void cpu_cgroup_fork(struct task_struct *task) { - sched_move_task(task); + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(task, &rf); + + sched_change_group(task, TASK_SET_GROUP); + + task_rq_unlock(rq, task, &rf); } static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dda0ccbd0f3d..64f26bc436eb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8466,6 +8466,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED +static void task_set_group_fair(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + + set_task_rq(p, task_cpu(p)); + se->depth = se->parent ? se->parent->depth + 1 : 0; +} + static void task_move_group_fair(struct task_struct *p) { detach_task_cfs_rq(p); @@ -8478,6 +8486,19 @@ static void task_move_group_fair(struct task_struct *p) attach_task_cfs_rq(p); } +static void task_change_group_fair(struct task_struct *p, int type) +{ + switch (type) { + case TASK_SET_GROUP: + task_set_group_fair(p); + break; + + case TASK_MOVE_GROUP: + task_move_group_fair(p); + break; + } +} + void free_fair_sched_group(struct task_group *tg) { int i; @@ -8706,7 +8727,7 @@ const struct sched_class fair_sched_class = { .update_curr = update_curr_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .task_move_group = task_move_group_fair, + .task_change_group = task_change_group_fair, #endif }; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 71ce9862abc3..307bd0418095 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1246,8 +1246,11 @@ struct sched_class { void (*update_curr) (struct rq *rq); +#define TASK_SET_GROUP 0 +#define TASK_MOVE_GROUP 1 + #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_move_group) (struct task_struct *p); + void (*task_change_group) (struct task_struct *p, int type); #endif }; -- cgit v1.3-8-gc7d7 From 7dc603c9028ea5d4354e0e317e8481df99b06d7e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jun 2016 13:29:28 +0200 Subject: sched/fair: Fix PELT integrity for new tasks Vincent and Yuyang found another few scenarios in which entity tracking goes wobbly. The scenarios are basically due to the fact that new tasks are not immediately attached and thereby differ from the normal situation -- a task is always attached to a cfs_rq load average (such that it includes its blocked contribution) and are explicitly detached/attached on migration to another cfs_rq. Scenario 1: switch to fair class p->sched_class = fair_class; if (queued) enqueue_task(p); ... enqueue_entity() enqueue_entity_load_avg() migrated = !sa->last_update_time (true) if (migrated) attach_entity_load_avg() check_class_changed() switched_from() (!fair) switched_to() (fair) switched_to_fair() attach_entity_load_avg() If @p is a new task that hasn't been fair before, it will have !last_update_time and, per the above, end up in attach_entity_load_avg() _twice_. Scenario 2: change between cgroups sched_move_group(p) if (queued) dequeue_task() task_move_group_fair() detach_task_cfs_rq() detach_entity_load_avg() set_task_rq() attach_task_cfs_rq() attach_entity_load_avg() if (queued) enqueue_task(); ... enqueue_entity() enqueue_entity_load_avg() migrated = !sa->last_update_time (true) if (migrated) attach_entity_load_avg() Similar as with scenario 1, if @p is a new task, it will have !load_update_time and we'll end up in attach_entity_load_avg() _twice_. Furthermore, notice how we do a detach_entity_load_avg() on something that wasn't attached to begin with. As stated above; the problem is that the new task isn't yet attached to the load tracking and thereby violates the invariant assumption. This patch remedies this by ensuring a new task is indeed properly attached to the load tracking on creation, through post_init_entity_util_avg(). Of course, this isn't entirely as straightforward as one might think, since the task is hashed before we call wake_up_new_task() and thus can be poked at. We avoid this by adding TASK_NEW and teaching cpu_cgroup_can_attach() to refuse such tasks. Reported-by: Yuyang Du Reported-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 +++-- kernel/sched/core.c | 28 +++++++++++++++++++++++----- kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++++++++-------- 3 files changed, 63 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index b45acfd18f4e..d99218a1e043 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -219,9 +219,10 @@ extern void proc_sched_set_task(struct task_struct *p); #define TASK_WAKING 256 #define TASK_PARKED 512 #define TASK_NOLOAD 1024 -#define TASK_STATE_MAX 2048 +#define TASK_NEW 2048 +#define TASK_STATE_MAX 4096 -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN" +#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3d856c46f6d8..14afa518948c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) __sched_fork(clone_flags, p); /* - * We mark the process as running here. This guarantees that + * We mark the process as NEW here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ - p->state = TASK_RUNNING; + p->state = TASK_NEW; /* * Make sure we do not leak PI boosting priority to the child. @@ -2383,6 +2383,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_class = &fair_sched_class; } + init_entity_runnable_average(&p->se); + /* * The child is not yet in the pid-hash so no cgroup attach races, * and the cgroup is pinned to this child due to cgroup_fork() @@ -2529,9 +2531,8 @@ void wake_up_new_task(struct task_struct *p) struct rq_flags rf; struct rq *rq; - /* Initialize new task's runnable average */ - init_entity_runnable_average(&p->se); raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + p->state = TASK_RUNNING; #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: @@ -8237,6 +8238,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; + int ret = 0; cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED @@ -8247,8 +8249,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) if (task->sched_class != &fair_sched_class) return -EINVAL; #endif + /* + * Serialize against wake_up_new_task() such that if its + * running, we're sure to observe its full state. + */ + raw_spin_lock_irq(&task->pi_lock); + /* + * Avoid calling sched_move_task() before wake_up_new_task() + * has happened. This would lead to problems with PELT, due to + * move wanting to detach+attach while we're not attached yet. + */ + if (task->state == TASK_NEW) + ret = -EINVAL; + raw_spin_unlock_irq(&task->pi_lock); + + if (ret) + break; } - return 0; + return ret; } static void cpu_cgroup_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 64f26bc436eb..0c21a12c0205 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -690,6 +690,10 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); + /* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: @@ -720,6 +724,7 @@ void post_init_entity_util_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + u64 now = cfs_rq_clock_task(cfs_rq); if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -733,16 +738,37 @@ void post_init_entity_util_avg(struct sched_entity *se) } sa->util_sum = sa->util_avg * LOAD_AVG_MAX; } + + if (entity_is_task(se)) { + struct task_struct *p = task_of(se); + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = now; + return; + } + } + + update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); } -#else +#else /* !CONFIG_SMP */ void init_entity_runnable_average(struct sched_entity *se) { } void post_init_entity_util_avg(struct sched_entity *se) { } -#endif +#endif /* CONFIG_SMP */ /* * Update the current task's runtime statistics. @@ -2840,8 +2866,6 @@ void set_task_rq_fair(struct sched_entity *se, static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); - static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -2951,6 +2975,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* * If we got migrated (either between CPUs or between cgroups) we'll * have aged the average right before clearing @last_update_time. + * + * Or we're fresh through post_init_entity_util_avg(). */ if (se->avg.last_update_time) { __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), @@ -3056,11 +3082,14 @@ void remove_entity_load_avg(struct sched_entity *se) u64 last_update_time; /* - * Newly created task or never used group entity should not be removed - * from its (source) cfs_rq + * tasks cannot exit without having gone through wake_up_new_task() -> + * post_init_entity_util_avg() which will have added things to the + * cfs_rq, so we can remove unconditionally. + * + * Similarly for groups, they will have passed through + * post_init_entity_util_avg() before unregister_sched_fair_group() + * calls this. */ - if (se->avg.last_update_time == 0) - return; last_update_time = cfs_rq_last_update_time(cfs_rq); -- cgit v1.3-8-gc7d7 From 3d30544f02120b884bba2a9466c87dba980e3be5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jun 2016 14:27:50 +0200 Subject: sched/fair: Apply more PELT fixes One additional 'rule' for using update_cfs_rq_load_avg() is that one should call update_tg_load_avg() if it returns true. Add a bunch of comments to hopefully clarify some of the rules: o You need to update cfs_rq _before_ any entity attach/detach, this is important, because while for mathmatical consisency this isn't strictly needed, it is required for the physical interpretation of the model, you attach/detach _now_. o When you modify the cfs_rq avg, you have to then call update_tg_load_avg() in order to propagate changes upwards. o (Fair) entities are always attached, switched_{to,from}_fair() deal with !fair. This directly follows from the definition of the cfs_rq averages, namely that they are a direct sum of all (runnable or blocked) entities on that rq. It is the second rule that this patch enforces, but it adds comments pertaining to all of them. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0c21a12c0205..781788d54736 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -692,6 +692,7 @@ void init_entity_runnable_average(struct sched_entity *se) static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); /* @@ -725,6 +726,7 @@ void post_init_entity_util_avg(struct sched_entity *se) struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -757,8 +759,10 @@ void post_init_entity_util_avg(struct sched_entity *se) } } - update_cfs_rq_load_avg(now, cfs_rq, false); + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); } #else /* !CONFIG_SMP */ @@ -768,6 +772,9 @@ void init_entity_runnable_average(struct sched_entity *se) void post_init_entity_util_avg(struct sched_entity *se) { } +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +{ +} #endif /* CONFIG_SMP */ /* @@ -2912,7 +2919,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) WRITE_ONCE(*ptr, res); \ } while (0) -/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ +/** + * update_cfs_rq_load_avg - update the cfs_rq's load/util averages + * @now: current time, as per cfs_rq_clock_task() + * @cfs_rq: cfs_rq to update + * @update_freq: should we call cfs_rq_util_change() or will the call do so + * + * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) + * avg. The immediate corollary is that all (fair) tasks must be attached, see + * post_init_entity_util_avg(). + * + * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. + * + * Returns true if the load decayed or we removed utilization. It is expected + * that one calls update_tg_load_avg() on this condition, but after you've + * modified the cfs_rq avg (attach/detach), such that we propagate the new + * avg up. + */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) { @@ -2967,6 +2990,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) update_tg_load_avg(cfs_rq, 0); } +/** + * attach_entity_load_avg - attach this entity to its cfs_rq load avg + * @cfs_rq: cfs_rq to attach to + * @se: sched_entity to attach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (!sched_feat(ATTACH_AGE_LOAD)) @@ -2998,6 +3029,14 @@ skip_aging: cfs_rq_util_change(cfs_rq); } +/** + * detach_entity_load_avg - detach this entity from its cfs_rq load avg + * @cfs_rq: cfs_rq to detach from + * @se: sched_entity to detach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), @@ -8404,6 +8443,7 @@ static void detach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; if (!vruntime_normalized(p)) { /* @@ -8415,8 +8455,10 @@ static void detach_task_cfs_rq(struct task_struct *p) } /* Catch up with the cfs_rq and remove our load when we leave */ - update_cfs_rq_load_avg(now, cfs_rq, false); + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); detach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); } static void attach_task_cfs_rq(struct task_struct *p) @@ -8424,6 +8466,7 @@ static void attach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8434,8 +8477,10 @@ static void attach_task_cfs_rq(struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ - update_cfs_rq_load_avg(now, cfs_rq, false); + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; -- cgit v1.3-8-gc7d7 From 8663e24d56dc1f093232783c23ea17f2a6f61c03 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jun 2016 14:58:02 +0200 Subject: sched/fair: Reorder cgroup creation code A future patch needs rq->lock held _after_ we link the task_group into the hierarchy. In order to avoid taking every rq->lock twice, reorder things a little and create online_fair_sched_group() to be called after we link the task_group. All this code is still ran from css_alloc() so css_online() isn't in fact used for this. Signed-off-by: Peter Zijlstra (Intel) Cc: Konstantin Khlebnikov Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ kernel/sched/fair.c | 22 ++++++++++++++++++---- kernel/sched/sched.h | 1 + 3 files changed, 21 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 14afa518948c..4ede4fc65653 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7717,6 +7717,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) INIT_LIST_HEAD(&tg->children); list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); + + online_fair_sched_group(tg); } /* rcu callback to free various structures associated with a task group */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 781788d54736..62d5e7dcc7f8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8624,10 +8624,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); - - raw_spin_lock_irq(&rq->lock); - post_init_entity_util_avg(se); - raw_spin_unlock_irq(&rq->lock); } return 1; @@ -8638,6 +8634,22 @@ err: return 0; } +void online_fair_sched_group(struct task_group *tg) +{ + struct sched_entity *se; + struct rq *rq; + int i; + + for_each_possible_cpu(i) { + rq = cpu_rq(i); + se = tg->se[i]; + + raw_spin_lock_irq(&rq->lock); + post_init_entity_util_avg(se); + raw_spin_unlock_irq(&rq->lock); + } +} + void unregister_fair_sched_group(struct task_group *tg) { unsigned long flags; @@ -8742,6 +8754,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } +void online_fair_sched_group(struct task_group *tg) { } + void unregister_fair_sched_group(struct task_group *tg) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 307bd0418095..28c42b789f70 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data); extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); +extern void online_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg); extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, -- cgit v1.3-8-gc7d7 From 599b4840b0ea453c7d11e1798dcc8f494dcfd58a Mon Sep 17 00:00:00 2001 From: Zev Weiss Date: Sun, 26 Jun 2016 16:13:23 -0500 Subject: sched/core: Fix sched_getaffinity() return value kerneldoc comment Previous version was probably written referencing the man page for glibc's wrapper, but the wrapper's behavior differs from that of the syscall itself in this case. Signed-off-by: Zev Weiss Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1466975603-25408-1-git-send-email-zev@bewilderbeest.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4ede4fc65653..28da50a5bc76 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4759,7 +4759,8 @@ out_unlock: * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask * - * Return: 0 on success. An error code otherwise. + * Return: size of CPU mask copied to user_mask_ptr on success. An + * error code otherwise. */ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) -- cgit v1.3-8-gc7d7 From 55e16d30bd99510900caec913c90f53bc2b35cba Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jun 2016 15:14:26 +0200 Subject: sched/fair: Rework throttle_count sync Since we already take rq->lock when creating a cgroup, use it to also sync the throttle_count and avoid the extra state and enqueue path branch. Signed-off-by: Peter Zijlstra (Intel) Cc: Konstantin Khlebnikov Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: linux-kernel@vger.kernel.org [ Fixed build warning. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 39 +++++++++++++++++++-------------------- kernel/sched/sched.h | 2 +- 2 files changed, 20 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 62d5e7dcc7f8..4088eedea763 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4241,26 +4241,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; - /* Synchronize hierarchical throttle counter: */ - if (unlikely(!cfs_rq->throttle_uptodate)) { - struct rq *rq = rq_of(cfs_rq); - struct cfs_rq *pcfs_rq; - struct task_group *tg; - - cfs_rq->throttle_uptodate = 1; - - /* Get closest up-to-date node, because leaves go first: */ - for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { - pcfs_rq = tg->cfs_rq[cpu_of(rq)]; - if (pcfs_rq->throttle_uptodate) - break; - } - if (tg) { - cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(rq); - } - } - /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; @@ -4275,6 +4255,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) throttle_cfs_rq(cfs_rq); } +static void sync_throttle(struct task_group *tg, int cpu) +{ + struct cfs_rq *pcfs_rq, *cfs_rq; + + if (!cfs_bandwidth_used()) + return; + + if (!tg->parent) + return; + + cfs_rq = tg->cfs_rq[cpu]; + pcfs_rq = tg->parent->cfs_rq[cpu]; + + cfs_rq->throttle_count = pcfs_rq->throttle_count; + pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); +} + /* conditionally throttle active cfs_rq's from put_prev_entity() */ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -4414,6 +4411,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} +static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) @@ -8646,6 +8644,7 @@ void online_fair_sched_group(struct task_group *tg) raw_spin_lock_irq(&rq->lock); post_init_entity_util_avg(se); + sync_throttle(tg, i); raw_spin_unlock_irq(&rq->lock); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 28c42b789f70..f44da95c70cd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -438,7 +438,7 @@ struct cfs_rq { u64 throttled_clock, throttled_clock_task; u64 throttled_clock_task_time; - int throttled, throttle_count, throttle_uptodate; + int throttled, throttle_count; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ -- cgit v1.3-8-gc7d7 From 86b2efbe3a390e07dbba725ef700b0d143e9a385 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 24 Jun 2016 16:35:46 -0400 Subject: audit: add fields to exclude filter by reusing user filter RFE: add additional fields for use in audit filter exclude rules https://github.com/linux-audit/audit-kernel/issues/5 Re-factor and combine audit_filter_type() with audit_filter_user() to use audit_filter_user_rules() to enable the exclude filter to additionally filter on PID, UID, GID, AUID, LOGINUID_SET, SUBJ_*. The process of combining the similar audit_filter_user() and audit_filter_type() functions, required inverting the meaning and including the ALWAYS action of the latter. Include audit_filter_user_rules() into audit_filter(), removing unneeded logic in the process. Keep the check to quit early if the list is empty. Signed-off-by: Richard Guy Briggs [PM: checkpatch.pl fixes - whitespace damage, wrapped description] Signed-off-by: Paul Moore --- include/linux/audit.h | 2 - kernel/audit.c | 4 +- kernel/audit.h | 2 + kernel/auditfilter.c | 151 ++++++++++++++++++-------------------------------- 4 files changed, 57 insertions(+), 102 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index e38e3fc13ea8..9d4443f93db6 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -163,8 +163,6 @@ extern void audit_log_task_info(struct audit_buffer *ab, extern int audit_update_lsm_rules(void); /* Private API (for audit.c only) */ -extern int audit_filter_user(int type); -extern int audit_filter_type(int type); extern int audit_rule_change(int type, __u32 portid, int seq, void *data, size_t datasz); extern int audit_list_rules_send(struct sk_buff *request_skb, int seq); diff --git a/kernel/audit.c b/kernel/audit.c index 678c3f000191..994588ef9489 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -934,7 +934,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; - err = audit_filter_user(msg_type); + err = audit_filter(msg_type, AUDIT_FILTER_USER); if (err == 1) { /* match or error */ err = 0; if (msg_type == AUDIT_USER_TTY) { @@ -1382,7 +1382,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (audit_initialized != AUDIT_INITIALIZED) return NULL; - if (unlikely(audit_filter_type(type))) + if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE))) return NULL; if (gfp_mask & __GFP_DIRECT_RECLAIM) { diff --git a/kernel/audit.h b/kernel/audit.h index cbbe6bb6496e..1879f02cb2c3 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -327,6 +327,8 @@ extern pid_t audit_sig_pid; extern kuid_t audit_sig_uid; extern u32 audit_sig_sid; +extern int audit_filter(int msgtype, unsigned int listtype); + #ifdef CONFIG_AUDITSYSCALL extern int __audit_signal_info(int sig, struct task_struct *t); static inline int audit_signal_info(int sig, struct task_struct *t) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index ff59a5eed691..85d9cac497e4 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1290,117 +1290,72 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen) return strncmp(p, dname, dlen); } -static int audit_filter_user_rules(struct audit_krule *rule, int type, - enum audit_state *state) -{ - int i; - - for (i = 0; i < rule->field_count; i++) { - struct audit_field *f = &rule->fields[i]; - pid_t pid; - int result = 0; - u32 sid; - - switch (f->type) { - case AUDIT_PID: - pid = task_pid_nr(current); - result = audit_comparator(pid, f->op, f->val); - break; - case AUDIT_UID: - result = audit_uid_comparator(current_uid(), f->op, f->uid); - break; - case AUDIT_GID: - result = audit_gid_comparator(current_gid(), f->op, f->gid); - break; - case AUDIT_LOGINUID: - result = audit_uid_comparator(audit_get_loginuid(current), - f->op, f->uid); - break; - case AUDIT_LOGINUID_SET: - result = audit_comparator(audit_loginuid_set(current), - f->op, f->val); - break; - case AUDIT_MSGTYPE: - result = audit_comparator(type, f->op, f->val); - break; - case AUDIT_SUBJ_USER: - case AUDIT_SUBJ_ROLE: - case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: - if (f->lsm_rule) { - security_task_getsecid(current, &sid); - result = security_audit_rule_match(sid, - f->type, - f->op, - f->lsm_rule, - NULL); - } - break; - } - - if (result <= 0) - return result; - } - switch (rule->action) { - case AUDIT_NEVER: - *state = AUDIT_DISABLED; - break; - case AUDIT_ALWAYS: - *state = AUDIT_RECORD_CONTEXT; - break; - } - return 1; -} - -int audit_filter_user(int type) -{ - enum audit_state state = AUDIT_DISABLED; - struct audit_entry *e; - int rc, ret; - - ret = 1; /* Audit by default */ - - rcu_read_lock(); - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { - rc = audit_filter_user_rules(&e->rule, type, &state); - if (rc) { - if (rc > 0 && state == AUDIT_DISABLED) - ret = 0; - break; - } - } - rcu_read_unlock(); - - return ret; -} - -int audit_filter_type(int type) +int audit_filter(int msgtype, unsigned int listtype) { struct audit_entry *e; - int result = 0; + int ret = 1; /* Audit by default */ rcu_read_lock(); - if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE])) + if (list_empty(&audit_filter_list[listtype])) goto unlock_and_return; + list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) { + int i, result = 0; - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE], - list) { - int i; for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; - if (f->type == AUDIT_MSGTYPE) { - result = audit_comparator(type, f->op, f->val); - if (!result) - break; + pid_t pid; + u32 sid; + + switch (f->type) { + case AUDIT_PID: + pid = task_pid_nr(current); + result = audit_comparator(pid, f->op, f->val); + break; + case AUDIT_UID: + result = audit_uid_comparator(current_uid(), f->op, f->uid); + break; + case AUDIT_GID: + result = audit_gid_comparator(current_gid(), f->op, f->gid); + break; + case AUDIT_LOGINUID: + result = audit_uid_comparator(audit_get_loginuid(current), + f->op, f->uid); + break; + case AUDIT_LOGINUID_SET: + result = audit_comparator(audit_loginuid_set(current), + f->op, f->val); + break; + case AUDIT_MSGTYPE: + result = audit_comparator(msgtype, f->op, f->val); + break; + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + if (f->lsm_rule) { + security_task_getsecid(current, &sid); + result = security_audit_rule_match(sid, + f->type, f->op, f->lsm_rule, NULL); + } + break; + default: + goto unlock_and_return; } + if (result < 0) /* error */ + goto unlock_and_return; + if (!result) + break; + } + if (result > 0) { + if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_TYPE) + ret = 0; + break; } - if (result) - goto unlock_and_return; } unlock_and_return: rcu_read_unlock(); - return result; + return ret; } static int update_lsm_rule(struct audit_krule *r) -- cgit v1.3-8-gc7d7 From 7fa8b7171a638ad896acabd9a17183b75b70aeb4 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 17 Jun 2016 22:44:54 -0700 Subject: tracing/function_graph: Fix filters for function_graph threshold Function graph tracer currently ignores filters if tracing_thresh is set. For example, even if set_ftrace_pid is set, then its ignored if tracing_thresh set, resulting in all processes being traced. To fix this, we reuse the same entry function as when tracing_thresh is not set and do everything as in the regular case except for writing the function entry to the ring buffer. Link: http://lkml.kernel.org/r/1466228694-2677-1-git-send-email-agnel.joel@gmail.com Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 67cce7896aeb..7363ccf79512 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -338,6 +338,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) if (ftrace_graph_notrace_addr(trace->func)) return 1; + /* + * Stop here if tracing_threshold is set. We only write function return + * events to the ring buffer. + */ + if (tracing_thresh) + return 1; + local_irq_save(flags); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -355,14 +362,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return ret; } -static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) -{ - if (tracing_thresh) - return 1; - else - return trace_graph_entry(trace); -} - static void __trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long flags, int pc) @@ -457,7 +456,7 @@ static int graph_trace_init(struct trace_array *tr) set_graph_array(tr); if (tracing_thresh) ret = register_ftrace_graph(&trace_graph_thresh_return, - &trace_graph_thresh_entry); + &trace_graph_entry); else ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry); -- cgit v1.3-8-gc7d7 From ea00f4f4f00cc2bc3b63ad512a4e6df3b20832b9 Mon Sep 17 00:00:00 2001 From: Lianwei Wang Date: Sun, 19 Jun 2016 23:52:27 -0700 Subject: PM / sleep: make PM notifiers called symmetrically This makes pm notifier PREPARE/POST symmetrical: if PREPARE fails, we will only undo what ever happened on PREPARE. It fixes the unbalanced CPU hotplug enable in CPU PM notifier. Signed-off-by: Lianwei Wang Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 20 ++++++++++++-------- kernel/power/main.c | 11 +++++++++-- kernel/power/power.h | 2 ++ kernel/power/suspend.c | 10 ++++++---- kernel/power/user.c | 14 ++++++++------ 5 files changed, 37 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index fca9254280ee..126e24caa82e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -647,7 +647,7 @@ static void power_down(void) */ int hibernate(void) { - int error; + int error, nr_calls = 0; if (!hibernation_available()) { pr_debug("PM: Hibernation not available.\n"); @@ -662,9 +662,11 @@ int hibernate(void) } pm_prepare_console(); - error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); - if (error) + error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); + if (error) { + nr_calls--; goto Exit; + } printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); @@ -714,7 +716,7 @@ int hibernate(void) /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; Exit: - pm_notifier_call_chain(PM_POST_HIBERNATION); + __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); pm_restore_console(); atomic_inc(&snapshot_device_available); Unlock: @@ -740,7 +742,7 @@ int hibernate(void) */ static int software_resume(void) { - int error; + int error, nr_calls = 0; unsigned int flags; /* @@ -827,9 +829,11 @@ static int software_resume(void) } pm_prepare_console(); - error = pm_notifier_call_chain(PM_RESTORE_PREPARE); - if (error) + error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); + if (error) { + nr_calls--; goto Close_Finish; + } pr_debug("PM: Preparing processes for restore.\n"); error = freeze_processes(); @@ -855,7 +859,7 @@ static int software_resume(void) unlock_device_hotplug(); thaw_processes(); Finish: - pm_notifier_call_chain(PM_POST_RESTORE); + __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); pm_restore_console(); atomic_inc(&snapshot_device_available); /* For success case, the suspend path will release the lock */ diff --git a/kernel/power/main.c b/kernel/power/main.c index 27946975eff0..5ea50b1b7595 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -38,12 +38,19 @@ int unregister_pm_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_pm_notifier); -int pm_notifier_call_chain(unsigned long val) +int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls) { - int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL); + int ret; + + ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL, + nr_to_call, nr_calls); return notifier_to_errno(ret); } +int pm_notifier_call_chain(unsigned long val) +{ + return __pm_notifier_call_chain(val, -1, NULL); +} /* If set, devices may be suspended and resumed asynchronously. */ int pm_async_enabled = 1; diff --git a/kernel/power/power.h b/kernel/power/power.h index efe1b3b17c88..51f02ecaf125 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -200,6 +200,8 @@ static inline void suspend_test_finish(const char *label) {} #ifdef CONFIG_PM_SLEEP /* kernel/power/main.c */ +extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call, + int *nr_calls); extern int pm_notifier_call_chain(unsigned long val); #endif diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 5b70d64b871e..0acab9d7f96f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -266,16 +266,18 @@ static int suspend_test(int level) */ static int suspend_prepare(suspend_state_t state) { - int error; + int error, nr_calls = 0; if (!sleep_state_supported(state)) return -EPERM; pm_prepare_console(); - error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); - if (error) + error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls); + if (error) { + nr_calls--; goto Finish; + } trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); @@ -286,7 +288,7 @@ static int suspend_prepare(suspend_state_t state) suspend_stats.failed_freeze++; dpm_save_failed_step(SUSPEND_FREEZE); Finish: - pm_notifier_call_chain(PM_POST_SUSPEND); + __pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL); pm_restore_console(); return error; } diff --git a/kernel/power/user.c b/kernel/power/user.c index 526e8911460a..35310b627388 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -47,7 +47,7 @@ atomic_t snapshot_device_available = ATOMIC_INIT(1); static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; - int error; + int error, nr_calls = 0; if (!hibernation_available()) return -EPERM; @@ -74,9 +74,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) swap_type_of(swsusp_resume_device, 0, NULL) : -1; data->mode = O_RDONLY; data->free_bitmaps = false; - error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); + error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); if (error) - pm_notifier_call_chain(PM_POST_HIBERNATION); + __pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL); } else { /* * Resuming. We may need to wait for the image device to @@ -86,13 +86,15 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->swap = -1; data->mode = O_WRONLY; - error = pm_notifier_call_chain(PM_RESTORE_PREPARE); + error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); if (!error) { error = create_basic_memory_bitmaps(); data->free_bitmaps = !error; - } + } else + nr_calls--; + if (error) - pm_notifier_call_chain(PM_POST_RESTORE); + __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); } if (error) atomic_inc(&snapshot_device_available); -- cgit v1.3-8-gc7d7 From 1ca1cc98bf7418c680415bfce05699f67510a7fd Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 28 Jun 2016 12:18:23 +0200 Subject: bpf: minor cleanups on fd maps and helpers Some minor cleanups: i) Remove the unlikely() from fd array map lookups and let the CPU branch predictor do its job, scenarios where there is not always a map entry are very well valid. ii) Move the attribute type check in the bpf_perf_event_read() helper a bit earlier so it's consistent wrt checks with bpf_perf_event_output() helper as well. iii) remove some comments that are self-documenting in kprobe_prog_is_valid_access() and therefore make it consistent to tp_prog_is_valid_access() as well. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 3 +-- kernel/trace/bpf_trace.c | 18 ++++++------------ 2 files changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b94a36550591..d638062f66d6 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -719,14 +719,13 @@ select_insn: if (unlikely(index >= array->map.max_entries)) goto out; - if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) goto out; tail_call_cnt++; prog = READ_ONCE(array->ptrs[index]); - if (unlikely(!prog)) + if (!prog) goto out; /* ARG1 at this point is guaranteed to point to CTX from diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3de25fbed785..4e61f74a5d73 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -199,19 +199,19 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) return -E2BIG; ee = READ_ONCE(array->ptrs[index]); - if (unlikely(!ee)) + if (!ee) return -ENOENT; event = ee->event; + if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && + event->attr.type != PERF_TYPE_RAW)) + return -EINVAL; + /* make sure event is local and doesn't have pmu::count */ if (event->oncpu != smp_processor_id() || event->pmu->count) return -EINVAL; - if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && - event->attr.type != PERF_TYPE_RAW)) - return -EINVAL; - /* * we don't know if the function is run successfully by the * return value. It can be judged in other places, such as @@ -251,7 +251,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) return -E2BIG; ee = READ_ONCE(array->ptrs[index]); - if (unlikely(!ee)) + if (!ee) return -ENOENT; event = ee->event; @@ -354,18 +354,12 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, enum bpf_reg_type *reg_type) { - /* check bounds */ if (off < 0 || off >= sizeof(struct pt_regs)) return false; - - /* only read is allowed */ if (type != BPF_READ) return false; - - /* disallow misaligned access */ if (off % size != 0) return false; - return true; } -- cgit v1.3-8-gc7d7 From d79313303181d357d293453fb8486bdc87bfd2f4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 28 Jun 2016 12:18:24 +0200 Subject: bpf, trace: fetch current cpu only once We currently have two invocations, which is unnecessary. Fetch it only once and use the smp_processor_id() variant, so we also get preemption checks along with it when DEBUG_PREEMPT is set. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4e61f74a5d73..505f9e9cdb3b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -233,6 +233,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) struct pt_regs *regs = (struct pt_regs *) (long) r1; struct bpf_map *map = (struct bpf_map *) (long) r2; struct bpf_array *array = container_of(map, struct bpf_array, map); + unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; void *data = (void *) (long) r4; struct perf_sample_data sample_data; @@ -246,7 +247,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; if (index == BPF_F_CURRENT_CPU) - index = raw_smp_processor_id(); + index = cpu; if (unlikely(index >= array->map.max_entries)) return -E2BIG; @@ -259,7 +260,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) return -EINVAL; - if (unlikely(event->oncpu != smp_processor_id())) + if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; perf_sample_data_init(&sample_data, 0, 0); -- cgit v1.3-8-gc7d7 From 6816a7ffce32e999601825ddfd887f36d3052932 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 28 Jun 2016 12:18:25 +0200 Subject: bpf, trace: add BPF_F_CURRENT_CPU flag for bpf_perf_event_read Follow-up commit to 1e33759c788c ("bpf, trace: add BPF_F_CURRENT_CPU flag for bpf_perf_event_output") to add the same functionality into bpf_perf_event_read() helper. The split of index into flags and index component is also safe here, since such large maps are rejected during map allocation time. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 2 +- kernel/trace/bpf_trace.c | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 406459b935a2..58df2da3e9bf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -347,7 +347,7 @@ enum bpf_func_id { #define BPF_F_ZERO_CSUM_TX (1ULL << 1) #define BPF_F_DONT_FRAGMENT (1ULL << 2) -/* BPF_FUNC_perf_event_output flags. */ +/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */ #define BPF_F_INDEX_MASK 0xffffffffULL #define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 505f9e9cdb3b..19c5b4a5c3eb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -188,13 +188,19 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } -static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) +static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5) { struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); + unsigned int cpu = smp_processor_id(); + u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; struct perf_event *event; + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + if (index == BPF_F_CURRENT_CPU) + index = cpu; if (unlikely(index >= array->map.max_entries)) return -E2BIG; @@ -208,8 +214,7 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) return -EINVAL; /* make sure event is local and doesn't have pmu::count */ - if (event->oncpu != smp_processor_id() || - event->pmu->count) + if (unlikely(event->oncpu != cpu || event->pmu->count)) return -EINVAL; /* -- cgit v1.3-8-gc7d7 From 80b48c445797a634d869c7e5a53e182ba2688931 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 28 Jun 2016 12:18:26 +0200 Subject: bpf: don't use raw processor id in generic helper Use smp_processor_id() for the generic helper bpf_get_smp_processor_id() instead of the raw variant. This allows for preemption checks when we have DEBUG_PREEMPT, and otherwise uses the raw variant anyway. We only need to keep the raw variant for socket filters, but we can reuse the helper that is already there from cBPF side. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/helpers.c | 2 +- net/core/filter.c | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ad7a0573f71b..1ea3afba1a4f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -101,7 +101,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = { static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { - return raw_smp_processor_id(); + return smp_processor_id(); } const struct bpf_func_proto bpf_get_smp_processor_id_proto = { diff --git a/net/core/filter.c b/net/core/filter.c index cb9fc16cac46..46c88d9cec5c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -150,6 +150,12 @@ static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return raw_smp_processor_id(); } +static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { + .func = __get_raw_cpu_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { @@ -2037,7 +2043,7 @@ sk_filter_func_proto(enum bpf_func_id func_id) case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: - return &bpf_get_smp_processor_id_proto; + return &bpf_get_raw_smp_processor_id_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: @@ -2086,6 +2092,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_get_route_realm_proto; case BPF_FUNC_perf_event_output: return bpf_get_event_output_proto(); + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; default: return sk_filter_func_proto(func_id); } -- cgit v1.3-8-gc7d7 From eaaa7ec71bff4cb34d9025ed89068d4b3cac3df0 Mon Sep 17 00:00:00 2001 From: Gregor Boirie Date: Wed, 9 Mar 2016 19:05:48 +0100 Subject: timekeeping: export get_monotonic_coarse64 symbol EXPORT_SYMBOL() get_monotonic_coarse64 for new IIO timestamping clock selection usage. This provides user apps the ability to request a particular IIO device to timestamp samples using a monotonic coarse clock granularity. Signed-off-by: Gregor Boirie Signed-off-by: Jonathan Cameron --- kernel/time/timekeeping.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 479d25cd3d4f..255e225393ac 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2186,6 +2186,7 @@ struct timespec64 get_monotonic_coarse64(void) return now; } +EXPORT_SYMBOL(get_monotonic_coarse64); /* * Must hold jiffies_lock -- cgit v1.3-8-gc7d7 From 5f65e5ca286126a60f62c8421b77c2018a482b8a Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Tue, 26 Apr 2016 14:36:24 -0500 Subject: cred: Reject inodes with invalid ids in set_create_file_as() Using INVALID_[UG]ID for the LSM file creation context doesn't make sense, so return an error if the inode passed to set_create_file_as() has an invalid id. Signed-off-by: Seth Forshee Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- kernel/cred.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 0c0cd8a62285..5f264fb5737d 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -689,6 +689,8 @@ EXPORT_SYMBOL(set_security_override_from_ctx); */ int set_create_files_as(struct cred *new, struct inode *inode) { + if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) + return -EINVAL; new->fsuid = inode->i_uid; new->fsgid = inode->i_gid; return security_kernel_create_files_as(new, inode); -- cgit v1.3-8-gc7d7 From 6168f8ed01dc46a277908938294f1132d723f58d Mon Sep 17 00:00:00 2001 From: Wei Jiangang Date: Wed, 29 Jun 2016 12:51:50 +0800 Subject: timers/nohz: Fix several typos Signed-off-by: Wei Jiangang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: fenghua.yu@intel.com Link: http://lkml.kernel.org/r/1467175910-2966-2-git-send-email-weijg.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 536ada80f6dd..6d83e9c4a302 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -61,7 +61,7 @@ static void tick_do_update_jiffies64(ktime_t now) if (delta.tv64 < tick_period.tv64) return; - /* Reevalute with jiffies_lock held */ + /* Reevaluate with jiffies_lock held */ write_seqlock(&jiffies_lock); delta = ktime_sub(now, last_jiffies_update); @@ -117,7 +117,7 @@ static void tick_sched_do_timer(ktime_t now) /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themself to + * into a long sleep. If two cpus happen to assign themselves to * this duty, then the jiffies update is still serialized by * jiffies_lock. */ @@ -571,7 +571,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts) * @last_update_time: variable to store update time in. Do not update * counters if NULL. * - * Return the cummulative idle time (since boot) for a given + * Return the cumulative idle time (since boot) for a given * CPU, in microseconds. * * This time is measured via accounting rather than sampling, @@ -612,7 +612,7 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); * @last_update_time: variable to store update time in. Do not update * counters if NULL. * - * Return the cummulative iowait time (since boot) for a given + * Return the cumulative iowait time (since boot) for a given * CPU, in microseconds. * * This time is measured via accounting rather than sampling, @@ -733,7 +733,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * do_timer() never invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. If this cpu * is the one which had the do_timer() duty last, we limit the - * sleep time to the timekeeping max_deferement value. + * sleep time to the timekeeping max_deferment value. * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); -- cgit v1.3-8-gc7d7 From 0de7611a1031f25b713fda7d36de44f17c2ed790 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 1 Jul 2016 12:42:35 +0200 Subject: timers/nohz: Capitalize 'CPU' consistently While reviewing another patch I noticed that kernel/time/tick-sched.c had a charmingly (confusingly, annoyingly) rich set of variants for spelling 'CPU': cpu cpus CPU CPUs per CPU per-CPU per cpu ... sometimes these were mixed even within the same comment block! Compress these variants down to a single consistent set of: CPU CPUs per-CPU Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6d83e9c4a302..db57d1ba73eb 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -31,7 +31,7 @@ #include /* - * Per cpu nohz control structure + * Per-CPU nohz control structure */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); @@ -116,8 +116,8 @@ static void tick_sched_do_timer(ktime_t now) #ifdef CONFIG_NO_HZ_COMMON /* * Check if the do_timer duty was dropped. We don't care about - * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themselves to + * concurrency: This happens only when the CPU in charge went + * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by * jiffies_lock. */ @@ -349,7 +349,7 @@ void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bi /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: - * perf events, posix cpu timers, ... + * perf events, posix CPU timers, ... */ void __tick_nohz_task_switch(void) { @@ -509,8 +509,8 @@ int tick_nohz_tick_stopped(void) * * In case the sched_tick was stopped on this CPU, we have to check if jiffies * must be updated. Otherwise an interrupt handler could use a stale jiffy - * value. We do this unconditionally on any cpu, as we don't know whether the - * cpu, which has the update task assigned is in a long sleep. + * value. We do this unconditionally on any CPU, as we don't know whether the + * CPU, which has the update task assigned is in a long sleep. */ static void tick_nohz_update_jiffies(ktime_t now) { @@ -526,7 +526,7 @@ static void tick_nohz_update_jiffies(ktime_t now) } /* - * Updates the per cpu time idle statistics counters + * Updates the per-CPU time idle statistics counters */ static void update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) @@ -566,7 +566,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts) } /** - * get_cpu_idle_time_us - get the total idle time of a cpu + * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. @@ -607,7 +607,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** - * get_cpu_iowait_time_us - get the total iowait time of a cpu + * get_cpu_iowait_time_us - get the total iowait time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. @@ -726,12 +726,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, } /* - * If this cpu is the one which updates jiffies, then give up - * the assignment and let it be taken by the cpu which runs - * the tick timer next, which might be this cpu as well. If we + * If this CPU is the one which updates jiffies, then give up + * the assignment and let it be taken by the CPU which runs + * the tick timer next, which might be this CPU as well. If we * don't drop this here the jiffies might be stale and * do_timer() never invoked. Keep track of the fact that it - * was the one which had the do_timer() duty last. If this cpu + * was the one which had the do_timer() duty last. If this CPU * is the one which had the do_timer() duty last, we limit the * sleep time to the timekeeping max_deferment value. * Otherwise we can sleep as long as we want. @@ -841,9 +841,9 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { /* - * If this cpu is offline and it is the one which updates + * If this CPU is offline and it is the one which updates * jiffies, then give up the assignment and let it be taken by - * the cpu which runs the tick timer next. If we don't drop + * the CPU which runs the tick timer next. If we don't drop * this here the jiffies might be stale and do_timer() never * invoked. */ @@ -933,11 +933,11 @@ void tick_nohz_idle_enter(void) WARN_ON_ONCE(irqs_disabled()); /* - * Update the idle state in the scheduler domain hierarchy - * when tick_nohz_stop_sched_tick() is called from the idle loop. - * State will be updated to busy during the first busy tick after - * exiting idle. - */ + * Update the idle state in the scheduler domain hierarchy + * when tick_nohz_stop_sched_tick() is called from the idle loop. + * State will be updated to busy during the first busy tick after + * exiting idle. + */ set_cpu_sd_state_idle(); local_irq_disable(); @@ -1211,7 +1211,7 @@ void tick_setup_sched_timer(void) hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ts->sched_timer.function = tick_sched_timer; - /* Get the next period (per cpu) */ + /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); /* Offset the tick to avert jiffies_lock contention. */ -- cgit v1.3-8-gc7d7 From 1aacde3d22c42281236155c1ef6d7a5aa32a826b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 30 Jun 2016 17:24:43 +0200 Subject: bpf: generally move prog destruction to RCU deferral Jann Horn reported following analysis that could potentially result in a very hard to trigger (if not impossible) UAF race, to quote his event timeline: - Set up a process with threads T1, T2 and T3 - Let T1 set up a socket filter F1 that invokes another filter F2 through a BPF map [tail call] - Let T1 trigger the socket filter via a unix domain socket write, don't wait for completion - Let T2 call PERF_EVENT_IOC_SET_BPF with F2, don't wait for completion - Now T2 should be behind bpf_prog_get(), but before bpf_prog_put() - Let T3 close the file descriptor for F2, dropping the reference count of F2 to 2 - At this point, T1 should have looked up F2 from the map, but not finished executing it - Let T3 remove F2 from the BPF map, dropping the reference count of F2 to 1 - Now T2 should call bpf_prog_put() (wrong BPF program type), dropping the reference count of F2 to 0 and scheduling bpf_prog_free_deferred() via schedule_work() - At this point, the BPF program could be freed - BPF execution is still running in a freed BPF program While at PERF_EVENT_IOC_SET_BPF time it's only guaranteed that the perf event fd we're doing the syscall on doesn't disappear from underneath us for whole syscall time, it may not be the case for the bpf fd used as an argument only after we did the put. It needs to be a valid fd pointing to a BPF program at the time of the call to make the bpf_prog_get() and while T2 gets preempted, F2 must have dropped reference to 1 on the other CPU. The fput() from the close() in T3 should also add additionally delay to the reference drop via exit_task_work() when bpf_prog_release() gets called as well as scheduling bpf_prog_free_deferred(). That said, it makes nevertheless sense to move the BPF prog destruction generally after RCU grace period to guarantee that such scenario above, but also others as recently fixed in ceb56070359b ("bpf, perf: delay release of BPF prog after grace period") with regards to tail calls won't happen. Integrating bpf_prog_free_deferred() directly into the RCU callback is not allowed since the invocation might happen from either softirq or process context, so we're not permitted to block. Reviewing all bpf_prog_put() invocations from eBPF side (note, cBPF -> eBPF progs don't use this for their destruction) with call_rcu() look good to me. Since we don't know whether at the time of attaching the program, we're already part of a tail call map, we need to use RCU variant. However, due to this, there won't be severely more stress on the RCU callback queue: situations with above bpf_prog_get() and bpf_prog_put() combo in practice normally won't lead to releases, but even if they would, enough effort/ cycles have to be put into loading a BPF program into the kernel already. Reported-by: Jann Horn Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 5 ----- kernel/bpf/arraymap.c | 4 +--- kernel/bpf/syscall.c | 13 +++---------- kernel/events/core.c | 2 +- 4 files changed, 5 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8411032ac90d..749549888b86 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -220,7 +220,6 @@ void bpf_register_map_type(struct bpf_map_type_list *tl); struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); -void bpf_prog_put_rcu(struct bpf_prog *prog); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); @@ -281,10 +280,6 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd) static inline void bpf_prog_put(struct bpf_prog *prog) { } - -static inline void bpf_prog_put_rcu(struct bpf_prog *prog) -{ -} #endif /* CONFIG_BPF_SYSCALL */ /* verifier prototypes for helper functions called from eBPF programs */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 5af30732697b..4ec57a649b1f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -390,9 +390,7 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, static void prog_fd_array_put_ptr(void *ptr) { - struct bpf_prog *prog = ptr; - - bpf_prog_put_rcu(prog); + bpf_prog_put(ptr); } /* decrement refcnt of all bpf_progs that are stored in this map */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c23a4e9311b3..f6806a1d7ed9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -623,7 +623,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) free_uid(user); } -static void __prog_put_common(struct rcu_head *rcu) +static void __bpf_prog_put_rcu(struct rcu_head *rcu) { struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); @@ -632,17 +632,10 @@ static void __prog_put_common(struct rcu_head *rcu) bpf_prog_free(aux->prog); } -/* version of bpf_prog_put() that is called after a grace period */ -void bpf_prog_put_rcu(struct bpf_prog *prog) -{ - if (atomic_dec_and_test(&prog->aux->refcnt)) - call_rcu(&prog->aux->rcu, __prog_put_common); -} - void bpf_prog_put(struct bpf_prog *prog) { if (atomic_dec_and_test(&prog->aux->refcnt)) - __prog_put_common(&prog->aux->rcu); + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } EXPORT_SYMBOL_GPL(bpf_prog_put); @@ -650,7 +643,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) { struct bpf_prog *prog = filp->private_data; - bpf_prog_put_rcu(prog); + bpf_prog_put(prog); return 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 85cd41878a74..9c51ec3f0f44 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7529,7 +7529,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event) prog = event->tp_event->prog; if (prog) { event->tp_event->prog = NULL; - bpf_prog_put_rcu(prog); + bpf_prog_put(prog); } } -- cgit v1.3-8-gc7d7 From 113214be7f6c98dd6d0435e4765aea8dea91662c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 30 Jun 2016 17:24:44 +0200 Subject: bpf: refactor bpf_prog_get and type check into helper Since bpf_prog_get() and program type check is used in a couple of places, refactor this into a small helper function that we can make use of. Since the non RO prog->aux part is not used in performance critical paths and a program destruction via RCU is rather very unlikley when doing the put, we shouldn't have an issue just doing the bpf_prog_get() + prog->type != type check, but actually not taking the ref at all (due to being in fdget() / fdput() section of the bpf fd) is even cleaner and makes the diff smaller as well, so just go for that. Callsites are changed to make use of the new helper where possible. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 7 +++++++ kernel/bpf/syscall.c | 27 +++++++++++++++++++-------- net/core/filter.c | 13 +------------ net/kcm/kcmsock.c | 8 +------- net/packet/af_packet.c | 6 +----- net/sched/act_bpf.c | 7 +------ net/sched/cls_bpf.c | 7 +------ 7 files changed, 31 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 749549888b86..b3336b4f5d04 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -218,6 +218,7 @@ void bpf_register_prog_type(struct bpf_prog_type_list *tl); void bpf_register_map_type(struct bpf_map_type_list *tl); struct bpf_prog *bpf_prog_get(u32 ufd); +struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type); struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); @@ -277,6 +278,12 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd) return ERR_PTR(-EOPNOTSUPP); } +static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, + enum bpf_prog_type type) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static inline void bpf_prog_put(struct bpf_prog *prog) { } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f6806a1d7ed9..22863d9872b1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -657,7 +657,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog) O_RDWR | O_CLOEXEC); } -static struct bpf_prog *__bpf_prog_get(struct fd f) +static struct bpf_prog *____bpf_prog_get(struct fd f) { if (!f.file) return ERR_PTR(-EBADF); @@ -678,24 +678,35 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) return prog; } -/* called by sockets/tracing/seccomp before attaching program to an event - * pairs with bpf_prog_put() - */ -struct bpf_prog *bpf_prog_get(u32 ufd) +static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) { struct fd f = fdget(ufd); struct bpf_prog *prog; - prog = __bpf_prog_get(f); + prog = ____bpf_prog_get(f); if (IS_ERR(prog)) return prog; + if (type && prog->type != *type) { + prog = ERR_PTR(-EINVAL); + goto out; + } prog = bpf_prog_inc(prog); +out: fdput(f); - return prog; } -EXPORT_SYMBOL_GPL(bpf_prog_get); + +struct bpf_prog *bpf_prog_get(u32 ufd) +{ + return __bpf_prog_get(ufd, NULL); +} + +struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) +{ + return __bpf_prog_get(ufd, &type); +} +EXPORT_SYMBOL_GPL(bpf_prog_get_type); /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD kern_version diff --git a/net/core/filter.c b/net/core/filter.c index 76f9a4938be4..76fee35da244 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1301,21 +1301,10 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) { - struct bpf_prog *prog; - if (sock_flag(sk, SOCK_FILTER_LOCKED)) return ERR_PTR(-EPERM); - prog = bpf_prog_get(ufd); - if (IS_ERR(prog)) - return prog; - - if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { - bpf_prog_put(prog); - return ERR_PTR(-EINVAL); - } - - return prog; + return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); } int sk_attach_bpf(u32 ufd, struct sock *sk) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 0b68ba730a06..cb39e05b166c 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1765,18 +1765,12 @@ static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info) if (!csock) return -ENOENT; - prog = bpf_prog_get(info->bpf_fd); + prog = bpf_prog_get_type(info->bpf_fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto out; } - if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { - bpf_prog_put(prog); - err = -EINVAL; - goto out; - } - err = kcm_attach(sock, csock, prog); if (err) { bpf_prog_put(prog); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d1f3b9e977e5..48b58957adf4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1588,13 +1588,9 @@ static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data, if (copy_from_user(&fd, data, len)) return -EFAULT; - new = bpf_prog_get(fd); + new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(new)) return PTR_ERR(new); - if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) { - bpf_prog_put(new); - return -EINVAL; - } __fanout_set_data_bpf(po->fanout, new); return 0; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index f7b6cf49ea6f..ef74bffa6101 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -223,15 +223,10 @@ static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg) bpf_fd = nla_get_u32(tb[TCA_ACT_BPF_FD]); - fp = bpf_prog_get(bpf_fd); + fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_ACT); if (IS_ERR(fp)) return PTR_ERR(fp); - if (fp->type != BPF_PROG_TYPE_SCHED_ACT) { - bpf_prog_put(fp); - return -EINVAL; - } - if (tb[TCA_ACT_BPF_NAME]) { name = kmemdup(nla_data(tb[TCA_ACT_BPF_NAME]), nla_len(tb[TCA_ACT_BPF_NAME]), diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 7b342c779da7..c3002c2c68bb 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -272,15 +272,10 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, bpf_fd = nla_get_u32(tb[TCA_BPF_FD]); - fp = bpf_prog_get(bpf_fd); + fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS); if (IS_ERR(fp)) return PTR_ERR(fp); - if (fp->type != BPF_PROG_TYPE_SCHED_CLS) { - bpf_prog_put(fp); - return -EINVAL; - } - if (tb[TCA_BPF_NAME]) { name = kmemdup(nla_data(tb[TCA_BPF_NAME]), nla_len(tb[TCA_BPF_NAME]), -- cgit v1.3-8-gc7d7 From 1f3fe7ebf6136c341012db9f554d4caa566fcbaa Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 30 Jun 2016 10:28:42 -0700 Subject: cgroup: Add cgroup_get_from_fd Add a helper function to get a cgroup2 from a fd. It will be stored in a bpf array (BPF_MAP_TYPE_CGROUP_ARRAY) which will be introduced in the later patch. Signed-off-by: Martin KaFai Lau Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Tejun Heo Acked-by: Tejun Heo Signed-off-by: David S. Miller --- include/linux/cgroup.h | 1 + kernel/cgroup.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index a20320c666fd..984f73b719a9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -87,6 +87,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); struct cgroup *cgroup_get_from_path(const char *path); +struct cgroup *cgroup_get_from_fd(int fd); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 75c0ff00aca6..50787cd61da2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -62,6 +62,7 @@ #include #include #include +#include #include /* @@ -6209,6 +6210,40 @@ struct cgroup *cgroup_get_from_path(const char *path) } EXPORT_SYMBOL_GPL(cgroup_get_from_path); +/** + * cgroup_get_from_fd - get a cgroup pointer from a fd + * @fd: fd obtained by open(cgroup2_dir) + * + * Find the cgroup from a fd which should be obtained + * by opening a cgroup directory. Returns a pointer to the + * cgroup on success. ERR_PTR is returned if the cgroup + * cannot be found. + */ +struct cgroup *cgroup_get_from_fd(int fd) +{ + struct cgroup_subsys_state *css; + struct cgroup *cgrp; + struct file *f; + + f = fget_raw(fd); + if (!f) + return ERR_PTR(-EBADF); + + css = css_tryget_online_from_dir(f->f_path.dentry, NULL); + fput(f); + if (IS_ERR(css)) + return ERR_CAST(css); + + cgrp = css->cgroup; + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } + + return cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_get_from_fd); + /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. -- cgit v1.3-8-gc7d7 From 4ed8ec521ed57c4e207ad464ca0388776de74d4b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 30 Jun 2016 10:28:43 -0700 Subject: cgroup: bpf: Add BPF_MAP_TYPE_CGROUP_ARRAY Add a BPF_MAP_TYPE_CGROUP_ARRAY and its bpf_map_ops's implementations. To update an element, the caller is expected to obtain a cgroup2 backed fd by open(cgroup2_dir) and then update the array with that fd. Signed-off-by: Martin KaFai Lau Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Tejun Heo Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + kernel/bpf/arraymap.c | 43 +++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 3 ++- kernel/bpf/verifier.c | 2 ++ 4 files changed, 48 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index be6ac1291680..26c04be32003 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -84,6 +84,7 @@ enum bpf_map_type { BPF_MAP_TYPE_PERCPU_HASH, BPF_MAP_TYPE_PERCPU_ARRAY, BPF_MAP_TYPE_STACK_TRACE, + BPF_MAP_TYPE_CGROUP_ARRAY, }; enum bpf_prog_type { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 4ec57a649b1f..db1a743e3db2 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -537,3 +537,46 @@ static int __init register_perf_event_array_map(void) return 0; } late_initcall(register_perf_event_array_map); + +#ifdef CONFIG_SOCK_CGROUP_DATA +static void *cgroup_fd_array_get_ptr(struct bpf_map *map, + struct file *map_file /* not used */, + int fd) +{ + return cgroup_get_from_fd(fd); +} + +static void cgroup_fd_array_put_ptr(void *ptr) +{ + /* cgroup_put free cgrp after a rcu grace period */ + cgroup_put(ptr); +} + +static void cgroup_fd_array_free(struct bpf_map *map) +{ + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + +static const struct bpf_map_ops cgroup_array_ops = { + .map_alloc = fd_array_map_alloc, + .map_free = cgroup_fd_array_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = fd_array_map_lookup_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = cgroup_fd_array_get_ptr, + .map_fd_put_ptr = cgroup_fd_array_put_ptr, +}; + +static struct bpf_map_type_list cgroup_array_type __read_mostly = { + .ops = &cgroup_array_ops, + .type = BPF_MAP_TYPE_CGROUP_ARRAY, +}; + +static int __init register_cgroup_array_map(void) +{ + bpf_register_map_type(&cgroup_array_type); + return 0; +} +late_initcall(register_cgroup_array_map); +#endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 22863d9872b1..96d938a22050 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -393,7 +393,8 @@ static int map_update_elem(union bpf_attr *attr) } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || - map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + map->map_type == BPF_MAP_TYPE_PROG_ARRAY || + map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eec9f90ba030..69ba2251a22b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1035,6 +1035,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (func_id != BPF_FUNC_get_stackid) goto error; break; + case BPF_MAP_TYPE_CGROUP_ARRAY: + goto error; default: break; } -- cgit v1.3-8-gc7d7 From 4a482f34afcc162d8456f449b137ec2a95be60d8 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 30 Jun 2016 10:28:44 -0700 Subject: cgroup: bpf: Add bpf_skb_in_cgroup_proto Adds a bpf helper, bpf_skb_in_cgroup, to decide if a skb->sk belongs to a descendant of a cgroup2. It is similar to the feature added in netfilter: commit c38c4597e4bf ("netfilter: implement xt_cgroup cgroup2 path match") The user is expected to populate a BPF_MAP_TYPE_CGROUP_ARRAY which will be used by the bpf_skb_in_cgroup. Modifications to the bpf verifier is to ensure BPF_MAP_TYPE_CGROUP_ARRAY and bpf_skb_in_cgroup() are always used together. Signed-off-by: Martin KaFai Lau Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Tejun Heo Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 11 +++++++++++ kernel/bpf/verifier.c | 8 +++++++- net/core/filter.c | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 26c04be32003..f44504d875e2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -337,6 +337,17 @@ enum bpf_func_id { */ BPF_FUNC_skb_change_type, + /** + * bpf_skb_in_cgroup(skb, map, index) - Check cgroup2 membership of skb + * @skb: pointer to skb + * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type + * @index: index of the cgroup in the bpf_map + * Return: + * == 0 skb failed the cgroup2 descendant test + * == 1 skb succeeded the cgroup2 descendant test + * < 0 error + */ + BPF_FUNC_skb_in_cgroup, __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 69ba2251a22b..e206c2181412 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1036,7 +1036,9 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) goto error; break; case BPF_MAP_TYPE_CGROUP_ARRAY: - goto error; + if (func_id != BPF_FUNC_skb_in_cgroup) + goto error; + break; default: break; } @@ -1056,6 +1058,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; break; + case BPF_FUNC_skb_in_cgroup: + if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) + goto error; + break; default: break; } diff --git a/net/core/filter.c b/net/core/filter.c index 76fee35da244..54071cf70fb5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2239,6 +2239,40 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) } } +#ifdef CONFIG_SOCK_CGROUP_DATA +static u64 bpf_skb_in_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *)(long)r1; + struct bpf_map *map = (struct bpf_map *)(long)r2; + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct cgroup *cgrp; + struct sock *sk; + u32 i = (u32)r3; + + sk = skb->sk; + if (!sk || !sk_fullsock(sk)) + return -ENOENT; + + if (unlikely(i >= array->map.max_entries)) + return -E2BIG; + + cgrp = READ_ONCE(array->ptrs[i]); + if (unlikely(!cgrp)) + return -EAGAIN; + + return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp); +} + +static const struct bpf_func_proto bpf_skb_in_cgroup_proto = { + .func = bpf_skb_in_cgroup, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; +#endif + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -2307,6 +2341,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return bpf_get_event_output_proto(); case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; +#ifdef CONFIG_SOCK_CGROUP_DATA + case BPF_FUNC_skb_in_cgroup: + return &bpf_skb_in_cgroup_proto; +#endif default: return sk_filter_func_proto(func_id); } -- cgit v1.3-8-gc7d7 From 7b776af66dc462caa7e839cc5c950a61db1f8551 Mon Sep 17 00:00:00 2001 From: Roger Lu Date: Fri, 1 Jul 2016 11:05:02 +0800 Subject: PM / suspend: show workqueue state in suspend flow If freezable workqueue aborts suspend flow, show workqueue state for debug purpose. Signed-off-by: Roger Lu Acked-by: Tejun Heo Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 3 +++ kernel/workqueue.c | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index df058bed53ce..6eef250a5705 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -89,6 +89,9 @@ static int try_to_freeze_tasks(bool user_only) elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); + if (wq_busy) + show_workqueue_state(); + if (!wakeup) { read_lock(&tasklist_lock); for_each_process_thread(g, p) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e1c0e996b5ae..619e80ce4a59 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4369,8 +4369,8 @@ static void show_pwq(struct pool_workqueue *pwq) /** * show_workqueue_state - dump workqueue state * - * Called from a sysrq handler and prints out all busy workqueues and - * pools. + * Called from a sysrq handler or try_to_freeze_tasks() and prints out + * all busy workqueues and pools. */ void show_workqueue_state(void) { -- cgit v1.3-8-gc7d7 From 9c744481c003697de453e8fc039468143ba604aa Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 29 Jun 2016 03:00:51 +0200 Subject: PM / hibernate: Do not free preallocated safe pages during image restore The core image restoration code preallocates some safe pages (ie. pages that weren't used by the image kernel before hibernation) for future use before allocating the bulk of memory for loading the image data. Those safe pages are then freed so they can be allocated again (with the memory management subsystem's help). That's done to ensure that there will be enough safe pages for temporary data structures needed during image restoration. However, it is not really necessary to free those pages after they have been allocated. They can be added to the (global) list of safe pages right away and then picked up from there when needed without freeing. That reduces the overhead related to using safe pages, especially in the arch-specific code, so modify the code accordingly. Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 66 ++++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3a970604308f..d9476ff877b8 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -74,6 +74,22 @@ void __init hibernate_image_size_init(void) */ struct pbe *restore_pblist; +/* struct linked_page is used to build chains of pages */ + +#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *)) + +struct linked_page { + struct linked_page *next; + char data[LINKED_PAGE_DATA_SIZE]; +} __packed; + +/* + * List of "safe" pages (ie. pages that were not used by the image kernel + * before hibernation) that may be used as temporary storage for image kernel + * memory contents. + */ +static struct linked_page *safe_pages_list; + /* Pointer to an auxiliary buffer (1 page) */ static void *buffer; @@ -113,9 +129,21 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed) return res; } +static void *__get_safe_page(gfp_t gfp_mask) +{ + if (safe_pages_list) { + void *ret = safe_pages_list; + + safe_pages_list = safe_pages_list->next; + memset(ret, 0, PAGE_SIZE); + return ret; + } + return get_image_page(gfp_mask, PG_SAFE); +} + unsigned long get_safe_page(gfp_t gfp_mask) { - return (unsigned long)get_image_page(gfp_mask, PG_SAFE); + return (unsigned long)__get_safe_page(gfp_mask); } static struct page *alloc_image_page(gfp_t gfp_mask) @@ -150,15 +178,6 @@ static inline void free_image_page(void *addr, int clear_nosave_free) __free_page(page); } -/* struct linked_page is used to build chains of pages */ - -#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *)) - -struct linked_page { - struct linked_page *next; - char data[LINKED_PAGE_DATA_SIZE]; -} __packed; - static inline void free_list_of_pages(struct linked_page *list, int clear_page_nosave) { @@ -208,7 +227,8 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { struct linked_page *lp; - lp = get_image_page(ca->gfp_mask, ca->safe_needed); + lp = ca->safe_needed ? __get_safe_page(ca->gfp_mask) : + get_image_page(ca->gfp_mask, PG_ANY); if (!lp) return NULL; @@ -2104,11 +2124,6 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) return 0; } -/* List of "safe" pages that may be used to store data loaded from the suspend - * image - */ -static struct linked_page *safe_pages_list; - #ifdef CONFIG_HIGHMEM /* struct highmem_pbe is used for creating the list of highmem pages that * should be restored atomically during the resume from disk, because the page @@ -2334,7 +2349,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) { unsigned int nr_pages, nr_highmem; - struct linked_page *sp_list, *lp; + struct linked_page *lp; int error; /* If there is no highmem, the buffer will not be necessary */ @@ -2362,9 +2377,9 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) * NOTE: This way we make sure there will be enough safe pages for the * chain_alloc() in get_buffer(). It is a bit wasteful, but * nr_copy_pages cannot be greater than 50% of the memory anyway. + * + * nr_copy_pages cannot be less than allocated_unsafe_pages too. */ - sp_list = NULL; - /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); while (nr_pages > 0) { @@ -2373,12 +2388,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) error = -ENOMEM; goto Free; } - lp->next = sp_list; - sp_list = lp; + lp->next = safe_pages_list; + safe_pages_list = lp; nr_pages--; } /* Preallocate memory for the image */ - safe_pages_list = NULL; nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; while (nr_pages > 0) { lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); @@ -2396,12 +2410,6 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) swsusp_set_page_free(virt_to_page(lp)); nr_pages--; } - /* Free the reserved safe pages so that chain_alloc() can use them */ - while (sp_list) { - lp = sp_list->next; - free_image_page(sp_list, PG_UNSAFE_CLEAR); - sp_list = lp; - } return 0; Free: @@ -2491,6 +2499,8 @@ int snapshot_write_next(struct snapshot_handle *handle) if (error) return error; + safe_pages_list = NULL; + error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); if (error) return error; -- cgit v1.3-8-gc7d7 From 6dbecfd345a617888da370b13d5b190c9ff3df53 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 29 Jun 2016 03:02:16 +0200 Subject: PM / hibernate: Simplify mark_unsafe_pages() Rework mark_unsafe_pages() to use a simpler method of clearing all bits in free_pages_map and to set the bits for the "unsafe" pages (ie. pages that were used by the image kernel before hibernation) with the help of duplicate_memory_bitmap(). For this purpose, move the pfn_valid() check from mark_unsafe_pages() to unpack_orig_pfns() where the "unsafe" pages are discovered. Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 64 +++++++++++++++++++------------------------------ 1 file changed, 25 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d9476ff877b8..39bbad5fac5a 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2019,53 +2019,41 @@ int snapshot_read_next(struct snapshot_handle *handle) return PAGE_SIZE; } +static void duplicate_memory_bitmap(struct memory_bitmap *dst, + struct memory_bitmap *src) +{ + unsigned long pfn; + + memory_bm_position_reset(src); + pfn = memory_bm_next_pfn(src); + while (pfn != BM_END_OF_MAP) { + memory_bm_set_bit(dst, pfn); + pfn = memory_bm_next_pfn(src); + } +} + /** * mark_unsafe_pages - mark the pages that cannot be used for storing * the image during resume, because they conflict with the pages that * had been used before suspend */ -static int mark_unsafe_pages(struct memory_bitmap *bm) +static void mark_unsafe_pages(struct memory_bitmap *bm) { - struct zone *zone; - unsigned long pfn, max_zone_pfn; + unsigned long pfn; - /* Clear page flags */ - for_each_populated_zone(zone) { - max_zone_pfn = zone_end_pfn(zone); - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) - swsusp_unset_page_free(pfn_to_page(pfn)); + /* Clear the "free"/"unsafe" bit for all PFNs */ + memory_bm_position_reset(free_pages_map); + pfn = memory_bm_next_pfn(free_pages_map); + while (pfn != BM_END_OF_MAP) { + memory_bm_clear_current(free_pages_map); + pfn = memory_bm_next_pfn(free_pages_map); } - /* Mark pages that correspond to the "original" pfns as "unsafe" */ - memory_bm_position_reset(bm); - do { - pfn = memory_bm_next_pfn(bm); - if (likely(pfn != BM_END_OF_MAP)) { - if (likely(pfn_valid(pfn))) - swsusp_set_page_free(pfn_to_page(pfn)); - else - return -EFAULT; - } - } while (pfn != BM_END_OF_MAP); + /* Mark pages that correspond to the "original" PFNs as "unsafe" */ + duplicate_memory_bitmap(free_pages_map, bm); allocated_unsafe_pages = 0; - - return 0; -} - -static void -duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src) -{ - unsigned long pfn; - - memory_bm_position_reset(src); - pfn = memory_bm_next_pfn(src); - while (pfn != BM_END_OF_MAP) { - memory_bm_set_bit(dst, pfn); - pfn = memory_bm_next_pfn(src); - } } static int check_header(struct swsusp_info *info) @@ -2115,7 +2103,7 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) /* Extract and buffer page key for data page (s390 only). */ page_key_memorize(buf + j); - if (memory_bm_pfn_present(bm, buf[j])) + if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) memory_bm_set_bit(bm, buf[j]); else return -EFAULT; @@ -2357,9 +2345,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) buffer = NULL; nr_highmem = count_highmem_image_pages(bm); - error = mark_unsafe_pages(bm); - if (error) - goto Free; + mark_unsafe_pages(bm); error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE); if (error) -- cgit v1.3-8-gc7d7 From 307c5971c972ef2bfd541d2850b36a692c6354c9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 29 Jun 2016 03:05:10 +0200 Subject: PM / hibernate: Recycle safe pages after image restoration One of the memory bitmaps used by the hibernation image restoration code is freed after the image has been loaded. That is not quite efficient, though, because the memory pages used for building that bitmap are known to be safe (ie. they were not used by the image kernel before hibernation) and the arch-specific code finalizing the image restoration may need them. In that case it needs to allocate those pages again via the memory management subsystem, check if they are really safe again by consulting the other bitmaps and so on. To avoid that, recycle those pages by putting them into the global list of known safe pages so that they can be given to the arch code right away when necessary. Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 39bbad5fac5a..94b6fe6c9ae3 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -158,6 +158,14 @@ static struct page *alloc_image_page(gfp_t gfp_mask) return page; } +static void recycle_safe_page(void *page_address) +{ + struct linked_page *lp = page_address; + + lp->next = safe_pages_list; + safe_pages_list = lp; +} + /** * free_image_page - free page represented by @addr, allocated with * get_image_page (page flags set by it must be cleared) @@ -852,6 +860,34 @@ struct nosave_region { static LIST_HEAD(nosave_regions); +static void recycle_zone_bm_rtree(struct mem_zone_bm_rtree *zone) +{ + struct rtree_node *node; + + list_for_each_entry(node, &zone->nodes, list) + recycle_safe_page(node->data); + + list_for_each_entry(node, &zone->leaves, list) + recycle_safe_page(node->data); +} + +static void memory_bm_recycle(struct memory_bitmap *bm) +{ + struct mem_zone_bm_rtree *zone; + struct linked_page *p_list; + + list_for_each_entry(zone, &bm->zones, list) + recycle_zone_bm_rtree(zone); + + p_list = bm->p_list; + while (p_list) { + struct linked_page *lp = p_list; + + p_list = lp->next; + recycle_safe_page(lp); + } +} + /** * register_nosave_region - register a range of page frames the contents * of which should not be saved during the suspend (to be used in the early @@ -2542,9 +2578,9 @@ void snapshot_write_finalize(struct snapshot_handle *handle) /* Restore page key for data page (s390 only). */ page_key_write(handle->buffer); page_key_free(); - /* Free only if we have loaded the image entirely */ + /* Do that only if we have loaded the image entirely */ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { - memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); + memory_bm_recycle(&orig_bm); free_highmem_data(); } } -- cgit v1.3-8-gc7d7 From b6140914fd079e43ea75a53429b47128584f033a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 17:39:22 +0900 Subject: genirq/msi: Remove unused MSI_FLAG_IDENTITY_MAP No user and we definitely don't want to grow one. Signed-off-by: Thomas Gleixner Reviewed-by: Bart Van Assche Cc: Christoph Hellwig Cc: linux-block@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: axboe@fb.com Cc: agordeev@redhat.com Link: http://lkml.kernel.org/r/1467621574-8277-2-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- include/linux/msi.h | 6 ++---- kernel/irq/msi.c | 8 ++------ 2 files changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index 8b425c66305a..c33abfa0f5a7 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -264,12 +264,10 @@ enum { * callbacks. */ MSI_FLAG_USE_DEF_CHIP_OPS = (1 << 1), - /* Build identity map between hwirq and irq */ - MSI_FLAG_IDENTITY_MAP = (1 << 2), /* Support multiple PCI MSI interrupts */ - MSI_FLAG_MULTI_PCI_MSI = (1 << 3), + MSI_FLAG_MULTI_PCI_MSI = (1 << 2), /* Support PCI MSIX interrupts */ - MSI_FLAG_PCI_MSIX = (1 << 4), + MSI_FLAG_PCI_MSIX = (1 << 3), }; int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask, diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 38e89ce7b071..eb5bf2b50b07 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -324,7 +324,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct msi_domain_ops *ops = info->ops; msi_alloc_info_t arg; struct msi_desc *desc; - int i, ret, virq = -1; + int i, ret, virq; ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); if (ret) @@ -332,12 +332,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, for_each_msi_entry(desc, dev) { ops->set_desc(&arg, desc); - if (info->flags & MSI_FLAG_IDENTITY_MAP) - virq = (int)ops->get_hwirq(info, &arg); - else - virq = -1; - virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used, + virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, dev_to_node(dev), &arg, false); if (virq < 0) { ret = -ENOSPC; -- cgit v1.3-8-gc7d7 From 9c2555835bb3d34dfac52a0be943dcc4bedd650f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 17:39:23 +0900 Subject: genirq: Introduce IRQD_AFFINITY_MANAGED flag Interupts marked with this flag are excluded from user space interrupt affinity changes. Contrary to the IRQ_NO_BALANCING flag, the kernel internal affinity mechanism is not blocked. This flag will be used for multi-queue device interrupts. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: linux-block@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: axboe@fb.com Cc: agordeev@redhat.com Link: http://lkml.kernel.org/r/1467621574-8277-3-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 7 +++++++ kernel/irq/internals.h | 2 ++ kernel/irq/manage.c | 21 ++++++++++++++++++--- kernel/irq/proc.c | 2 +- 4 files changed, 28 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 4d758a7c604a..f6074813688d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -197,6 +197,7 @@ struct irq_data { * IRQD_IRQ_INPROGRESS - In progress state of the interrupt * IRQD_WAKEUP_ARMED - Wakeup mode armed * IRQD_FORWARDED_TO_VCPU - The interrupt is forwarded to a VCPU + * IRQD_AFFINITY_MANAGED - Affinity is auto-managed by the kernel */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -212,6 +213,7 @@ enum { IRQD_IRQ_INPROGRESS = (1 << 18), IRQD_WAKEUP_ARMED = (1 << 19), IRQD_FORWARDED_TO_VCPU = (1 << 20), + IRQD_AFFINITY_MANAGED = (1 << 21), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) @@ -305,6 +307,11 @@ static inline void irqd_clr_forwarded_to_vcpu(struct irq_data *d) __irqd_to_state(d) &= ~IRQD_FORWARDED_TO_VCPU; } +static inline bool irqd_affinity_is_managed(struct irq_data *d) +{ + return __irqd_to_state(d) & IRQD_AFFINITY_MANAGED; +} + #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 09be2c903c6d..b15aa3b617a2 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -105,6 +105,8 @@ static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif +extern bool irq_can_set_affinity_usr(unsigned int irq); + extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ef0bc02c3a70..30658e9827f0 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -115,12 +115,12 @@ EXPORT_SYMBOL(synchronize_irq); #ifdef CONFIG_SMP cpumask_var_t irq_default_affinity; -static int __irq_can_set_affinity(struct irq_desc *desc) +static bool __irq_can_set_affinity(struct irq_desc *desc) { if (!desc || !irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) - return 0; - return 1; + return false; + return true; } /** @@ -133,6 +133,21 @@ int irq_can_set_affinity(unsigned int irq) return __irq_can_set_affinity(irq_to_desc(irq)); } +/** + * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space + * @irq: Interrupt to check + * + * Like irq_can_set_affinity() above, but additionally checks for the + * AFFINITY_MANAGED flag. + */ +bool irq_can_set_affinity_usr(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + return __irq_can_set_affinity(desc) && + !irqd_affinity_is_managed(&desc->irq_data); +} + /** * irq_set_thread_affinity - Notify irq threads to adjust affinity * @desc: irq descriptor which has affitnity changed diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 4e1b94726818..40bdcdc1f700 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -96,7 +96,7 @@ static ssize_t write_irq_affinity(int type, struct file *file, cpumask_var_t new_value; int err; - if (!irq_can_set_affinity(irq) || no_irq_affinity) + if (!irq_can_set_affinity_usr(irq) || no_irq_affinity) return -EIO; if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) -- cgit v1.3-8-gc7d7 From 06ee6d571f0e350253a8fc3492316b2be007fae2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 17:39:24 +0900 Subject: genirq: Add affinity hint to irq allocation Add an extra argument to the irq(domain) allocation functions, so we can hand down affinity hints to the allocator. Thats necessary to implement proper support for multiqueue devices. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: linux-block@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: axboe@fb.com Cc: agordeev@redhat.com Link: http://lkml.kernel.org/r/1467621574-8277-4-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- arch/sparc/kernel/irq_64.c | 2 +- arch/x86/kernel/apic/io_apic.c | 5 +++-- include/linux/irq.h | 4 ++-- include/linux/irqdomain.h | 9 ++++++--- kernel/irq/ipi.c | 2 +- kernel/irq/irqdesc.c | 12 ++++++++---- kernel/irq/irqdomain.c | 22 ++++++++++++++-------- kernel/irq/manage.c | 7 ++++--- kernel/irq/msi.c | 3 ++- 9 files changed, 41 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index e22416ce56ea..34a7930b76ef 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -242,7 +242,7 @@ unsigned int irq_alloc(unsigned int dev_handle, unsigned int dev_ino) { int irq; - irq = __irq_alloc_descs(-1, 1, 1, numa_node_id(), NULL); + irq = __irq_alloc_descs(-1, 1, 1, numa_node_id(), NULL, NULL); if (irq <= 0) goto out; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 446702ed99dc..7c4f90dd4c2a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -981,7 +981,7 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, return __irq_domain_alloc_irqs(domain, irq, 1, ioapic_alloc_attr_node(info), - info, legacy); + info, legacy, NULL); } /* @@ -1014,7 +1014,8 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, info->ioapic_pin)) return -ENOMEM; } else { - irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true); + irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, + NULL); if (irq >= 0) { irq_data = irq_domain_get_irq_data(domain, irq); data = irq_data->chip_data; diff --git a/include/linux/irq.h b/include/linux/irq.h index f6074813688d..39ce46ac5c18 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -708,11 +708,11 @@ static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d) unsigned int arch_dynirq_lower_bound(unsigned int from); int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, - struct module *owner); + struct module *owner, const struct cpumask *affinity); /* use macros to avoid needing export.h for THIS_MODULE */ #define irq_alloc_descs(irq, from, cnt, node) \ - __irq_alloc_descs(irq, from, cnt, node, THIS_MODULE) + __irq_alloc_descs(irq, from, cnt, node, THIS_MODULE, NULL) #define irq_alloc_desc(node) \ irq_alloc_descs(-1, 0, 1, node) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index f1f36e04d885..1aee0fbe900e 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -39,6 +39,7 @@ struct irq_domain; struct of_device_id; struct irq_chip; struct irq_data; +struct cpumask; /* Number of irqs reserved for a legacy isa controller */ #define NUM_ISA_INTERRUPTS 16 @@ -217,7 +218,8 @@ extern struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token); extern void irq_set_default_host(struct irq_domain *host); extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, - irq_hw_number_t hwirq, int node); + irq_hw_number_t hwirq, int node, + const struct cpumask *affinity); static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) { @@ -389,7 +391,7 @@ static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *par extern int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, - bool realloc); + bool realloc, const struct cpumask *affinity); extern void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs); extern void irq_domain_activate_irq(struct irq_data *irq_data); extern void irq_domain_deactivate_irq(struct irq_data *irq_data); @@ -397,7 +399,8 @@ extern void irq_domain_deactivate_irq(struct irq_data *irq_data); static inline int irq_domain_alloc_irqs(struct irq_domain *domain, unsigned int nr_irqs, int node, void *arg) { - return __irq_domain_alloc_irqs(domain, -1, nr_irqs, node, arg, false); + return __irq_domain_alloc_irqs(domain, -1, nr_irqs, node, arg, false, + NULL); } extern int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 89b49f6773f0..4fd23510d5f2 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -76,7 +76,7 @@ int irq_reserve_ipi(struct irq_domain *domain, } } - virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE); + virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE, NULL); if (virq <= 0) { pr_warn("Can't reserve IPI, failed to alloc descs\n"); return -ENOMEM; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 8731e1c5d1e7..b8df4fcdbb5f 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -223,7 +223,7 @@ static void free_desc(unsigned int irq) } static int alloc_descs(unsigned int start, unsigned int cnt, int node, - struct module *owner) + const struct cpumask *affinity, struct module *owner) { struct irq_desc *desc; int i; @@ -333,6 +333,7 @@ static void free_desc(unsigned int irq) } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, + const struct cpumask *affinity, struct module *owner) { u32 i; @@ -453,12 +454,15 @@ EXPORT_SYMBOL_GPL(irq_free_descs); * @cnt: Number of consecutive irqs to allocate. * @node: Preferred node on which the irq descriptor should be allocated * @owner: Owning module (can be NULL) + * @affinity: Optional pointer to an affinity mask which hints where the + * irq descriptors should be allocated and which default + * affinities to use * * Returns the first irq number or error code */ int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, - struct module *owner) + struct module *owner, const struct cpumask *affinity) { int start, ret; @@ -494,7 +498,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, bitmap_set(allocated_irqs, start, cnt); mutex_unlock(&sparse_irq_lock); - return alloc_descs(start, cnt, node, owner); + return alloc_descs(start, cnt, node, affinity, owner); err: mutex_unlock(&sparse_irq_lock); @@ -512,7 +516,7 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs); */ unsigned int irq_alloc_hwirqs(int cnt, int node) { - int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL); + int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL, NULL); if (irq < 0) return 0; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8798b6c9e945..79459b732dc9 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -481,7 +481,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } /* Allocate a virtual interrupt number */ - virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node)); + virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; @@ -835,19 +835,23 @@ const struct irq_domain_ops irq_domain_simple_ops = { EXPORT_SYMBOL_GPL(irq_domain_simple_ops); int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, - int node) + int node, const struct cpumask *affinity) { unsigned int hint; if (virq >= 0) { - virq = irq_alloc_descs(virq, virq, cnt, node); + virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE, + affinity); } else { hint = hwirq % nr_irqs; if (hint == 0) hint++; - virq = irq_alloc_descs_from(hint, cnt, node); - if (virq <= 0 && hint > 1) - virq = irq_alloc_descs_from(1, cnt, node); + virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE, + affinity); + if (virq <= 0 && hint > 1) { + virq = __irq_alloc_descs(-1, 1, cnt, node, THIS_MODULE, + affinity); + } } return virq; @@ -1160,6 +1164,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, * @node: NUMA node id for memory allocation * @arg: domain specific argument * @realloc: IRQ descriptors have already been allocated if true + * @affinity: Optional irq affinity mask for multiqueue devices * * Allocate IRQ numbers and initialized all data structures to support * hierarchy IRQ domains. @@ -1175,7 +1180,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, */ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, - bool realloc) + bool realloc, const struct cpumask *affinity) { int i, ret, virq; @@ -1193,7 +1198,8 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, if (realloc && irq_base >= 0) { virq = irq_base; } else { - virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node); + virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node, + affinity); if (virq < 0) { pr_debug("cannot allocate IRQ(base %d, count %d)\n", irq_base, nr_irqs); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 30658e9827f0..ad0aac6d1248 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -353,10 +353,11 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) return 0; /* - * Preserve an userspace affinity setup, but make sure that - * one of the targets is online. + * Preserve the managed affinity setting and an userspace affinity + * setup, but make sure that one of the targets is online. */ - if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { + if (irqd_affinity_is_managed(&desc->irq_data) || + irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { if (cpumask_intersects(desc->irq_common_data.affinity, cpu_online_mask)) set = desc->irq_common_data.affinity; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index eb5bf2b50b07..58dbbacc6fbb 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -334,7 +334,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, ops->set_desc(&arg, desc); virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, - dev_to_node(dev), &arg, false); + dev_to_node(dev), &arg, false, + NULL); if (virq < 0) { ret = -ENOSPC; if (ops->handle_error) -- cgit v1.3-8-gc7d7 From 45ddcecbfa947f1dd8e8019bad9e90d6c9f2665c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 17:39:25 +0900 Subject: genirq: Use affinity hint in irqdesc allocation Use the affinity hint in the irqdesc allocator. The hint is used to determine the node for the allocation and to set the affinity of the interrupt. If multiple interrupts are allocated (multi-MSI) then the allocator iterates over the cpumask and for each set cpu it allocates on their node and sets the initial affinity to that cpu. If a single interrupt is allocated (MSI-X) then the allocator uses the first cpu in the mask to compute the allocation node and uses the mask for the initial affinity setting. Interrupts set up this way are marked with the AFFINITY_MANAGED flag to prevent userspace from messing with their affinity settings. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: linux-block@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: axboe@fb.com Cc: agordeev@redhat.com Link: http://lkml.kernel.org/r/1467621574-8277-5-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 51 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index b8df4fcdbb5f..a623b44f2d4b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -68,9 +68,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) return 0; } -static void desc_smp_init(struct irq_desc *desc, int node) +static void desc_smp_init(struct irq_desc *desc, int node, + const struct cpumask *affinity) { - cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity); + if (!affinity) + affinity = irq_default_affinity; + cpumask_copy(desc->irq_common_data.affinity, affinity); + #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_clear(desc->pending_mask); #endif @@ -82,11 +86,12 @@ static void desc_smp_init(struct irq_desc *desc, int node) #else static inline int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } -static inline void desc_smp_init(struct irq_desc *desc, int node) { } +static inline void +desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { } #endif static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, - struct module *owner) + const struct cpumask *affinity, struct module *owner) { int cpu; @@ -107,7 +112,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, desc->owner = owner; for_each_possible_cpu(cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; - desc_smp_init(desc, node); + desc_smp_init(desc, node, affinity); } int nr_irqs = NR_IRQS; @@ -158,7 +163,9 @@ void irq_unlock_sparse(void) mutex_unlock(&sparse_irq_lock); } -static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) +static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, + const struct cpumask *affinity, + struct module *owner) { struct irq_desc *desc; gfp_t gfp = GFP_KERNEL; @@ -178,7 +185,8 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) lockdep_set_class(&desc->lock, &irq_desc_lock_class); init_rcu_head(&desc->rcu); - desc_set_defaults(irq, desc, node, owner); + desc_set_defaults(irq, desc, node, affinity, owner); + irqd_set(&desc->irq_data, flags); return desc; @@ -225,11 +233,30 @@ static void free_desc(unsigned int irq) static int alloc_descs(unsigned int start, unsigned int cnt, int node, const struct cpumask *affinity, struct module *owner) { + const struct cpumask *mask = NULL; struct irq_desc *desc; - int i; + unsigned int flags; + int i, cpu = -1; + + if (affinity && cpumask_empty(affinity)) + return -EINVAL; + + flags = affinity ? IRQD_AFFINITY_MANAGED : 0; for (i = 0; i < cnt; i++) { - desc = alloc_desc(start + i, node, owner); + if (affinity) { + cpu = cpumask_next(cpu, affinity); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(affinity); + node = cpu_to_node(cpu); + + /* + * For single allocations we use the caller provided + * mask otherwise we use the mask of the target cpu + */ + mask = cnt == 1 ? affinity : cpumask_of(cpu); + } + desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) goto err; mutex_lock(&sparse_irq_lock); @@ -277,7 +304,7 @@ int __init early_irq_init(void) nr_irqs = initcnt; for (i = 0; i < initcnt; i++) { - desc = alloc_desc(i, node, NULL); + desc = alloc_desc(i, node, 0, NULL, NULL); set_bit(i, allocated_irqs); irq_insert_desc(i, desc); } @@ -311,7 +338,7 @@ int __init early_irq_init(void) alloc_masks(&desc[i], GFP_KERNEL, node); raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - desc_set_defaults(i, &desc[i], node, NULL); + desc_set_defaults(i, &desc[i], node, NULL, NULL); } return arch_early_irq_init(); } @@ -328,7 +355,7 @@ static void free_desc(unsigned int irq) unsigned long flags; raw_spin_lock_irqsave(&desc->lock, flags); - desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL); + desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); raw_spin_unlock_irqrestore(&desc->lock, flags); } -- cgit v1.3-8-gc7d7 From 0972fa57f53525ffa6ced12d703750fc2791e3ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 17:39:26 +0900 Subject: genirq/msi: Make use of affinity aware allocations Allow the MSI code to provide affinity hints per MSI descriptor. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: linux-block@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: axboe@fb.com Cc: agordeev@redhat.com Link: http://lkml.kernel.org/r/1467621574-8277-6-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- include/linux/msi.h | 2 ++ kernel/irq/msi.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index c33abfa0f5a7..4f0bfe5912b2 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -47,6 +47,7 @@ struct fsl_mc_msi_desc { * @nvec_used: The number of vectors used * @dev: Pointer to the device which uses this descriptor * @msg: The last set MSI message cached for reuse + * @affinity: Optional pointer to a cpu affinity mask for this descriptor * * @masked: [PCI MSI/X] Mask bits * @is_msix: [PCI MSI/X] True if MSI-X @@ -67,6 +68,7 @@ struct msi_desc { unsigned int nvec_used; struct device *dev; struct msi_msg msg; + const struct cpumask *affinity; union { /* PCI MSI/X specific data */ diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 58dbbacc6fbb..0e2a736b14a7 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -335,7 +335,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, dev_to_node(dev), &arg, false, - NULL); + desc->affinity); if (virq < 0) { ret = -ENOSPC; if (ops->handle_error) -- cgit v1.3-8-gc7d7 From 5e385a6ef31fbbf2acbda770aecc2bc2ff933d17 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Jul 2016 17:39:27 +0900 Subject: genirq: Add a helper to spread an affinity mask for MSI/MSI-X vectors This is lifted from the blk-mq code and adopted to use the affinity mask concept just introduced in the irq handling code. It tries to keep the algorithm the same as the one current used by blk-mq, but improvements like assining vectors on a per-node basis instead of just per sibling are possible with this simple move and refactoring. Signed-off-by: Christoph Hellwig Cc: linux-block@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: axboe@fb.com Cc: agordeev@redhat.com Link: http://lkml.kernel.org/r/1467621574-8277-7-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 8 +++++++ kernel/irq/Makefile | 1 + kernel/irq/affinity.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+) create mode 100644 kernel/irq/affinity.c (limited to 'kernel') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 9fcabeb07787..b6683f0ffc9f 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -278,6 +278,8 @@ extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m); extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); +struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs); + #else /* CONFIG_SMP */ static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m) @@ -308,6 +310,12 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) { return 0; } + +static inline struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) +{ + *nr_vecs = 1; + return NULL; +} #endif /* CONFIG_SMP */ /* diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 2ee42e95a3ce..1d3ee3169202 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -9,3 +9,4 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o +obj-$(CONFIG_SMP) += affinity.o diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c new file mode 100644 index 000000000000..f68959341c0f --- /dev/null +++ b/kernel/irq/affinity.c @@ -0,0 +1,61 @@ + +#include +#include +#include +#include + +static int get_first_sibling(unsigned int cpu) +{ + unsigned int ret; + + ret = cpumask_first(topology_sibling_cpumask(cpu)); + if (ret < nr_cpu_ids) + return ret; + return cpu; +} + +/* + * Take a map of online CPUs and the number of available interrupt vectors + * and generate an output cpumask suitable for spreading MSI/MSI-X vectors + * so that they are distributed as good as possible around the CPUs. If + * more vectors than CPUs are available we'll map one to each CPU, + * otherwise we map one to the first sibling of each socket. + * + * If there are more vectors than CPUs we will still only have one bit + * set per CPU, but interrupt code will keep on assigning the vectors from + * the start of the bitmap until we run out of vectors. + */ +struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) +{ + struct cpumask *affinity_mask; + unsigned int max_vecs = *nr_vecs; + + if (max_vecs == 1) + return NULL; + + affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL); + if (!affinity_mask) { + *nr_vecs = 1; + return NULL; + } + + if (max_vecs >= num_online_cpus()) { + cpumask_copy(affinity_mask, cpu_online_mask); + *nr_vecs = num_online_cpus(); + } else { + unsigned int vecs = 0, cpu; + + for_each_online_cpu(cpu) { + if (cpu == get_first_sibling(cpu)) { + cpumask_set_cpu(cpu, affinity_mask); + vecs++; + } + + if (--max_vecs == 0) + break; + } + *nr_vecs = vecs; + } + + return affinity_mask; +} -- cgit v1.3-8-gc7d7 From 4364e1a29be16b2783c0bcbc263f61236af64281 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 15:32:25 +0200 Subject: genirq/msi: Fix broken debug output virq is not required to be the same for all msi descs. Use the base irq number from the desc in the debug printk. Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/irq/msi.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 0e2a736b14a7..54999350162c 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -353,6 +353,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, ops->msi_finish(&arg, 0); for_each_msi_entry(desc, dev) { + virq = desc->irq; if (desc->nvec_used == 1) dev_dbg(dev, "irq %d for MSI\n", virq); else -- cgit v1.3-8-gc7d7 From 501c2375253c0795048f48368e0b3e8b2f6646dc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 5 Jul 2016 10:04:34 -0400 Subject: ftrace: Move toplevel init out of ftrace_init_tracefs() Commit 345ddcc882d8 ("ftrace: Have set_ftrace_pid use the bitmap like events do") placed ftrace_init_tracefs into the instance creation, and encapsulated the top level updating with an if conditional, as the top level only gets updated at boot up. Unfortunately, this triggers section mismatch errors as the init functions are called from a function that can be called later, and the section mismatch logic is unaware of the if conditional that would prevent it from happening at run time. To make everyone happy, create a separate ftrace_init_tracefs_toplevel() routine that only gets called by init functions, and this will be what calls other init functions for the toplevel directory. Link: http://lkml.kernel.org/r/20160704102139.19cbc0d9@gandalf.local.home Reported-by: kbuild test robot Reported-by: Arnd Bergmann Fixes: 345ddcc882d8 ("ftrace: Have set_ftrace_pid use the bitmap like events do") Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 16 ++++++++++------ kernel/trace/trace.c | 1 + kernel/trace/trace.h | 3 +++ 3 files changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8b488f4dd8e8..84752c8e28b5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5539,16 +5539,20 @@ static const struct file_operations ftrace_pid_fops = { void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer) { - /* Only the top level directory has the dyn_tracefs and profile */ - if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { - ftrace_init_dyn_tracefs(d_tracer); - ftrace_profile_tracefs(d_tracer); - } - trace_create_file("set_ftrace_pid", 0644, d_tracer, tr, &ftrace_pid_fops); } +void __init ftrace_init_tracefs_toplevel(struct trace_array *tr, + struct dentry *d_tracer) +{ + /* Only the top level directory has the dyn_tracefs and profile */ + WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); + + ftrace_init_dyn_tracefs(d_tracer); + ftrace_profile_tracefs(d_tracer); +} + /** * ftrace_kill - kill ftrace * diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3d9f31b576f3..5fd53a7847bc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7369,6 +7369,7 @@ static __init int tracer_init_tracefs(void) return 0; init_tracer_tracefs(&global_trace, d_tracer); + ftrace_init_tracefs_toplevel(&global_trace, d_tracer); trace_create_file("tracing_thresh", 0644, d_tracer, &global_trace, &tracing_thresh_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index eaee458755a4..c1de3f493cd3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -857,6 +857,8 @@ void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); int using_ftrace_ops_list_func(void); void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); +void ftrace_init_tracefs_toplevel(struct trace_array *tr, + struct dentry *d_tracer); #else static inline int ftrace_trace_task(struct trace_array *tr) { @@ -874,6 +876,7 @@ static inline __init void ftrace_init_global_array_ops(struct trace_array *tr) { } static inline void ftrace_reset_array_ops(struct trace_array *tr) { } static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } +static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } /* ftace_func_t type is not defined, use macro instead of static inline */ #define ftrace_init_array_ops(tr, func) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER */ -- cgit v1.3-8-gc7d7 From 5130213721d01b6632c255d4295a8102cbb58379 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 5 Jul 2016 16:57:51 +0800 Subject: tick/broadcast-hrtimer: Set name of the ce_broadcast_hrtimer This is to avoid the "null" name when we either ~ # cat /sys/devices/system/clockevents/broadcast/current_device (null) or ~ # cat /proc/timer_list ... Tick Device: mode: 1 Broadcast device Clock Event Device: (null) ... Signed-off-by: Jisheng Zhang Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1467709071-3667-1-git-send-email-jszhang@marvell.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast-hrtimer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 53d7184da0be..690b797f522e 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -75,6 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) } static struct clock_event_device ce_broadcast_hrtimer = { + .name = "bc_hrtimer", .set_state_shutdown = bc_shutdown, .set_next_ktime = bc_set_next, .features = CLOCK_EVT_FEAT_ONESHOT | -- cgit v1.3-8-gc7d7 From 67f20b084574def586ecba68508acd5d054ccc88 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 4 Jul 2016 15:10:04 +0000 Subject: tracing: Using for_each_set_bit() to simplify trace_pid_write() Using for_each_set_bit() to simplify the code. Link: http://lkml.kernel.org/r/1467645004-11169-1-git-send-email-weiyj_lk@163.com Signed-off-by: Wei Yongjun Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5fd53a7847bc..dade4c9559cc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -517,13 +517,9 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, if (filtered_pids) { /* copy the current bits to the new max */ - pid = find_first_bit(filtered_pids->pids, - filtered_pids->pid_max); - while (pid < filtered_pids->pid_max) { + for_each_set_bit(pid, filtered_pids->pids, + filtered_pids->pid_max) { set_bit(pid, pid_list->pids); - pid = find_next_bit(filtered_pids->pids, - filtered_pids->pid_max, - pid + 1); nr_pids++; } } -- cgit v1.3-8-gc7d7 From 7ad8fb61c4abf589596f0a4da34d987471481569 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 3 Jul 2016 08:51:34 -0500 Subject: tracing: Have HIST_TRIGGERS select TRACING The kbuild test robot reported a compile error if HIST_TRIGGERS was enabled but nothing else that selected TRACING was configured in. HIST_TRIGGERS should directly select it and not rely on anything else to do it. Link: http://lkml.kernel.org/r/57791866.8080505@linux.intel.com Reported-by: kbuild test robot Fixes: 7ef224d1d0e3a ("tracing: Add 'hist' event trigger command") Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fafeaf803bd0..f4b86e8ca1e7 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -542,6 +542,7 @@ config HIST_TRIGGERS bool "Histogram triggers" depends on ARCH_HAVE_NMI_SAFE_CMPXCHG select TRACING_MAP + select TRACING default n help Hist triggers allow one or more arbitrary trace event fields -- cgit v1.3-8-gc7d7 From a4a551b8f1d4c4ebffd0f49dfef44df3128546f8 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 29 Jun 2016 19:56:48 +0900 Subject: ftrace: Reduce size of function graph entries Currently ftrace_graph_ent{,_entry} and ftrace_graph_ret{,_entry} struct can have padding bytes at the end due to alignment in 64-bit data type. As these data are recorded so frequently, those paddings waste non-negligible space. As the ring buffer maintains alignment properly for each architecture, just to remove the extra padding using 'packed' attribute. ftrace_graph_ent_entry: 24 -> 20 ftrace_graph_ret_entry: 48 -> 44 Also I moved the 'overrun' field in struct ftrace_graph_ret to minimize the padding in the middle. Tested on x86_64 only. Link: http://lkml.kernel.org/r/1467197808-13578-1-git-send-email-namhyung@kernel.org Cc: Ingo Molnar Cc: linux-arch@vger.kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 12 ++++++++---- kernel/trace/trace.h | 11 +++++++++++ kernel/trace/trace_entries.h | 4 ++-- 3 files changed, 21 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 66a36a815f0a..7d565afe35d2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -754,23 +754,27 @@ static inline void ftrace_init(void) { } /* * Structure that defines an entry function trace. + * It's already packed but the attribute "packed" is needed + * to remove extra padding at the end. */ struct ftrace_graph_ent { unsigned long func; /* Current function */ int depth; -}; +} __packed; /* * Structure that defines a return function trace. + * It's already packed but the attribute "packed" is needed + * to remove extra padding at the end. */ struct ftrace_graph_ret { unsigned long func; /* Current function */ - unsigned long long calltime; - unsigned long long rettime; /* Number of functions that overran the depth limit for current task */ unsigned long overrun; + unsigned long long calltime; + unsigned long long rettime; int depth; -}; +} __packed; /* Type of the callback handlers for tracing function graph*/ typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c1de3f493cd3..f783df416726 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -80,6 +80,12 @@ enum trace_type { FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ filter) +#undef FTRACE_ENTRY_PACKED +#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \ + filter) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) __packed + #include "trace_entries.h" /* @@ -1625,6 +1631,11 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ filter) +#undef FTRACE_ENTRY_PACKED +#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) + #include "trace_entries.h" #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ee7b94a4810a..5c30efcda5e6 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -72,7 +72,7 @@ FTRACE_ENTRY_REG(function, ftrace_entry, ); /* Function call entry */ -FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, +FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, TRACE_GRAPH_ENT, @@ -88,7 +88,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, ); /* Function return entry */ -FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, +FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, TRACE_GRAPH_RET, -- cgit v1.3-8-gc7d7 From ecb23dc6f2eff0ce64dd60351a81f376f13b12cc Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 20 May 2016 09:26:48 +0200 Subject: xen: add steal_clock support on x86 The pv_time_ops structure contains a function pointer for the "steal_clock" functionality used only by KVM and Xen on ARM. Xen on x86 uses its own mechanism to account for the "stolen" time a thread wasn't able to run due to hypervisor scheduling. Add support in Xen arch independent time handling for this feature by moving it out of the arm arch into drivers/xen and remove the x86 Xen hack. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Reviewed-by: Stefano Stabellini Signed-off-by: David Vrabel --- arch/arm/xen/enlighten.c | 18 ++---------------- arch/x86/xen/time.c | 44 ++------------------------------------------ drivers/xen/time.c | 20 ++++++++++++++++++++ include/linux/kernel_stat.h | 1 - include/xen/xen-ops.h | 1 + kernel/sched/cputime.c | 10 ---------- 6 files changed, 25 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index d4f36eba3499..47acb3613f40 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -86,19 +85,6 @@ int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, } EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range); -static unsigned long long xen_stolen_accounting(int cpu) -{ - struct vcpu_runstate_info state; - - BUG_ON(cpu != smp_processor_id()); - - xen_get_runstate_snapshot(&state); - - WARN_ON(state.state != RUNSTATE_running); - - return state.time[RUNSTATE_runnable] + state.time[RUNSTATE_offline]; -} - static void xen_read_wallclock(struct timespec64 *ts) { u32 version; @@ -432,8 +418,8 @@ static int __init xen_guest_init(void) register_cpu_notifier(&xen_cpu_notifier); - pv_time_ops.steal_clock = xen_stolen_accounting; - static_key_slow_inc(¶virt_steal_enabled); + xen_time_setup_guest(); + if (xen_initial_domain()) pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 6deba5bc7e34..c31006f0f37f 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -11,8 +11,6 @@ #include #include #include -#include -#include #include #include #include @@ -31,44 +29,6 @@ /* Xen may fire a timer up to this many ns early */ #define TIMER_SLOP 100000 -#define NS_PER_TICK (1000000000LL / HZ) - -/* snapshots of runstate info */ -static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot); - -/* unused ns of stolen time */ -static DEFINE_PER_CPU(u64, xen_residual_stolen); - -static void do_stolen_accounting(void) -{ - struct vcpu_runstate_info state; - struct vcpu_runstate_info *snap; - s64 runnable, offline, stolen; - cputime_t ticks; - - xen_get_runstate_snapshot(&state); - - WARN_ON(state.state != RUNSTATE_running); - - snap = this_cpu_ptr(&xen_runstate_snapshot); - - /* work out how much time the VCPU has not been runn*ing* */ - runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; - offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; - - *snap = state; - - /* Add the appropriate number of ticks of stolen time, - including any left-overs from last time. */ - stolen = runnable + offline + __this_cpu_read(xen_residual_stolen); - - if (stolen < 0) - stolen = 0; - - ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); - __this_cpu_write(xen_residual_stolen, stolen); - account_steal_ticks(ticks); -} /* Get the TSC speed from Xen */ static unsigned long xen_tsc_khz(void) @@ -335,8 +295,6 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) ret = IRQ_HANDLED; } - do_stolen_accounting(); - return ret; } @@ -431,6 +389,8 @@ static void __init xen_time_init(void) xen_setup_timer(cpu); xen_setup_cpu_clockevents(); + xen_time_setup_guest(); + if (xen_initial_domain()) pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); } diff --git a/drivers/xen/time.c b/drivers/xen/time.c index 71078425c9ea..2257b6663766 100644 --- a/drivers/xen/time.c +++ b/drivers/xen/time.c @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -75,6 +76,15 @@ bool xen_vcpu_stolen(int vcpu) return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable; } +static u64 xen_steal_clock(int cpu) +{ + struct vcpu_runstate_info state; + + BUG_ON(cpu != smp_processor_id()); + xen_get_runstate_snapshot(&state); + return state.time[RUNSTATE_runnable] + state.time[RUNSTATE_offline]; +} + void xen_setup_runstate_info(int cpu) { struct vcpu_register_runstate_memory_area area; @@ -86,3 +96,13 @@ void xen_setup_runstate_info(int cpu) BUG(); } +void __init xen_time_setup_guest(void) +{ + pv_time_ops.steal_clock = xen_steal_clock; + + static_key_slow_inc(¶virt_steal_enabled); + /* + * We can't set paravirt_steal_rq_enabled as this would require the + * capability to read another cpu's runstate info. + */ +} diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 25a822f6f000..44fda64ad434 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -92,7 +92,6 @@ static inline void account_process_tick(struct task_struct *tsk, int user) extern void account_process_tick(struct task_struct *, int user); #endif -extern void account_steal_ticks(unsigned long ticks); extern void account_idle_ticks(unsigned long ticks); #endif /* _LINUX_KERNEL_STAT_H */ diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 3491582bf50a..355275bad2cf 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -21,6 +21,7 @@ void xen_resume_notifier_unregister(struct notifier_block *nb); bool xen_vcpu_stolen(int vcpu); void xen_setup_runstate_info(int cpu); +void xen_time_setup_guest(void); void xen_get_runstate_snapshot(struct vcpu_runstate_info *res); int xen_setup_shutdown_event(void); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 75f98c5498d5..8c4c6dcc052c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -489,16 +489,6 @@ void account_process_tick(struct task_struct *p, int user_tick) account_idle_time(cputime_one_jiffy); } -/* - * Account multiple ticks of steal time. - * @p: the process from which the cpu time has been stolen - * @ticks: number of stolen ticks - */ -void account_steal_ticks(unsigned long ticks) -{ - account_steal_time(jiffies_to_cputime(ticks)); -} - /* * Account multiple ticks of idle time. * @ticks: number of stolen ticks -- cgit v1.3-8-gc7d7 From 885885f6b88d22f81e67ee6a61561e480b27d27a Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 17 Jun 2016 17:19:40 +0000 Subject: locking/static_keys: Fix non static symbol Sparse warning Fix the following sparse warnings: kernel/jump_label.c:473:23: warning: symbol 'jump_label_module_nb' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1466183980-8903-1-git-send-email-weiyj_lk@163.com Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 05254eeb4b4e..ac4ab953b49c 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -422,7 +422,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, return notifier_from_errno(ret); } -struct notifier_block jump_label_module_nb = { +static struct notifier_block jump_label_module_nb = { .notifier_call = jump_label_module_notify, .priority = 1, /* higher than tracepoints */ }; -- cgit v1.3-8-gc7d7 From e675447bda51c1ea72d1ac9132ce3bed974f1da3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:15 +0000 Subject: timers: Make 'pinned' a timer property We want to move the timer migration logic from a 'push' to a 'pull' model. Under the current 'push' model pinned timers are handled via a runtime API variant: mod_timer_pinned(). The 'pull' model requires us to store the pinned attribute of a timer in the timer_list structure itself, as a new TIMER_PINNED bit in timer->flags. This flag must be set at initialization time and the timer APIs recognize the flag. This patch: - Implements the new flag and associated new-style initialization methods - makes mod_timer() recognize new-style pinned timers, - and adds some migration helper facility to allow step by step conversion of old-style to new-style pinned timers. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094341.049338558@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/timer.h | 25 ++++++++++++++++++++++--- kernel/time/timer.c | 10 +++++----- 2 files changed, 27 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/timer.h b/include/linux/timer.h index 20ac746f3eb3..046d6cf26498 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -62,7 +62,8 @@ struct timer_list { #define TIMER_MIGRATING 0x00080000 #define TIMER_BASEMASK (TIMER_CPUMASK | TIMER_MIGRATING) #define TIMER_DEFERRABLE 0x00100000 -#define TIMER_IRQSAFE 0x00200000 +#define TIMER_PINNED 0x00200000 +#define TIMER_IRQSAFE 0x00400000 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \ .entry = { .next = TIMER_ENTRY_STATIC }, \ @@ -78,9 +79,15 @@ struct timer_list { #define TIMER_INITIALIZER(_function, _expires, _data) \ __TIMER_INITIALIZER((_function), (_expires), (_data), 0) +#define TIMER_PINNED_INITIALIZER(_function, _expires, _data) \ + __TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_PINNED) + #define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data) \ __TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE) +#define TIMER_PINNED_DEFERRED_INITIALIZER(_function, _expires, _data) \ + __TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE | TIMER_PINNED) + #define DEFINE_TIMER(_name, _function, _expires, _data) \ struct timer_list _name = \ TIMER_INITIALIZER(_function, _expires, _data) @@ -124,8 +131,12 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, #define init_timer(timer) \ __init_timer((timer), 0) +#define init_timer_pinned(timer) \ + __init_timer((timer), TIMER_PINNED) #define init_timer_deferrable(timer) \ __init_timer((timer), TIMER_DEFERRABLE) +#define init_timer_pinned_deferrable(timer) \ + __init_timer((timer), TIMER_DEFERRABLE | TIMER_PINNED) #define init_timer_on_stack(timer) \ __init_timer_on_stack((timer), 0) @@ -145,12 +156,20 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, #define setup_timer(timer, fn, data) \ __setup_timer((timer), (fn), (data), 0) +#define setup_pinned_timer(timer, fn, data) \ + __setup_timer((timer), (fn), (data), TIMER_PINNED) #define setup_deferrable_timer(timer, fn, data) \ __setup_timer((timer), (fn), (data), TIMER_DEFERRABLE) +#define setup_pinned_deferrable_timer(timer, fn, data) \ + __setup_timer((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED) #define setup_timer_on_stack(timer, fn, data) \ __setup_timer_on_stack((timer), (fn), (data), 0) +#define setup_pinned_timer_on_stack(timer, fn, data) \ + __setup_timer_on_stack((timer), (fn), (data), TIMER_PINNED) #define setup_deferrable_timer_on_stack(timer, fn, data) \ __setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE) +#define setup_pinned_deferrable_timer_on_stack(timer, fn, data) \ + __setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED) /** * timer_pending - is a timer pending? @@ -175,8 +194,8 @@ extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires); extern void set_timer_slack(struct timer_list *time, int slack_hz); -#define TIMER_NOT_PINNED 0 -#define TIMER_PINNED 1 +#define MOD_TIMER_NOT_PINNED 0 +#define MOD_TIMER_PINNED 1 /* * The jiffies value which is added to now, when there is no timer * in the timer wheel: diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 3a95f9728778..693f6d14058e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -782,7 +782,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - new_base = get_target_base(base, pinned); + new_base = get_target_base(base, pinned || timer->flags & TIMER_PINNED); if (base != new_base) { /* @@ -825,7 +825,7 @@ out_unlock: */ int mod_timer_pending(struct timer_list *timer, unsigned long expires) { - return __mod_timer(timer, expires, true, TIMER_NOT_PINNED); + return __mod_timer(timer, expires, true, MOD_TIMER_NOT_PINNED); } EXPORT_SYMBOL(mod_timer_pending); @@ -900,7 +900,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires) if (timer_pending(timer) && timer->expires == expires) return 1; - return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); + return __mod_timer(timer, expires, false, MOD_TIMER_NOT_PINNED); } EXPORT_SYMBOL(mod_timer); @@ -928,7 +928,7 @@ int mod_timer_pinned(struct timer_list *timer, unsigned long expires) if (timer->expires == expires && timer_pending(timer)) return 1; - return __mod_timer(timer, expires, false, TIMER_PINNED); + return __mod_timer(timer, expires, false, MOD_TIMER_PINNED); } EXPORT_SYMBOL(mod_timer_pinned); @@ -1512,7 +1512,7 @@ signed long __sched schedule_timeout(signed long timeout) expire = timeout + jiffies; setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); - __mod_timer(&timer, expire, false, TIMER_NOT_PINNED); + __mod_timer(&timer, expire, false, MOD_TIMER_NOT_PINNED); schedule(); del_singleshot_timer_sync(&timer); -- cgit v1.3-8-gc7d7 From 177ec0a0a531695210b277d734b2f92ee5796303 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:24 +0000 Subject: timers: Remove the deprecated mod_timer_pinned() API We switched all users to initialize the timers as pinned and call mod_timer(). Remove the now unused timer API function. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094341.706205231@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/timer.h | 3 --- kernel/time/timer.c | 39 +++++---------------------------------- 2 files changed, 5 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/timer.h b/include/linux/timer.h index 046d6cf26498..a8f6c70eb414 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -190,12 +190,9 @@ extern void add_timer_on(struct timer_list *timer, int cpu); extern int del_timer(struct timer_list * timer); extern int mod_timer(struct timer_list *timer, unsigned long expires); extern int mod_timer_pending(struct timer_list *timer, unsigned long expires); -extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires); extern void set_timer_slack(struct timer_list *time, int slack_hz); -#define MOD_TIMER_NOT_PINNED 0 -#define MOD_TIMER_PINNED 1 /* * The jiffies value which is added to now, when there is no timer * in the timer wheel: diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 693f6d14058e..ba49c1cf80f5 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -764,8 +764,7 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, } static inline int -__mod_timer(struct timer_list *timer, unsigned long expires, - bool pending_only, int pinned) +__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) { struct tvec_base *base, *new_base; unsigned long flags; @@ -782,7 +781,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - new_base = get_target_base(base, pinned || timer->flags & TIMER_PINNED); + new_base = get_target_base(base, timer->flags & TIMER_PINNED); if (base != new_base) { /* @@ -825,7 +824,7 @@ out_unlock: */ int mod_timer_pending(struct timer_list *timer, unsigned long expires) { - return __mod_timer(timer, expires, true, MOD_TIMER_NOT_PINNED); + return __mod_timer(timer, expires, true); } EXPORT_SYMBOL(mod_timer_pending); @@ -900,38 +899,10 @@ int mod_timer(struct timer_list *timer, unsigned long expires) if (timer_pending(timer) && timer->expires == expires) return 1; - return __mod_timer(timer, expires, false, MOD_TIMER_NOT_PINNED); + return __mod_timer(timer, expires, false); } EXPORT_SYMBOL(mod_timer); -/** - * mod_timer_pinned - modify a timer's timeout - * @timer: the timer to be modified - * @expires: new timeout in jiffies - * - * mod_timer_pinned() is a way to update the expire field of an - * active timer (if the timer is inactive it will be activated) - * and to ensure that the timer is scheduled on the current CPU. - * - * Note that this does not prevent the timer from being migrated - * when the current CPU goes offline. If this is a problem for - * you, use CPU-hotplug notifiers to handle it correctly, for - * example, cancelling the timer when the corresponding CPU goes - * offline. - * - * mod_timer_pinned(timer, expires) is equivalent to: - * - * del_timer(timer); timer->expires = expires; add_timer(timer); - */ -int mod_timer_pinned(struct timer_list *timer, unsigned long expires) -{ - if (timer->expires == expires && timer_pending(timer)) - return 1; - - return __mod_timer(timer, expires, false, MOD_TIMER_PINNED); -} -EXPORT_SYMBOL(mod_timer_pinned); - /** * add_timer - start a timer * @timer: the timer to be added @@ -1512,7 +1483,7 @@ signed long __sched schedule_timeout(signed long timeout) expire = timeout + jiffies; setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); - __mod_timer(&timer, expire, false, MOD_TIMER_NOT_PINNED); + __mod_timer(&timer, expire, false); schedule(); del_singleshot_timer_sync(&timer); -- cgit v1.3-8-gc7d7 From 2b1ecc3d1a6b10f8fbac7f83d80db30b5a2c2791 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:25 +0000 Subject: signals: Use hrtimer for sigtimedwait() We've converted most timeout related syscalls to hrtimers, but sigtimedwait() did not get this treatment. Convert it so we get a reasonable accuracy and remove the user space exposure to the timer wheel properties. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Cc: Al Viro Cc: Arjan van de Ven Cc: Chris Mason Cc: Cyril Hrubis Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094341.787164909@linutronix.de Signed-off-by: Ingo Molnar --- kernel/signal.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 96e9bc40667f..af21afc00d08 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2751,23 +2751,18 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) * @ts: upper bound on process time suspension */ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, - const struct timespec *ts) + const struct timespec *ts) { + ktime_t *to = NULL, timeout = { .tv64 = KTIME_MAX }; struct task_struct *tsk = current; - long timeout = MAX_SCHEDULE_TIMEOUT; sigset_t mask = *which; - int sig; + int sig, ret = 0; if (ts) { if (!timespec_valid(ts)) return -EINVAL; - timeout = timespec_to_jiffies(ts); - /* - * We can be close to the next tick, add another one - * to ensure we will wait at least the time asked for. - */ - if (ts->tv_sec || ts->tv_nsec) - timeout++; + timeout = timespec_to_ktime(*ts); + to = &timeout; } /* @@ -2778,7 +2773,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, spin_lock_irq(&tsk->sighand->siglock); sig = dequeue_signal(tsk, &mask, info); - if (!sig && timeout) { + if (!sig && timeout.tv64) { /* * None ready, temporarily unblock those we're interested * while we are sleeping in so that we'll be awakened when @@ -2790,8 +2785,9 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, recalc_sigpending(); spin_unlock_irq(&tsk->sighand->siglock); - timeout = freezable_schedule_timeout_interruptible(timeout); - + __set_current_state(TASK_INTERRUPTIBLE); + ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns, + HRTIMER_MODE_REL); spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); sigemptyset(&tsk->real_blocked); @@ -2801,7 +2797,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, if (sig) return sig; - return timeout ? -EINTR : -EAGAIN; + return ret ? -EINTR : -EAGAIN; } /** -- cgit v1.3-8-gc7d7 From 494af3ed7848de08640d98ee5aff57a45c137c3c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:28 +0000 Subject: timers: Give a few structs and members proper names Some of the names in the internal implementation of the timer code are not longer correct and others are simply too long to type. Clean it up before we switch the wheel implementation over to the new scheme. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094341.948752516@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/timer.c | 118 ++++++++++++++++++++++++++-------------------------- 1 file changed, 59 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ba49c1cf80f5..f259a3ef4577 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -77,10 +77,10 @@ struct tvec_root { struct hlist_head vec[TVR_SIZE]; }; -struct tvec_base { +struct timer_base { spinlock_t lock; struct timer_list *running_timer; - unsigned long timer_jiffies; + unsigned long clk; unsigned long next_timer; unsigned long active_timers; unsigned long all_timers; @@ -95,7 +95,7 @@ struct tvec_base { } ____cacheline_aligned; -static DEFINE_PER_CPU(struct tvec_base, tvec_bases); +static DEFINE_PER_CPU(struct timer_base, timer_bases); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) unsigned int sysctl_timer_migration = 1; @@ -106,15 +106,15 @@ void timers_update_migration(bool update_nohz) unsigned int cpu; /* Avoid the loop, if nothing to update */ - if (this_cpu_read(tvec_bases.migration_enabled) == on) + if (this_cpu_read(timer_bases.migration_enabled) == on) return; for_each_possible_cpu(cpu) { - per_cpu(tvec_bases.migration_enabled, cpu) = on; + per_cpu(timer_bases.migration_enabled, cpu) = on; per_cpu(hrtimer_bases.migration_enabled, cpu) = on; if (!update_nohz) continue; - per_cpu(tvec_bases.nohz_active, cpu) = true; + per_cpu(timer_bases.nohz_active, cpu) = true; per_cpu(hrtimer_bases.nohz_active, cpu) = true; } } @@ -134,18 +134,18 @@ int timer_migration_handler(struct ctl_table *table, int write, return ret; } -static inline struct tvec_base *get_target_base(struct tvec_base *base, +static inline struct timer_base *get_target_base(struct timer_base *base, int pinned) { if (pinned || !base->migration_enabled) - return this_cpu_ptr(&tvec_bases); - return per_cpu_ptr(&tvec_bases, get_nohz_timer_target()); + return this_cpu_ptr(&timer_bases); + return per_cpu_ptr(&timer_bases, get_nohz_timer_target()); } #else -static inline struct tvec_base *get_target_base(struct tvec_base *base, +static inline struct timer_base *get_target_base(struct timer_base *base, int pinned) { - return this_cpu_ptr(&tvec_bases); + return this_cpu_ptr(&timer_bases); } #endif @@ -371,10 +371,10 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) EXPORT_SYMBOL_GPL(set_timer_slack); static void -__internal_add_timer(struct tvec_base *base, struct timer_list *timer) +__internal_add_timer(struct timer_base *base, struct timer_list *timer) { unsigned long expires = timer->expires; - unsigned long idx = expires - base->timer_jiffies; + unsigned long idx = expires - base->clk; struct hlist_head *vec; if (idx < TVR_SIZE) { @@ -394,7 +394,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) * Can happen if you add a timer with expires == jiffies, * or you set a timer to go off in the past */ - vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); + vec = base->tv1.vec + (base->clk & TVR_MASK); } else { int i; /* If the timeout is larger than MAX_TVAL (on 64-bit @@ -403,7 +403,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) */ if (idx > MAX_TVAL) { idx = MAX_TVAL; - expires = idx + base->timer_jiffies; + expires = idx + base->clk; } i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; vec = base->tv5.vec + i; @@ -412,11 +412,11 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) hlist_add_head(&timer->entry, vec); } -static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +static void internal_add_timer(struct timer_base *base, struct timer_list *timer) { /* Advance base->jiffies, if the base is empty */ if (!base->all_timers++) - base->timer_jiffies = jiffies; + base->clk = jiffies; __internal_add_timer(base, timer); /* @@ -707,7 +707,7 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending) } static inline void -detach_expired_timer(struct timer_list *timer, struct tvec_base *base) +detach_expired_timer(struct timer_list *timer, struct timer_base *base) { detach_timer(timer, true); if (!(timer->flags & TIMER_DEFERRABLE)) @@ -715,7 +715,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base) base->all_timers--; } -static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, +static int detach_if_pending(struct timer_list *timer, struct timer_base *base, bool clear_pending) { if (!timer_pending(timer)) @@ -725,16 +725,16 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, if (!(timer->flags & TIMER_DEFERRABLE)) { base->active_timers--; if (timer->expires == base->next_timer) - base->next_timer = base->timer_jiffies; + base->next_timer = base->clk; } /* If this was the last timer, advance base->jiffies */ if (!--base->all_timers) - base->timer_jiffies = jiffies; + base->clk = jiffies; return 1; } /* - * We are using hashed locking: holding per_cpu(tvec_bases).lock + * We are using hashed locking: holding per_cpu(timer_bases).lock * means that all timers which are tied to this base via timer->base are * locked, and the base itself is locked too. * @@ -744,16 +744,16 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, * When the timer's base is locked and removed from the list, the * TIMER_MIGRATING flag is set, FIXME */ -static struct tvec_base *lock_timer_base(struct timer_list *timer, +static struct timer_base *lock_timer_base(struct timer_list *timer, unsigned long *flags) __acquires(timer->base->lock) { for (;;) { u32 tf = timer->flags; - struct tvec_base *base; + struct timer_base *base; if (!(tf & TIMER_MIGRATING)) { - base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); + base = per_cpu_ptr(&timer_bases, tf & TIMER_CPUMASK); spin_lock_irqsave(&base->lock, *flags); if (timer->flags == tf) return base; @@ -766,7 +766,7 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, static inline int __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) { - struct tvec_base *base, *new_base; + struct timer_base *base, *new_base; unsigned long flags; int ret = 0; @@ -933,8 +933,8 @@ EXPORT_SYMBOL(add_timer); */ void add_timer_on(struct timer_list *timer, int cpu) { - struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu); - struct tvec_base *base; + struct timer_base *new_base = per_cpu_ptr(&timer_bases, cpu); + struct timer_base *base; unsigned long flags; timer_stats_timer_set_start_info(timer); @@ -975,7 +975,7 @@ EXPORT_SYMBOL_GPL(add_timer_on); */ int del_timer(struct timer_list *timer) { - struct tvec_base *base; + struct timer_base *base; unsigned long flags; int ret = 0; @@ -1001,7 +1001,7 @@ EXPORT_SYMBOL(del_timer); */ int try_to_del_timer_sync(struct timer_list *timer) { - struct tvec_base *base; + struct timer_base *base; unsigned long flags; int ret = -1; @@ -1085,7 +1085,7 @@ int del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(del_timer_sync); #endif -static int cascade(struct tvec_base *base, struct tvec *tv, int index) +static int cascade(struct timer_base *base, struct tvec *tv, int index) { /* cascade all the timers from tv up one level */ struct timer_list *timer; @@ -1149,7 +1149,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), } } -#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) +#define INDEX(N) ((base->clk >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) /** * __run_timers - run all expired timers (if any) on this CPU. @@ -1158,23 +1158,23 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), * This function cascades all vectors and executes all expired timer * vectors. */ -static inline void __run_timers(struct tvec_base *base) +static inline void __run_timers(struct timer_base *base) { struct timer_list *timer; spin_lock_irq(&base->lock); - while (time_after_eq(jiffies, base->timer_jiffies)) { + while (time_after_eq(jiffies, base->clk)) { struct hlist_head work_list; struct hlist_head *head = &work_list; int index; if (!base->all_timers) { - base->timer_jiffies = jiffies; + base->clk = jiffies; break; } - index = base->timer_jiffies & TVR_MASK; + index = base->clk & TVR_MASK; /* * Cascade timers: @@ -1184,7 +1184,7 @@ static inline void __run_timers(struct tvec_base *base) (!cascade(base, &base->tv3, INDEX(1))) && !cascade(base, &base->tv4, INDEX(2))) cascade(base, &base->tv5, INDEX(3)); - ++base->timer_jiffies; + ++base->clk; hlist_move_list(base->tv1.vec + index, head); while (!hlist_empty(head)) { void (*fn)(unsigned long); @@ -1222,16 +1222,16 @@ static inline void __run_timers(struct tvec_base *base) * is used on S/390 to stop all activity when a CPU is idle. * This function needs to be called with interrupts disabled. */ -static unsigned long __next_timer_interrupt(struct tvec_base *base) +static unsigned long __next_timer_interrupt(struct timer_base *base) { - unsigned long timer_jiffies = base->timer_jiffies; - unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; + unsigned long clk = base->clk; + unsigned long expires = clk + NEXT_TIMER_MAX_DELTA; int index, slot, array, found = 0; struct timer_list *nte; struct tvec *varray[4]; /* Look for timer events in tv1. */ - index = slot = timer_jiffies & TVR_MASK; + index = slot = clk & TVR_MASK; do { hlist_for_each_entry(nte, base->tv1.vec + slot, entry) { if (nte->flags & TIMER_DEFERRABLE) @@ -1250,8 +1250,8 @@ static unsigned long __next_timer_interrupt(struct tvec_base *base) cascade: /* Calculate the next cascade event */ if (index) - timer_jiffies += TVR_SIZE - index; - timer_jiffies >>= TVR_BITS; + clk += TVR_SIZE - index; + clk >>= TVR_BITS; /* Check tv2-tv5. */ varray[0] = &base->tv2; @@ -1262,7 +1262,7 @@ cascade: for (array = 0; array < 4; array++) { struct tvec *varp = varray[array]; - index = slot = timer_jiffies & TVN_MASK; + index = slot = clk & TVN_MASK; do { hlist_for_each_entry(nte, varp->vec + slot, entry) { if (nte->flags & TIMER_DEFERRABLE) @@ -1286,8 +1286,8 @@ cascade: } while (slot != index); if (index) - timer_jiffies += TVN_SIZE - index; - timer_jiffies >>= TVN_BITS; + clk += TVN_SIZE - index; + clk >>= TVN_BITS; } return expires; } @@ -1335,7 +1335,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) */ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { - struct tvec_base *base = this_cpu_ptr(&tvec_bases); + struct timer_base *base = this_cpu_ptr(&timer_bases); u64 expires = KTIME_MAX; unsigned long nextevt; @@ -1348,7 +1348,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) spin_lock(&base->lock); if (base->active_timers) { - if (time_before_eq(base->next_timer, base->timer_jiffies)) + if (time_before_eq(base->next_timer, base->clk)) base->next_timer = __next_timer_interrupt(base); nextevt = base->next_timer; if (time_before_eq(nextevt, basej)) @@ -1387,9 +1387,9 @@ void update_process_times(int user_tick) */ static void run_timer_softirq(struct softirq_action *h) { - struct tvec_base *base = this_cpu_ptr(&tvec_bases); + struct timer_base *base = this_cpu_ptr(&timer_bases); - if (time_after_eq(jiffies, base->timer_jiffies)) + if (time_after_eq(jiffies, base->clk)) __run_timers(base); } @@ -1534,7 +1534,7 @@ signed long __sched schedule_timeout_idle(signed long timeout) EXPORT_SYMBOL(schedule_timeout_idle); #ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) +static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) { struct timer_list *timer; int cpu = new_base->cpu; @@ -1550,13 +1550,13 @@ static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *he static void migrate_timers(int cpu) { - struct tvec_base *old_base; - struct tvec_base *new_base; + struct timer_base *old_base; + struct timer_base *new_base; int i; BUG_ON(cpu_online(cpu)); - old_base = per_cpu_ptr(&tvec_bases, cpu); - new_base = get_cpu_ptr(&tvec_bases); + old_base = per_cpu_ptr(&timer_bases, cpu); + new_base = get_cpu_ptr(&timer_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. @@ -1580,7 +1580,7 @@ static void migrate_timers(int cpu) spin_unlock(&old_base->lock); spin_unlock_irq(&new_base->lock); - put_cpu_ptr(&tvec_bases); + put_cpu_ptr(&timer_bases); } static int timer_cpu_notify(struct notifier_block *self, @@ -1608,13 +1608,13 @@ static inline void timer_register_cpu_notifier(void) { } static void __init init_timer_cpu(int cpu) { - struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); + struct timer_base *base = per_cpu_ptr(&timer_bases, cpu); base->cpu = cpu; spin_lock_init(&base->lock); - base->timer_jiffies = jiffies; - base->next_timer = base->timer_jiffies; + base->clk = jiffies; + base->next_timer = base->clk; } static void __init init_timer_cpus(void) -- cgit v1.3-8-gc7d7 From 500462a9de657f86edaa102f8ab6bff7f7e43fc2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:30 +0000 Subject: timers: Switch to a non-cascading wheel The current timer wheel has some drawbacks: 1) Cascading: Cascading can be an unbound operation and is completely pointless in most cases because the vast majority of the timer wheel timers are canceled or rearmed before expiration. (They are used as timeout safeguards, not as real timers to measure time.) 2) No fast lookup of the next expiring timer: In NOHZ scenarios the first timer soft interrupt after a long NOHZ period must fast forward the base time to the current value of jiffies. As we have no way to find the next expiring timer fast, the code loops linearly and increments the base time one by one and checks for expired timers in each step. This causes unbound overhead spikes exactly in the moment when we should wake up as fast as possible. After a thorough analysis of real world data gathered on laptops, workstations, webservers and other machines (thanks Chris!) I came to the conclusion that the current 'classic' timer wheel implementation can be modified to address the above issues. The vast majority of timer wheel timers is canceled or rearmed before expiry. Most of them are timeouts for networking and other I/O tasks. The nature of timeouts is to catch the exception from normal operation (TCP ack timed out, disk does not respond, etc.). For these kinds of timeouts the accuracy of the timeout is not really a concern. Timeouts are very often approximate worst-case values and in case the timeout fires, we already waited for a long time and performance is down the drain already. The few timers which actually expire can be split into two categories: 1) Short expiry times which expect halfways accurate expiry 2) Long term expiry times are inaccurate today already due to the batching which is done for NOHZ automatically and also via the set_timer_slack() API. So for long term expiry timers we can avoid the cascading property and just leave them in the less granular outer wheels until expiry or cancelation. Timers which are armed with a timeout larger than the wheel capacity are no longer cascaded. We expire them with the longest possible timeout (6+ days). We have not observed such timeouts in our data collection, but at least we handle them, applying the rule of the least surprise. To avoid extending the wheel levels for HZ=1000 so we can accomodate the longest observed timeouts (5 days in the network conntrack code) we reduce the first level granularity on HZ=1000 to 4ms, which effectively is the same as the HZ=250 behaviour. From our data analysis there is nothing which relies on that 1ms granularity and as a side effect we get better batching and timer locality for the networking code as well. Contrary to the classic wheel the granularity of the next wheel is not the capacity of the first wheel. The granularities of the wheels are in the currently chosen setting 8 times the granularity of the previous wheel. So for HZ=250 we end up with the following granularity levels: Level Offset Granularity Range 0 0 4 ms 0 ms - 252 ms 1 64 32 ms 256 ms - 2044 ms (256ms - ~2s) 2 128 256 ms 2048 ms - 16380 ms (~2s - ~16s) 3 192 2048 ms (~2s) 16384 ms - 131068 ms (~16s - ~2m) 4 256 16384 ms (~16s) 131072 ms - 1048572 ms (~2m - ~17m) 5 320 131072 ms (~2m) 1048576 ms - 8388604 ms (~17m - ~2h) 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) That's a worst case inaccuracy of 12.5% for the timers which are queued at the beginning of a level. So the new wheel concept addresses the old issues: 1) Cascading is avoided completely 2) By keeping the timers in the bucket until expiry/cancelation we can track the buckets which have timers enqueued in a bucket bitmap and therefore can look up the next expiring timer very fast and O(1). A further benefit of the concept is that the slack calculation which is done on every timer start is no longer necessary because the granularity levels provide natural batching already. Our extensive testing with various loads did not show any performance degradation vs. the current wheel implementation. This patch does not address the 'fast lookup' issue as we wanted to make sure that there is no regression introduced by the wheel redesign. The optimizations are in follow up patches. This patch contains fixes from Anna-Maria Gleixner and Richard Cochran. Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.108621834@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/timer.h | 2 + kernel/time/timer.c | 829 ++++++++++++++++++++++++++++---------------------- 2 files changed, 469 insertions(+), 362 deletions(-) (limited to 'kernel') diff --git a/include/linux/timer.h b/include/linux/timer.h index 989f33d16ebf..5869ab9848fe 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -64,6 +64,8 @@ struct timer_list { #define TIMER_DEFERRABLE 0x00080000 #define TIMER_PINNED 0x00100000 #define TIMER_IRQSAFE 0x00200000 +#define TIMER_ARRAYSHIFT 22 +#define TIMER_ARRAYMASK 0xFFC00000 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \ .entry = { .next = TIMER_ENTRY_STATIC }, \ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index f259a3ef4577..86e95b72665d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -59,43 +59,151 @@ __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); /* - * per-CPU timer vector definitions: + * The timer wheel has LVL_DEPTH array levels. Each level provides an array of + * LVL_SIZE buckets. Each level is driven by its own clock and therefor each + * level has a different granularity. + * + * The level granularity is: LVL_CLK_DIV ^ lvl + * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) + * + * The array level of a newly armed timer depends on the relative expiry + * time. The farther the expiry time is away the higher the array level and + * therefor the granularity becomes. + * + * Contrary to the original timer wheel implementation, which aims for 'exact' + * expiry of the timers, this implementation removes the need for recascading + * the timers into the lower array levels. The previous 'classic' timer wheel + * implementation of the kernel already violated the 'exact' expiry by adding + * slack to the expiry time to provide batched expiration. The granularity + * levels provide implicit batching. + * + * This is an optimization of the original timer wheel implementation for the + * majority of the timer wheel use cases: timeouts. The vast majority of + * timeout timers (networking, disk I/O ...) are canceled before expiry. If + * the timeout expires it indicates that normal operation is disturbed, so it + * does not matter much whether the timeout comes with a slight delay. + * + * The only exception to this are networking timers with a small expiry + * time. They rely on the granularity. Those fit into the first wheel level, + * which has HZ granularity. + * + * We don't have cascading anymore. timers with a expiry time above the + * capacity of the last wheel level are force expired at the maximum timeout + * value of the last wheel level. From data sampling we know that the maximum + * value observed is 5 days (network connection tracking), so this should not + * be an issue. + * + * The currently chosen array constants values are a good compromise between + * array size and granularity. + * + * This results in the following granularity and range levels: + * + * HZ 1000 steps + * Level Offset Granularity Range + * 0 0 1 ms 0 ms - 63 ms + * 1 64 8 ms 64 ms - 511 ms + * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s) + * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s) + * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m) + * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m) + * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h) + * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d) + * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d) + * + * HZ 300 + * Level Offset Granularity Range + * 0 0 3 ms 0 ms - 210 ms + * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s) + * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s) + * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m) + * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m) + * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h) + * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h) + * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d) + * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d) + * + * HZ 250 + * Level Offset Granularity Range + * 0 0 4 ms 0 ms - 255 ms + * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s) + * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s) + * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m) + * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m) + * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h) + * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) + * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) + * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d) + * + * HZ 100 + * Level Offset Granularity Range + * 0 0 10 ms 0 ms - 630 ms + * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s) + * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s) + * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m) + * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m) + * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h) + * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d) + * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d) */ -#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) -#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) -#define TVN_SIZE (1 << TVN_BITS) -#define TVR_SIZE (1 << TVR_BITS) -#define TVN_MASK (TVN_SIZE - 1) -#define TVR_MASK (TVR_SIZE - 1) -#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) - -struct tvec { - struct hlist_head vec[TVN_SIZE]; -}; -struct tvec_root { - struct hlist_head vec[TVR_SIZE]; -}; +/* Clock divisor for the next level */ +#define LVL_CLK_SHIFT 3 +#define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT) +#define LVL_CLK_MASK (LVL_CLK_DIV - 1) +#define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT) +#define LVL_GRAN(n) (1UL << LVL_SHIFT(n)) + +/* + * The time start value for each level to select the bucket at enqueue + * time. + */ +#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) + +/* Size of each clock level */ +#define LVL_BITS 6 +#define LVL_SIZE (1UL << LVL_BITS) +#define LVL_MASK (LVL_SIZE - 1) +#define LVL_OFFS(n) ((n) * LVL_SIZE) + +/* Level depth */ +#if HZ > 100 +# define LVL_DEPTH 9 +# else +# define LVL_DEPTH 8 +#endif + +/* The cutoff (max. capacity of the wheel) */ +#define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH)) +#define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1)) + +/* + * The resulting wheel size. If NOHZ is configured we allocate two + * wheels so we have a separate storage for the deferrable timers. + */ +#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH) + +#ifdef CONFIG_NO_HZ_COMMON +# define NR_BASES 2 +# define BASE_STD 0 +# define BASE_DEF 1 +#else +# define NR_BASES 1 +# define BASE_STD 0 +# define BASE_DEF 0 +#endif struct timer_base { - spinlock_t lock; - struct timer_list *running_timer; - unsigned long clk; - unsigned long next_timer; - unsigned long active_timers; - unsigned long all_timers; - int cpu; - bool migration_enabled; - bool nohz_active; - struct tvec_root tv1; - struct tvec tv2; - struct tvec tv3; - struct tvec tv4; - struct tvec tv5; + spinlock_t lock; + struct timer_list *running_timer; + unsigned long clk; + unsigned int cpu; + bool migration_enabled; + bool nohz_active; + DECLARE_BITMAP(pending_map, WHEEL_SIZE); + struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; - -static DEFINE_PER_CPU(struct timer_base, timer_bases); +static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) unsigned int sysctl_timer_migration = 1; @@ -106,15 +214,17 @@ void timers_update_migration(bool update_nohz) unsigned int cpu; /* Avoid the loop, if nothing to update */ - if (this_cpu_read(timer_bases.migration_enabled) == on) + if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on) return; for_each_possible_cpu(cpu) { - per_cpu(timer_bases.migration_enabled, cpu) = on; + per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on; + per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on; per_cpu(hrtimer_bases.migration_enabled, cpu) = on; if (!update_nohz) continue; - per_cpu(timer_bases.nohz_active, cpu) = true; + per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true; + per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true; per_cpu(hrtimer_bases.nohz_active, cpu) = true; } } @@ -133,20 +243,6 @@ int timer_migration_handler(struct ctl_table *table, int write, mutex_unlock(&mutex); return ret; } - -static inline struct timer_base *get_target_base(struct timer_base *base, - int pinned) -{ - if (pinned || !base->migration_enabled) - return this_cpu_ptr(&timer_bases); - return per_cpu_ptr(&timer_bases, get_nohz_timer_target()); -} -#else -static inline struct timer_base *get_target_base(struct timer_base *base, - int pinned) -{ - return this_cpu_ptr(&timer_bases); -} #endif static unsigned long round_jiffies_common(unsigned long j, int cpu, @@ -370,78 +466,91 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) } EXPORT_SYMBOL_GPL(set_timer_slack); +static inline unsigned int timer_get_idx(struct timer_list *timer) +{ + return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT; +} + +static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) +{ + timer->flags = (timer->flags & ~TIMER_ARRAYMASK) | + idx << TIMER_ARRAYSHIFT; +} + +/* + * Helper function to calculate the array index for a given expiry + * time. + */ +static inline unsigned calc_index(unsigned expires, unsigned lvl) +{ + expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); + return LVL_OFFS(lvl) + (expires & LVL_MASK); +} + static void __internal_add_timer(struct timer_base *base, struct timer_list *timer) { unsigned long expires = timer->expires; - unsigned long idx = expires - base->clk; + unsigned long delta = expires - base->clk; struct hlist_head *vec; - - if (idx < TVR_SIZE) { - int i = expires & TVR_MASK; - vec = base->tv1.vec + i; - } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { - int i = (expires >> TVR_BITS) & TVN_MASK; - vec = base->tv2.vec + i; - } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; - vec = base->tv3.vec + i; - } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; - vec = base->tv4.vec + i; - } else if ((signed long) idx < 0) { - /* - * Can happen if you add a timer with expires == jiffies, - * or you set a timer to go off in the past - */ - vec = base->tv1.vec + (base->clk & TVR_MASK); + unsigned int idx; + + if (delta < LVL_START(1)) { + idx = calc_index(expires, 0); + } else if (delta < LVL_START(2)) { + idx = calc_index(expires, 1); + } else if (delta < LVL_START(3)) { + idx = calc_index(expires, 2); + } else if (delta < LVL_START(4)) { + idx = calc_index(expires, 3); + } else if (delta < LVL_START(5)) { + idx = calc_index(expires, 4); + } else if (delta < LVL_START(6)) { + idx = calc_index(expires, 5); + } else if (delta < LVL_START(7)) { + idx = calc_index(expires, 6); + } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { + idx = calc_index(expires, 7); + } else if ((long) delta < 0) { + idx = base->clk & LVL_MASK; } else { - int i; - /* If the timeout is larger than MAX_TVAL (on 64-bit - * architectures or with CONFIG_BASE_SMALL=1) then we - * use the maximum timeout. + /* + * Force expire obscene large timeouts to expire at the + * capacity limit of the wheel. */ - if (idx > MAX_TVAL) { - idx = MAX_TVAL; - expires = idx + base->clk; - } - i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; - vec = base->tv5.vec + i; - } + if (expires >= WHEEL_TIMEOUT_CUTOFF) + expires = WHEEL_TIMEOUT_MAX; + idx = calc_index(expires, LVL_DEPTH - 1); + } + /* + * Enqueue the timer into the array bucket, mark it pending in + * the bitmap and store the index in the timer flags. + */ + vec = base->vectors + idx; hlist_add_head(&timer->entry, vec); + __set_bit(idx, base->pending_map); + timer_set_idx(timer, idx); } static void internal_add_timer(struct timer_base *base, struct timer_list *timer) { - /* Advance base->jiffies, if the base is empty */ - if (!base->all_timers++) - base->clk = jiffies; - __internal_add_timer(base, timer); - /* - * Update base->active_timers and base->next_timer - */ - if (!(timer->flags & TIMER_DEFERRABLE)) { - if (!base->active_timers++ || - time_before(timer->expires, base->next_timer)) - base->next_timer = timer->expires; - } /* * Check whether the other CPU is in dynticks mode and needs - * to be triggered to reevaluate the timer wheel. - * We are protected against the other CPU fiddling - * with the timer by holding the timer base lock. This also - * makes sure that a CPU on the way to stop its tick can not - * evaluate the timer wheel. + * to be triggered to reevaluate the timer wheel. We are + * protected against the other CPU fiddling with the timer by + * holding the timer base lock. This also makes sure that a + * CPU on the way to stop its tick can not evaluate the timer + * wheel. * * Spare the IPI for deferrable timers on idle targets though. * The next busy ticks will take care of it. Except full dynticks * require special care against races with idle_cpu(), lets deal * with that later. */ - if (base->nohz_active) { + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) { if (!(timer->flags & TIMER_DEFERRABLE) || tick_nohz_full_cpu(base->cpu)) wake_up_nohz_cpu(base->cpu); @@ -706,54 +815,87 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending) entry->next = LIST_POISON2; } -static inline void -detach_expired_timer(struct timer_list *timer, struct timer_base *base) -{ - detach_timer(timer, true); - if (!(timer->flags & TIMER_DEFERRABLE)) - base->active_timers--; - base->all_timers--; -} - static int detach_if_pending(struct timer_list *timer, struct timer_base *base, bool clear_pending) { + unsigned idx = timer_get_idx(timer); + if (!timer_pending(timer)) return 0; + if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) + __clear_bit(idx, base->pending_map); + detach_timer(timer, clear_pending); - if (!(timer->flags & TIMER_DEFERRABLE)) { - base->active_timers--; - if (timer->expires == base->next_timer) - base->next_timer = base->clk; - } - /* If this was the last timer, advance base->jiffies */ - if (!--base->all_timers) - base->clk = jiffies; return 1; } +static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) +{ + struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); + + /* + * If the timer is deferrable and nohz is active then we need to use + * the deferrable base. + */ + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && + (tflags & TIMER_DEFERRABLE)) + base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); + return base; +} + +static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) +{ + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + + /* + * If the timer is deferrable and nohz is active then we need to use + * the deferrable base. + */ + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && + (tflags & TIMER_DEFERRABLE)) + base = this_cpu_ptr(&timer_bases[BASE_DEF]); + return base; +} + +static inline struct timer_base *get_timer_base(u32 tflags) +{ + return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); +} + +static inline struct timer_base *get_target_base(struct timer_base *base, + unsigned tflags) +{ +#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) + if ((tflags & TIMER_PINNED) || !base->migration_enabled) + return get_timer_this_cpu_base(tflags); + return get_timer_cpu_base(tflags, get_nohz_timer_target()); +#else + return get_timer_this_cpu_base(tflags); +#endif +} + /* - * We are using hashed locking: holding per_cpu(timer_bases).lock - * means that all timers which are tied to this base via timer->base are - * locked, and the base itself is locked too. + * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means + * that all timers which are tied to this base are locked, and the base itself + * is locked too. * * So __run_timers/migrate_timers can safely modify all timers which could - * be found on ->tvX lists. + * be found in the base->vectors array. * - * When the timer's base is locked and removed from the list, the - * TIMER_MIGRATING flag is set, FIXME + * When a timer is migrating then the TIMER_MIGRATING flag is set and we need + * to wait until the migration is done. */ static struct timer_base *lock_timer_base(struct timer_list *timer, - unsigned long *flags) + unsigned long *flags) __acquires(timer->base->lock) { for (;;) { - u32 tf = timer->flags; struct timer_base *base; + u32 tf = timer->flags; if (!(tf & TIMER_MIGRATING)) { - base = per_cpu_ptr(&timer_bases, tf & TIMER_CPUMASK); + base = get_timer_base(tf); spin_lock_irqsave(&base->lock, *flags); if (timer->flags == tf) return base; @@ -770,6 +912,27 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) unsigned long flags; int ret = 0; + /* + * TODO: Calculate the array bucket of the timer right here w/o + * holding the base lock. This allows to check not only + * timer->expires == expires below, but also whether the timer + * ends up in the same bucket. If we really need to requeue + * the timer then we check whether base->clk have + * advanced between here and locking the timer base. If + * jiffies advanced we have to recalc the array bucket with the + * lock held. + */ + + /* + * This is a common optimization triggered by the + * networking code - if the timer is re-modified + * to be the same thing then just return: + */ + if (timer_pending(timer)) { + if (timer->expires == expires) + return 1; + } + timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -781,15 +944,15 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) debug_activate(timer, expires); - new_base = get_target_base(base, timer->flags & TIMER_PINNED); + new_base = get_target_base(base, timer->flags); if (base != new_base) { /* - * We are trying to schedule the timer on the local CPU. + * We are trying to schedule the timer on the new base. * However we can't change timer's base while it is running, * otherwise del_timer_sync() can't detect that the timer's - * handler yet has not finished. This also guarantees that - * the timer is serialized wrt itself. + * handler yet has not finished. This also guarantees that the + * timer is serialized wrt itself. */ if (likely(base->running_timer != timer)) { /* See the comment in lock_timer_base() */ @@ -828,45 +991,6 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires) } EXPORT_SYMBOL(mod_timer_pending); -/* - * Decide where to put the timer while taking the slack into account - * - * Algorithm: - * 1) calculate the maximum (absolute) time - * 2) calculate the highest bit where the expires and new max are different - * 3) use this bit to make a mask - * 4) use the bitmask to round down the maximum time, so that all last - * bits are zeros - */ -static inline -unsigned long apply_slack(struct timer_list *timer, unsigned long expires) -{ - unsigned long expires_limit, mask; - int bit; - - if (timer->slack >= 0) { - expires_limit = expires + timer->slack; - } else { - long delta = expires - jiffies; - - if (delta < 256) - return expires; - - expires_limit = expires + delta / 256; - } - mask = expires ^ expires_limit; - if (mask == 0) - return expires; - - bit = __fls(mask); - - mask = (1UL << bit) - 1; - - expires_limit = expires_limit & ~(mask); - - return expires_limit; -} - /** * mod_timer - modify a timer's timeout * @timer: the timer to be modified @@ -889,16 +1013,6 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) */ int mod_timer(struct timer_list *timer, unsigned long expires) { - expires = apply_slack(timer, expires); - - /* - * This is a common optimization triggered by the - * networking code - if the timer is re-modified - * to be the same thing then just return: - */ - if (timer_pending(timer) && timer->expires == expires) - return 1; - return __mod_timer(timer, expires, false); } EXPORT_SYMBOL(mod_timer); @@ -933,13 +1047,14 @@ EXPORT_SYMBOL(add_timer); */ void add_timer_on(struct timer_list *timer, int cpu) { - struct timer_base *new_base = per_cpu_ptr(&timer_bases, cpu); - struct timer_base *base; + struct timer_base *new_base, *base; unsigned long flags; timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); + new_base = get_timer_cpu_base(timer->flags, cpu); + /* * If @timer was on a different CPU, it should be migrated with the * old base locked to prevent other operations proceeding with the @@ -1085,27 +1200,6 @@ int del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(del_timer_sync); #endif -static int cascade(struct timer_base *base, struct tvec *tv, int index) -{ - /* cascade all the timers from tv up one level */ - struct timer_list *timer; - struct hlist_node *tmp; - struct hlist_head tv_list; - - hlist_move_list(tv->vec + index, &tv_list); - - /* - * We are removing _all_ timers from the list, so we - * don't have to detach them individually. - */ - hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) { - /* No accounting, while moving them */ - __internal_add_timer(base, timer); - } - - return index; -} - static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), unsigned long data) { @@ -1149,68 +1243,80 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), } } -#define INDEX(N) ((base->clk >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) +static void expire_timers(struct timer_base *base, struct hlist_head *head) +{ + while (!hlist_empty(head)) { + struct timer_list *timer; + void (*fn)(unsigned long); + unsigned long data; + + timer = hlist_entry(head->first, struct timer_list, entry); + timer_stats_account_timer(timer); + + base->running_timer = timer; + detach_timer(timer, true); + + fn = timer->function; + data = timer->data; + + if (timer->flags & TIMER_IRQSAFE) { + spin_unlock(&base->lock); + call_timer_fn(timer, fn, data); + spin_lock(&base->lock); + } else { + spin_unlock_irq(&base->lock); + call_timer_fn(timer, fn, data); + spin_lock_irq(&base->lock); + } + } +} + +static int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) +{ + unsigned long clk = base->clk; + struct hlist_head *vec; + int i, levels = 0; + unsigned int idx; + + for (i = 0; i < LVL_DEPTH; i++) { + idx = (clk & LVL_MASK) + i * LVL_SIZE; + + if (__test_and_clear_bit(idx, base->pending_map)) { + vec = base->vectors + idx; + hlist_move_list(vec, heads++); + levels++; + } + /* Is it time to look at the next level? */ + if (clk & LVL_CLK_MASK) + break; + /* Shift clock for the next level granularity */ + clk >>= LVL_CLK_SHIFT; + } + return levels; +} /** * __run_timers - run all expired timers (if any) on this CPU. * @base: the timer vector to be processed. - * - * This function cascades all vectors and executes all expired timer - * vectors. */ static inline void __run_timers(struct timer_base *base) { - struct timer_list *timer; + struct hlist_head heads[LVL_DEPTH]; + int levels; + + if (!time_after_eq(jiffies, base->clk)) + return; spin_lock_irq(&base->lock); while (time_after_eq(jiffies, base->clk)) { - struct hlist_head work_list; - struct hlist_head *head = &work_list; - int index; - if (!base->all_timers) { - base->clk = jiffies; - break; - } - - index = base->clk & TVR_MASK; + levels = collect_expired_timers(base, heads); + base->clk++; - /* - * Cascade timers: - */ - if (!index && - (!cascade(base, &base->tv2, INDEX(0))) && - (!cascade(base, &base->tv3, INDEX(1))) && - !cascade(base, &base->tv4, INDEX(2))) - cascade(base, &base->tv5, INDEX(3)); - ++base->clk; - hlist_move_list(base->tv1.vec + index, head); - while (!hlist_empty(head)) { - void (*fn)(unsigned long); - unsigned long data; - bool irqsafe; - - timer = hlist_entry(head->first, struct timer_list, entry); - fn = timer->function; - data = timer->data; - irqsafe = timer->flags & TIMER_IRQSAFE; - - timer_stats_account_timer(timer); - - base->running_timer = timer; - detach_expired_timer(timer, base); - - if (irqsafe) { - spin_unlock(&base->lock); - call_timer_fn(timer, fn, data); - spin_lock(&base->lock); - } else { - spin_unlock_irq(&base->lock); - call_timer_fn(timer, fn, data); - spin_lock_irq(&base->lock); - } - } + while (levels--) + expire_timers(base, heads + levels); } base->running_timer = NULL; spin_unlock_irq(&base->lock); @@ -1218,78 +1324,87 @@ static inline void __run_timers(struct timer_base *base) #ifdef CONFIG_NO_HZ_COMMON /* - * Find out when the next timer event is due to happen. This - * is used on S/390 to stop all activity when a CPU is idle. - * This function needs to be called with interrupts disabled. + * Find the next pending bucket of a level. Search from @offset + @clk upwards + * and if nothing there, search from start of the level (@offset) up to + * @offset + clk. + */ +static int next_pending_bucket(struct timer_base *base, unsigned offset, + unsigned clk) +{ + unsigned pos, start = offset + clk; + unsigned end = offset + LVL_SIZE; + + pos = find_next_bit(base->pending_map, end, start); + if (pos < end) + return pos - start; + + pos = find_next_bit(base->pending_map, start, offset); + return pos < start ? pos + LVL_SIZE - start : -1; +} + +/* + * Search the first expiring timer in the various clock levels. */ static unsigned long __next_timer_interrupt(struct timer_base *base) { - unsigned long clk = base->clk; - unsigned long expires = clk + NEXT_TIMER_MAX_DELTA; - int index, slot, array, found = 0; - struct timer_list *nte; - struct tvec *varray[4]; - - /* Look for timer events in tv1. */ - index = slot = clk & TVR_MASK; - do { - hlist_for_each_entry(nte, base->tv1.vec + slot, entry) { - if (nte->flags & TIMER_DEFERRABLE) - continue; - - found = 1; - expires = nte->expires; - /* Look at the cascade bucket(s)? */ - if (!index || slot < index) - goto cascade; - return expires; + unsigned long clk, next, adj; + unsigned lvl, offset = 0; + + spin_lock(&base->lock); + next = base->clk + NEXT_TIMER_MAX_DELTA; + clk = base->clk; + for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { + int pos = next_pending_bucket(base, offset, clk & LVL_MASK); + + if (pos >= 0) { + unsigned long tmp = clk + (unsigned long) pos; + + tmp <<= LVL_SHIFT(lvl); + if (time_before(tmp, next)) + next = tmp; } - slot = (slot + 1) & TVR_MASK; - } while (slot != index); - -cascade: - /* Calculate the next cascade event */ - if (index) - clk += TVR_SIZE - index; - clk >>= TVR_BITS; - - /* Check tv2-tv5. */ - varray[0] = &base->tv2; - varray[1] = &base->tv3; - varray[2] = &base->tv4; - varray[3] = &base->tv5; - - for (array = 0; array < 4; array++) { - struct tvec *varp = varray[array]; - - index = slot = clk & TVN_MASK; - do { - hlist_for_each_entry(nte, varp->vec + slot, entry) { - if (nte->flags & TIMER_DEFERRABLE) - continue; - - found = 1; - if (time_before(nte->expires, expires)) - expires = nte->expires; - } - /* - * Do we still search for the first timer or are - * we looking up the cascade buckets ? - */ - if (found) { - /* Look at the cascade bucket(s)? */ - if (!index || slot < index) - break; - return expires; - } - slot = (slot + 1) & TVN_MASK; - } while (slot != index); - - if (index) - clk += TVN_SIZE - index; - clk >>= TVN_BITS; + /* + * Clock for the next level. If the current level clock lower + * bits are zero, we look at the next level as is. If not we + * need to advance it by one because that's going to be the + * next expiring bucket in that level. base->clk is the next + * expiring jiffie. So in case of: + * + * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 + * 0 0 0 0 0 0 + * + * we have to look at all levels @index 0. With + * + * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 + * 0 0 0 0 0 2 + * + * LVL0 has the next expiring bucket @index 2. The upper + * levels have the next expiring bucket @index 1. + * + * In case that the propagation wraps the next level the same + * rules apply: + * + * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 + * 0 0 0 0 F 2 + * + * So after looking at LVL0 we get: + * + * LVL5 LVL4 LVL3 LVL2 LVL1 + * 0 0 0 1 0 + * + * So no propagation from LVL1 to LVL2 because that happened + * with the add already, but then we need to propagate further + * from LVL2 to LVL3. + * + * So the simple check whether the lower bits of the current + * level are 0 or not is sufficient for all cases. + */ + adj = clk & LVL_CLK_MASK ? 1 : 0; + clk >>= LVL_CLK_SHIFT; + clk += adj; } - return expires; + spin_unlock(&base->lock); + return next; } /* @@ -1335,7 +1450,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) */ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { - struct timer_base *base = this_cpu_ptr(&timer_bases); + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); u64 expires = KTIME_MAX; unsigned long nextevt; @@ -1346,17 +1461,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) if (cpu_is_offline(smp_processor_id())) return expires; - spin_lock(&base->lock); - if (base->active_timers) { - if (time_before_eq(base->next_timer, base->clk)) - base->next_timer = __next_timer_interrupt(base); - nextevt = base->next_timer; - if (time_before_eq(nextevt, basej)) - expires = basem; - else - expires = basem + (nextevt - basej) * TICK_NSEC; - } - spin_unlock(&base->lock); + nextevt = __next_timer_interrupt(base); + if (time_before_eq(nextevt, basej)) + expires = basem; + else + expires = basem + (nextevt - basej) * TICK_NSEC; return cmp_next_hrtimer_event(basem, expires); } @@ -1387,10 +1496,11 @@ void update_process_times(int user_tick) */ static void run_timer_softirq(struct softirq_action *h) { - struct timer_base *base = this_cpu_ptr(&timer_bases); + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); - if (time_after_eq(jiffies, base->clk)) - __run_timers(base); + __run_timers(base); + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) + __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); } /* @@ -1541,7 +1651,6 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h while (!hlist_empty(head)) { timer = hlist_entry(head->first, struct timer_list, entry); - /* We ignore the accounting on the dying cpu */ detach_timer(timer, false); timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); @@ -1552,35 +1661,29 @@ static void migrate_timers(int cpu) { struct timer_base *old_base; struct timer_base *new_base; - int i; + int b, i; BUG_ON(cpu_online(cpu)); - old_base = per_cpu_ptr(&timer_bases, cpu); - new_base = get_cpu_ptr(&timer_bases); - /* - * The caller is globally serialized and nobody else - * takes two locks at once, deadlock is not possible. - */ - spin_lock_irq(&new_base->lock); - spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - - BUG_ON(old_base->running_timer); - - for (i = 0; i < TVR_SIZE; i++) - migrate_timer_list(new_base, old_base->tv1.vec + i); - for (i = 0; i < TVN_SIZE; i++) { - migrate_timer_list(new_base, old_base->tv2.vec + i); - migrate_timer_list(new_base, old_base->tv3.vec + i); - migrate_timer_list(new_base, old_base->tv4.vec + i); - migrate_timer_list(new_base, old_base->tv5.vec + i); - } - old_base->active_timers = 0; - old_base->all_timers = 0; + for (b = 0; b < NR_BASES; b++) { + old_base = per_cpu_ptr(&timer_bases[b], cpu); + new_base = get_cpu_ptr(&timer_bases[b]); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + BUG_ON(old_base->running_timer); + + for (i = 0; i < WHEEL_SIZE; i++) + migrate_timer_list(new_base, old_base->vectors + i); - spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); - put_cpu_ptr(&timer_bases); + spin_unlock(&old_base->lock); + spin_unlock_irq(&new_base->lock); + put_cpu_ptr(&timer_bases); + } } static int timer_cpu_notify(struct notifier_block *self, @@ -1608,13 +1711,15 @@ static inline void timer_register_cpu_notifier(void) { } static void __init init_timer_cpu(int cpu) { - struct timer_base *base = per_cpu_ptr(&timer_bases, cpu); - - base->cpu = cpu; - spin_lock_init(&base->lock); + struct timer_base *base; + int i; - base->clk = jiffies; - base->next_timer = base->clk; + for (i = 0; i < NR_BASES; i++) { + base = per_cpu_ptr(&timer_bases[i], cpu); + base->cpu = cpu; + spin_lock_init(&base->lock); + base->clk = jiffies; + } } static void __init init_timer_cpus(void) -- cgit v1.3-8-gc7d7 From 53bf837b78d155b8e1110b3c25b4d0d6391b8ff3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:31 +0000 Subject: timers: Remove set_timer_slack() leftovers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We now have implicit batching in the timer wheel. The slack API is no longer used, so remove it. Signed-off-by: Thomas Gleixner Cc: Alan Stern Cc: Andrew F. Davis Cc: Arjan van de Ven Cc: Chris Mason Cc: David S. Miller Cc: David Woodhouse Cc: Dmitry Eremin-Solenikov Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Greg Kroah-Hartman Cc: Jaehoon Chung Cc: Jens Axboe Cc: John Stultz Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Mathias Nyman Cc: Pali Rohár Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Sebastian Reichel Cc: Ulf Hansson Cc: linux-block@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mmc@vger.kernel.org Cc: linux-pm@vger.kernel.org Cc: linux-usb@vger.kernel.org Cc: netdev@vger.kernel.org Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.189813118@linutronix.de Signed-off-by: Ingo Molnar --- block/genhd.c | 5 ----- drivers/mmc/host/jz4740_mmc.c | 2 -- drivers/power/bq27xxx_battery.c | 5 +---- drivers/usb/host/ohci-hcd.c | 1 - drivers/usb/host/xhci.c | 2 -- include/linux/timer.h | 4 ---- kernel/time/timer.c | 19 ------------------- lib/random32.c | 1 - 8 files changed, 1 insertion(+), 38 deletions(-) (limited to 'kernel') diff --git a/block/genhd.c b/block/genhd.c index 9f42526b4d62..f06d7f3b075b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1523,12 +1523,7 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now) if (--ev->block) goto out_unlock; - /* - * Not exactly a latency critical operation, set poll timer - * slack to 25% and kick event check. - */ intv = disk_events_poll_jiffies(disk); - set_timer_slack(&ev->dwork.timer, intv / 4); if (check_now) queue_delayed_work(system_freezable_power_efficient_wq, &ev->dwork, 0); diff --git a/drivers/mmc/host/jz4740_mmc.c b/drivers/mmc/host/jz4740_mmc.c index 03ddf0ecf402..684087db170b 100644 --- a/drivers/mmc/host/jz4740_mmc.c +++ b/drivers/mmc/host/jz4740_mmc.c @@ -1068,8 +1068,6 @@ static int jz4740_mmc_probe(struct platform_device* pdev) jz4740_mmc_clock_disable(host); setup_timer(&host->timeout_timer, jz4740_mmc_timeout, (unsigned long)host); - /* It is not important when it times out, it just needs to timeout. */ - set_timer_slack(&host->timeout_timer, HZ); host->use_dma = true; if (host->use_dma && jz4740_mmc_acquire_dma_channels(host) != 0) diff --git a/drivers/power/bq27xxx_battery.c b/drivers/power/bq27xxx_battery.c index 45f6ebf88df6..e90b3f307e0f 100644 --- a/drivers/power/bq27xxx_battery.c +++ b/drivers/power/bq27xxx_battery.c @@ -735,11 +735,8 @@ static void bq27xxx_battery_poll(struct work_struct *work) bq27xxx_battery_update(di); - if (poll_interval > 0) { - /* The timer does not have to be accurate. */ - set_timer_slack(&di->work.timer, poll_interval * HZ / 4); + if (poll_interval > 0) schedule_delayed_work(&di->work, poll_interval * HZ); - } } /* diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c index 0449235d4f22..1700908b84ef 100644 --- a/drivers/usb/host/ohci-hcd.c +++ b/drivers/usb/host/ohci-hcd.c @@ -500,7 +500,6 @@ static int ohci_init (struct ohci_hcd *ohci) setup_timer(&ohci->io_watchdog, io_watchdog_func, (unsigned long) ohci); - set_timer_slack(&ohci->io_watchdog, msecs_to_jiffies(20)); ohci->hcca = dma_alloc_coherent (hcd->self.controller, sizeof(*ohci->hcca), &ohci->hcca_dma, GFP_KERNEL); diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index f2f9518c53ab..a986fe763d87 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -490,8 +490,6 @@ static void compliance_mode_recovery_timer_init(struct xhci_hcd *xhci) xhci->comp_mode_recovery_timer.expires = jiffies + msecs_to_jiffies(COMP_MODE_RCVRY_MSECS); - set_timer_slack(&xhci->comp_mode_recovery_timer, - msecs_to_jiffies(COMP_MODE_RCVRY_MSECS)); add_timer(&xhci->comp_mode_recovery_timer); xhci_dbg_trace(xhci, trace_xhci_dbg_quirks, "Compliance mode recovery timer initialized"); diff --git a/include/linux/timer.h b/include/linux/timer.h index 5869ab9848fe..4419506b564e 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -19,7 +19,6 @@ struct timer_list { void (*function)(unsigned long); unsigned long data; u32 flags; - int slack; #ifdef CONFIG_TIMER_STATS int start_pid; @@ -73,7 +72,6 @@ struct timer_list { .expires = (_expires), \ .data = (_data), \ .flags = (_flags), \ - .slack = -1, \ __TIMER_LOCKDEP_MAP_INITIALIZER( \ __FILE__ ":" __stringify(__LINE__)) \ } @@ -193,8 +191,6 @@ extern int del_timer(struct timer_list * timer); extern int mod_timer(struct timer_list *timer, unsigned long expires); extern int mod_timer_pending(struct timer_list *timer, unsigned long expires); -extern void set_timer_slack(struct timer_list *time, int slack_hz); - /* * The jiffies value which is added to now, when there is no timer * in the timer wheel: diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 86e95b72665d..a83e23d0bc25 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -447,24 +447,6 @@ unsigned long round_jiffies_up_relative(unsigned long j) } EXPORT_SYMBOL_GPL(round_jiffies_up_relative); -/** - * set_timer_slack - set the allowed slack for a timer - * @timer: the timer to be modified - * @slack_hz: the amount of time (in jiffies) allowed for rounding - * - * Set the amount of time, in jiffies, that a certain timer has - * in terms of slack. By setting this value, the timer subsystem - * will schedule the actual timer somewhere between - * the time mod_timer() asks for, and that time plus the slack. - * - * By setting the slack to -1, a percentage of the delay is used - * instead. - */ -void set_timer_slack(struct timer_list *timer, int slack_hz) -{ - timer->slack = slack_hz; -} -EXPORT_SYMBOL_GPL(set_timer_slack); static inline unsigned int timer_get_idx(struct timer_list *timer) { @@ -775,7 +757,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags, { timer->entry.pprev = NULL; timer->flags = flags | raw_smp_processor_id(); - timer->slack = -1; #ifdef CONFIG_TIMER_STATS timer->start_site = NULL; timer->start_pid = -1; diff --git a/lib/random32.c b/lib/random32.c index 510d1ce7d4d2..69ed593aab07 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -233,7 +233,6 @@ static void __prandom_timer(unsigned long dontcare) static void __init __prandom_start_seed_timer(void) { - set_timer_slack(&seed_timer, HZ); seed_timer.expires = jiffies + msecs_to_jiffies(40 * MSEC_PER_SEC); add_timer(&seed_timer); } -- cgit v1.3-8-gc7d7 From 73420fea80c6c376d91a69defe64013baa0d7e95 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Mon, 4 Jul 2016 09:50:33 +0000 Subject: timers: Move __run_timers() function Move __run_timers() below __next_timer_interrupt() and next_pending_bucket() in preparation for __run_timers() NOHZ optimization. No functional change. Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.271872665@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/timer.c | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a83e23d0bc25..c16c48de01c5 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1277,32 +1277,6 @@ static int collect_expired_timers(struct timer_base *base, return levels; } -/** - * __run_timers - run all expired timers (if any) on this CPU. - * @base: the timer vector to be processed. - */ -static inline void __run_timers(struct timer_base *base) -{ - struct hlist_head heads[LVL_DEPTH]; - int levels; - - if (!time_after_eq(jiffies, base->clk)) - return; - - spin_lock_irq(&base->lock); - - while (time_after_eq(jiffies, base->clk)) { - - levels = collect_expired_timers(base, heads); - base->clk++; - - while (levels--) - expire_timers(base, heads + levels); - } - base->running_timer = NULL; - spin_unlock_irq(&base->lock); -} - #ifdef CONFIG_NO_HZ_COMMON /* * Find the next pending bucket of a level. Search from @offset + @clk upwards @@ -1472,6 +1446,32 @@ void update_process_times(int user_tick) run_posix_cpu_timers(p); } +/** + * __run_timers - run all expired timers (if any) on this CPU. + * @base: the timer vector to be processed. + */ +static inline void __run_timers(struct timer_base *base) +{ + struct hlist_head heads[LVL_DEPTH]; + int levels; + + if (!time_after_eq(jiffies, base->clk)) + return; + + spin_lock_irq(&base->lock); + + while (time_after_eq(jiffies, base->clk)) { + + levels = collect_expired_timers(base, heads); + base->clk++; + + while (levels--) + expire_timers(base, heads + levels); + } + base->running_timer = NULL; + spin_unlock_irq(&base->lock); +} + /* * This function runs timers and the timer-tq in bottom half context. */ -- cgit v1.3-8-gc7d7 From 236968383cf5cd48835ff0d8a265e299e220d140 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Mon, 4 Jul 2016 09:50:34 +0000 Subject: timers: Optimize collect_expired_timers() for NOHZ After a NOHZ idle sleep the timer wheel must be forwarded to current jiffies. There might be expired timers so the current code loops and checks the expired buckets for timers. This can take quite some time for long NOHZ idle periods. The pending bitmask in the timer base allows us to do a quick search for the next expiring timer and therefore a fast forward of the base time which prevents pointless long lasting loops. For a 3 seconds idle sleep this reduces the catchup time from ~1ms to 5us. Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.351296290@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/timer.c | 49 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index c16c48de01c5..658051c97a3c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1252,8 +1252,8 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) } } -static int collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) +static int __collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) { unsigned long clk = base->clk; struct hlist_head *vec; @@ -1279,9 +1279,9 @@ static int collect_expired_timers(struct timer_base *base, #ifdef CONFIG_NO_HZ_COMMON /* - * Find the next pending bucket of a level. Search from @offset + @clk upwards - * and if nothing there, search from start of the level (@offset) up to - * @offset + clk. + * Find the next pending bucket of a level. Search from level start (@offset) + * + @clk upwards and if nothing there, search from start of the level + * (@offset) up to @offset + clk. */ static int next_pending_bucket(struct timer_base *base, unsigned offset, unsigned clk) @@ -1298,14 +1298,14 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset, } /* - * Search the first expiring timer in the various clock levels. + * Search the first expiring timer in the various clock levels. Caller must + * hold base->lock. */ static unsigned long __next_timer_interrupt(struct timer_base *base) { unsigned long clk, next, adj; unsigned lvl, offset = 0; - spin_lock(&base->lock); next = base->clk + NEXT_TIMER_MAX_DELTA; clk = base->clk; for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { @@ -1358,7 +1358,6 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) clk >>= LVL_CLK_SHIFT; clk += adj; } - spin_unlock(&base->lock); return next; } @@ -1416,7 +1415,10 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) if (cpu_is_offline(smp_processor_id())) return expires; + spin_lock(&base->lock); nextevt = __next_timer_interrupt(base); + spin_unlock(&base->lock); + if (time_before_eq(nextevt, basej)) expires = basem; else @@ -1424,6 +1426,37 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) return cmp_next_hrtimer_event(basem, expires); } + +static int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) +{ + /* + * NOHZ optimization. After a long idle sleep we need to forward the + * base to current jiffies. Avoid a loop by searching the bitfield for + * the next expiring timer. + */ + if ((long)(jiffies - base->clk) > 2) { + unsigned long next = __next_timer_interrupt(base); + + /* + * If the next timer is ahead of time forward to current + * jiffies, otherwise forward to the next expiry time. + */ + if (time_after(next, jiffies)) { + /* The call site will increment clock! */ + base->clk = jiffies - 1; + return 0; + } + base->clk = next; + } + return __collect_expired_timers(base, heads); +} +#else +static inline int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) +{ + return __collect_expired_timers(base, heads); +} #endif /* -- cgit v1.3-8-gc7d7 From ff00673292bd42a3688b33de47252a6a3c3f424c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:35 +0000 Subject: timers/nohz: Remove pointless tick_nohz_kick_tick() function This was a failed attempt to optimize the timer expiry in idle, which was disabled and never revisited. Remove the cruft. Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.431073782@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 33 +-------------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 536ada80f6dd..69abc7bfe80f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1092,35 +1092,6 @@ static void tick_nohz_switch_to_nohz(void) tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } -/* - * When NOHZ is enabled and the tick is stopped, we need to kick the - * tick timer from irq_enter() so that the jiffies update is kept - * alive during long running softirqs. That's ugly as hell, but - * correctness is key even if we need to fix the offending softirq in - * the first place. - * - * Note, this is different to tick_nohz_restart. We just kick the - * timer and do not touch the other magic bits which need to be done - * when idle is left. - */ -static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) -{ -#if 0 - /* Switch back to 2.6.27 behaviour */ - ktime_t delta; - - /* - * Do not touch the tick device, when the next expiry is either - * already reached or less/equal than the tick period. - */ - delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); - if (delta.tv64 <= tick_period.tv64) - return; - - tick_nohz_restart(ts, now); -#endif -} - static inline void tick_nohz_irq_enter(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); @@ -1131,10 +1102,8 @@ static inline void tick_nohz_irq_enter(void) now = ktime_get(); if (ts->idle_active) tick_nohz_stop_idle(ts, now); - if (ts->tick_stopped) { + if (ts->tick_stopped) tick_nohz_update_jiffies(now); - tick_nohz_kick_tick(ts, now); - } } #else -- cgit v1.3-8-gc7d7 From a683f390b93f4d1292f849fc48d28e322046120f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:36 +0000 Subject: timers: Forward the wheel clock whenever possible The wheel clock is stale when a CPU goes into a long idle sleep. This has the side effect that timers which are queued end up in the outer wheel levels. That results in coarser granularity. To solve this, we keep track of the idle state and forward the wheel clock whenever possible. Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.512039360@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/tick-internal.h | 1 + kernel/time/tick-sched.c | 12 +++++ kernel/time/timer.c | 128 ++++++++++++++++++++++++++++++++++++-------- 3 files changed, 120 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 966a5a6fdd0a..f738251000fe 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -164,3 +164,4 @@ static inline void timers_update_migration(bool update_nohz) { } DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); +void timer_clear_idle(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 69abc7bfe80f..5d81f9aa30d2 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -700,6 +700,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { tick.tv64 = 0; + + /* + * Tell the timer code that the base is not idle, i.e. undo + * the effect of get_next_timer_interrupt(): + */ + timer_clear_idle(); /* * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. @@ -809,6 +815,12 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) tick_do_update_jiffies64(now); cpu_load_update_nohz_stop(); + /* + * Clear the timer idle flag, so we avoid IPIs on remote queueing and + * the clock forward checks in the enqueue path: + */ + timer_clear_idle(); + calc_load_exit_idle(); touch_softlockup_watchdog_sched(); /* diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 658051c97a3c..9339d71ee998 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -196,9 +196,11 @@ struct timer_base { spinlock_t lock; struct timer_list *running_timer; unsigned long clk; + unsigned long next_expiry; unsigned int cpu; bool migration_enabled; bool nohz_active; + bool is_idle; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; @@ -519,24 +521,37 @@ static void internal_add_timer(struct timer_base *base, struct timer_list *timer { __internal_add_timer(base, timer); + if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) + return; + /* - * Check whether the other CPU is in dynticks mode and needs - * to be triggered to reevaluate the timer wheel. We are - * protected against the other CPU fiddling with the timer by - * holding the timer base lock. This also makes sure that a - * CPU on the way to stop its tick can not evaluate the timer - * wheel. - * - * Spare the IPI for deferrable timers on idle targets though. - * The next busy ticks will take care of it. Except full dynticks - * require special care against races with idle_cpu(), lets deal - * with that later. + * TODO: This wants some optimizing similar to the code below, but we + * will do that when we switch from push to pull for deferrable timers. */ - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) { - if (!(timer->flags & TIMER_DEFERRABLE) || - tick_nohz_full_cpu(base->cpu)) + if (timer->flags & TIMER_DEFERRABLE) { + if (tick_nohz_full_cpu(base->cpu)) wake_up_nohz_cpu(base->cpu); + return; } + + /* + * We might have to IPI the remote CPU if the base is idle and the + * timer is not deferrable. If the other CPU is on the way to idle + * then it can't set base->is_idle as we hold the base lock: + */ + if (!base->is_idle) + return; + + /* Check whether this is the new first expiring timer: */ + if (time_after_eq(timer->expires, base->next_expiry)) + return; + + /* + * Set the next expiry time and kick the CPU so it can reevaluate the + * wheel: + */ + base->next_expiry = timer->expires; + wake_up_nohz_cpu(base->cpu); } #ifdef CONFIG_TIMER_STATS @@ -844,10 +859,11 @@ static inline struct timer_base *get_timer_base(u32 tflags) return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); } -static inline struct timer_base *get_target_base(struct timer_base *base, - unsigned tflags) +#ifdef CONFIG_NO_HZ_COMMON +static inline struct timer_base * +__get_target_base(struct timer_base *base, unsigned tflags) { -#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP if ((tflags & TIMER_PINNED) || !base->migration_enabled) return get_timer_this_cpu_base(tflags); return get_timer_cpu_base(tflags, get_nohz_timer_target()); @@ -856,6 +872,43 @@ static inline struct timer_base *get_target_base(struct timer_base *base, #endif } +static inline void forward_timer_base(struct timer_base *base) +{ + /* + * We only forward the base when it's idle and we have a delta between + * base clock and jiffies. + */ + if (!base->is_idle || (long) (jiffies - base->clk) < 2) + return; + + /* + * If the next expiry value is > jiffies, then we fast forward to + * jiffies otherwise we forward to the next expiry value. + */ + if (time_after(base->next_expiry, jiffies)) + base->clk = jiffies; + else + base->clk = base->next_expiry; +} +#else +static inline struct timer_base * +__get_target_base(struct timer_base *base, unsigned tflags) +{ + return get_timer_this_cpu_base(tflags); +} + +static inline void forward_timer_base(struct timer_base *base) { } +#endif + +static inline struct timer_base * +get_target_base(struct timer_base *base, unsigned tflags) +{ + struct timer_base *target = __get_target_base(base, tflags); + + forward_timer_base(target); + return target; +} + /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means * that all timers which are tied to this base are locked, and the base itself @@ -1417,16 +1470,49 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) spin_lock(&base->lock); nextevt = __next_timer_interrupt(base); - spin_unlock(&base->lock); + base->next_expiry = nextevt; + /* + * We have a fresh next event. Check whether we can forward the base: + */ + if (time_after(nextevt, jiffies)) + base->clk = jiffies; + else if (time_after(nextevt, base->clk)) + base->clk = nextevt; - if (time_before_eq(nextevt, basej)) + if (time_before_eq(nextevt, basej)) { expires = basem; - else + base->is_idle = false; + } else { expires = basem + (nextevt - basej) * TICK_NSEC; + /* + * If we expect to sleep more than a tick, mark the base idle: + */ + if ((expires - basem) > TICK_NSEC) + base->is_idle = true; + } + spin_unlock(&base->lock); return cmp_next_hrtimer_event(basem, expires); } +/** + * timer_clear_idle - Clear the idle state of the timer base + * + * Called with interrupts disabled + */ +void timer_clear_idle(void) +{ + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + + /* + * We do this unlocked. The worst outcome is a remote enqueue sending + * a pointless IPI, but taking the lock would just make the window for + * sending the IPI a few instructions smaller for the cost of taking + * the lock in the exit from idle path. + */ + base->is_idle = false; +} + static int collect_expired_timers(struct timer_base *base, struct hlist_head *heads) { @@ -1440,7 +1526,7 @@ static int collect_expired_timers(struct timer_base *base, /* * If the next timer is ahead of time forward to current - * jiffies, otherwise forward to the next expiry time. + * jiffies, otherwise forward to the next expiry time: */ if (time_after(next, jiffies)) { /* The call site will increment clock! */ -- cgit v1.3-8-gc7d7 From 4e85876a9d2a977b4a07389da8c07edf76d10825 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Jul 2016 09:50:37 +0000 Subject: timers: Only wake softirq if necessary With the wheel forwading in place and with the HZ=1000 4ms folding we can avoid running the softirq at all. Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.607650550@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/timer.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 9339d71ee998..8d830f1f6a6a 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1608,7 +1608,18 @@ static void run_timer_softirq(struct softirq_action *h) */ void run_local_timers(void) { + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + hrtimer_run_queues(); + /* Raise the softirq only if required. */ + if (time_before(jiffies, base->clk)) { + if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) + return; + /* CPU is awake, so check the deferrable base. */ + base++; + if (time_before(jiffies, base->clk)) + return; + } raise_softirq(TIMER_SOFTIRQ); } -- cgit v1.3-8-gc7d7 From ffdf047728f8f93df896b58049c7513856027141 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Mon, 4 Jul 2016 09:50:39 +0000 Subject: timers: Split out index calculation For further optimizations we need to seperate index calculation from queueing. No functional change. Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.691159619@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/timer.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8d830f1f6a6a..8d7c23e55c85 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -471,12 +471,9 @@ static inline unsigned calc_index(unsigned expires, unsigned lvl) return LVL_OFFS(lvl) + (expires & LVL_MASK); } -static void -__internal_add_timer(struct timer_base *base, struct timer_list *timer) +static int calc_wheel_index(unsigned long expires, unsigned long clk) { - unsigned long expires = timer->expires; - unsigned long delta = expires - base->clk; - struct hlist_head *vec; + unsigned long delta = expires - clk; unsigned int idx; if (delta < LVL_START(1)) { @@ -496,7 +493,7 @@ __internal_add_timer(struct timer_base *base, struct timer_list *timer) } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { idx = calc_index(expires, 7); } else if ((long) delta < 0) { - idx = base->clk & LVL_MASK; + idx = clk & LVL_MASK; } else { /* * Force expire obscene large timeouts to expire at the @@ -507,20 +504,33 @@ __internal_add_timer(struct timer_base *base, struct timer_list *timer) idx = calc_index(expires, LVL_DEPTH - 1); } - /* - * Enqueue the timer into the array bucket, mark it pending in - * the bitmap and store the index in the timer flags. - */ - vec = base->vectors + idx; - hlist_add_head(&timer->entry, vec); + return idx; +} + +/* + * Enqueue the timer into the hash bucket, mark it pending in + * the bitmap and store the index in the timer flags. + */ +static void enqueue_timer(struct timer_base *base, struct timer_list *timer, + unsigned int idx) +{ + hlist_add_head(&timer->entry, base->vectors + idx); __set_bit(idx, base->pending_map); timer_set_idx(timer, idx); } -static void internal_add_timer(struct timer_base *base, struct timer_list *timer) +static void +__internal_add_timer(struct timer_base *base, struct timer_list *timer) { - __internal_add_timer(base, timer); + unsigned int idx; + + idx = calc_wheel_index(timer->expires, base->clk); + enqueue_timer(base, timer, idx); +} +static void +trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) +{ if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) return; @@ -551,7 +561,14 @@ static void internal_add_timer(struct timer_base *base, struct timer_list *timer * wheel: */ base->next_expiry = timer->expires; - wake_up_nohz_cpu(base->cpu); + wake_up_nohz_cpu(base->cpu); +} + +static void +internal_add_timer(struct timer_base *base, struct timer_list *timer) +{ + __internal_add_timer(base, timer); + trigger_dyntick_cpu(base, timer); } #ifdef CONFIG_TIMER_STATS -- cgit v1.3-8-gc7d7 From f00c0afdfa625165a609513bc74164d56752ec3e Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Mon, 4 Jul 2016 09:50:40 +0000 Subject: timers: Implement optimization for same expiry time in mod_timer() The existing optimization for same expiry time in mod_timer() checks whether the timer expiry time is the same as the new requested expiry time. In the old timer wheel implementation this does not take the slack batching into account, neither does the new implementation evaluate whether the new expiry time will requeue the timer to the same bucket. To optimize that, we can calculate the resulting bucket and check if the new expiry time is different from the current expiry time. This calculation happens outside the base lock held region. If the resulting bucket is the same we can avoid taking the base lock and requeueing the timer. If the timer needs to be requeued then we have to check under the base lock whether the base time has changed between the lockless calculation and taking the lock. If it has changed we need to recalculate under the lock. This optimization takes effect for timers which are enqueued into the less granular wheel levels (1 and above). With a simple test case the functionality has been verified: Before After Match: 5.5% 86.6% Requeue: 94.5% 13.4% Recalc: <0.01% In the non optimized case the timer is requeued in 94.5% of the cases. With the index optimization in place the requeue rate drops to 13.4%. The case where the lockless index calculation has to be redone is less than 0.01%. With a real world test case (networking) we observed the following changes: Before After Match: 97.8% 99.7% Requeue: 2.2% 0.3% Recalc: <0.001% That means two percent fewer lock/requeue/unlock operations done in one of the hot path use cases of timers. Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Cc: Arjan van de Ven Cc: Chris Mason Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: George Spelvin Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160704094342.778527749@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/timer.c | 51 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8d7c23e55c85..8f29abeb8c4d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -960,28 +960,36 @@ static inline int __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) { struct timer_base *base, *new_base; - unsigned long flags; + unsigned int idx = UINT_MAX; + unsigned long clk = 0, flags; int ret = 0; /* - * TODO: Calculate the array bucket of the timer right here w/o - * holding the base lock. This allows to check not only - * timer->expires == expires below, but also whether the timer - * ends up in the same bucket. If we really need to requeue - * the timer then we check whether base->clk have - * advanced between here and locking the timer base. If - * jiffies advanced we have to recalc the array bucket with the - * lock held. - */ - - /* - * This is a common optimization triggered by the - * networking code - if the timer is re-modified - * to be the same thing then just return: + * This is a common optimization triggered by the networking code - if + * the timer is re-modified to have the same timeout or ends up in the + * same array bucket then just return: */ if (timer_pending(timer)) { if (timer->expires == expires) return 1; + /* + * Take the current timer_jiffies of base, but without holding + * the lock! + */ + base = get_timer_base(timer->flags); + clk = base->clk; + + idx = calc_wheel_index(expires, clk); + + /* + * Retrieve and compare the array index of the pending + * timer. If it matches set the expiry to the new value so a + * subsequent call will exit in the expires check above. + */ + if (idx == timer_get_idx(timer)) { + timer->expires = expires; + return 1; + } } timer_stats_timer_set_start_info(timer); @@ -1018,7 +1026,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } timer->expires = expires; - internal_add_timer(base, timer); + /* + * If 'idx' was calculated above and the base time did not advance + * between calculating 'idx' and taking the lock, only enqueue_timer() + * and trigger_dyntick_cpu() is required. Otherwise we need to + * (re)calculate the wheel index via internal_add_timer(). + */ + if (idx != UINT_MAX && clk == base->clk) { + enqueue_timer(base, timer, idx); + trigger_dyntick_cpu(base, timer); + } else { + internal_add_timer(base, timer); + } out_unlock: spin_unlock_irqrestore(&base->lock, flags); -- cgit v1.3-8-gc7d7 From 606274c5abd8e245add01bc7145a8cbb92b69ba8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 6 Jul 2016 22:38:36 -0700 Subject: bpf: introduce bpf_get_current_task() helper over time there were multiple requests to access different data structures and fields of task_struct current, so finally add the helper to access 'current' as-is. Tracing bpf programs will do the rest of walking the pointers via bpf_probe_read(). Note that current can be null and bpf program has to deal it with, but even dumb passing null into bpf_probe_read() is still safe. Suggested-by: Brendan Gregg Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 7 +++++++ kernel/trace/bpf_trace.c | 13 +++++++++++++ 2 files changed, 20 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c14ca1cd6297..262a7e883b19 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -357,6 +357,13 @@ enum bpf_func_id { */ BPF_FUNC_get_hash_recalc, + /** + * u64 bpf_get_current_task(void) + * Returns current task_struct + * Return: current + */ + BPF_FUNC_get_current_task, + __BPF_FUNC_MAX_ID, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 19c5b4a5c3eb..094c716154ed 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -312,6 +312,17 @@ const struct bpf_func_proto *bpf_get_event_output_proto(void) return &bpf_event_output_proto; } +static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return (long) current; +} + +static const struct bpf_func_proto bpf_get_current_task_proto = { + .func = bpf_get_current_task, + .gpl_only = true, + .ret_type = RET_INTEGER, +}; + static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -329,6 +340,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_current_task: + return &bpf_get_current_task_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_comm: -- cgit v1.3-8-gc7d7 From 9acacc2ac525ef1397af63b15cef7bb77a823c06 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Mon, 20 Jun 2016 17:37:18 +0800 Subject: sched/cpuacct: Merge cpuacct_usage_index and cpuacct_stat_index enums These two types have similar function, no need to separate them. Signed-off-by: Zhao Lei Cc: KOSAKI Motohiro Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/436748885270d64363c7dc67167507d486c2057a.1466415271.git.zhaolei@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 47 ++++++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 41f85c4d0938..74241eb5f3ff 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -25,15 +25,13 @@ enum cpuacct_stat_index { CPUACCT_STAT_NSTATS, }; -enum cpuacct_usage_index { - CPUACCT_USAGE_USER, /* ... user mode */ - CPUACCT_USAGE_SYSTEM, /* ... kernel mode */ - - CPUACCT_USAGE_NRUSAGE, +static const char * const cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", }; struct cpuacct_usage { - u64 usages[CPUACCT_USAGE_NRUSAGE]; + u64 usages[CPUACCT_STAT_NSTATS]; }; /* track cpu usage of a group of tasks and its child groups */ @@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) } static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, - enum cpuacct_usage_index index) + enum cpuacct_stat_index index) { struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 data; /* - * We allow index == CPUACCT_USAGE_NRUSAGE here to read + * We allow index == CPUACCT_STAT_NSTATS here to read * the sum of suages. */ - BUG_ON(index > CPUACCT_USAGE_NRUSAGE); + BUG_ON(index > CPUACCT_STAT_NSTATS); #ifndef CONFIG_64BIT /* @@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, raw_spin_lock_irq(&cpu_rq(cpu)->lock); #endif - if (index == CPUACCT_USAGE_NRUSAGE) { + if (index == CPUACCT_STAT_NSTATS) { int i = 0; data = 0; - for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) data += cpuusage->usages[i]; } else { data = cpuusage->usages[index]; @@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) raw_spin_lock_irq(&cpu_rq(cpu)->lock); #endif - for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) cpuusage->usages[i] = val; #ifndef CONFIG_64BIT @@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) /* return total cpu usage (in nanoseconds) of a group */ static u64 __cpuusage_read(struct cgroup_subsys_state *css, - enum cpuacct_usage_index index) + enum cpuacct_stat_index index) { struct cpuacct *ca = css_ca(css); u64 totalcpuusage = 0; @@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css, static u64 cpuusage_user_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return __cpuusage_read(css, CPUACCT_USAGE_USER); + return __cpuusage_read(css, CPUACCT_STAT_USER); } static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM); + return __cpuusage_read(css, CPUACCT_STAT_SYSTEM); } static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE); + return __cpuusage_read(css, CPUACCT_STAT_NSTATS); } static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, @@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, } static int __cpuacct_percpu_seq_show(struct seq_file *m, - enum cpuacct_usage_index index) + enum cpuacct_stat_index index) { struct cpuacct *ca = css_ca(seq_css(m)); u64 percpu; @@ -229,24 +227,19 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m, static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) { - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER); + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER); } static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) { - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM); + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM); } static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) { - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS); } -static const char * const cpuacct_stat_desc[] = { - [CPUACCT_STAT_USER] = "user", - [CPUACCT_STAT_SYSTEM] = "system", -}; - static int cpuacct_stats_show(struct seq_file *sf, void *v) { struct cpuacct *ca = css_ca(seq_css(sf)); @@ -316,11 +309,11 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; - int index = CPUACCT_USAGE_SYSTEM; + int index = CPUACCT_STAT_SYSTEM; struct pt_regs *regs = task_pt_regs(tsk); if (regs && user_mode(regs)) - index = CPUACCT_USAGE_USER; + index = CPUACCT_STAT_USER; rcu_read_lock(); -- cgit v1.3-8-gc7d7 From 8e546bfafb3121ed25c73a0c02311ec58459344a Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Mon, 20 Jun 2016 17:37:19 +0800 Subject: sched/cpuacct: Use loop to consolidate code in cpuacct_stats_show() In cpuacct_stats_show() we currently we have copies of similar code, for each cpustat(system/user) variant. Use a loop instead to consolidate the code. This will also work better if we extend the CPUACCT_STAT_NSTATS type. Signed-off-by: Zhao Lei Cc: KOSAKI Motohiro Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/b0597d4224655e9f333f1a6224ed9654c7d7d36a.1466415271.git.zhaolei@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 74241eb5f3ff..677cd1ab33b7 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -243,27 +243,26 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) static int cpuacct_stats_show(struct seq_file *sf, void *v) { struct cpuacct *ca = css_ca(seq_css(sf)); + s64 val[CPUACCT_STAT_NSTATS]; int cpu; - s64 val = 0; + int stat; + memset(val, 0, sizeof(val)); for_each_possible_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_USER]; - val += kcpustat->cpustat[CPUTIME_NICE]; - } - val = cputime64_to_clock_t(val); - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; - val = 0; - for_each_possible_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_SYSTEM]; - val += kcpustat->cpustat[CPUTIME_IRQ]; - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; + val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; + val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; } - val = cputime64_to_clock_t(val); - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { + seq_printf(sf, "%s %lld\n", + cpuacct_stat_desc[stat], + cputime64_to_clock_t(val[stat])); + } return 0; } -- cgit v1.3-8-gc7d7 From 277a13e4f0d661678a7084bf97ed96a99c7dac21 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Mon, 20 Jun 2016 17:37:20 +0800 Subject: sched/cpuacct: Introduce cpuacct.usage_all to show all CPU stats together In current code, we can get cpuacct data from several files, but each file has various limitations. For example: - We can get CPU usage in user and kernel mode via cpuacct.stat, but we can't get detailed data about each CPU. - We can get each CPU's kernel mode usage in cpuacct.usage_percpu_sys, but we can't get user mode usage data at the same time. This patch introduces cpuacct.usage_all, to show all detailed CPU accounting data together: # cat cpuacct.usage_all cpu user system 0 3809760299 5807968992 1 3250329855 454612211 .. Signed-off-by: Zhao Lei Cc: KOSAKI Motohiro Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/7744460969edd7caaf0e903592ee52353ed9bdd6.1466415271.git.zhaolei@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 677cd1ab33b7..bc0b309c3f19 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -240,6 +240,42 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS); } +static int cpuacct_all_seq_show(struct seq_file *m, void *V) +{ + struct cpuacct *ca = css_ca(seq_css(m)); + int index; + int cpu; + + seq_puts(m, "cpu"); + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) + seq_printf(m, " %s", cpuacct_stat_desc[index]); + seq_puts(m, "\n"); + + for_each_possible_cpu(cpu) { + struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + + seq_printf(m, "%d", cpu); + + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit + * platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); +#endif + + seq_printf(m, " %llu", cpuusage->usages[index]); + +#ifndef CONFIG_64BIT + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#endif + } + seq_puts(m, "\n"); + } + return 0; +} + static int cpuacct_stats_show(struct seq_file *sf, void *v) { struct cpuacct *ca = css_ca(seq_css(sf)); @@ -293,6 +329,10 @@ static struct cftype files[] = { .name = "usage_percpu_sys", .seq_show = cpuacct_percpu_sys_seq_show, }, + { + .name = "usage_all", + .seq_show = cpuacct_all_seq_show, + }, { .name = "stat", .seq_show = cpuacct_stats_show, -- cgit v1.3-8-gc7d7 From 2f88e41a22ccfa95291c4df573f8ed4c6a71f29b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Jul 2016 02:40:56 +0200 Subject: PM / hibernate: Add missing braces in hibernate_setup() Make hibernate_setup() follow the coding style more closely by adding some missing braces to the if () statement in it. Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 126e24caa82e..b00f270d328e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1119,11 +1119,11 @@ static int __init resume_offset_setup(char *str) static int __init hibernate_setup(char *str) { - if (!strncmp(str, "noresume", 8)) + if (!strncmp(str, "noresume", 8)) { noresume = 1; - else if (!strncmp(str, "nocompress", 10)) + } else if (!strncmp(str, "nocompress", 10)) { nocompress = 1; - else if (!strncmp(str, "no", 2)) { + } else if (!strncmp(str, "no", 2)) { noresume = 1; nohibernate = 1; } -- cgit v1.3-8-gc7d7 From efd5a85242e996275ebf3df71013beabd723bda3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Jul 2016 23:42:46 +0200 Subject: PM / hibernate: Clean up function headers in snapshot.c The formatting of some function headers in kernel/power/snapshot.c is not consistent with the general kernel coding style and with the formatting of some other function headers in the same file. Make all of them follow the same formatting convention. No functional changes. Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 93 ++++++++++++++++++++++--------------------------- 1 file changed, 42 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 94b6fe6c9ae3..1fe0ddb6fd0d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -186,8 +186,8 @@ static inline void free_image_page(void *addr, int clear_nosave_free) __free_page(page); } -static inline void -free_list_of_pages(struct linked_page *list, int clear_page_nosave) +static inline void free_list_of_pages(struct linked_page *list, + int clear_page_nosave) { while (list) { struct linked_page *lp = list->next; @@ -219,8 +219,8 @@ struct chain_allocator { int safe_needed; /* if set, only "safe" pages are allocated */ }; -static void -chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed) +static void chain_init(struct chain_allocator *ca, gfp_t gfp_mask, + int safe_needed) { ca->chain = NULL; ca->used_space = LINKED_PAGE_DATA_SIZE; @@ -452,10 +452,11 @@ static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, * This function also allocated and builds the radix tree for the * zone. */ -static struct mem_zone_bm_rtree * -create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, - struct chain_allocator *ca, - unsigned long start, unsigned long end) +static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask, + int safe_needed, + struct chain_allocator *ca, + unsigned long start, + unsigned long end) { struct mem_zone_bm_rtree *zone; unsigned int i, nr_blocks; @@ -595,8 +596,8 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) /** * memory_bm_create - allocate memory for a memory bitmap */ -static int -memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) +static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, + int safe_needed) { struct chain_allocator ca; struct list_head mem_extents; @@ -894,9 +895,8 @@ static void memory_bm_recycle(struct memory_bitmap *bm) * initialization code) */ -void __init -__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, - int use_kmalloc) +void __init __register_nosave_region(unsigned long start_pfn, + unsigned long end_pfn, int use_kmalloc) { struct nosave_region *region; @@ -1277,8 +1277,7 @@ static void safe_copy_page(void *dst, struct page *s_page) #ifdef CONFIG_HIGHMEM -static inline struct page * -page_is_saveable(struct zone *zone, unsigned long pfn) +static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn) { return is_highmem(zone) ? saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); @@ -1321,8 +1320,8 @@ static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) } #endif /* CONFIG_HIGHMEM */ -static void -copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) +static void copy_data_pages(struct memory_bitmap *copy_bm, + struct memory_bitmap *orig_bm) { struct zone *zone; unsigned long pfn; @@ -1485,8 +1484,8 @@ static unsigned long __fraction(u64 x, u64 multiplier, u64 base) } static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, - unsigned long highmem, - unsigned long total) + unsigned long highmem, + unsigned long total) { unsigned long alloc = __fraction(nr_pages, highmem, total); @@ -1499,8 +1498,8 @@ static inline unsigned long preallocate_image_highmem(unsigned long nr_pages) } static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, - unsigned long highmem, - unsigned long total) + unsigned long highmem, + unsigned long total) { return 0; } @@ -1780,8 +1779,7 @@ static unsigned int count_pages_for_highmem(unsigned int nr_highmem) return nr_highmem; } #else -static unsigned int -count_pages_for_highmem(unsigned int nr_highmem) { return 0; } +static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0; } #endif /* CONFIG_HIGHMEM */ /** @@ -1823,8 +1821,8 @@ static inline int get_highmem_buffer(int safe_needed) * highmem pages is lesser than that, allocate them all. */ -static inline unsigned int -alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) +static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, + unsigned int nr_highmem) { unsigned int to_alloc = count_free_highmem_pages(); @@ -1843,8 +1841,8 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) #else static inline int get_highmem_buffer(int safe_needed) { return 0; } -static inline unsigned int -alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } +static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, + unsigned int n) { return 0; } #endif /* CONFIG_HIGHMEM */ /** @@ -1859,9 +1857,9 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } * copy_data_pages() works. */ -static int -swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, - unsigned int nr_pages, unsigned int nr_highmem) +static int swsusp_alloc(struct memory_bitmap *orig_bm, + struct memory_bitmap *copy_bm, + unsigned int nr_pages, unsigned int nr_highmem) { if (nr_highmem > 0) { if (get_highmem_buffer(PG_ANY)) @@ -1978,8 +1976,7 @@ static int init_header(struct swsusp_info *info) * are stored in the array @buf[] (1 page at a time) */ -static inline void -pack_pfns(unsigned long *buf, struct memory_bitmap *bm) +static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm) { int j; @@ -2110,8 +2107,7 @@ static int check_header(struct swsusp_info *info) * load header - check the image header and copy data from it */ -static int -load_header(struct swsusp_info *info) +static int load_header(struct swsusp_info *info) { int error; @@ -2204,8 +2200,8 @@ static unsigned int safe_highmem_pages; static struct memory_bitmap *safe_highmem_bm; -static int -prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) +static int prepare_highmem_image(struct memory_bitmap *bm, + unsigned int *nr_highmem_p) { unsigned int to_alloc; @@ -2259,8 +2255,8 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) static struct page *last_highmem_page; -static void * -get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) +static void *get_highmem_page_buffer(struct page *page, + struct chain_allocator *ca) { struct highmem_pbe *pbe; void *kaddr; @@ -2333,17 +2329,13 @@ static inline void free_highmem_data(void) free_image_page(buffer, PG_UNSAFE_CLEAR); } #else -static unsigned int -count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } +static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } -static inline int -prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) -{ - return 0; -} +static inline int prepare_highmem_image(struct memory_bitmap *bm, + unsigned int *nr_highmem_p) { return 0; } -static inline void * -get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) +static inline void *get_highmem_page_buffer(struct page *page, + struct chain_allocator *ca) { return ERR_PTR(-EINVAL); } @@ -2369,8 +2361,7 @@ static inline void free_highmem_data(void) {} #define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) -static int -prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) +static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) { unsigned int nr_pages, nr_highmem; struct linked_page *lp; @@ -2593,8 +2584,8 @@ int snapshot_image_loaded(struct snapshot_handle *handle) #ifdef CONFIG_HIGHMEM /* Assumes that @buf is ready and points to a "safe" page */ -static inline void -swap_two_pages_data(struct page *p1, struct page *p2, void *buf) +static inline void swap_two_pages_data(struct page *p1, struct page *p2, + void *buf) { void *kaddr1, *kaddr2; -- cgit v1.3-8-gc7d7 From ef96f639ea663474c4e1c57bd64e118ffbb92be4 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Jul 2016 23:43:46 +0200 Subject: PM / hibernate: Clean up comments in snapshot.c Many comments in kernel/power/snapshot.c do not follow the general comment formatting rules. They look odd, some of them are outdated too, some are hard to parse and generally difficult to understand. Clean them up to make them easier to comprehend. No functional changes. Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 636 +++++++++++++++++++++++++----------------------- 1 file changed, 330 insertions(+), 306 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1fe0ddb6fd0d..bd927d9efeb7 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -67,7 +67,8 @@ void __init hibernate_image_size_init(void) image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; } -/* List of PBEs needed for restoring the pages that were allocated before +/* + * List of PBEs needed for restoring the pages that were allocated before * the suspend and included in the suspend image, but have also been * allocated by the "resume" kernel, so their contents cannot be written * directly to their "original" page frames. @@ -93,16 +94,6 @@ static struct linked_page *safe_pages_list; /* Pointer to an auxiliary buffer (1 page) */ static void *buffer; -/** - * @safe_needed - on resume, for storing the PBE list and the image, - * we can only use memory pages that do not conflict with the pages - * used before suspend. The unsafe pages have PageNosaveFree set - * and we count them using unsafe_pages. - * - * Each allocated image page is marked as PageNosave and PageNosaveFree - * so that swsusp_free() can release it. - */ - #define PG_ANY 0 #define PG_SAFE 1 #define PG_UNSAFE_CLEAR 1 @@ -110,6 +101,19 @@ static void *buffer; static unsigned int allocated_unsafe_pages; +/** + * get_image_page - Allocate a page for a hibernation image. + * @gfp_mask: GFP mask for the allocation. + * @safe_needed: Get pages that were not used before hibernation (restore only) + * + * During image restoration, for storing the PBE list and the image data, we can + * only use memory pages that do not conflict with the pages used before + * hibernation. The "unsafe" pages have PageNosaveFree set and we count them + * using allocated_unsafe_pages. + * + * Each allocated image page is marked as PageNosave and PageNosaveFree so that + * swsusp_free() can release it. + */ static void *get_image_page(gfp_t gfp_mask, int safe_needed) { void *res; @@ -167,10 +171,13 @@ static void recycle_safe_page(void *page_address) } /** - * free_image_page - free page represented by @addr, allocated with - * get_image_page (page flags set by it must be cleared) + * free_image_page - Free a page allocated for hibernation image. + * @addr: Address of the page to free. + * @clear_nosave_free: If set, clear the PageNosaveFree bit for the page. + * + * The page to free should have been allocated by get_image_page() (page flags + * set by it are affected). */ - static inline void free_image_page(void *addr, int clear_nosave_free) { struct page *page; @@ -197,24 +204,22 @@ static inline void free_list_of_pages(struct linked_page *list, } } -/** - * struct chain_allocator is used for allocating small objects out of - * a linked list of pages called 'the chain'. - * - * The chain grows each time when there is no room for a new object in - * the current page. The allocated objects cannot be freed individually. - * It is only possible to free them all at once, by freeing the entire - * chain. - * - * NOTE: The chain allocator may be inefficient if the allocated objects - * are not much smaller than PAGE_SIZE. - */ - +/* + * struct chain_allocator is used for allocating small objects out of + * a linked list of pages called 'the chain'. + * + * The chain grows each time when there is no room for a new object in + * the current page. The allocated objects cannot be freed individually. + * It is only possible to free them all at once, by freeing the entire + * chain. + * + * NOTE: The chain allocator may be inefficient if the allocated objects + * are not much smaller than PAGE_SIZE. + */ struct chain_allocator { struct linked_page *chain; /* the chain */ unsigned int used_space; /* total size of objects allocated out - * of the current page - */ + of the current page */ gfp_t gfp_mask; /* mask for allocating pages */ int safe_needed; /* if set, only "safe" pages are allocated */ }; @@ -250,44 +255,44 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) } /** - * Data types related to memory bitmaps. + * Data types related to memory bitmaps. * - * Memory bitmap is a structure consiting of many linked lists of - * objects. The main list's elements are of type struct zone_bitmap - * and each of them corresonds to one zone. For each zone bitmap - * object there is a list of objects of type struct bm_block that - * represent each blocks of bitmap in which information is stored. + * Memory bitmap is a structure consiting of many linked lists of + * objects. The main list's elements are of type struct zone_bitmap + * and each of them corresonds to one zone. For each zone bitmap + * object there is a list of objects of type struct bm_block that + * represent each blocks of bitmap in which information is stored. * - * struct memory_bitmap contains a pointer to the main list of zone - * bitmap objects, a struct bm_position used for browsing the bitmap, - * and a pointer to the list of pages used for allocating all of the - * zone bitmap objects and bitmap block objects. + * struct memory_bitmap contains a pointer to the main list of zone + * bitmap objects, a struct bm_position used for browsing the bitmap, + * and a pointer to the list of pages used for allocating all of the + * zone bitmap objects and bitmap block objects. * - * NOTE: It has to be possible to lay out the bitmap in memory - * using only allocations of order 0. Additionally, the bitmap is - * designed to work with arbitrary number of zones (this is over the - * top for now, but let's avoid making unnecessary assumptions ;-). + * NOTE: It has to be possible to lay out the bitmap in memory + * using only allocations of order 0. Additionally, the bitmap is + * designed to work with arbitrary number of zones (this is over the + * top for now, but let's avoid making unnecessary assumptions ;-). * - * struct zone_bitmap contains a pointer to a list of bitmap block - * objects and a pointer to the bitmap block object that has been - * most recently used for setting bits. Additionally, it contains the - * pfns that correspond to the start and end of the represented zone. + * struct zone_bitmap contains a pointer to a list of bitmap block + * objects and a pointer to the bitmap block object that has been + * most recently used for setting bits. Additionally, it contains the + * PFNs that correspond to the start and end of the represented zone. * - * struct bm_block contains a pointer to the memory page in which - * information is stored (in the form of a block of bitmap) - * It also contains the pfns that correspond to the start and end of - * the represented memory area. + * struct bm_block contains a pointer to the memory page in which + * information is stored (in the form of a block of bitmap) + * It also contains the pfns that correspond to the start and end of + * the represented memory area. * - * The memory bitmap is organized as a radix tree to guarantee fast random - * access to the bits. There is one radix tree for each zone (as returned - * from create_mem_extents). + * The memory bitmap is organized as a radix tree to guarantee fast random + * access to the bits. There is one radix tree for each zone (as returned + * from create_mem_extents). * - * One radix tree is represented by one struct mem_zone_bm_rtree. There are - * two linked lists for the nodes of the tree, one for the inner nodes and - * one for the leave nodes. The linked leave nodes are used for fast linear - * access of the memory bitmap. + * One radix tree is represented by one struct mem_zone_bm_rtree. There are + * two linked lists for the nodes of the tree, one for the inner nodes and + * one for the leave nodes. The linked leave nodes are used for fast linear + * access of the memory bitmap. * - * The struct rtree_node represents one node of the radix tree. + * The struct rtree_node represents one node of the radix tree. */ #define BM_END_OF_MAP (~0UL) @@ -333,9 +338,8 @@ struct bm_position { struct memory_bitmap { struct list_head zones; struct linked_page *p_list; /* list of pages used to store zone - * bitmap objects and bitmap block - * objects - */ + bitmap objects and bitmap block + objects */ struct bm_position cur; /* most recently used bit position */ }; @@ -349,12 +353,12 @@ struct memory_bitmap { #endif #define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1) -/* - * alloc_rtree_node - Allocate a new node and add it to the radix tree. +/** + * alloc_rtree_node - Allocate a new node and add it to the radix tree. * - * This function is used to allocate inner nodes as well as the - * leave nodes of the radix tree. It also adds the node to the - * corresponding linked list passed in by the *list parameter. + * This function is used to allocate inner nodes as well as the + * leave nodes of the radix tree. It also adds the node to the + * corresponding linked list passed in by the *list parameter. */ static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, struct chain_allocator *ca, @@ -375,12 +379,12 @@ static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, return node; } -/* - * add_rtree_block - Add a new leave node to the radix tree +/** + * add_rtree_block - Add a new leave node to the radix tree. * - * The leave nodes need to be allocated in order to keep the leaves - * linked list in order. This is guaranteed by the zone->blocks - * counter. + * The leave nodes need to be allocated in order to keep the leaves + * linked list in order. This is guaranteed by the zone->blocks + * counter. */ static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, int safe_needed, struct chain_allocator *ca) @@ -445,12 +449,12 @@ static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, int clear_nosave_free); -/* - * create_zone_bm_rtree - create a radix tree for one zone +/** + * create_zone_bm_rtree - Create a radix tree for one zone. * - * Allocated the mem_zone_bm_rtree structure and initializes it. - * This function also allocated and builds the radix tree for the - * zone. + * Allocated the mem_zone_bm_rtree structure and initializes it. + * This function also allocated and builds the radix tree for the + * zone. */ static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, @@ -483,12 +487,12 @@ static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask, return zone; } -/* - * free_zone_bm_rtree - Free the memory of the radix tree +/** + * free_zone_bm_rtree - Free the memory of the radix tree. * - * Free all node pages of the radix tree. The mem_zone_bm_rtree - * structure itself is not freed here nor are the rtree_node - * structs. + * Free all node pages of the radix tree. The mem_zone_bm_rtree + * structure itself is not freed here nor are the rtree_node + * structs. */ static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, int clear_nosave_free) @@ -521,8 +525,8 @@ struct mem_extent { }; /** - * free_mem_extents - free a list of memory extents - * @list - list of extents to empty + * free_mem_extents - Free a list of memory extents. + * @list: List of extents to free. */ static void free_mem_extents(struct list_head *list) { @@ -535,10 +539,11 @@ static void free_mem_extents(struct list_head *list) } /** - * create_mem_extents - create a list of memory extents representing - * contiguous ranges of PFNs - * @list - list to put the extents into - * @gfp_mask - mask to use for memory allocations + * create_mem_extents - Create a list of memory extents. + * @list: List to put the extents into. + * @gfp_mask: Mask to use for memory allocations. + * + * The extents represent contiguous ranges of PFNs. */ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) { @@ -594,8 +599,8 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) } /** - * memory_bm_create - allocate memory for a memory bitmap - */ + * memory_bm_create - Allocate memory for a memory bitmap. + */ static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) { @@ -636,8 +641,9 @@ static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, } /** - * memory_bm_free - free memory occupied by the memory bitmap @bm - */ + * memory_bm_free - Free memory occupied by the memory bitmap. + * @bm: Memory bitmap. + */ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) { struct mem_zone_bm_rtree *zone; @@ -651,14 +657,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) } /** - * memory_bm_find_bit - Find the bit for pfn in the memory - * bitmap + * memory_bm_find_bit - Find the bit for a given PFN in a memory bitmap. * - * Find the bit in the bitmap @bm that corresponds to given pfn. - * The cur.zone, cur.block and cur.node_pfn member of @bm are - * updated. - * It walks the radix tree to find the page which contains the bit for - * pfn and returns the bit position in **addr and *bit_nr. + * Find the bit in memory bitmap @bm that corresponds to the given PFN. + * The cur.zone, cur.block and cur.node_pfn members of @bm are updated. + * + * Walk the radix tree to find the page containing the bit that represents @pfn + * and return the position of the bit in @addr and @bit_nr. */ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, void **addr, unsigned int *bit_nr) @@ -687,10 +692,9 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, zone_found: /* - * We have a zone. Now walk the radix tree to find the leave - * node for our pfn. + * We have found the zone. Now walk the radix tree to find the leaf node + * for our PFN. */ - node = bm->cur.node; if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) goto node_found; @@ -783,14 +787,14 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) } /* - * rtree_next_node - Jumps to the next leave node + * rtree_next_node - Jump to the next leaf node. * - * Sets the position to the beginning of the next node in the - * memory bitmap. This is either the next node in the current - * zone's radix tree or the first node in the radix tree of the - * next zone. + * Set the position to the beginning of the next node in the + * memory bitmap. This is either the next node in the current + * zone's radix tree or the first node in the radix tree of the + * next zone. * - * Returns true if there is a next node, false otherwise. + * Return true if there is a next node, false otherwise. */ static bool rtree_next_node(struct memory_bitmap *bm) { @@ -819,14 +823,15 @@ static bool rtree_next_node(struct memory_bitmap *bm) } /** - * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm + * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap. + * @bm: Memory bitmap. * - * Starting from the last returned position this function searches - * for the next set bit in the memory bitmap and returns its - * number. If no more bit is set BM_END_OF_MAP is returned. + * Starting from the last returned position this function searches for the next + * set bit in @bm and returns the PFN represented by it. If no more bits are + * set, BM_END_OF_MAP is returned. * - * It is required to run memory_bm_position_reset() before the - * first call to this function. + * It is required to run memory_bm_position_reset() before the first call to + * this function for the given memory bitmap. */ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { @@ -848,11 +853,10 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) return BM_END_OF_MAP; } -/** - * This structure represents a range of page frames the contents of which - * should not be saved during the suspend. +/* + * This structure represents a range of page frames the contents of which + * should not be saved during hibernation. */ - struct nosave_region { struct list_head list; unsigned long start_pfn; @@ -890,11 +894,11 @@ static void memory_bm_recycle(struct memory_bitmap *bm) } /** - * register_nosave_region - register a range of page frames the contents - * of which should not be saved during the suspend (to be used in the early - * initialization code) + * register_nosave_region - Register a region of unsaveable memory. + * + * Register a range of page frames the contents of which should not be saved + * during hibernation (to be used in the early initialization code). */ - void __init __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, int use_kmalloc) { @@ -913,7 +917,7 @@ void __init __register_nosave_region(unsigned long start_pfn, } } if (use_kmalloc) { - /* during init, this shouldn't fail */ + /* During init, this shouldn't fail */ region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); BUG_ON(!region); } else @@ -979,10 +983,12 @@ static void swsusp_unset_page_forbidden(struct page *page) } /** - * mark_nosave_pages - set bits corresponding to the page frames the - * contents of which should not be saved in a given bitmap. + * mark_nosave_pages - Mark pages that should not be saved. + * @bm: Memory bitmap. + * + * Set the bits in @bm that correspond to the page frames the contents of which + * should not be saved. */ - static void mark_nosave_pages(struct memory_bitmap *bm) { struct nosave_region *region; @@ -1012,13 +1018,13 @@ static void mark_nosave_pages(struct memory_bitmap *bm) } /** - * create_basic_memory_bitmaps - create bitmaps needed for marking page - * frames that should not be saved and free page frames. The pointers - * forbidden_pages_map and free_pages_map are only modified if everything - * goes well, because we don't want the bits to be used before both bitmaps - * are set up. + * create_basic_memory_bitmaps - Create bitmaps to hold basic page information. + * + * Create bitmaps needed for marking page frames that should not be saved and + * free page frames. The forbidden_pages_map and free_pages_map pointers are + * only modified if everything goes well, because we don't want the bits to be + * touched before both bitmaps are set up. */ - int create_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; @@ -1063,12 +1069,12 @@ int create_basic_memory_bitmaps(void) } /** - * free_basic_memory_bitmaps - free memory bitmaps allocated by - * create_basic_memory_bitmaps(). The auxiliary pointers are necessary - * so that the bitmaps themselves are not referred to while they are being - * freed. + * free_basic_memory_bitmaps - Free memory bitmaps holding basic information. + * + * Free memory bitmaps allocated by create_basic_memory_bitmaps(). The + * auxiliary pointers are necessary so that the bitmaps themselves are not + * referred to while they are being freed. */ - void free_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; @@ -1089,11 +1095,13 @@ void free_basic_memory_bitmaps(void) } /** - * snapshot_additional_pages - estimate the number of additional pages - * be needed for setting up the suspend image data structures for given - * zone (usually the returned value is greater than the exact number) + * snapshot_additional_pages - Estimate the number of extra pages needed. + * @zone: Memory zone to carry out the computation for. + * + * Estimate the number of additional pages needed for setting up a hibernation + * image data structures for @zone (usually, the returned value is greater than + * the exact number). */ - unsigned int snapshot_additional_pages(struct zone *zone) { unsigned int rtree, nodes; @@ -1111,10 +1119,10 @@ unsigned int snapshot_additional_pages(struct zone *zone) #ifdef CONFIG_HIGHMEM /** - * count_free_highmem_pages - compute the total number of free highmem - * pages, system-wide. + * count_free_highmem_pages - Compute the total number of free highmem pages. + * + * The returned number is system-wide. */ - static unsigned int count_free_highmem_pages(void) { struct zone *zone; @@ -1128,11 +1136,12 @@ static unsigned int count_free_highmem_pages(void) } /** - * saveable_highmem_page - Determine whether a highmem page should be - * included in the suspend image. + * saveable_highmem_page - Check if a highmem page is saveable. + * + * Determine whether a highmem page should be included in a hibernation image. * - * We should save the page if it isn't Nosave or NosaveFree, or Reserved, - * and it isn't a part of a free chunk of pages. + * We should save the page if it isn't Nosave or NosaveFree, or Reserved, + * and it isn't part of a free chunk of pages. */ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) { @@ -1158,10 +1167,8 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) } /** - * count_highmem_pages - compute the total number of saveable highmem - * pages. + * count_highmem_pages - Compute the total number of saveable highmem pages. */ - static unsigned int count_highmem_pages(void) { struct zone *zone; @@ -1189,12 +1196,14 @@ static inline void *saveable_highmem_page(struct zone *z, unsigned long p) #endif /* CONFIG_HIGHMEM */ /** - * saveable_page - Determine whether a non-highmem page should be included - * in the suspend image. + * saveable_page - Check if the given page is saveable. + * + * Determine whether a non-highmem page should be included in a hibernation + * image. * - * We should save the page if it isn't Nosave, and is not in the range - * of pages statically defined as 'unsaveable', and it isn't a part of - * a free chunk of pages. + * We should save the page if it isn't Nosave, and is not in the range + * of pages statically defined as 'unsaveable', and it isn't part of + * a free chunk of pages. */ static struct page *saveable_page(struct zone *zone, unsigned long pfn) { @@ -1223,10 +1232,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) } /** - * count_data_pages - compute the total number of saveable non-highmem - * pages. + * count_data_pages - Compute the total number of saveable non-highmem pages. */ - static unsigned int count_data_pages(void) { struct zone *zone; @@ -1246,7 +1253,8 @@ static unsigned int count_data_pages(void) return n; } -/* This is needed, because copy_page and memcpy are not usable for copying +/* + * This is needed, because copy_page and memcpy are not usable for copying * task structs. */ static inline void do_copy_page(long *dst, long *src) @@ -1257,12 +1265,12 @@ static inline void do_copy_page(long *dst, long *src) *dst++ = *src++; } - /** - * safe_copy_page - check if the page we are going to copy is marked as - * present in the kernel page tables (this always is the case if - * CONFIG_DEBUG_PAGEALLOC is not set and in that case - * kernel_page_present() always returns 'true'). + * safe_copy_page - Copy a page in a safe way. + * + * Check if the page we are going to copy is marked as present in the kernel + * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set + * and in that case kernel_page_present() always returns 'true'). */ static void safe_copy_page(void *dst, struct page *s_page) { @@ -1275,7 +1283,6 @@ static void safe_copy_page(void *dst, struct page *s_page) } } - #ifdef CONFIG_HIGHMEM static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn) { @@ -1298,7 +1305,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) kunmap_atomic(src); } else { if (PageHighMem(d_page)) { - /* Page pointed to by src may contain some kernel + /* + * The page pointed to by src may contain some kernel * data modified by kmap_atomic() */ safe_copy_page(buffer, s_page); @@ -1370,12 +1378,11 @@ static struct memory_bitmap orig_bm; static struct memory_bitmap copy_bm; /** - * swsusp_free - free pages allocated for the suspend. + * swsusp_free - Free pages allocated for hibernation image. * - * Suspend pages are alocated before the atomic copy is made, so we - * need to release them after the resume. + * Image pages are alocated before snapshot creation, so they need to be + * released after resume. */ - void swsusp_free(void) { unsigned long fb_pfn, fr_pfn; @@ -1424,7 +1431,7 @@ out: #define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN) /** - * preallocate_image_pages - Allocate a number of pages for hibernation image + * preallocate_image_pages - Allocate a number of pages for hibernation image. * @nr_pages: Number of page frames to allocate. * @mask: GFP flags to use for the allocation. * @@ -1474,7 +1481,7 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages) } /** - * __fraction - Compute (an approximation of) x * (multiplier / base) + * __fraction - Compute (an approximation of) x * (multiplier / base). */ static unsigned long __fraction(u64 x, u64 multiplier, u64 base) { @@ -1506,7 +1513,7 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, #endif /* CONFIG_HIGHMEM */ /** - * free_unnecessary_pages - Release preallocated pages not needed for the image + * free_unnecessary_pages - Release preallocated pages not needed for the image. */ static unsigned long free_unnecessary_pages(void) { @@ -1560,7 +1567,7 @@ static unsigned long free_unnecessary_pages(void) } /** - * minimum_image_size - Estimate the minimum acceptable size of an image + * minimum_image_size - Estimate the minimum acceptable size of an image. * @saveable: Number of saveable pages in the system. * * We want to avoid attempting to free too much memory too hard, so estimate the @@ -1590,7 +1597,7 @@ static unsigned long minimum_image_size(unsigned long saveable) } /** - * hibernate_preallocate_memory - Preallocate memory for hibernation image + * hibernate_preallocate_memory - Preallocate memory for hibernation image. * * To create a hibernation image it is necessary to make a copy of every page * frame in use. We also need a number of page frames to be free during @@ -1763,10 +1770,11 @@ int hibernate_preallocate_memory(void) #ifdef CONFIG_HIGHMEM /** - * count_pages_for_highmem - compute the number of non-highmem pages - * that will be necessary for creating copies of highmem pages. - */ - + * count_pages_for_highmem - Count non-highmem pages needed for copying highmem. + * + * Compute the number of non-highmem pages that will be necessary for creating + * copies of highmem pages. + */ static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem; @@ -1783,10 +1791,8 @@ static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0; #endif /* CONFIG_HIGHMEM */ /** - * enough_free_mem - Make sure we have enough free memory for the - * snapshot image. + * enough_free_mem - Check if there is enough free memory for the image. */ - static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) { struct zone *zone; @@ -1805,10 +1811,11 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) #ifdef CONFIG_HIGHMEM /** - * get_highmem_buffer - if there are some highmem pages in the suspend - * image, we may need the buffer to copy them and/or load their data. + * get_highmem_buffer - Allocate a buffer for highmem pages. + * + * If there are some highmem pages in the hibernation image, we may need a + * buffer to copy them and/or load their data. */ - static inline int get_highmem_buffer(int safe_needed) { buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); @@ -1816,11 +1823,11 @@ static inline int get_highmem_buffer(int safe_needed) } /** - * alloc_highmem_image_pages - allocate some highmem pages for the image. - * Try to allocate as many pages as needed, but if the number of free - * highmem pages is lesser than that, allocate them all. + * alloc_highmem_image_pages - Allocate some highmem pages for the image. + * + * Try to allocate as many pages as needed, but if the number of free highmem + * pages is less than that, allocate them all. */ - static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) { @@ -1846,17 +1853,16 @@ static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, #endif /* CONFIG_HIGHMEM */ /** - * swsusp_alloc - allocate memory for the suspend image + * swsusp_alloc - Allocate memory for hibernation image. * - * We first try to allocate as many highmem pages as there are - * saveable highmem pages in the system. If that fails, we allocate - * non-highmem pages for the copies of the remaining highmem ones. + * We first try to allocate as many highmem pages as there are + * saveable highmem pages in the system. If that fails, we allocate + * non-highmem pages for the copies of the remaining highmem ones. * - * In this approach it is likely that the copies of highmem pages will - * also be located in the high memory, because of the way in which - * copy_data_pages() works. + * In this approach it is likely that the copies of highmem pages will + * also be located in the high memory, because of the way in which + * copy_data_pages() works. */ - static int swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, unsigned int nr_pages, unsigned int nr_highmem) @@ -1909,7 +1915,8 @@ asmlinkage __visible int swsusp_save(void) return -ENOMEM; } - /* During allocating of suspend pagedir, new cold pages may appear. + /* + * During allocating of suspend pagedir, new cold pages may appear. * Kill them. */ drain_local_pages(NULL); @@ -1972,10 +1979,13 @@ static int init_header(struct swsusp_info *info) } /** - * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm - * are stored in the array @buf[] (1 page at a time) + * pack_pfns - Prepare PFNs for saving. + * @bm: Memory bitmap. + * @buf: Memory buffer to store the PFNs in. + * + * PFNs corresponding to set bits in @bm are stored in the area of memory + * pointed to by @buf (1 page at a time). */ - static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm) { int j; @@ -1990,22 +2000,21 @@ static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm) } /** - * snapshot_read_next - used for reading the system memory snapshot. + * snapshot_read_next - Get the address to read the next image page from. + * @handle: Snapshot handle to be used for the reading. * - * On the first call to it @handle should point to a zeroed - * snapshot_handle structure. The structure gets updated and a pointer - * to it should be passed to this function every next time. + * On the first call, @handle should point to a zeroed snapshot_handle + * structure. The structure gets populated then and a pointer to it should be + * passed to this function every next time. * - * On success the function returns a positive number. Then, the caller - * is allowed to read up to the returned number of bytes from the memory - * location computed by the data_of() macro. + * On success, the function returns a positive number. Then, the caller + * is allowed to read up to the returned number of bytes from the memory + * location computed by the data_of() macro. * - * The function returns 0 to indicate the end of data stream condition, - * and a negative number is returned on error. In such cases the - * structure pointed to by @handle is not updated and should not be used - * any more. + * The function returns 0 to indicate the end of the data stream condition, + * and negative numbers are returned on errors. If that happens, the structure + * pointed to by @handle is not updated and should not be used any more. */ - int snapshot_read_next(struct snapshot_handle *handle) { if (handle->cur > nr_meta_pages + nr_copy_pages) @@ -2034,7 +2043,8 @@ int snapshot_read_next(struct snapshot_handle *handle) page = pfn_to_page(memory_bm_next_pfn(©_bm)); if (PageHighMem(page)) { - /* Highmem pages are copied to the buffer, + /* + * Highmem pages are copied to the buffer, * because we can't return with a kmapped * highmem page (we may not be called again). */ @@ -2066,11 +2076,11 @@ static void duplicate_memory_bitmap(struct memory_bitmap *dst, } /** - * mark_unsafe_pages - mark the pages that cannot be used for storing - * the image during resume, because they conflict with the pages that - * had been used before suspend + * mark_unsafe_pages - Mark pages that were used before hibernation. + * + * Mark the pages that cannot be used for storing the image during restoration, + * because they conflict with the pages that had been used before hibernation. */ - static void mark_unsafe_pages(struct memory_bitmap *bm) { unsigned long pfn; @@ -2104,9 +2114,8 @@ static int check_header(struct swsusp_info *info) } /** - * load header - check the image header and copy data from it + * load header - Check the image header and copy the data from it. */ - static int load_header(struct swsusp_info *info) { int error; @@ -2121,8 +2130,12 @@ static int load_header(struct swsusp_info *info) } /** - * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set - * the corresponding bit in the memory bitmap @bm + * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap. + * @bm: Memory bitmap. + * @buf: Area of memory containing the PFNs. + * + * For each element of the array pointed to by @buf (1 page at a time), set the + * corresponding bit in @bm. */ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) { @@ -2145,7 +2158,8 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) } #ifdef CONFIG_HIGHMEM -/* struct highmem_pbe is used for creating the list of highmem pages that +/* + * struct highmem_pbe is used for creating the list of highmem pages that * should be restored atomically during the resume from disk, because the page * frames they have occupied before the suspend are in use. */ @@ -2155,7 +2169,8 @@ struct highmem_pbe { struct highmem_pbe *next; }; -/* List of highmem PBEs needed for restoring the highmem pages that were +/* + * List of highmem PBEs needed for restoring the highmem pages that were * allocated before the suspend and included in the suspend image, but have * also been allocated by the "resume" kernel, so their contents cannot be * written directly to their "original" page frames. @@ -2163,11 +2178,11 @@ struct highmem_pbe { static struct highmem_pbe *highmem_pblist; /** - * count_highmem_image_pages - compute the number of highmem pages in the - * suspend image. The bits in the memory bitmap @bm that correspond to the - * image pages are assumed to be set. + * count_highmem_image_pages - Compute the number of highmem pages in the image. + * @bm: Memory bitmap. + * + * The bits in @bm that correspond to image pages are assumed to be set. */ - static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { unsigned long pfn; @@ -2184,22 +2199,23 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) return cnt; } -/** - * prepare_highmem_image - try to allocate as many highmem pages as - * there are highmem image pages (@nr_highmem_p points to the variable - * containing the number of highmem image pages). The pages that are - * "safe" (ie. will not be overwritten when the suspend image is - * restored) have the corresponding bits set in @bm (it must be - * unitialized). - * - * NOTE: This function should not be called if there are no highmem - * image pages. - */ - static unsigned int safe_highmem_pages; static struct memory_bitmap *safe_highmem_bm; +/** + * prepare_highmem_image - Allocate memory for loading highmem data from image. + * @bm: Pointer to an uninitialized memory bitmap structure. + * @nr_highmem_p: Pointer to the number of highmem image pages. + * + * Try to allocate as many highmem pages as there are highmem image pages + * (@nr_highmem_p points to the variable containing the number of highmem image + * pages). The pages that are "safe" (ie. will not be overwritten when the + * hibernation image is restored entirely) have the corresponding bits set in + * @bm (it must be unitialized). + * + * NOTE: This function should not be called if there are no highmem image pages. + */ static int prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) { @@ -2236,25 +2252,26 @@ static int prepare_highmem_image(struct memory_bitmap *bm, return 0; } +static struct page *last_highmem_page; + /** - * get_highmem_page_buffer - for given highmem image page find the buffer - * that suspend_write_next() should set for its caller to write to. + * get_highmem_page_buffer - Prepare a buffer to store a highmem image page. + * + * For a given highmem image page get a buffer that suspend_write_next() should + * return to its caller to write to. * - * If the page is to be saved to its "original" page frame or a copy of - * the page is to be made in the highmem, @buffer is returned. Otherwise, - * the copy of the page is to be made in normal memory, so the address of - * the copy is returned. + * If the page is to be saved to its "original" page frame or a copy of + * the page is to be made in the highmem, @buffer is returned. Otherwise, + * the copy of the page is to be made in normal memory, so the address of + * the copy is returned. * - * If @buffer is returned, the caller of suspend_write_next() will write - * the page's contents to @buffer, so they will have to be copied to the - * right location on the next call to suspend_write_next() and it is done - * with the help of copy_last_highmem_page(). For this purpose, if - * @buffer is returned, @last_highmem page is set to the page to which - * the data will have to be copied from @buffer. + * If @buffer is returned, the caller of suspend_write_next() will write + * the page's contents to @buffer, so they will have to be copied to the + * right location on the next call to suspend_write_next() and it is done + * with the help of copy_last_highmem_page(). For this purpose, if + * @buffer is returned, @last_highmem_page is set to the page to which + * the data will have to be copied from @buffer. */ - -static struct page *last_highmem_page; - static void *get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) { @@ -2262,13 +2279,15 @@ static void *get_highmem_page_buffer(struct page *page, void *kaddr; if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) { - /* We have allocated the "original" page frame and we can + /* + * We have allocated the "original" page frame and we can * use it directly to store the loaded page. */ last_highmem_page = page; return buffer; } - /* The "original" page frame has not been allocated and we have to + /* + * The "original" page frame has not been allocated and we have to * use a "safe" page frame to store the loaded page. */ pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); @@ -2298,11 +2317,12 @@ static void *get_highmem_page_buffer(struct page *page, } /** - * copy_last_highmem_page - copy the contents of a highmem image from - * @buffer, where the caller of snapshot_write_next() has place them, - * to the right location represented by @last_highmem_page . + * copy_last_highmem_page - Copy most the most recent highmem image page. + * + * Copy the contents of a highmem image from @buffer, where the caller of + * snapshot_write_next() has stored them, to the right location represented by + * @last_highmem_page . */ - static void copy_last_highmem_page(void) { if (last_highmem_page) { @@ -2345,22 +2365,23 @@ static inline int last_highmem_page_copied(void) { return 1; } static inline void free_highmem_data(void) {} #endif /* CONFIG_HIGHMEM */ +#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) + /** - * prepare_image - use the memory bitmap @bm to mark the pages that will - * be overwritten in the process of restoring the system memory state - * from the suspend image ("unsafe" pages) and allocate memory for the - * image. + * prepare_image - Make room for loading hibernation image. + * @new_bm: Unitialized memory bitmap structure. + * @bm: Memory bitmap with unsafe pages marked. + * + * Use @bm to mark the pages that will be overwritten in the process of + * restoring the system memory state from the suspend image ("unsafe" pages) + * and allocate memory for the image. * - * The idea is to allocate a new memory bitmap first and then allocate - * as many pages as needed for the image data, but not to assign these - * pages to specific tasks initially. Instead, we just mark them as - * allocated and create a lists of "safe" pages that will be used - * later. On systems with high memory a list of "safe" highmem pages is - * also created. + * The idea is to allocate a new memory bitmap first and then allocate + * as many pages as needed for image data, but without specifying what those + * pages will be used for just yet. Instead, we mark them all as allocated and + * create a lists of "safe" pages to be used later. On systems with high + * memory a list of "safe" highmem pages is created too. */ - -#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) - static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) { unsigned int nr_pages, nr_highmem; @@ -2385,7 +2406,8 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) if (error) goto Free; } - /* Reserve some safe pages for potential later use. + /* + * Reserve some safe pages for potential later use. * * NOTE: This way we make sure there will be enough safe pages for the * chain_alloc() in get_buffer(). It is a bit wasteful, but @@ -2431,10 +2453,11 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) } /** - * get_buffer - compute the address that snapshot_write_next() should - * set for its caller to write to. + * get_buffer - Get the address to store the next image data page. + * + * Get the address that snapshot_write_next() should return to its caller to + * write to. */ - static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) { struct pbe *pbe; @@ -2449,12 +2472,14 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) return get_highmem_page_buffer(page, ca); if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) - /* We have allocated the "original" page frame and we can + /* + * We have allocated the "original" page frame and we can * use it directly to store the loaded page. */ return page_address(page); - /* The "original" page frame has not been allocated and we have to + /* + * The "original" page frame has not been allocated and we have to * use a "safe" page frame to store the loaded page. */ pbe = chain_alloc(ca, sizeof(struct pbe)); @@ -2471,22 +2496,21 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) } /** - * snapshot_write_next - used for writing the system memory snapshot. + * snapshot_write_next - Get the address to store the next image page. + * @handle: Snapshot handle structure to guide the writing. * - * On the first call to it @handle should point to a zeroed - * snapshot_handle structure. The structure gets updated and a pointer - * to it should be passed to this function every next time. + * On the first call, @handle should point to a zeroed snapshot_handle + * structure. The structure gets populated then and a pointer to it should be + * passed to this function every next time. * - * On success the function returns a positive number. Then, the caller - * is allowed to write up to the returned number of bytes to the memory - * location computed by the data_of() macro. + * On success, the function returns a positive number. Then, the caller + * is allowed to write up to the returned number of bytes to the memory + * location computed by the data_of() macro. * - * The function returns 0 to indicate the "end of file" condition, - * and a negative number is returned on error. In such cases the - * structure pointed to by @handle is not updated and should not be used - * any more. + * The function returns 0 to indicate the "end of file" condition. Negative + * numbers are returned on errors, in which cases the structure pointed to by + * @handle is not updated and should not be used any more. */ - int snapshot_write_next(struct snapshot_handle *handle) { static struct chain_allocator ca; @@ -2556,13 +2580,13 @@ int snapshot_write_next(struct snapshot_handle *handle) } /** - * snapshot_write_finalize - must be called after the last call to - * snapshot_write_next() in case the last page in the image happens - * to be a highmem page and its contents should be stored in the - * highmem. Additionally, it releases the memory that will not be - * used any more. + * snapshot_write_finalize - Complete the loading of a hibernation image. + * + * Must be called after the last call to snapshot_write_next() in case the last + * page in the image happens to be a highmem page and its contents should be + * stored in highmem. Additionally, it recycles bitmap memory that's not + * necessary any more. */ - void snapshot_write_finalize(struct snapshot_handle *handle) { copy_last_highmem_page(); @@ -2599,15 +2623,15 @@ static inline void swap_two_pages_data(struct page *p1, struct page *p2, } /** - * restore_highmem - for each highmem page that was allocated before - * the suspend and included in the suspend image, and also has been - * allocated by the "resume" kernel swap its current (ie. "before - * resume") contents with the previous (ie. "before suspend") one. + * restore_highmem - Put highmem image pages into their original locations. + * + * For each highmem page that was in use before hibernation and is included in + * the image, and also has been allocated by the "restore" kernel, swap its + * current contents with the previous (ie. "before hibernation") ones. * - * If the resume eventually fails, we can call this function once - * again and restore the "before resume" highmem state. + * If the restore eventually fails, we can call this function once again and + * restore the highmem state as seen by the restore kernel. */ - int restore_highmem(void) { struct highmem_pbe *pbe = highmem_pblist; -- cgit v1.3-8-gc7d7 From d5f32af3100165cbd625855bd155b3aa9bd87ebf Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Jul 2016 23:44:31 +0200 Subject: PM / hibernate: Add missing braces in __register_nosave_region() One branch of an if/else statement in __register_nosave_region() is formatted against the kernel coding style which causes the code to look slightly odd. To fix that, add missing braces to it. No functional changes. Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index bd927d9efeb7..d64d5d0efa79 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -920,9 +920,10 @@ void __init __register_nosave_region(unsigned long start_pfn, /* During init, this shouldn't fail */ region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); BUG_ON(!region); - } else + } else { /* This allocation cannot fail */ region = memblock_virt_alloc(sizeof(struct nosave_region), 0); + } region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); -- cgit v1.3-8-gc7d7 From 4c0b6c10fbaf0c82efe2a7ba6c236c633d4f2ed7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 10 Jul 2016 02:12:10 +0200 Subject: PM / hibernate: Image data protection during restoration Make it possible to protect all pages holding image data during hibernate image restoration by setting them read-only (so as to catch attempts to write to those pages after image data have been stored in them). This adds overhead to image restoration code (it may cause large page mappings to be split as a result of page flags changes) and the errors it protects against should never happen in theory, so the feature is only active after passing hibernate=protect_image to the command line of the restore kernel. Also it only is built if CONFIG_DEBUG_RODATA is set. Signed-off-by: Rafael J. Wysocki --- Documentation/kernel-parameters.txt | 3 +++ kernel/power/hibernate.c | 3 +++ kernel/power/power.h | 7 +++++++ kernel/power/snapshot.c | 42 +++++++++++++++++++++++++++++++++++++ 4 files changed, 55 insertions(+) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 82b42c958d1c..07960c24642d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3594,6 +3594,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. present during boot. nocompress Don't compress/decompress hibernation images. no Disable hibernation and resume. + protect_image Turn on image protection during restoration + (that will set all pages holding image data + during restoration read-only). retain_initrd [RAM] Keep initrd memory after extraction diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b00f270d328e..51441d87f0b6 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1126,6 +1126,9 @@ static int __init hibernate_setup(char *str) } else if (!strncmp(str, "no", 2)) { noresume = 1; nohibernate = 1; + } else if (IS_ENABLED(CONFIG_DEBUG_RODATA) + && !strncmp(str, "protect_image", 13)) { + enable_restore_image_protection(); } return 1; } diff --git a/kernel/power/power.h b/kernel/power/power.h index 51f02ecaf125..064963e89194 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -59,6 +59,13 @@ extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); +#ifdef CONFIG_DEBUG_RODATA +/* kernel/power/snapshot.c */ +extern void enable_restore_image_protection(void); +#else +static inline void enable_restore_image_protection(void) {} +#endif /* CONFIG_DEBUG_RODATA */ + #else /* !CONFIG_HIBERNATION */ static inline void hibernate_reserved_size_init(void) {} diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d64d5d0efa79..d90df926b59f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -38,6 +38,43 @@ #include "power.h" +#ifdef CONFIG_DEBUG_RODATA +static bool hibernate_restore_protection; +static bool hibernate_restore_protection_active; + +void enable_restore_image_protection(void) +{ + hibernate_restore_protection = true; +} + +static inline void hibernate_restore_protection_begin(void) +{ + hibernate_restore_protection_active = hibernate_restore_protection; +} + +static inline void hibernate_restore_protection_end(void) +{ + hibernate_restore_protection_active = false; +} + +static inline void hibernate_restore_protect_page(void *page_address) +{ + if (hibernate_restore_protection_active) + set_memory_ro((unsigned long)page_address, 1); +} + +static inline void hibernate_restore_unprotect_page(void *page_address) +{ + if (hibernate_restore_protection_active) + set_memory_rw((unsigned long)page_address, 1); +} +#else +static inline void hibernate_restore_protection_begin(void) {} +static inline void hibernate_restore_protection_end(void) {} +static inline void hibernate_restore_protect_page(void *page_address) {} +static inline void hibernate_restore_unprotect_page(void *page_address) {} +#endif /* CONFIG_DEBUG_RODATA */ + static int swsusp_page_is_free(struct page *); static void swsusp_set_page_forbidden(struct page *); static void swsusp_unset_page_forbidden(struct page *); @@ -1414,6 +1451,7 @@ loop: memory_bm_clear_current(forbidden_pages_map); memory_bm_clear_current(free_pages_map); + hibernate_restore_unprotect_page(page_address(page)); __free_page(page); goto loop; } @@ -1425,6 +1463,7 @@ out: buffer = NULL; alloc_normal = 0; alloc_highmem = 0; + hibernate_restore_protection_end(); } /* Helper functions used for the shrinking of memory. */ @@ -2548,6 +2587,7 @@ int snapshot_write_next(struct snapshot_handle *handle) if (error) return error; + hibernate_restore_protection_begin(); } else if (handle->cur <= nr_meta_pages + 1) { error = unpack_orig_pfns(buffer, ©_bm); if (error) @@ -2570,6 +2610,7 @@ int snapshot_write_next(struct snapshot_handle *handle) copy_last_highmem_page(); /* Restore page key for data page (s390 only). */ page_key_write(handle->buffer); + hibernate_restore_protect_page(handle->buffer); handle->buffer = get_buffer(&orig_bm, &ca); if (IS_ERR(handle->buffer)) return PTR_ERR(handle->buffer); @@ -2594,6 +2635,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle) /* Restore page key for data page (s390 only). */ page_key_write(handle->buffer); page_key_free(); + hibernate_restore_protect_page(handle->buffer); /* Do that only if we have loaded the image entirely */ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { memory_bm_recycle(&orig_bm); -- cgit v1.3-8-gc7d7 From 748c7201e622d1c24abb4f85072d2e74d12f295f Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 3 Jun 2016 17:10:18 -0300 Subject: sched/core: Panic on scheduling while atomic bugs if kernel.panic_on_warn is set Currently, a schedule while atomic error prints the stack trace to the kernel log and the system continue running. Although it is possible to collect the kernel log messages and analyze it, often more information are needed. Furthermore, keep the system running is not always the best choice. For example, when the preempt count underflows the system will not stop to complain about scheduling while atomic, so the kernel log can wrap around overwriting the first stack trace, tuning the analysis even more challenging. This patch uses the kernel.panic_on_warn sysctl to help out on these more complex situations. When kernel.panic_on_warn is set to 1, the kernel will panic() in the schedule while atomic detection. The default value of the sysctl is 0, maintaining the current behavior. Signed-off-by: Daniel Bristot de Oliveira Reviewed-by: Luis Claudio R. Goncalves Cc: Christian Borntraeger Cc: Linus Torvalds Cc: Luis Claudio R. Goncalves Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e8f7b80f353aa22c63bd8557208163989af8493d.1464983675.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 28da50a5bc76..4e9617a7e7d9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3168,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev) pr_cont("\n"); } #endif + if (panic_on_warn) + panic("scheduling while atomic\n"); + dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } -- cgit v1.3-8-gc7d7 From 44530d588e142a96cf0cd345a7cb8911c4f88720 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 10 Jul 2016 20:58:36 +0200 Subject: Revert "perf/x86/intel, watchdog: Switch NMI watchdog to ref cycles on x86" This reverts commit 2c95afc1e83d93fac3be6923465e1753c2c53b0a. Stephane reported the following regression: > Since Andi added: > > commit 2c95afc1e83d93fac3be6923465e1753c2c53b0a > Author: Andi Kleen > Date: Thu Jun 9 06:14:38 2016 -0700 > > perf/x86/intel, watchdog: Switch NMI watchdog to ref cycles on x86 > > $ perf stat -e ref-cycles ls > .... > > fails systematically because the ref-cycles is now used by the > watchdog and given this is a system-wide pinned event, it monopolizes > the fixed counter 2 which is the only counter able to measure this event. Since the next merge window is near, fix the regression for now by reverting the commit. Reported-by: Stephane Eranian Cc: Andi Kleen Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Vince Weaver Cc: Alexander Shishkin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/hw_nmi.c | 8 -------- include/linux/nmi.h | 1 - kernel/watchdog.c | 7 ------- 3 files changed, 16 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 016f4263fad4..7788ce643bf4 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -18,16 +18,8 @@ #include #include #include -#include #ifdef CONFIG_HARDLOCKUP_DETECTOR -int hw_nmi_get_event(void) -{ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - return PERF_COUNT_HW_REF_CPU_CYCLES; - return PERF_COUNT_HW_CPU_CYCLES; -} - u64 hw_nmi_get_sample_period(int watchdog_thresh) { return (u64)(cpu_khz) * 1000 * watchdog_thresh; diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 79858af27209..4630eeae18e0 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -66,7 +66,6 @@ static inline bool trigger_allbutself_cpu_backtrace(void) #ifdef CONFIG_LOCKUP_DETECTOR u64 hw_nmi_get_sample_period(int watchdog_thresh); -int hw_nmi_get_event(void); extern int nmi_watchdog_enabled; extern int soft_watchdog_enabled; extern int watchdog_user_enabled; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8dd30fcd91be..9acb29f280ec 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -315,12 +315,6 @@ static int is_softlockup(unsigned long touch_ts) #ifdef CONFIG_HARDLOCKUP_DETECTOR -/* Can be overriden by architecture */ -__weak int hw_nmi_get_event(void) -{ - return PERF_COUNT_HW_CPU_CYCLES; -} - static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -610,7 +604,6 @@ static int watchdog_nmi_enable(unsigned int cpu) wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); - wd_attr->config = hw_nmi_get_event(); /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); -- cgit v1.3-8-gc7d7 From a1b7b1a57b9919a0abb6c93fca04ac9cf840c992 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Sun, 3 Jul 2016 03:24:08 +0300 Subject: irqdomain: Fix irq_domain_alloc_irqs_recursive() error handling If an irq_domain is auto-recursive and irq_domain_alloc_irqs_recursive() for its parent has returned an error, then do return and avoid calling irq_domain_free_irqs_recursive() uselessly, because: - if domain->ops->alloc() had failed for an auto-recursive irq_domain, then irq_domain_free_irqs_recursive() had already been called; - if domain->ops->alloc() had failed for a not auto-recursive irq_domain, then there is nothing to free at all. Signed-off-by: Alexander Popov Acked-by: Marc Zyngier Link: http://lkml.kernel.org/r/1467505448-2850-1-git-send-email-alex.popov@linux.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index a82853739694..4752b43662e0 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1192,8 +1192,10 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, if (recursive) ret = irq_domain_alloc_irqs_recursive(parent, irq_base, nr_irqs, arg); - if (ret >= 0) - ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); + if (ret < 0) + return ret; + + ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); if (ret < 0 && recursive) irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); -- cgit v1.3-8-gc7d7 From a536a6e13ecd0d6eb0ffc36c5d56555896617282 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 11 Jul 2016 12:51:01 -0400 Subject: bpf: make inode code explicitly non-modular The Kconfig currently controlling compilation of this code is: init/Kconfig:config BPF_SYSCALL init/Kconfig: bool "Enable bpf() system call" ...meaning that it currently is not being built as a module by anyone. Lets remove the couple traces of modular infrastructure use, so that when reading the driver there is no doubt it is builtin-only. Note that MODULE_ALIAS is a no-op for non-modular code. We replace module.h with init.h since the file does use __init. Cc: Alexei Starovoitov Cc: netdev@vger.kernel.org Signed-off-by: Paul Gortmaker Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 318858edb1cd..5967b870a895 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -11,7 +11,7 @@ * version 2 as published by the Free Software Foundation. */ -#include +#include #include #include #include @@ -367,8 +367,6 @@ static struct file_system_type bpf_fs_type = { .kill_sb = kill_litter_super, }; -MODULE_ALIAS_FS("bpf"); - static int __init bpf_init(void) { int ret; -- cgit v1.3-8-gc7d7 From 6a4e24518c8a10f78f44da219835239cb5aca90d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Jul 2016 17:16:03 +0000 Subject: cpu/hotplug: Handle early registration gracefully We switched the hotplug machinery to smpboot threads. Early registration of hotplug callbacks, i.e. from do_pre_smp_initcalls(), happens before the threads are initialized. Instead of moving the thread init, we simply handle it in the hotplug code itself and invoke the function directly. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160713153332.896450738@linutronix.de Signed-off-by: Ingo Molnar --- kernel/cpu.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 7b61887f7ccd..fe71ce4e60f1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -517,6 +517,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, if (!cpu_online(cpu)) return 0; + /* + * If we are up and running, use the hotplug thread. For early calls + * we invoke the thread function directly. + */ + if (!st->thread) + return cpuhp_invoke_callback(cpu, state, cb); + st->cb_state = state; st->cb = cb; /* -- cgit v1.3-8-gc7d7 From 00e16c3d68fce504e880f59c9bdf23b2a4759d6d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Jul 2016 17:16:09 +0000 Subject: perf/core: Convert to hotplug state machine Actually a nice symmetric startup/teardown pair which fits properly into the state machine concept. In the long run we should be able to invoke the startup callback for the boot CPU via the state machine and get rid of the init function which invokes it on the boot CPU. Note: This comes actually before the perf hardware callbacks. In the notifier model the hardware callbacks have a higher priority than the core callback. But that's solely for CPU offline so that hardware migration of events happens before the core is notified about the outgoing CPU. With the symetric state array model we have the following ordering: UP: core -> hardware DOWN: hardware -> core Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Gleixner Reviewed-by: Sebastian Siewior Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160713153333.587514098@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/cpuhotplug.h | 2 ++ include/linux/perf_event.h | 9 ++++++++ kernel/cpu.c | 11 +++++++++ kernel/events/core.c | 56 +++++++++------------------------------------- 4 files changed, 32 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index d769ec941fd3..067082e3fd41 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -4,6 +4,7 @@ enum cpuhp_state { CPUHP_OFFLINE, CPUHP_CREATE_THREADS, + CPUHP_PERF_PREPARE, CPUHP_NOTIFY_PREPARE, CPUHP_BRINGUP_CPU, CPUHP_AP_IDLE_DEAD, @@ -22,6 +23,7 @@ enum cpuhp_state { CPUHP_AP_ONLINE_IDLE, CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_X86_VDSO_VMA_ONLINE, + CPUHP_AP_PERF_ONLINE, CPUHP_AP_NOTIFY_ONLINE, CPUHP_AP_ONLINE_DYN, CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1a827cecd62f..9abeb6948e70 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1354,4 +1354,13 @@ _name##_show(struct device *dev, \ \ static struct device_attribute format_attr_##_name = __ATTR_RO(_name) +/* Performance counter hotplug functions */ +#ifdef CONFIG_PERF_EVENTS +int perf_event_init_cpu(unsigned int cpu); +int perf_event_exit_cpu(unsigned int cpu); +#else +#define perf_event_init_cpu NULL +#define perf_event_exit_cpu NULL +#endif + #endif /* _LINUX_PERF_EVENT_H */ diff --git a/kernel/cpu.c b/kernel/cpu.c index fe71ce4e60f1..3705d9043c08 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1180,6 +1180,11 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown = NULL, .cant_stop = true, }, + [CPUHP_PERF_PREPARE] = { + .name = "perf prepare", + .startup = perf_event_init_cpu, + .teardown = perf_event_exit_cpu, + }, /* * Preparatory and dead notifiers. Will be replaced once the notifiers * are converted to states. @@ -1257,6 +1262,12 @@ static struct cpuhp_step cpuhp_ap_states[] = { .startup = smpboot_unpark_threads, .teardown = NULL, }, + [CPUHP_AP_PERF_ONLINE] = { + .name = "perf online", + .startup = perf_event_init_cpu, + .teardown = perf_event_exit_cpu, + }, + /* * Online/down_prepare notifiers. Will be removed once the notifiers * are converted to states. diff --git a/kernel/events/core.c b/kernel/events/core.c index 43d43a2d5811..f3ef1c29a7c9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10255,7 +10255,7 @@ static void __init perf_event_init_all_cpus(void) } } -static void perf_event_init_cpu(int cpu) +int perf_event_init_cpu(unsigned int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -10268,6 +10268,7 @@ static void perf_event_init_cpu(int cpu) rcu_assign_pointer(swhash->swevent_hlist, hlist); } mutex_unlock(&swhash->hlist_mutex); + return 0; } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE @@ -10299,14 +10300,17 @@ static void perf_event_exit_cpu_context(int cpu) } srcu_read_unlock(&pmus_srcu, idx); } +#else + +static void perf_event_exit_cpu_context(int cpu) { } + +#endif -static void perf_event_exit_cpu(int cpu) +int perf_event_exit_cpu(unsigned int cpu) { perf_event_exit_cpu_context(cpu); + return 0; } -#else -static inline void perf_event_exit_cpu(int cpu) { } -#endif static int perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) @@ -10328,46 +10332,6 @@ static struct notifier_block perf_reboot_notifier = { .priority = INT_MIN, }; -static int -perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - - case CPU_UP_PREPARE: - /* - * This must be done before the CPU comes alive, because the - * moment we can run tasks we can encounter (software) events. - * - * Specifically, someone can have inherited events on kthreadd - * or a pre-existing worker thread that gets re-bound. - */ - perf_event_init_cpu(cpu); - break; - - case CPU_DOWN_PREPARE: - /* - * This must be done before the CPU dies because after that an - * active event might want to IPI the CPU and that'll not work - * so great for dead CPUs. - * - * XXX smp_call_function_single() return -ENXIO without a warn - * so we could possibly deal with this. - * - * This is safe against new events arriving because - * sys_perf_event_open() serializes against hotplug using - * get_online_cpus(). - */ - perf_event_exit_cpu(cpu); - break; - default: - break; - } - - return NOTIFY_OK; -} - void __init perf_event_init(void) { int ret; @@ -10380,7 +10344,7 @@ void __init perf_event_init(void) perf_pmu_register(&perf_cpu_clock, NULL, -1); perf_pmu_register(&perf_task_clock, NULL, -1); perf_tp_register(); - perf_cpu_notifier(perf_cpu_notify); + perf_event_init_cpu(smp_processor_id()); register_reboot_notifier(&perf_reboot_notifier); ret = init_hw_breakpoint(); -- cgit v1.3-8-gc7d7 From 7ee681b25284782ecf380bf5ccf55f13c52fd0ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Jul 2016 17:16:29 +0000 Subject: workqueue: Convert to state machine callbacks Get rid of the prio ordering of the separate notifiers and use a proper state callback pair. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Gleixner Reviewed-by: Sebastian Andrzej Siewior Acked-by: Tejun Heo Cc: Andrew Morton Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Nicolas Iooss Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Rusty Russell Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160713153335.197083890@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/cpu.h | 9 ---- include/linux/cpuhotplug.h | 2 + include/linux/workqueue.h | 6 +++ kernel/cpu.c | 10 +++++ kernel/workqueue.c | 108 ++++++++++++++++++--------------------------- 5 files changed, 61 insertions(+), 74 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index ca2dd865a34e..797d9c8e9a1b 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -55,15 +55,6 @@ extern ssize_t arch_cpu_release(const char *, size_t); #endif struct notifier_block; -/* - * CPU notifier priorities. - */ -enum { - /* bring up workqueues before normal notifiers and down after */ - CPU_PRI_WORKQUEUE_UP = 5, - CPU_PRI_WORKQUEUE_DOWN = -5, -}; - #define CPU_ONLINE 0x0002 /* CPU (unsigned)v is up */ #define CPU_UP_PREPARE 0x0003 /* CPU (unsigned)v coming up */ #define CPU_UP_CANCELED 0x0004 /* CPU (unsigned)v NOT coming up */ diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index acfeda137df8..60557a9e783d 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -12,6 +12,7 @@ enum cpuhp_state { CPUHP_PERF_BFIN, CPUHP_PERF_POWER, CPUHP_PERF_SUPERH, + CPUHP_WORKQUEUE_PREP, CPUHP_NOTIFY_PREPARE, CPUHP_BRINGUP_CPU, CPUHP_AP_IDLE_DEAD, @@ -49,6 +50,7 @@ enum cpuhp_state { CPUHP_AP_PERF_S390_SF_ONLINE, CPUHP_AP_PERF_ARM_CCI_ONLINE, CPUHP_AP_PERF_ARM_CCN_ONLINE, + CPUHP_AP_WORKQUEUE_ONLINE, CPUHP_AP_NOTIFY_ONLINE, CPUHP_AP_ONLINE_DYN, CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index ca73c503b92a..26cc1df280d6 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -625,4 +625,10 @@ void wq_watchdog_touch(int cpu); static inline void wq_watchdog_touch(int cpu) { } #endif /* CONFIG_WQ_WATCHDOG */ +#ifdef CONFIG_SMP +int workqueue_prepare_cpu(unsigned int cpu); +int workqueue_online_cpu(unsigned int cpu); +int workqueue_offline_cpu(unsigned int cpu); +#endif + #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index 3705d9043c08..af53f820fec9 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1185,6 +1185,11 @@ static struct cpuhp_step cpuhp_bp_states[] = { .startup = perf_event_init_cpu, .teardown = perf_event_exit_cpu, }, + [CPUHP_WORKQUEUE_PREP] = { + .name = "workqueue prepare", + .startup = workqueue_prepare_cpu, + .teardown = NULL, + }, /* * Preparatory and dead notifiers. Will be replaced once the notifiers * are converted to states. @@ -1267,6 +1272,11 @@ static struct cpuhp_step cpuhp_ap_states[] = { .startup = perf_event_init_cpu, .teardown = perf_event_exit_cpu, }, + [CPUHP_AP_WORKQUEUE_ONLINE] = { + .name = "workqueue online", + .startup = workqueue_online_cpu, + .teardown = workqueue_offline_cpu, + }, /* * Online/down_prepare notifiers. Will be removed once the notifiers diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e1c0e996b5ae..c9dd5fbdbf33 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4611,84 +4611,65 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) pool->attrs->cpumask) < 0); } -/* - * Workqueues should be brought up before normal priority CPU notifiers. - * This will be registered high priority CPU notifier. - */ -static int workqueue_cpu_up_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +int workqueue_prepare_cpu(unsigned int cpu) +{ + struct worker_pool *pool; + + for_each_cpu_worker_pool(pool, cpu) { + if (pool->nr_workers) + continue; + if (!create_worker(pool)) + return -ENOMEM; + } + return 0; +} + +int workqueue_online_cpu(unsigned int cpu) { - int cpu = (unsigned long)hcpu; struct worker_pool *pool; struct workqueue_struct *wq; int pi; - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - for_each_cpu_worker_pool(pool, cpu) { - if (pool->nr_workers) - continue; - if (!create_worker(pool)) - return NOTIFY_BAD; - } - break; - - case CPU_DOWN_FAILED: - case CPU_ONLINE: - mutex_lock(&wq_pool_mutex); + mutex_lock(&wq_pool_mutex); - for_each_pool(pool, pi) { - mutex_lock(&pool->attach_mutex); + for_each_pool(pool, pi) { + mutex_lock(&pool->attach_mutex); - if (pool->cpu == cpu) - rebind_workers(pool); - else if (pool->cpu < 0) - restore_unbound_workers_cpumask(pool, cpu); + if (pool->cpu == cpu) + rebind_workers(pool); + else if (pool->cpu < 0) + restore_unbound_workers_cpumask(pool, cpu); - mutex_unlock(&pool->attach_mutex); - } + mutex_unlock(&pool->attach_mutex); + } - /* update NUMA affinity of unbound workqueues */ - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, true); + /* update NUMA affinity of unbound workqueues */ + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, true); - mutex_unlock(&wq_pool_mutex); - break; - } - return NOTIFY_OK; + mutex_unlock(&wq_pool_mutex); + return 0; } -/* - * Workqueues should be brought down after normal priority CPU notifiers. - * This will be registered as low priority CPU notifier. - */ -static int workqueue_cpu_down_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +int workqueue_offline_cpu(unsigned int cpu) { - int cpu = (unsigned long)hcpu; struct work_struct unbind_work; struct workqueue_struct *wq; - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - /* unbinding per-cpu workers should happen on the local CPU */ - INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); - queue_work_on(cpu, system_highpri_wq, &unbind_work); - - /* update NUMA affinity of unbound workqueues */ - mutex_lock(&wq_pool_mutex); - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, false); - mutex_unlock(&wq_pool_mutex); - - /* wait for per-cpu unbinding to finish */ - flush_work(&unbind_work); - destroy_work_on_stack(&unbind_work); - break; - } - return NOTIFY_OK; + /* unbinding per-cpu workers should happen on the local CPU */ + INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); + queue_work_on(cpu, system_highpri_wq, &unbind_work); + + /* update NUMA affinity of unbound workqueues */ + mutex_lock(&wq_pool_mutex); + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, false); + mutex_unlock(&wq_pool_mutex); + + /* wait for per-cpu unbinding to finish */ + flush_work(&unbind_work); + destroy_work_on_stack(&unbind_work); + return 0; } #ifdef CONFIG_SMP @@ -5490,9 +5471,6 @@ static int __init init_workqueues(void) pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); - cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); - hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); - wq_numa_init(); /* initialize CPU pools */ -- cgit v1.3-8-gc7d7 From 57430218317e5b280a80582a139b26029c25de6c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 13 Jul 2016 16:50:01 +0200 Subject: sched/cputime: Count actually elapsed irq & softirq time Currently, if there was any irq or softirq time during 'ticks' jiffies, the entire period will be accounted as irq or softirq time. This is inaccurate if only a subset of the time was actually spent handling irqs, and could conceivably mis-count all of the ticks during a period as irq time, when there was some irq and some softirq time. This can actually happen when irqtime_account_process_tick is called from account_idle_ticks, which can pass a larger number of ticks down all at once. Fix this by changing irqtime_account_hi_update(), irqtime_account_si_update(), and steal_account_process_ticks() to work with cputime_t time units, and return the amount of time spent in each mode. Rename steal_account_process_ticks() to steal_account_process_time(), to reflect that time is now accounted in cputime_t, instead of ticks. Additionally, have irqtime_account_process_tick() take into account how much time was spent in each of steal, irq, and softirq time. The latter could help improve the accuracy of cputime accounting when returning from idle on a NO_HZ_IDLE CPU. Properly accounting how much time was spent in hardirq and softirq time will also allow the NO_HZ_FULL code to re-use these same functions for hardirq and softirq accounting. Signed-off-by: Rik van Riel [ Make nsecs_to_cputime64() actually return cputime64_t. ] Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krcmar Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1468421405-20056-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/asm-generic/cputime_nsecs.h | 2 + kernel/sched/cputime.c | 124 ++++++++++++++++++++++-------------- 2 files changed, 79 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h index 0f1c6f315cdc..a84e28e0c634 100644 --- a/include/asm-generic/cputime_nsecs.h +++ b/include/asm-generic/cputime_nsecs.h @@ -50,6 +50,8 @@ typedef u64 __nocast cputime64_t; (__force u64)(__ct) #define nsecs_to_cputime(__nsecs) \ (__force cputime_t)(__nsecs) +#define nsecs_to_cputime64(__nsecs) \ + (__force cputime64_t)(__nsecs) /* diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 3d60e5d76fdb..db82ae12cf01 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -79,40 +79,50 @@ void irqtime_account_irq(struct task_struct *curr) } EXPORT_SYMBOL_GPL(irqtime_account_irq); -static int irqtime_account_hi_update(void) +static cputime_t irqtime_account_hi_update(cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; unsigned long flags; - u64 latest_ns; - int ret = 0; + cputime_t irq_cputime; local_irq_save(flags); - latest_ns = this_cpu_read(cpu_hardirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) - ret = 1; + irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) - + cpustat[CPUTIME_IRQ]; + irq_cputime = min(irq_cputime, maxtime); + cpustat[CPUTIME_IRQ] += irq_cputime; local_irq_restore(flags); - return ret; + return irq_cputime; } -static int irqtime_account_si_update(void) +static cputime_t irqtime_account_si_update(cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; unsigned long flags; - u64 latest_ns; - int ret = 0; + cputime_t softirq_cputime; local_irq_save(flags); - latest_ns = this_cpu_read(cpu_softirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) - ret = 1; + softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - + cpustat[CPUTIME_SOFTIRQ]; + softirq_cputime = min(softirq_cputime, maxtime); + cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; local_irq_restore(flags); - return ret; + return softirq_cputime; } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ #define sched_clock_irqtime (0) +static cputime_t irqtime_account_hi_update(cputime_t dummy) +{ + return 0; +} + +static cputime_t irqtime_account_si_update(cputime_t dummy) +{ + return 0; +} + #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ static inline void task_group_account_field(struct task_struct *p, int index, @@ -257,31 +267,44 @@ void account_idle_time(cputime_t cputime) cpustat[CPUTIME_IDLE] += (__force u64) cputime; } -static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies) +static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) { #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { + cputime_t steal_cputime; u64 steal; - unsigned long steal_jiffies; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; - /* - * steal is in nsecs but our caller is expecting steal - * time in jiffies. Lets cast the result to jiffies - * granularity and account the rest on the next rounds. - */ - steal_jiffies = min(nsecs_to_jiffies(steal), max_jiffies); - this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); + steal_cputime = min(nsecs_to_cputime(steal), maxtime); + account_steal_time(steal_cputime); + this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime); - account_steal_time(jiffies_to_cputime(steal_jiffies)); - return steal_jiffies; + return steal_cputime; } #endif return 0; } +/* + * Account how much elapsed time was spent in steal, irq, or softirq time. + */ +static inline cputime_t account_other_time(cputime_t max) +{ + cputime_t accounted; + + accounted = steal_account_process_time(max); + + if (accounted < max) + accounted += irqtime_account_hi_update(max - accounted); + + if (accounted < max) + accounted += irqtime_account_si_update(max - accounted); + + return accounted; +} + /* * Accumulate raw cputime values of dead tasks (sig->[us]time) and live * tasks (sum on group iteration) belonging to @tsk's group. @@ -342,21 +365,23 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq, int ticks) { - cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); - u64 cputime = (__force u64) cputime_one_jiffy; - u64 *cpustat = kcpustat_this_cpu->cpustat; + u64 cputime = (__force u64) cputime_one_jiffy * ticks; + cputime_t scaled, other; - if (steal_account_process_tick(ULONG_MAX)) + /* + * When returning from idle, many ticks can get accounted at + * once, including some ticks of steal, irq, and softirq time. + * Subtract those ticks from the amount of time accounted to + * idle, or potentially user or system time. Due to rounding, + * other time can exceed ticks occasionally. + */ + other = account_other_time(cputime); + if (other >= cputime) return; + cputime -= other; + scaled = cputime_to_scaled(cputime); - cputime *= ticks; - scaled *= ticks; - - if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += cputime; - } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += cputime; - } else if (this_cpu_ksoftirqd() == p) { + if (this_cpu_ksoftirqd() == p) { /* * ksoftirqd time do not get accounted in cpu_softirq_time. * So, we have to handle it separately here. @@ -466,7 +491,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime_t cputime, scaled, steal; struct rq *rq = this_rq(); if (vtime_accounting_cpu_enabled()) @@ -477,16 +502,21 @@ void account_process_tick(struct task_struct *p, int user_tick) return; } - if (steal_account_process_tick(ULONG_MAX)) + cputime = cputime_one_jiffy; + steal = steal_account_process_time(cputime); + + if (steal >= cputime) return; + cputime -= steal; + scaled = cputime_to_scaled(cputime); + if (user_tick) - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime, scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, - one_jiffy_scaled); + account_system_time(p, HARDIRQ_OFFSET, cputime, scaled); else - account_idle_time(cputime_one_jiffy); + account_idle_time(cputime); } /* @@ -681,14 +711,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) static cputime_t get_vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); - unsigned long delta_jiffies, steal_jiffies; + cputime_t delta, steal; - delta_jiffies = now - tsk->vtime_snap; - steal_jiffies = steal_account_process_tick(delta_jiffies); + delta = jiffies_to_cputime(now - tsk->vtime_snap); + steal = steal_account_process_time(delta); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); tsk->vtime_snap = now; - return jiffies_to_cputime(delta_jiffies - steal_jiffies); + return delta - steal; } static void __vtime_account_system(struct task_struct *tsk) -- cgit v1.3-8-gc7d7 From b58c35840521bb02b150e1d0d34ca9197f8b7145 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 13 Jul 2016 16:50:02 +0200 Subject: sched/cputime: Replace VTIME_GEN irq time code with IRQ_TIME_ACCOUNTING code The CONFIG_VIRT_CPU_ACCOUNTING_GEN irq time tracking code does not appear to currently work right. On CPUs without nohz_full=, only tick based irq time sampling is done, which breaks down when dealing with a nohz_idle CPU. On firewalls and similar systems, no ticks may happen on a CPU for a while, and the irq time spent may never get accounted properly. This can cause issues with capacity planning and power saving, which use the CPU statistics as inputs in decision making. Remove the VTIME_GEN vtime irq time code, and replace it with the IRQ_TIME_ACCOUNTING code, when selected as a config option by the user. Signed-off-by: Rik van Riel Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krcmar Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1468421405-20056-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/vtime.h | 32 ++++++++++++++------------------ init/Kconfig | 6 +++--- kernel/sched/cputime.c | 16 +++------------- 3 files changed, 20 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/linux/vtime.h b/include/linux/vtime.h index fa2196990f84..d1977d84ebdf 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -14,6 +14,18 @@ struct task_struct; */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static inline bool vtime_accounting_cpu_enabled(void) { return true; } + +#ifdef __ARCH_HAS_VTIME_ACCOUNT +extern void vtime_account_irq_enter(struct task_struct *tsk); +#else +extern void vtime_common_account_irq_enter(struct task_struct *tsk); +static inline void vtime_account_irq_enter(struct task_struct *tsk) +{ + if (vtime_accounting_cpu_enabled()) + vtime_common_account_irq_enter(tsk); +} +#endif /* __ARCH_HAS_VTIME_ACCOUNT */ + #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN @@ -64,17 +76,6 @@ extern void vtime_account_system(struct task_struct *tsk); extern void vtime_account_idle(struct task_struct *tsk); extern void vtime_account_user(struct task_struct *tsk); -#ifdef __ARCH_HAS_VTIME_ACCOUNT -extern void vtime_account_irq_enter(struct task_struct *tsk); -#else -extern void vtime_common_account_irq_enter(struct task_struct *tsk); -static inline void vtime_account_irq_enter(struct task_struct *tsk) -{ - if (vtime_accounting_cpu_enabled()) - vtime_common_account_irq_enter(tsk); -} -#endif /* __ARCH_HAS_VTIME_ACCOUNT */ - #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ static inline void vtime_task_switch(struct task_struct *prev) { } @@ -85,13 +86,8 @@ static inline void vtime_account_irq_enter(struct task_struct *tsk) { } #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern void arch_vtime_task_switch(struct task_struct *tsk); -extern void vtime_gen_account_irq_exit(struct task_struct *tsk); - -static inline void vtime_account_irq_exit(struct task_struct *tsk) -{ - if (vtime_accounting_cpu_enabled()) - vtime_gen_account_irq_exit(tsk); -} +static inline void vtime_account_irq_enter(struct task_struct *tsk) { } +static inline void vtime_account_irq_exit(struct task_struct *tsk) { } extern void vtime_user_enter(struct task_struct *tsk); diff --git a/init/Kconfig b/init/Kconfig index c02d89777713..787dd76acf29 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -375,9 +375,11 @@ config VIRT_CPU_ACCOUNTING_GEN If unsure, say N. +endchoice + config IRQ_TIME_ACCOUNTING bool "Fine granularity task level IRQ time accounting" - depends on HAVE_IRQ_TIME_ACCOUNTING && !NO_HZ_FULL + depends on HAVE_IRQ_TIME_ACCOUNTING && !VIRT_CPU_ACCOUNTING_NATIVE help Select this option to enable fine granularity task irq time accounting. This is done by reading a timestamp on each @@ -386,8 +388,6 @@ config IRQ_TIME_ACCOUNTING If in doubt, say N here. -endchoice - config BSD_PROCESS_ACCT bool "BSD Process Accounting" depends on MULTIUSER diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index db82ae12cf01..ca7e33cb0967 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -711,14 +711,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) static cputime_t get_vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); - cputime_t delta, steal; + cputime_t delta, other; delta = jiffies_to_cputime(now - tsk->vtime_snap); - steal = steal_account_process_time(delta); + other = account_other_time(delta); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); tsk->vtime_snap = now; - return delta - steal; + return delta - other; } static void __vtime_account_system(struct task_struct *tsk) @@ -738,16 +738,6 @@ void vtime_account_system(struct task_struct *tsk) write_seqcount_end(&tsk->vtime_seqcount); } -void vtime_gen_account_irq_exit(struct task_struct *tsk) -{ - write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) - __vtime_account_system(tsk); - if (context_tracking_in_user()) - tsk->vtime_snap_whence = VTIME_USER; - write_seqcount_end(&tsk->vtime_seqcount); -} - void vtime_account_user(struct task_struct *tsk) { cputime_t delta_cpu; -- cgit v1.3-8-gc7d7 From 0cfdf9a198b0d4f5ad6c87d894db7830b796b2cc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 13 Jul 2016 16:50:03 +0200 Subject: sched/cputime: Clean up the old vtime gen irqtime accounting completely Vtime generic irqtime accounting has been removed but there are a few remnants to clean up: * The vtime_accounting_cpu_enabled() check in irq entry was only used by CONFIG_VIRT_CPU_ACCOUNTING_GEN. We can safely remove it. * Without the vtime_accounting_cpu_enabled(), we no longer need to have a vtime_common_account_irq_enter() indirect function. * Move vtime_account_irq_enter() implementation under CONFIG_VIRT_CPU_ACCOUNTING_NATIVE which is the last user. * The vtime_account_user() call was only used on irq entry for CONFIG_VIRT_CPU_ACCOUNTING_GEN. We can remove that too. Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krcmar Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1468421405-20056-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/vtime.h | 11 ----------- kernel/sched/cputime.c | 33 ++++++++++----------------------- 2 files changed, 10 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/linux/vtime.h b/include/linux/vtime.h index d1977d84ebdf..65aef5e9d04e 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -14,18 +14,7 @@ struct task_struct; */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static inline bool vtime_accounting_cpu_enabled(void) { return true; } - -#ifdef __ARCH_HAS_VTIME_ACCOUNT extern void vtime_account_irq_enter(struct task_struct *tsk); -#else -extern void vtime_common_account_irq_enter(struct task_struct *tsk); -static inline void vtime_account_irq_enter(struct task_struct *tsk) -{ - if (vtime_accounting_cpu_enabled()) - vtime_common_account_irq_enter(tsk); -} -#endif /* __ARCH_HAS_VTIME_ACCOUNT */ - #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ca7e33cb0967..16a873c203b1 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -431,6 +431,10 @@ void vtime_common_task_switch(struct task_struct *prev) } #endif +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement @@ -440,33 +444,16 @@ void vtime_common_task_switch(struct task_struct *prev) * vtime_account(). */ #ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_common_account_irq_enter(struct task_struct *tsk) +void vtime_account_irq_enter(struct task_struct *tsk) { - if (!in_interrupt()) { - /* - * If we interrupted user, context_tracking_in_user() - * is 1 because the context tracking don't hook - * on irq entry/exit. This way we know if - * we need to flush user time on kernel entry. - */ - if (context_tracking_in_user()) { - vtime_account_user(tsk); - return; - } - - if (is_idle_task(tsk)) { - vtime_account_idle(tsk); - return; - } - } - vtime_account_system(tsk); + if (!in_interrupt() && is_idle_task(tsk)) + vtime_account_idle(tsk); + else + vtime_account_system(tsk); } -EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); +EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ -#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { *ut = p->utime; -- cgit v1.3-8-gc7d7 From 553bf6bbfd8a540c70aee28eb50e24caff456a03 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 13 Jul 2016 16:50:05 +0200 Subject: sched/cputime: Drop local_irq_save/restore from irqtime_account_irq() Paolo pointed out that irqs are already blocked when irqtime_account_irq() is called. That means there is no reason to call local_irq_save/restore() again. Suggested-by: Paolo Bonzini Signed-off-by: Rik van Riel Signed-off-by: Frederic Weisbecker Reviewed-by: Paolo Bonzini Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Radim Krcmar Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1468421405-20056-6-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 16a873c203b1..ea0f6f31a244 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -49,15 +49,12 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq); */ void irqtime_account_irq(struct task_struct *curr) { - unsigned long flags; s64 delta; int cpu; if (!sched_clock_irqtime) return; - local_irq_save(flags); - cpu = smp_processor_id(); delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); __this_cpu_add(irq_start_time, delta); @@ -75,7 +72,6 @@ void irqtime_account_irq(struct task_struct *curr) __this_cpu_add(cpu_softirq_time, delta); irq_time_write_end(); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(irqtime_account_irq); -- cgit v1.3-8-gc7d7 From 0b7a0fdb29715e38641beb39db4d01695b22b5aa Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Thu, 14 Jul 2016 10:59:19 -0400 Subject: audit: fix whitespace in CWD record Fix the whitespace in the CWD record Signed-off-by: Steve Grubb [PM: fixed subject line] Signed-off-by: Paul Moore --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ec4c552876a7..aa3feec4df14 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1430,7 +1430,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { - audit_log_d_path(ab, " cwd=", &context->pwd); + audit_log_d_path(ab, "cwd=", &context->pwd); audit_log_end(ab); } } -- cgit v1.3-8-gc7d7 From 27590dc17b34aedc4f3e14bd107ee59b9db9b0a6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Jul 2016 10:41:04 +0200 Subject: hrtimer: Convert to hotplug state machine Split out the clockevents callbacks instead of piggybacking them on hrtimers. This gets rid of a POST_DEAD user. See commit: 54e88fad223c ("sched: Make sure timers have migrated before killing the migration_thread") We just move the callback state to the proper place in the state machine. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Gleixner Reviewed-by: Sebastian Andrzej Siewior Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Rusty Russell Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160713153337.485419196@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/cpuhotplug.h | 1 + include/linux/hrtimer.h | 7 +++++++ kernel/cpu.c | 5 +++++ kernel/time/hrtimer.c | 40 +++++----------------------------------- 4 files changed, 18 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 070cc7f28862..04ecc55009ed 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -15,6 +15,7 @@ enum cpuhp_state { CPUHP_X86_HPET_DEAD, CPUHP_X86_APB_DEAD, CPUHP_WORKQUEUE_PREP, + CPUHP_HRTIMERS_PREPARE, CPUHP_NOTIFY_PREPARE, CPUHP_BRINGUP_CPU, CPUHP_AP_IDLE_DEAD, diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index c98c6539e2c2..5e00f80b1535 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -494,4 +494,11 @@ extern void __init hrtimers_init(void); /* Show pending timers: */ extern void sysrq_timer_list_show(void); +int hrtimers_prepare_cpu(unsigned int cpu); +#ifdef CONFIG_HOTPLUG_CPU +int hrtimers_dead_cpu(unsigned int cpu); +#else +#define hrtimers_dead_cpu NULL +#endif + #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index af53f820fec9..85500e7fe238 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1190,6 +1190,11 @@ static struct cpuhp_step cpuhp_bp_states[] = { .startup = workqueue_prepare_cpu, .teardown = NULL, }, + [CPUHP_HRTIMERS_PREPARE] = { + .name = "hrtimers prepare", + .startup = hrtimers_prepare_cpu, + .teardown = hrtimers_dead_cpu, + }, /* * Preparatory and dead notifiers. Will be replaced once the notifiers * are converted to states. diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d13c9aebf7a3..9ba7c820fc23 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1590,7 +1590,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, /* * Functions related to boot-time initialization: */ -static void init_hrtimers_cpu(int cpu) +int hrtimers_prepare_cpu(unsigned int cpu) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; @@ -1602,6 +1602,7 @@ static void init_hrtimers_cpu(int cpu) cpu_base->cpu = cpu; hrtimer_init_hres(cpu_base); + return 0; } #ifdef CONFIG_HOTPLUG_CPU @@ -1636,7 +1637,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, } } -static void migrate_hrtimers(int scpu) +int hrtimers_dead_cpu(unsigned int scpu) { struct hrtimer_cpu_base *old_base, *new_base; int i; @@ -1665,45 +1666,14 @@ static void migrate_hrtimers(int scpu) /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); local_irq_enable(); + return 0; } #endif /* CONFIG_HOTPLUG_CPU */ -static int hrtimer_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - int scpu = (long)hcpu; - - switch (action) { - - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - init_hrtimers_cpu(scpu); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - migrate_hrtimers(scpu); - break; -#endif - - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block hrtimers_nb = { - .notifier_call = hrtimer_cpu_notify, -}; - void __init hrtimers_init(void) { - hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - register_cpu_notifier(&hrtimers_nb); + hrtimers_prepare_cpu(smp_processor_id()); } /** -- cgit v1.3-8-gc7d7 From 24f73b99716a9cd8cbb345c41ced6b3b5ed94006 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 13 Jul 2016 17:16:59 +0000 Subject: timers/core: Convert to hotplug state machine When tearing down, call timers_dead_cpu() before notify_dead(). There is a hidden dependency between: - timers - block multiqueue - rcutree If timers_dead_cpu() comes later than blk_mq_queue_reinit_notify() that latter function causes a RCU stall. Signed-off-by: Richard Cochran Signed-off-by: Anna-Maria Gleixner Reviewed-by: Sebastian Andrzej Siewior Cc: John Stultz Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Thomas Gleixner Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160713153337.566790058@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/cpuhotplug.h | 1 + include/linux/timer.h | 6 ++++++ kernel/cpu.c | 5 +++++ kernel/time/timer.c | 25 ++----------------------- 4 files changed, 14 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 04ecc55009ed..15d46d2a1d16 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -16,6 +16,7 @@ enum cpuhp_state { CPUHP_X86_APB_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_HRTIMERS_PREPARE, + CPUHP_TIMERS_DEAD, CPUHP_NOTIFY_PREPARE, CPUHP_BRINGUP_CPU, CPUHP_AP_IDLE_DEAD, diff --git a/include/linux/timer.h b/include/linux/timer.h index 4419506b564e..51d601f192d4 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -273,4 +273,10 @@ unsigned long __round_jiffies_up_relative(unsigned long j, int cpu); unsigned long round_jiffies_up(unsigned long j); unsigned long round_jiffies_up_relative(unsigned long j); +#ifdef CONFIG_HOTPLUG_CPU +int timers_dead_cpu(unsigned int cpu); +#else +#define timers_dead_cpu NULL +#endif + #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index 85500e7fe238..e1017d92d308 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1195,6 +1195,11 @@ static struct cpuhp_step cpuhp_bp_states[] = { .startup = hrtimers_prepare_cpu, .teardown = hrtimers_dead_cpu, }, + [CPUHP_TIMERS_DEAD] = { + .name = "timers dead", + .startup = NULL, + .teardown = timers_dead_cpu, + }, /* * Preparatory and dead notifiers. Will be replaced once the notifiers * are converted to states. diff --git a/kernel/time/timer.c b/kernel/time/timer.c index cb9ab401e2d9..555670a5143c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1804,7 +1804,7 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h } } -static void migrate_timers(int cpu) +int timers_dead_cpu(unsigned int cpu) { struct timer_base *old_base; struct timer_base *new_base; @@ -1831,29 +1831,9 @@ static void migrate_timers(int cpu) spin_unlock_irq(&new_base->lock); put_cpu_ptr(&timer_bases); } + return 0; } -static int timer_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - switch (action) { - case CPU_DEAD: - case CPU_DEAD_FROZEN: - migrate_timers((long)hcpu); - break; - default: - break; - } - - return NOTIFY_OK; -} - -static inline void timer_register_cpu_notifier(void) -{ - cpu_notifier(timer_cpu_notify, 0); -} -#else -static inline void timer_register_cpu_notifier(void) { } #endif /* CONFIG_HOTPLUG_CPU */ static void __init init_timer_cpu(int cpu) @@ -1881,7 +1861,6 @@ void __init init_timers(void) { init_timer_cpus(); init_timer_stats(); - timer_register_cpu_notifier(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } -- cgit v1.3-8-gc7d7 From e722d8daafb974b9ad1bbaf42f384a5ea5929f5f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 13 Jul 2016 17:16:59 +0000 Subject: profile: Convert to hotplug state machine Install the callbacks via the state machine and let the core invoke the callbacks on the already online CPUs. A lot of code is removed because the for-loop is used and create_hash_tables() is removed since its purpose is covered by the startup / teardown hooks. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Anna-Maria Gleixner Cc: Andrew Morton Cc: Arnd Bergmann Cc: Linus Torvalds Cc: Mel Gorman Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160713153337.649867675@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/cpuhotplug.h | 1 + kernel/profile.c | 181 ++++++++++++++++----------------------------- 2 files changed, 66 insertions(+), 116 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 15d46d2a1d16..ace5ad0fc3ec 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -16,6 +16,7 @@ enum cpuhp_state { CPUHP_X86_APB_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_HRTIMERS_PREPARE, + CPUHP_PROFILE_PREPARE, CPUHP_TIMERS_DEAD, CPUHP_NOTIFY_PREPARE, CPUHP_BRINGUP_CPU, diff --git a/kernel/profile.c b/kernel/profile.c index c2199e9901c9..2dbccf2d806c 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -328,68 +328,57 @@ out: put_cpu(); } -static int profile_cpu_callback(struct notifier_block *info, - unsigned long action, void *__cpu) +static int profile_dead_cpu(unsigned int cpu) { - int node, cpu = (unsigned long)__cpu; struct page *page; + int i; - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - node = cpu_to_mem(cpu); - per_cpu(cpu_profile_flip, cpu) = 0; - if (!per_cpu(cpu_profile_hits, cpu)[1]) { - page = __alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO, - 0); - if (!page) - return notifier_from_errno(-ENOMEM); - per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); - } - if (!per_cpu(cpu_profile_hits, cpu)[0]) { - page = __alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO, - 0); - if (!page) - goto out_free; - per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); - } - break; -out_free: - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); - per_cpu(cpu_profile_hits, cpu)[1] = NULL; - __free_page(page); - return notifier_from_errno(-ENOMEM); - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - if (prof_cpu_mask != NULL) - cpumask_set_cpu(cpu, prof_cpu_mask); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - if (prof_cpu_mask != NULL) - cpumask_clear_cpu(cpu, prof_cpu_mask); - if (per_cpu(cpu_profile_hits, cpu)[0]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); - per_cpu(cpu_profile_hits, cpu)[0] = NULL; + if (prof_cpu_mask != NULL) + cpumask_clear_cpu(cpu, prof_cpu_mask); + + for (i = 0; i < 2; i++) { + if (per_cpu(cpu_profile_hits, cpu)[i]) { + page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]); + per_cpu(cpu_profile_hits, cpu)[i] = NULL; __free_page(page); } - if (per_cpu(cpu_profile_hits, cpu)[1]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); - per_cpu(cpu_profile_hits, cpu)[1] = NULL; - __free_page(page); + } + return 0; +} + +static int profile_prepare_cpu(unsigned int cpu) +{ + int i, node = cpu_to_mem(cpu); + struct page *page; + + per_cpu(cpu_profile_flip, cpu) = 0; + + for (i = 0; i < 2; i++) { + if (per_cpu(cpu_profile_hits, cpu)[i]) + continue; + + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!page) { + profile_dead_cpu(cpu); + return -ENOMEM; } - break; + per_cpu(cpu_profile_hits, cpu)[i] = page_address(page); + } - return NOTIFY_OK; + return 0; +} + +static int profile_online_cpu(unsigned int cpu) +{ + if (prof_cpu_mask != NULL) + cpumask_set_cpu(cpu, prof_cpu_mask); + + return 0; } + #else /* !CONFIG_SMP */ #define profile_flip_buffers() do { } while (0) #define profile_discard_flip_buffers() do { } while (0) -#define profile_cpu_callback NULL static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { @@ -531,83 +520,43 @@ static const struct file_operations proc_profile_operations = { .llseek = default_llseek, }; -#ifdef CONFIG_SMP -static void profile_nop(void *unused) -{ -} - -static int create_hash_tables(void) +int __ref create_proc_profile(void) { - int cpu; - - for_each_online_cpu(cpu) { - int node = cpu_to_mem(cpu); - struct page *page; - - page = __alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, - 0); - if (!page) - goto out_cleanup; - per_cpu(cpu_profile_hits, cpu)[1] - = (struct profile_hit *)page_address(page); - page = __alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, - 0); - if (!page) - goto out_cleanup; - per_cpu(cpu_profile_hits, cpu)[0] - = (struct profile_hit *)page_address(page); - } - return 0; -out_cleanup: - prof_on = 0; - smp_mb(); - on_each_cpu(profile_nop, NULL, 1); - for_each_online_cpu(cpu) { - struct page *page; - - if (per_cpu(cpu_profile_hits, cpu)[0]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); - per_cpu(cpu_profile_hits, cpu)[0] = NULL; - __free_page(page); - } - if (per_cpu(cpu_profile_hits, cpu)[1]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); - per_cpu(cpu_profile_hits, cpu)[1] = NULL; - __free_page(page); - } - } - return -1; -} -#else -#define create_hash_tables() ({ 0; }) + struct proc_dir_entry *entry; +#ifdef CONFIG_SMP + enum cpuhp_state online_state; #endif -int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ -{ - struct proc_dir_entry *entry; int err = 0; if (!prof_on) return 0; - - cpu_notifier_register_begin(); - - if (create_hash_tables()) { - err = -ENOMEM; - goto out; - } - +#ifdef CONFIG_SMP + err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE", + profile_prepare_cpu, profile_dead_cpu); + if (err) + return err; + + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE", + profile_online_cpu, NULL); + if (err < 0) + goto err_state_prep; + online_state = err; + err = 0; +#endif entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &proc_profile_operations); if (!entry) - goto out; + goto err_state_onl; proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); - __hotcpu_notifier(profile_cpu_callback, 0); -out: - cpu_notifier_register_done(); + return err; +err_state_onl: +#ifdef CONFIG_SMP + cpuhp_remove_state(online_state); +err_state_prep: + cpuhp_remove_state(CPUHP_PROFILE_PREPARE); +#endif return err; } subsys_initcall(create_proc_profile); -- cgit v1.3-8-gc7d7 From 31487f8328f20fdb302430b020a5d6e8446c1971 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 13 Jul 2016 17:17:01 +0000 Subject: smp/cfd: Convert core to hotplug state machine Install the callbacks via the state machine. They are installed at runtime so smpcfd_prepare_cpu() needs to be invoked by the boot-CPU. Signed-off-by: Richard Weinberger [ Added the dropped CPU dying case back in. ] Signed-off-by: Richard Cochran Signed-off-by: Anna-Maria Gleixner Reviewed-by: Sebastian Andrzej Siewior Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Mel Gorman Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Thomas Gleixner Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160713153337.818376366@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/cpuhotplug.h | 2 ++ include/linux/smp.h | 5 +++ kernel/cpu.c | 9 ++++++ kernel/smp.c | 79 +++++++++++++++++++--------------------------- 4 files changed, 48 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 78170827a776..b5cf01ace71b 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -18,6 +18,7 @@ enum cpuhp_state { CPUHP_HRTIMERS_PREPARE, CPUHP_PROFILE_PREPARE, CPUHP_X2APIC_PREPARE, + CPUHP_SMPCFD_PREPARE, CPUHP_TIMERS_DEAD, CPUHP_NOTIFY_PREPARE, CPUHP_BRINGUP_CPU, @@ -57,6 +58,7 @@ enum cpuhp_state { CPUHP_AP_ARM_CORESIGHT4_STARTING, CPUHP_AP_ARM64_ISNDEP_STARTING, CPUHP_AP_LEDTRIG_STARTING, + CPUHP_AP_SMPCFD_DYING, CPUHP_AP_X86_TBOOT_DYING, CPUHP_AP_NOTIFY_STARTING, CPUHP_AP_ONLINE, diff --git a/include/linux/smp.h b/include/linux/smp.h index c4414074bd88..eccae4690f41 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -196,4 +196,9 @@ extern void arch_enable_nonboot_cpus_end(void); void smp_setup_processor_id(void); +/* SMP core functions */ +int smpcfd_prepare_cpu(unsigned int cpu); +int smpcfd_dead_cpu(unsigned int cpu); +int smpcfd_dying_cpu(unsigned int cpu); + #endif /* __LINUX_SMP_H */ diff --git a/kernel/cpu.c b/kernel/cpu.c index e1017d92d308..008e2fd40cb1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1195,6 +1195,11 @@ static struct cpuhp_step cpuhp_bp_states[] = { .startup = hrtimers_prepare_cpu, .teardown = hrtimers_dead_cpu, }, + [CPUHP_SMPCFD_PREPARE] = { + .name = "SMPCFD prepare", + .startup = smpcfd_prepare_cpu, + .teardown = smpcfd_dead_cpu, + }, [CPUHP_TIMERS_DEAD] = { .name = "timers dead", .startup = NULL, @@ -1218,6 +1223,10 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown = NULL, .cant_stop = true, }, + [CPUHP_AP_SMPCFD_DYING] = { + .startup = NULL, + .teardown = smpcfd_dying_cpu, + }, /* * Handled on controll processor until the plugged processor manages * this itself. diff --git a/kernel/smp.c b/kernel/smp.c index 74165443c240..7180491c9678 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -33,69 +33,54 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); static void flush_smp_call_function_queue(bool warn_cpu_offline); -static int -hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) +int smpcfd_prepare_cpu(unsigned int cpu) { - long cpu = (long)hcpu; struct call_function_data *cfd = &per_cpu(cfd_data, cpu); - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, - cpu_to_node(cpu))) - return notifier_from_errno(-ENOMEM); - cfd->csd = alloc_percpu(struct call_single_data); - if (!cfd->csd) { - free_cpumask_var(cfd->cpumask); - return notifier_from_errno(-ENOMEM); - } - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - /* Fall-through to the CPU_DEAD[_FROZEN] case. */ - - case CPU_DEAD: - case CPU_DEAD_FROZEN: + if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, + cpu_to_node(cpu))) + return -ENOMEM; + cfd->csd = alloc_percpu(struct call_single_data); + if (!cfd->csd) { free_cpumask_var(cfd->cpumask); - free_percpu(cfd->csd); - break; + return -ENOMEM; + } - case CPU_DYING: - case CPU_DYING_FROZEN: - /* - * The IPIs for the smp-call-function callbacks queued by other - * CPUs might arrive late, either due to hardware latencies or - * because this CPU disabled interrupts (inside stop-machine) - * before the IPIs were sent. So flush out any pending callbacks - * explicitly (without waiting for the IPIs to arrive), to - * ensure that the outgoing CPU doesn't go offline with work - * still pending. - */ - flush_smp_call_function_queue(false); - break; -#endif - }; + return 0; +} + +int smpcfd_dead_cpu(unsigned int cpu) +{ + struct call_function_data *cfd = &per_cpu(cfd_data, cpu); - return NOTIFY_OK; + free_cpumask_var(cfd->cpumask); + free_percpu(cfd->csd); + return 0; } -static struct notifier_block hotplug_cfd_notifier = { - .notifier_call = hotplug_cfd, -}; +int smpcfd_dying_cpu(unsigned int cpu) +{ + /* + * The IPIs for the smp-call-function callbacks queued by other + * CPUs might arrive late, either due to hardware latencies or + * because this CPU disabled interrupts (inside stop-machine) + * before the IPIs were sent. So flush out any pending callbacks + * explicitly (without waiting for the IPIs to arrive), to + * ensure that the outgoing CPU doesn't go offline with work + * still pending. + */ + flush_smp_call_function_queue(false); + return 0; +} void __init call_function_init(void) { - void *cpu = (void *)(long)smp_processor_id(); int i; for_each_possible_cpu(i) init_llist_head(&per_cpu(call_single_queue, i)); - hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); - register_cpu_notifier(&hotplug_cfd_notifier); + smpcfd_prepare_cpu(smp_processor_id()); } /* -- cgit v1.3-8-gc7d7 From 4df8374254ea9294dfe4b8c447a1b7eddc543dbf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Jul 2016 17:17:03 +0000 Subject: rcu: Convert rcutree to hotplug state machine Straight forward conversion to the state machine. Though the question arises whether this needs really all these state transitions to work. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Gleixner Reviewed-by: Sebastian Andrzej Siewior Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20160713153337.982013161@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/cpuhotplug.h | 3 ++ include/linux/rcutiny.h | 7 +++ include/linux/rcutree.h | 7 +++ kernel/cpu.c | 14 ++++++ kernel/rcu/tree.c | 105 ++++++++++++++++++++++----------------------- 5 files changed, 83 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 544b5563720e..201a2e23bc49 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -20,11 +20,13 @@ enum cpuhp_state { CPUHP_X2APIC_PREPARE, CPUHP_SMPCFD_PREPARE, CPUHP_TIMERS_DEAD, + CPUHP_RCUTREE_PREP, CPUHP_NOTIFY_PREPARE, CPUHP_BRINGUP_CPU, CPUHP_AP_IDLE_DEAD, CPUHP_AP_OFFLINE, CPUHP_AP_SCHED_STARTING, + CPUHP_AP_RCUTREE_DYING, CPUHP_AP_IRQ_GIC_STARTING, CPUHP_AP_IRQ_GICV3_STARTING, CPUHP_AP_IRQ_HIP04_STARTING, @@ -80,6 +82,7 @@ enum cpuhp_state { CPUHP_AP_PERF_ARM_CCI_ONLINE, CPUHP_AP_PERF_ARM_CCN_ONLINE, CPUHP_AP_WORKQUEUE_ONLINE, + CPUHP_AP_RCUTREE_ONLINE, CPUHP_AP_NOTIFY_ONLINE, CPUHP_AP_ONLINE_DYN, CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 93aea75029fb..ac81e4063b40 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -243,4 +243,11 @@ static inline void rcu_all_qs(void) barrier(); /* Avoid RCU read-side critical sections leaking across. */ } +/* RCUtree hotplug events */ +#define rcutree_prepare_cpu NULL +#define rcutree_online_cpu NULL +#define rcutree_offline_cpu NULL +#define rcutree_dead_cpu NULL +#define rcutree_dying_cpu NULL + #endif /* __LINUX_RCUTINY_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 5043cb823fb2..63a4e4cf40a5 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -111,4 +111,11 @@ bool rcu_is_watching(void); void rcu_all_qs(void); +/* RCUtree hotplug events */ +int rcutree_prepare_cpu(unsigned int cpu); +int rcutree_online_cpu(unsigned int cpu); +int rcutree_offline_cpu(unsigned int cpu); +int rcutree_dead_cpu(unsigned int cpu); +int rcutree_dying_cpu(unsigned int cpu); + #endif /* __LINUX_RCUTREE_H */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 008e2fd40cb1..f24f45915b54 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1205,6 +1205,11 @@ static struct cpuhp_step cpuhp_bp_states[] = { .startup = NULL, .teardown = timers_dead_cpu, }, + [CPUHP_RCUTREE_PREP] = { + .name = "RCU-tree prepare", + .startup = rcutree_prepare_cpu, + .teardown = rcutree_dead_cpu, + }, /* * Preparatory and dead notifiers. Will be replaced once the notifiers * are converted to states. @@ -1263,6 +1268,10 @@ static struct cpuhp_step cpuhp_ap_states[] = { .startup = sched_cpu_starting, .teardown = sched_cpu_dying, }, + [CPUHP_AP_RCUTREE_DYING] = { + .startup = NULL, + .teardown = rcutree_dying_cpu, + }, /* * Low level startup/teardown notifiers. Run with interrupts * disabled. Will be removed once the notifiers are converted to @@ -1296,6 +1305,11 @@ static struct cpuhp_step cpuhp_ap_states[] = { .startup = workqueue_online_cpu, .teardown = workqueue_offline_cpu, }, + [CPUHP_AP_RCUTREE_ONLINE] = { + .name = "RCU-tree online", + .startup = rcutree_online_cpu, + .teardown = rcutree_offline_cpu, + }, /* * Online/down_prepare notifiers. Will be removed once the notifiers diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f433959e9322..5d80925e7fc8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1073,11 +1073,11 @@ EXPORT_SYMBOL_GPL(rcu_is_watching); * offline to continue to use RCU for one jiffy after marking itself * offline in the cpu_online_mask. This leniency is necessary given the * non-atomic nature of the online and offline processing, for example, - * the fact that a CPU enters the scheduler after completing the CPU_DYING - * notifiers. + * the fact that a CPU enters the scheduler after completing the teardown + * of the CPU. * - * This is also why RCU internally marks CPUs online during the - * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. + * This is also why RCU internally marks CPUs online during in the + * preparation phase and offline after the CPU has been taken down. * * Disable checking if in an NMI handler because we cannot safely report * errors from NMI handlers anyway. @@ -3806,12 +3806,58 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -static void rcu_prepare_cpu(int cpu) +int rcutree_prepare_cpu(unsigned int cpu) { struct rcu_state *rsp; for_each_rcu_flavor(rsp) rcu_init_percpu_data(cpu, rsp); + + rcu_prepare_kthreads(cpu); + rcu_spawn_all_nocb_kthreads(cpu); + + return 0; +} + +static void rcutree_affinity_setting(unsigned int cpu, int outgoing) +{ + struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); + + rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); +} + +int rcutree_online_cpu(unsigned int cpu) +{ + sync_sched_exp_online_cleanup(cpu); + rcutree_affinity_setting(cpu, -1); + return 0; +} + +int rcutree_offline_cpu(unsigned int cpu) +{ + rcutree_affinity_setting(cpu, cpu); + return 0; +} + + +int rcutree_dying_cpu(unsigned int cpu) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rcu_cleanup_dying_cpu(rsp); + return 0; +} + +int rcutree_dead_cpu(unsigned int cpu) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + rcu_cleanup_dead_cpu(cpu, rsp); + do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); + } + return 0; } #ifdef CONFIG_HOTPLUG_CPU @@ -3851,52 +3897,6 @@ void rcu_report_dead(unsigned int cpu) } #endif -/* - * Handle CPU online/offline notification events. - */ -int rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); - struct rcu_node *rnp = rdp->mynode; - struct rcu_state *rsp; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - rcu_prepare_cpu(cpu); - rcu_prepare_kthreads(cpu); - rcu_spawn_all_nocb_kthreads(cpu); - break; - case CPU_ONLINE: - case CPU_DOWN_FAILED: - sync_sched_exp_online_cleanup(cpu); - rcu_boost_kthread_setaffinity(rnp, -1); - break; - case CPU_DOWN_PREPARE: - rcu_boost_kthread_setaffinity(rnp, cpu); - break; - case CPU_DYING: - case CPU_DYING_FROZEN: - for_each_rcu_flavor(rsp) - rcu_cleanup_dying_cpu(rsp); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - for_each_rcu_flavor(rsp) { - rcu_cleanup_dead_cpu(cpu, rsp); - do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); - } - break; - default: - break; - } - return NOTIFY_OK; -} - static int rcu_pm_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -4208,10 +4208,9 @@ void __init rcu_init(void) * this is called early in boot, before either interrupts * or the scheduler are operational. */ - cpu_notifier(rcu_cpu_notify, 0); pm_notifier(rcu_pm_notify, 0); for_each_online_cpu(cpu) - rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); + rcutree_prepare_cpu(cpu); } #include "tree_exp.h" -- cgit v1.3-8-gc7d7 From 7bd8830875bfa380c68f390efbad893293749324 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 15 Jul 2016 06:35:24 -0500 Subject: cgroupns: Fix the locking in copy_cgroup_ns If "clone(CLONE_NEWCGROUP...)" is called it results in a nice lockdep valid splat. In __cgroup_proc_write the lock ordering is: cgroup_mutex -- through cgroup_kn_lock_live cgroup_threadgroup_rwsem In copy_process the guts of clone the lock ordering is: cgroup_threadgroup_rwsem -- through threadgroup_change_begin cgroup_mutex -- through copy_namespaces -- copy_cgroup_ns lockdep reports some a different call chains for the first ordering of cgroup_mutex and cgroup_threadgroup_rwsem but it is harder to trace. This is most definitely deadlock potential under the right circumstances. Fix this by by skipping the cgroup_mutex and making the locking in copy_cgroup_ns mirror the locking in cgroup_post_fork which also runs during fork under the cgroup_threadgroup_rwsem. Cc: stable@vger.kernel.org Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces") Signed-off-by: "Eric W. Biederman" Signed-off-by: Tejun Heo --- kernel/cgroup.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 75c0ff00aca6..5f01e00cffc4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6309,14 +6309,11 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - mutex_lock(&cgroup_mutex); + /* It is not safe to take cgroup_mutex here */ spin_lock_irq(&css_set_lock); - cset = task_css_set(current); get_css_set(cset); - spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); new_ns = alloc_cgroup_ns(); if (IS_ERR(new_ns)) { -- cgit v1.3-8-gc7d7 From eedd0f4cbf5f3b81e82649832091e1d9d53f0709 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 15 Jul 2016 06:35:51 -0500 Subject: cgroupns: Close race between cgroup_post_fork and copy_cgroup_ns In most code paths involving cgroup migration cgroup_threadgroup_rwsem is taken. There are two exceptions: - remove_tasks_in_empty_cpuset calls cgroup_transfer_tasks - vhost_attach_cgroups_work calls cgroup_attach_task_all With cgroup_threadgroup_rwsem held it is guaranteed that cgroup_post_fork and copy_cgroup_ns will reference the same css_set from the process calling fork. Without such an interlock there process after fork could reference one css_set from it's new cgroup namespace and another css_set from task->cgroups, which semantically is nonsensical. Cc: stable@vger.kernel.org Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces") Signed-off-by: "Eric W. Biederman" Signed-off-by: Tejun Heo --- kernel/cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5f01e00cffc4..e75efa819911 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2962,6 +2962,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); for_each_root(root) { struct cgroup *from_cgrp; @@ -2976,6 +2977,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } + percpu_up_write(&cgroup_threadgroup_rwsem); mutex_unlock(&cgroup_mutex); return retval; @@ -4343,6 +4345,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) @@ -4371,6 +4375,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&preloaded_csets); + percpu_up_write(&cgroup_threadgroup_rwsem); mutex_unlock(&cgroup_mutex); return ret; } -- cgit v1.3-8-gc7d7 From 726a4994b05ff5b6f83d64b5b43c3251217366ce Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 15 Jul 2016 06:36:44 -0500 Subject: cgroupns: Only allow creation of hierarchies in the initial cgroup namespace Unprivileged users can't use hierarchies if they create them as they do not have privilieges to the root directory. Which means the only thing a hiearchy created by an unprivileged user is good for is expanding the number of cgroup links in every css_set, which is a DOS attack. We could allow hierarchies to be created in namespaces in the initial user namespace. Unfortunately there is only a single namespace for the names of heirarchies, so that is likely to create more confusion than not. So do the simple thing and restrict hiearchy creation to the initial cgroup namespace. Cc: stable@vger.kernel.org Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces") Signed-off-by: "Eric W. Biederman" Signed-off-by: Tejun Heo --- kernel/cgroup.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e75efa819911..e0be49fc382f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2215,12 +2215,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; } - /* - * We know this subsystem has not yet been bound. Users in a non-init - * user namespace may only mount hierarchies with no bound subsystems, - * i.e. 'none,name=user1' - */ - if (!opts.none && !capable(CAP_SYS_ADMIN)) { + /* Hierarchies may only be created in the initial cgroup namespace. */ + if (ns != &init_cgroup_ns) { ret = -EPERM; goto out_unlock; } -- cgit v1.3-8-gc7d7 From 406f992e4a372dafbe3c2cff7efbb2002a5c8ebd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 14 Jul 2016 03:55:23 +0200 Subject: x86 / hibernate: Use hlt_play_dead() when resuming from hibernation On Intel hardware, native_play_dead() uses mwait_play_dead() by default and only falls back to the other methods if that fails. That also happens during resume from hibernation, when the restore (boot) kernel runs disable_nonboot_cpus() to take all of the CPUs except for the boot one offline. However, that is problematic, because the address passed to __monitor() in mwait_play_dead() is likely to be written to in the last phase of hibernate image restoration and that causes the "dead" CPU to start executing instructions again. Unfortunately, the page containing the address in that CPU's instruction pointer may not be valid any more at that point. First, that page may have been overwritten with image kernel memory contents already, so the instructions the CPU attempts to execute may simply be invalid. Second, the page tables previously used by that CPU may have been overwritten by image kernel memory contents, so the address in its instruction pointer is impossible to resolve then. A report from Varun Koyyalagunta and investigation carried out by Chen Yu show that the latter sometimes happens in practice. To prevent it from happening, temporarily change the smp_ops.play_dead pointer during resume from hibernation so that it points to a special "play dead" routine which uses hlt_play_dead() and avoids the inadvertent "revivals" of "dead" CPUs this way. A slightly unpleasant consequence of this change is that if the system is hibernated with one or more CPUs offline, it will generally draw more power after resume than it did before hibernation, because the physical state entered by CPUs via hlt_play_dead() is higher-power than the mwait_play_dead() one in the majority of cases. It is possible to work around this, but it is unclear how much of a problem that's going to be in practice, so the workaround will be implemented later if it turns out to be necessary. Link: https://bugzilla.kernel.org/show_bug.cgi?id=106371 Reported-by: Varun Koyyalagunta Original-by: Chen Yu Tested-by: Chen Yu Signed-off-by: Rafael J. Wysocki Acked-by: Ingo Molnar --- arch/x86/include/asm/smp.h | 1 + arch/x86/kernel/smpboot.c | 2 +- arch/x86/power/cpu.c | 30 ++++++++++++++++++++++++++++++ kernel/power/hibernate.c | 7 ++++++- kernel/power/power.h | 2 ++ 5 files changed, 40 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 66b057306f40..7427ca895a27 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -135,6 +135,7 @@ int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_disable(void); int common_cpu_die(unsigned int cpu); void native_cpu_die(unsigned int cpu); +void hlt_play_dead(void); void native_play_dead(void); void play_dead_common(void); void wbinvd_on_cpu(int cpu); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fafe8b923cac..8264dfad9cf8 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1622,7 +1622,7 @@ static inline void mwait_play_dead(void) } } -static inline void hlt_play_dead(void) +void hlt_play_dead(void) { if (__this_cpu_read(cpu_info.x86) >= 4) wbinvd(); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index d5f64996394a..b12c26e2e309 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -266,6 +267,35 @@ void notrace restore_processor_state(void) EXPORT_SYMBOL(restore_processor_state); #endif +#if defined(CONFIG_HIBERNATION) && defined(CONFIG_HOTPLUG_CPU) +static void resume_play_dead(void) +{ + play_dead_common(); + tboot_shutdown(TB_SHUTDOWN_WFS); + hlt_play_dead(); +} + +int hibernate_resume_nonboot_cpu_disable(void) +{ + void (*play_dead)(void) = smp_ops.play_dead; + int ret; + + /* + * Ensure that MONITOR/MWAIT will not be used in the "play dead" loop + * during hibernate image restoration, because it is likely that the + * monitored address will be actually written to at that time and then + * the "dead" CPU will attempt to execute instructions again, but the + * address in its instruction pointer may not be possible to resolve + * any more at that point (the page tables used by it previously may + * have been overwritten by hibernate image data). + */ + smp_ops.play_dead = resume_play_dead; + ret = disable_nonboot_cpus(); + smp_ops.play_dead = play_dead; + return ret; +} +#endif + /* * When bsp_check() is called in hibernate and suspend, cpu hotplug * is disabled already. So it's unnessary to handle race condition between diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 51441d87f0b6..5f3523e18e46 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -409,6 +409,11 @@ int hibernation_snapshot(int platform_mode) goto Close; } +int __weak hibernate_resume_nonboot_cpu_disable(void) +{ + return disable_nonboot_cpus(); +} + /** * resume_target_kernel - Restore system state from a hibernation image. * @platform_mode: Whether or not to use the platform driver. @@ -433,7 +438,7 @@ static int resume_target_kernel(bool platform_mode) if (error) goto Cleanup; - error = disable_nonboot_cpus(); + error = hibernate_resume_nonboot_cpu_disable(); if (error) goto Enable_cpus; diff --git a/kernel/power/power.h b/kernel/power/power.h index 064963e89194..242d8b827dd5 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -38,6 +38,8 @@ static inline char *check_image_kernel(struct swsusp_info *info) } #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ +extern int hibernate_resume_nonboot_cpu_disable(void); + /* * Keep some memory free so that I/O operations can succeed without paging * [Might this be more than 4 MB?] -- cgit v1.3-8-gc7d7 From 7e3f977edd0bd9ea6104156feba95bb5ae9bdd38 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 14 Jul 2016 18:08:03 +0200 Subject: perf, events: add non-linear data support for raw records This patch adds support for non-linear data on raw records. It extends raw records to have one or multiple fragments that will be written linearly into the ring slot, where each fragment can optionally have a custom callback handler to walk and extract complex, possibly non-linear data. If a callback handler is provided for a fragment, then the new __output_custom() will be used instead of __output_copy() for the perf_output_sample() part. perf_prepare_sample() does all the size calculation only once, so perf_output_sample() doesn't need to redo the same work anymore, meaning real_size and padding will be cached in the raw record. The raw record becomes 32 bytes in size without holes; to not increase it further and to avoid doing unnecessary recalculations in fast-path, we can reuse next pointer of the last fragment, idea here is borrowed from ZERO_OR_NULL_PTR(), which should keep the perf_output_sample() path for PERF_SAMPLE_RAW minimal. This facility is needed for BPF's event output helper as a first user that will, in a follow-up, add an additional perf_raw_frag to its perf_raw_record in order to be able to more efficiently dump skb context after a linear head meta data related to it. skbs can be non-linear and thus need a custom output function to dump buffers. Currently, the skb data needs to be copied twice; with the help of __output_custom() this work only needs to be done once. Future users could be things like XDP/BPF programs that work on different context though and would thus also have a different callback function. The few users of raw records are adapted to initialize their frag data from the raw record itself, no change in behavior for them. The code is based upon a PoC diff provided by Peter Zijlstra [1]. [1] http://thread.gmane.org/gmane.linux.network/421294 Suggested-by: Peter Zijlstra Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/s390/kernel/perf_cpum_sf.c | 9 ++++-- arch/x86/events/amd/ibs.c | 8 +++-- include/linux/perf_event.h | 20 ++++++++++++- kernel/events/core.c | 66 ++++++++++++++++++++++++++++------------- kernel/events/internal.h | 16 +++++++--- kernel/trace/bpf_trace.c | 6 ++-- 6 files changed, 93 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index a8e832166417..92619cce57ed 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -979,12 +979,15 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr) struct pt_regs regs; struct perf_sf_sde_regs *sde_regs; struct perf_sample_data data; - struct perf_raw_record raw; + struct perf_raw_record raw = { + .frag = { + .size = sfr->size, + .data = sfr, + }, + }; /* Setup perf sample */ perf_sample_data_init(&data, 0, event->hw.last_period); - raw.size = sfr->size; - raw.data = sfr; data.raw = &raw; /* Setup pt_regs to look like an CPU-measurement external interrupt diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index feb90f6730e8..72dea2f40fc4 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -655,8 +655,12 @@ fail: } if (event->attr.sample_type & PERF_SAMPLE_RAW) { - raw.size = sizeof(u32) + ibs_data.size; - raw.data = ibs_data.data; + raw = (struct perf_raw_record){ + .frag = { + .size = sizeof(u32) + ibs_data.size, + .data = ibs_data.data, + }, + }; data.raw = &raw; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1a827cecd62f..e79e6c6fed89 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -69,9 +69,22 @@ struct perf_callchain_entry_ctx { bool contexts_maxed; }; +typedef unsigned long (*perf_copy_f)(void *dst, const void *src, + unsigned long len); + +struct perf_raw_frag { + union { + struct perf_raw_frag *next; + unsigned long pad; + }; + perf_copy_f copy; + void *data; + u32 size; +} __packed; + struct perf_raw_record { + struct perf_raw_frag frag; u32 size; - void *data; }; /* @@ -1283,6 +1296,11 @@ extern void perf_restore_debug_store(void); static inline void perf_restore_debug_store(void) { } #endif +static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag) +{ + return frag->pad < sizeof(u64); +} + #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 9c51ec3f0f44..b1891b6b5c1f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5553,16 +5553,26 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - u32 raw_size = data->raw->size; - u32 real_size = round_up(raw_size + sizeof(u32), - sizeof(u64)) - sizeof(u32); - u64 zero = 0; - - perf_output_put(handle, real_size); - __output_copy(handle, data->raw->data, raw_size); - if (real_size - raw_size) - __output_copy(handle, &zero, real_size - raw_size); + struct perf_raw_record *raw = data->raw; + + if (raw) { + struct perf_raw_frag *frag = &raw->frag; + + perf_output_put(handle, raw->size); + do { + if (frag->copy) { + __output_custom(handle, frag->copy, + frag->data, frag->size); + } else { + __output_copy(handle, frag->data, + frag->size); + } + if (perf_raw_frag_last(frag)) + break; + frag = frag->next; + } while (1); + if (frag->pad) + __output_skip(handle, NULL, frag->pad); } else { struct { u32 size; @@ -5687,14 +5697,28 @@ void perf_prepare_sample(struct perf_event_header *header, } if (sample_type & PERF_SAMPLE_RAW) { - int size = sizeof(u32); - - if (data->raw) - size += data->raw->size; - else - size += sizeof(u32); + struct perf_raw_record *raw = data->raw; + int size; + + if (raw) { + struct perf_raw_frag *frag = &raw->frag; + u32 sum = 0; + + do { + sum += frag->size; + if (perf_raw_frag_last(frag)) + break; + frag = frag->next; + } while (1); + + size = round_up(sum + sizeof(u32), sizeof(u64)); + raw->size = size - sizeof(u32); + frag->pad = raw->size - sum; + } else { + size = sizeof(u64); + } - header->size += round_up(size, sizeof(u64)); + header->size += size; } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { @@ -7331,7 +7355,7 @@ static struct pmu perf_swevent = { static int perf_tp_filter_match(struct perf_event *event, struct perf_sample_data *data) { - void *record = data->raw->data; + void *record = data->raw->frag.data; /* only top level events have filters set */ if (event->parent) @@ -7387,8 +7411,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct perf_event *event; struct perf_raw_record raw = { - .size = entry_size, - .data = record, + .frag = { + .size = entry_size, + .data = record, + }, }; perf_sample_data_init(&data, 0, 0); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 05f9f6d626df..2417eb5512cd 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -123,10 +123,7 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb) return rb->aux_nr_pages << PAGE_SHIFT; } -#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ -static inline unsigned long \ -func_name(struct perf_output_handle *handle, \ - const void *buf, unsigned long len) \ +#define __DEFINE_OUTPUT_COPY_BODY(memcpy_func) \ { \ unsigned long size, written; \ \ @@ -152,6 +149,17 @@ func_name(struct perf_output_handle *handle, \ return len; \ } +#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ +static inline unsigned long \ +func_name(struct perf_output_handle *handle, \ + const void *buf, unsigned long len) \ +__DEFINE_OUTPUT_COPY_BODY(memcpy_func) + +static inline unsigned long +__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func, + const void *buf, unsigned long len) +__DEFINE_OUTPUT_COPY_BODY(copy_func) + static inline unsigned long memcpy_common(void *dst, const void *src, unsigned long n) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 094c716154ed..35ab1b2b041b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -245,8 +245,10 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) struct bpf_event_entry *ee; struct perf_event *event; struct perf_raw_record raw = { - .size = size, - .data = data, + .frag = { + .size = size, + .data = data, + }, }; if (unlikely(flags & ~(BPF_F_INDEX_MASK))) -- cgit v1.3-8-gc7d7 From 8e7a3920ac277dd4e690c0e70c9750176e3acb83 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 14 Jul 2016 18:08:04 +0200 Subject: bpf, perf: split bpf_perf_event_output Split the bpf_perf_event_output() helper as a preparation into two parts. The new bpf_perf_event_output() will prepare the raw record itself and test for unknown flags from BPF trace context, where the __bpf_perf_event_output() does the core work. The latter will be reused later on from bpf_event_output() directly. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 35ab1b2b041b..c35883a9bc11 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -233,26 +233,17 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; -static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +static __always_inline u64 +__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, + u64 flags, struct perf_raw_record *raw) { - struct pt_regs *regs = (struct pt_regs *) (long) r1; - struct bpf_map *map = (struct bpf_map *) (long) r2; struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; - void *data = (void *) (long) r4; struct perf_sample_data sample_data; struct bpf_event_entry *ee; struct perf_event *event; - struct perf_raw_record raw = { - .frag = { - .size = size, - .data = data, - }, - }; - if (unlikely(flags & ~(BPF_F_INDEX_MASK))) - return -EINVAL; if (index == BPF_F_CURRENT_CPU) index = cpu; if (unlikely(index >= array->map.max_entries)) @@ -271,11 +262,29 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) return -EOPNOTSUPP; perf_sample_data_init(&sample_data, 0, 0); - sample_data.raw = &raw; + sample_data.raw = raw; perf_event_output(event, &sample_data, regs); return 0; } +static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +{ + struct pt_regs *regs = (struct pt_regs *)(long) r1; + struct bpf_map *map = (struct bpf_map *)(long) r2; + void *data = (void *)(long) r4; + struct perf_raw_record raw = { + .frag = { + .size = size, + .data = data, + }, + }; + + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return __bpf_perf_event_output(regs, map, flags, &raw); +} + static const struct bpf_func_proto bpf_perf_event_output_proto = { .func = bpf_perf_event_output, .gpl_only = true, -- cgit v1.3-8-gc7d7 From 555c8a8623a3a87b3c990ba30b7fd2e5914e41d2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 14 Jul 2016 18:08:05 +0200 Subject: bpf: avoid stack copy and use skb ctx for event output This work addresses a couple of issues bpf_skb_event_output() helper currently has: i) We need two copies instead of just a single one for the skb data when it should be part of a sample. The data can be non-linear and thus needs to be extracted via bpf_skb_load_bytes() helper first, and then copied once again into the ring buffer slot. ii) Since bpf_skb_load_bytes() currently needs to be used first, the helper needs to see a constant size on the passed stack buffer to make sure BPF verifier can do sanity checks on it during verification time. Thus, just passing skb->len (or any other non-constant value) wouldn't work, but changing bpf_skb_load_bytes() is also not the proper solution, since the two copies are generally still needed. iii) bpf_skb_load_bytes() is just for rather small buffers like headers, since they need to sit on the limited BPF stack anyway. Instead of working around in bpf_skb_load_bytes(), this work improves the bpf_skb_event_output() helper to address all 3 at once. We can make use of the passed in skb context that we have in the helper anyway, and use some of the reserved flag bits as a length argument. The helper will use the new __output_custom() facility from perf side with bpf_skb_copy() as callback helper to walk and extract the data. It will pass the data for setup to bpf_event_output(), which generates and pushes the raw record with an additional frag part. The linear data used in the first frag of the record serves as programmatically defined meta data passed along with the appended sample. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 7 ++++++- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/core.c | 6 ++++-- kernel/trace/bpf_trace.c | 33 +++++++++++++++------------------ net/core/filter.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 69 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b3336b4f5d04..c13e92b00bf5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -209,7 +209,12 @@ u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); -const struct bpf_func_proto *bpf_get_event_output_proto(void); + +typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, + unsigned long len); + +u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, + void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 262a7e883b19..c4d922439d20 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -401,6 +401,8 @@ enum bpf_func_id { /* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */ #define BPF_F_INDEX_MASK 0xffffffffULL #define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK +/* BPF_FUNC_perf_event_output for sk_buff input context. */ +#define BPF_F_CTXLEN_MASK (0xfffffULL << 32) /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d638062f66d6..03fd23d4d587 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1054,9 +1054,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) return NULL; } -const struct bpf_func_proto * __weak bpf_get_event_output_proto(void) +u64 __weak +bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, + void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { - return NULL; + return -ENOTSUPP; } /* Always built-in helper functions. */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c35883a9bc11..ebfbb7dd7033 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -298,29 +298,26 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); -static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, + void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); + struct perf_raw_frag frag = { + .copy = ctx_copy, + .size = ctx_size, + .data = ctx, + }; + struct perf_raw_record raw = { + .frag = { + .next = ctx_size ? &frag : NULL, + .size = meta_size, + .data = meta, + }, + }; perf_fetch_caller_regs(regs); - return bpf_perf_event_output((long)regs, r2, flags, r4, size); -} - -static const struct bpf_func_proto bpf_event_output_proto = { - .func = bpf_event_output, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, -}; - -const struct bpf_func_proto *bpf_get_event_output_proto(void) -{ - return &bpf_event_output_proto; + return __bpf_perf_event_output(regs, map, flags, &raw); } static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) diff --git a/net/core/filter.c b/net/core/filter.c index 10c4a2f9e8bb..22e3992c8b48 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2025,6 +2025,47 @@ bool bpf_helper_changes_skb_data(void *func) return false; } +static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, + unsigned long len) +{ + void *ptr = skb_header_pointer(skb, 0, len, dst_buff); + + if (unlikely(!ptr)) + return len; + if (ptr != dst_buff) + memcpy(dst_buff, ptr, len); + + return 0; +} + +static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4, + u64 meta_size) +{ + struct sk_buff *skb = (struct sk_buff *)(long) r1; + struct bpf_map *map = (struct bpf_map *)(long) r2; + u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; + void *meta = (void *)(long) r4; + + if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) + return -EINVAL; + if (unlikely(skb_size > skb->len)) + return -EFAULT; + + return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, + bpf_skb_copy); +} + +static const struct bpf_func_proto bpf_skb_event_output_proto = { + .func = bpf_skb_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_STACK, + .arg5_type = ARG_CONST_STACK_SIZE, +}; + static unsigned short bpf_tunnel_key_af(u64 flags) { return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; @@ -2357,7 +2398,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; case BPF_FUNC_perf_event_output: - return bpf_get_event_output_proto(); + return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; #ifdef CONFIG_SOCK_CGROUP_DATA -- cgit v1.3-8-gc7d7 From 858d68f10238fdd1ebdd0096f912f063e97c6766 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 16 Jul 2016 01:15:55 +0200 Subject: bpf: bpf_event_entry_gen's alloc needs to be in atomic context Should have been obvious, only called from bpf() syscall via map_update_elem() that calls bpf_fd_array_map_update_elem() under RCU read lock and thus this must also be in GFP_ATOMIC, of course. Fixes: 3b1efb196eee ("bpf, maps: flush own entries on perf map release") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index db1a743e3db2..633a650d7aeb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -430,7 +430,7 @@ static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, { struct bpf_event_entry *ee; - ee = kzalloc(sizeof(*ee), GFP_KERNEL); + ee = kzalloc(sizeof(*ee), GFP_ATOMIC); if (ee) { ee->event = perf_file->private_data; ee->perf_file = perf_file; -- cgit v1.3-8-gc7d7 From 775be506266a860f141f6b848c92c316c602a94f Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 17 Jun 2016 16:56:14 +0100 Subject: clockevents: Make clockevents_subsys static The clockevents_subsys struct is used for sysfs support and is not declared or used outside the file it is defined in. Fix the following warning by making it static: kernel/time/clockevents.c:648:17: warning: symbol 'clockevents_subsys' was not declared. Should it be static? Signed-off-by: Ben Dooks Cc: linux-kernel@lists.codethink.co.uk Link: http://lkml.kernel.org/r/1466178974-7105-1-git-send-email-ben.dooks@codethink.co.uk Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index a9b76a40319e..2c5bc77c0bb0 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -645,7 +645,7 @@ void tick_cleanup_dead_cpu(int cpu) #endif #ifdef CONFIG_SYSFS -struct bus_type clockevents_subsys = { +static struct bus_type clockevents_subsys = { .name = "clockevents", .dev_name = "clockevent", }; -- cgit v1.3-8-gc7d7 From eb0dc47ab6810c432e8193beccd9905ba0db8b22 Mon Sep 17 00:00:00 2001 From: Vincent Stehle Date: Mon, 18 Jul 2016 22:56:26 +0200 Subject: genirq: Fix missing irq allocation affinity hint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new affinity hint argument of __irq_domain_alloc_irqs() is missing in irq_reserve_ipi(). Add it. This fixes the following compilation error: kernel/irq/ipi.c: In function ‘irq_reserve_ipi’: kernel/irq/ipi.c:85:9: error: too few arguments to function ‘__irq_domain_alloc_irqs’ virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, ^ Fixes: 06ee6d571f0e ("genirq: Add affinity hint to irq allocation") Signed-off-by: Vincent Stehlé Cc: linux-pci@vger.kernel.org Cc: Christoph Hellwig Signed-off-by: Thomas Gleixner --- kernel/irq/ipi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 4fd23510d5f2..1a9abc1c8ea0 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -83,7 +83,7 @@ int irq_reserve_ipi(struct irq_domain *domain, } virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, - (void *) dest, true); + (void *) dest, true, NULL); if (virq <= 0) { pr_warn("Can't reserve IPI, failed to alloc hw irqs\n"); -- cgit v1.3-8-gc7d7 From 1f3b0f8243cb934307f59bd4d8e43b868e61d4d9 Mon Sep 17 00:00:00 2001 From: Gaurav Jindal Date: Thu, 14 Jul 2016 12:04:20 +0000 Subject: tick/nohz: Optimize nohz idle enter tick_nohz_start_idle is called before checking whether the idle tick can be stopped. If the tick cannot be stopped, calling tick_nohz_start_idle() is pointless and just wasting CPU cycles. Only invoke tick_nohz_start_idle() when can_stop_idle_tick() returns true. A short one minute observation of the effect on ARM64 shows a reduction of calls by 1.5% thus optimizing the idle entry sequence. [tglx: Massaged changelog ] Co-developed-by: Sanjeev Yadav Signed-off-by: Gaurav Jindal Link: http://lkml.kernel.org/r/20160714120416.GB21099@gaurav.jindal@spreadtrum.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2ec7c00228f3..204fdc86863d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -908,11 +908,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) ktime_t now, expires; int cpu = smp_processor_id(); - now = tick_nohz_start_idle(ts); - if (can_stop_idle_tick(cpu, ts)) { int was_stopped = ts->tick_stopped; + now = tick_nohz_start_idle(ts); ts->idle_calls++; expires = tick_nohz_stop_sched_tick(ts, now, cpu); -- cgit v1.3-8-gc7d7 From 55094f57535831883b60776de5eb78c6bfb3c16d Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 19 Jul 2016 12:02:39 +0000 Subject: cgroup: remove duplicated include from cgroup.c Remove duplicated include. Signed-off-by: Wei Yongjun Signed-off-by: Tejun Heo --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dd26e1bb7222..861995c7fc3f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -61,7 +61,6 @@ #include #include #include -#include #include /* -- cgit v1.3-8-gc7d7 From 183fc1537ec39be242dc8b619f71fc11b393d295 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 18 Jul 2016 15:50:58 -0700 Subject: kernel/trace/bpf_trace.c: work around gcc-4.4.4 anon union initialization bug kernel/trace/bpf_trace.c: In function 'bpf_event_output': kernel/trace/bpf_trace.c:312: error: unknown field 'next' specified in initializer kernel/trace/bpf_trace.c:312: warning: missing braces around initializer kernel/trace/bpf_trace.c:312: warning: (near initialization for 'raw.frag.') Fixes: 555c8a8623a3a87 ("bpf: avoid stack copy and use skb ctx for event output") Acked-by: Daniel Borkmann Cc: Alexei Starovoitov Cc: David S. Miller Signed-off-by: Andrew Morton Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ebfbb7dd7033..a12bbd32c0a6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -309,7 +309,9 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, }; struct perf_raw_record raw = { .frag = { - .next = ctx_size ? &frag : NULL, + { + .next = ctx_size ? &frag : NULL, + }, .size = meta_size, .data = meta, }, -- cgit v1.3-8-gc7d7 From 59d3656d5bf504f771fc44fdbc7a9a8590795f22 Mon Sep 17 00:00:00 2001 From: Brenden Blanco Date: Tue, 19 Jul 2016 12:16:46 -0700 Subject: bpf: add bpf_prog_add api for bulk prog refcnt A subsystem may need to store many copies of a bpf program, each deserving its own reference. Rather than requiring the caller to loop one by one (with possible mid-loop failure), add a bulk bpf_prog_add api. Signed-off-by: Brenden Blanco Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + kernel/bpf/syscall.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c13e92b00bf5..75a5ae6bee07 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -224,6 +224,7 @@ void bpf_register_map_type(struct bpf_map_type_list *tl); struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type); +struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i); struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 96d938a22050..228f962447a5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -670,14 +670,20 @@ static struct bpf_prog *____bpf_prog_get(struct fd f) return f.file->private_data; } -struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) { - if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) { - atomic_dec(&prog->aux->refcnt); + if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) { + atomic_sub(i, &prog->aux->refcnt); return ERR_PTR(-EBUSY); } return prog; } +EXPORT_SYMBOL_GPL(bpf_prog_add); + +struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +{ + return bpf_prog_add(prog, 1); +} static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) { -- cgit v1.3-8-gc7d7 From 6a773a15a1e8874e5eccd2f29190c31085912c95 Mon Sep 17 00:00:00 2001 From: Brenden Blanco Date: Tue, 19 Jul 2016 12:16:47 -0700 Subject: bpf: add XDP prog type for early driver filter Add a new bpf prog type that is intended to run in early stages of the packet rx path. Only minimal packet metadata will be available, hence a new context type, struct xdp_md, is exposed to userspace. So far only expose the packet start and end pointers, and only in read mode. An XDP program must return one of the well known enum values, all other return codes are reserved for future use. Unfortunately, this restriction is hard to enforce at verification time, so take the approach of warning at runtime when such programs are encountered. Out of bounds return codes should alias to XDP_ABORTED. Signed-off-by: Brenden Blanco Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 18 +++++++++++ include/uapi/linux/bpf.h | 20 ++++++++++++ kernel/bpf/verifier.c | 1 + net/core/filter.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 118 insertions(+) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 6fc31ef1da2d..15d816a8b755 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -368,6 +368,11 @@ struct bpf_skb_data_end { void *data_end; }; +struct xdp_buff { + void *data; + void *data_end; +}; + /* compute the linear packet data range [data, data_end) which * will be accessed by cls_bpf and act_bpf programs */ @@ -429,6 +434,18 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, return BPF_PROG_RUN(prog, skb); } +static inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, + struct xdp_buff *xdp) +{ + u32 ret; + + rcu_read_lock(); + ret = BPF_PROG_RUN(prog, (void *)xdp); + rcu_read_unlock(); + + return ret; +} + static inline unsigned int bpf_prog_size(unsigned int proglen) { return max(sizeof(struct bpf_prog), @@ -509,6 +526,7 @@ bool bpf_helper_changes_skb_data(void *func); struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +void bpf_warn_invalid_xdp_action(u32 act); #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c4d922439d20..a51786566c2f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -94,6 +94,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SCHED_CLS, BPF_PROG_TYPE_SCHED_ACT, BPF_PROG_TYPE_TRACEPOINT, + BPF_PROG_TYPE_XDP, }; #define BPF_PSEUDO_MAP_FD 1 @@ -439,4 +440,23 @@ struct bpf_tunnel_key { __u32 tunnel_label; }; +/* User return codes for XDP prog type. + * A valid XDP program must return one of these defined values. All other + * return codes are reserved for future use. Unknown return codes will result + * in packet drop. + */ +enum xdp_action { + XDP_ABORTED = 0, + XDP_DROP, + XDP_PASS, +}; + +/* user accessible metadata for XDP packet hook + * new fields must be added to the end of this structure + */ +struct xdp_md { + __u32 data; + __u32 data_end; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e206c2181412..a8d67d097b0d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -713,6 +713,7 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, switch (env->prog->type) { case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_XDP: break; default: verbose("verifier is misconfigured\n"); diff --git a/net/core/filter.c b/net/core/filter.c index 22e3992c8b48..6c627bc4be6e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2410,6 +2410,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) } } +static const struct bpf_func_proto * +xdp_func_proto(enum bpf_func_id func_id) +{ + return sk_filter_func_proto(func_id); +} + static bool __is_valid_access(int off, int size, enum bpf_access_type type) { if (off < 0 || off >= sizeof(struct __sk_buff)) @@ -2477,6 +2483,44 @@ static bool tc_cls_act_is_valid_access(int off, int size, return __is_valid_access(off, size, type); } +static bool __is_valid_xdp_access(int off, int size, + enum bpf_access_type type) +{ + if (off < 0 || off >= sizeof(struct xdp_md)) + return false; + if (off % size != 0) + return false; + if (size != 4) + return false; + + return true; +} + +static bool xdp_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (type == BPF_WRITE) + return false; + + switch (off) { + case offsetof(struct xdp_md, data): + *reg_type = PTR_TO_PACKET; + break; + case offsetof(struct xdp_md, data_end): + *reg_type = PTR_TO_PACKET_END; + break; + } + + return __is_valid_xdp_access(off, size, type); +} + +void bpf_warn_invalid_xdp_action(u32 act) +{ + WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act); +} +EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); + static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn_buf, @@ -2628,6 +2672,29 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, return insn - insn_buf; } +static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct xdp_md, data): + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)), + dst_reg, src_reg, + offsetof(struct xdp_buff, data)); + break; + case offsetof(struct xdp_md, data_end): + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)), + dst_reg, src_reg, + offsetof(struct xdp_buff, data_end)); + break; + } + + return insn - insn_buf; +} + static const struct bpf_verifier_ops sk_filter_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, @@ -2640,6 +2707,12 @@ static const struct bpf_verifier_ops tc_cls_act_ops = { .convert_ctx_access = bpf_net_convert_ctx_access, }; +static const struct bpf_verifier_ops xdp_ops = { + .get_func_proto = xdp_func_proto, + .is_valid_access = xdp_is_valid_access, + .convert_ctx_access = xdp_convert_ctx_access, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, @@ -2655,11 +2728,17 @@ static struct bpf_prog_type_list sched_act_type __read_mostly = { .type = BPF_PROG_TYPE_SCHED_ACT, }; +static struct bpf_prog_type_list xdp_type __read_mostly = { + .ops = &xdp_ops, + .type = BPF_PROG_TYPE_XDP, +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); bpf_register_prog_type(&sched_cls_type); bpf_register_prog_type(&sched_act_type); + bpf_register_prog_type(&xdp_type); return 0; } -- cgit v1.3-8-gc7d7 From 4acf6c0b84c91243c705303cd9ff16421914150d Mon Sep 17 00:00:00 2001 From: Brenden Blanco Date: Tue, 19 Jul 2016 12:16:56 -0700 Subject: bpf: enable direct packet data write for xdp progs For forwarding to be effective, XDP programs should be allowed to rewrite packet data. This requires that the drivers supporting XDP must all map the packet memory as TODEVICE or BIDIRECTIONAL before invoking the program. Signed-off-by: Brenden Blanco Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a8d67d097b0d..f72f23b8fdab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -653,6 +653,16 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, #define MAX_PACKET_OFF 0xffff +static bool may_write_pkt_data(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_XDP: + return true; + default: + return false; + } +} + static int check_packet_access(struct verifier_env *env, u32 regno, int off, int size) { @@ -806,10 +816,15 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, err = check_stack_read(state, off, size, value_regno); } } else if (state->regs[regno].type == PTR_TO_PACKET) { - if (t == BPF_WRITE) { + if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) { verbose("cannot write into packet\n"); return -EACCES; } + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose("R%d leaks addr into packet\n", value_regno); + return -EACCES; + } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); -- cgit v1.3-8-gc7d7 From 43761473c254b45883a64441dd0bc85a42f3645c Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 19 Jul 2016 17:42:57 -0400 Subject: audit: fix a double fetch in audit_log_single_execve_arg() There is a double fetch problem in audit_log_single_execve_arg() where we first check the execve(2) argumnets for any "bad" characters which would require hex encoding and then re-fetch the arguments for logging in the audit record[1]. Of course this leaves a window of opportunity for an unsavory application to munge with the data. This patch reworks things by only fetching the argument data once[2] into a buffer where it is scanned and logged into the audit records(s). In addition to fixing the double fetch, this patch improves on the original code in a few other ways: better handling of large arguments which require encoding, stricter record length checking, and some performance improvements (completely unverified, but we got rid of some strlen() calls, that's got to be a good thing). As part of the development of this patch, I've also created a basic regression test for the audit-testsuite, the test can be tracked on GitHub at the following link: * https://github.com/linux-audit/audit-testsuite/issues/25 [1] If you pay careful attention, there is actually a triple fetch problem due to a strnlen_user() call at the top of the function. [2] This is a tiny white lie, we do make a call to strnlen_user() prior to fetching the argument data. I don't like it, but due to the way the audit record is structured we really have no choice unless we copy the entire argument at once (which would require a rather wasteful allocation). The good news is that with this patch the kernel no longer relies on this strnlen_user() value for anything beyond recording it in the log, we also update it with a trustworthy value whenever possible. Reported-by: Pengfei Wang Cc: Signed-off-by: Paul Moore --- kernel/auditsc.c | 332 +++++++++++++++++++++++++++---------------------------- 1 file changed, 164 insertions(+), 168 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index aa3feec4df14..c65af21a12d6 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include "audit.h" @@ -82,7 +83,8 @@ #define AUDITSC_SUCCESS 1 #define AUDITSC_FAILURE 2 -/* no execve audit message should be longer than this (userspace limits) */ +/* no execve audit message should be longer than this (userspace limits), + * see the note near the top of audit_log_execve_info() about this value */ #define MAX_EXECVE_AUDIT_LEN 7500 /* max length to print of cmdline/proctitle value during audit */ @@ -992,184 +994,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, return rc; } -/* - * to_send and len_sent accounting are very loose estimates. We aren't - * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being - * within about 500 bytes (next page boundary) - * - * why snprintf? an int is up to 12 digits long. if we just assumed when - * logging that a[%d]= was going to be 16 characters long we would be wasting - * space in every audit message. In one 7500 byte message we can log up to - * about 1000 min size arguments. That comes down to about 50% waste of space - * if we didn't do the snprintf to find out how long arg_num_len was. - */ -static int audit_log_single_execve_arg(struct audit_context *context, - struct audit_buffer **ab, - int arg_num, - size_t *len_sent, - const char __user *p, - char *buf) +static void audit_log_execve_info(struct audit_context *context, + struct audit_buffer **ab) { - char arg_num_len_buf[12]; - const char __user *tmp_p = p; - /* how many digits are in arg_num? 5 is the length of ' a=""' */ - size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5; - size_t len, len_left, to_send; - size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; - unsigned int i, has_cntl = 0, too_long = 0; - int ret; - - /* strnlen_user includes the null we don't want to send */ - len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1; - - /* - * We just created this mm, if we can't find the strings - * we just copied into it something is _very_ wrong. Similar - * for strings that are too long, we should not have created - * any. - */ - if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) { - send_sig(SIGKILL, current, 0); - return -1; + long len_max; + long len_rem; + long len_full; + long len_buf; + long len_abuf; + long len_tmp; + bool require_data; + bool encode; + unsigned int iter; + unsigned int arg; + char *buf_head; + char *buf; + const char __user *p = (const char __user *)current->mm->arg_start; + + /* NOTE: this buffer needs to be large enough to hold all the non-arg + * data we put in the audit record for this argument (see the + * code below) ... at this point in time 96 is plenty */ + char abuf[96]; + + /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the + * current value of 7500 is not as important as the fact that it + * is less than 8k, a setting of 7500 gives us plenty of wiggle + * room if we go over a little bit in the logging below */ + WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500); + len_max = MAX_EXECVE_AUDIT_LEN; + + /* scratch buffer to hold the userspace args */ + buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); + if (!buf_head) { + audit_panic("out of memory for argv string"); + return; } + buf = buf_head; - /* walk the whole argument looking for non-ascii chars */ + audit_log_format(*ab, "argc=%d", context->execve.argc); + + len_rem = len_max; + len_buf = 0; + len_full = 0; + require_data = true; + encode = false; + iter = 0; + arg = 0; do { - if (len_left > MAX_EXECVE_AUDIT_LEN) - to_send = MAX_EXECVE_AUDIT_LEN; - else - to_send = len_left; - ret = copy_from_user(buf, tmp_p, to_send); - /* - * There is no reason for this copy to be short. We just - * copied them here, and the mm hasn't been exposed to user- - * space yet. - */ - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - buf[to_send] = '\0'; - has_cntl = audit_string_contains_control(buf, to_send); - if (has_cntl) { - /* - * hex messages get logged as 2 bytes, so we can only - * send half as much in each message - */ - max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2; - break; - } - len_left -= to_send; - tmp_p += to_send; - } while (len_left > 0); - - len_left = len; - - if (len > max_execve_audit_len) - too_long = 1; - - /* rewalk the argument actually logging the message */ - for (i = 0; len_left > 0; i++) { - int room_left; - - if (len_left > max_execve_audit_len) - to_send = max_execve_audit_len; - else - to_send = len_left; - - /* do we have space left to send this argument in this ab? */ - room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent; - if (has_cntl) - room_left -= (to_send * 2); - else - room_left -= to_send; - if (room_left < 0) { - *len_sent = 0; - audit_log_end(*ab); - *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); - if (!*ab) - return 0; - } + /* NOTE: we don't ever want to trust this value for anything + * serious, but the audit record format insists we + * provide an argument length for really long arguments, + * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but + * to use strncpy_from_user() to obtain this value for + * recording in the log, although we don't use it + * anywhere here to avoid a double-fetch problem */ + if (len_full == 0) + len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1; + + /* read more data from userspace */ + if (require_data) { + /* can we make more room in the buffer? */ + if (buf != buf_head) { + memmove(buf_head, buf, len_buf); + buf = buf_head; + } + + /* fetch as much as we can of the argument */ + len_tmp = strncpy_from_user(&buf_head[len_buf], p, + len_max - len_buf); + if (len_tmp == -EFAULT) { + /* unable to copy from userspace */ + send_sig(SIGKILL, current, 0); + goto out; + } else if (len_tmp == (len_max - len_buf)) { + /* buffer is not large enough */ + require_data = true; + /* NOTE: if we are going to span multiple + * buffers force the encoding so we stand + * a chance at a sane len_full value and + * consistent record encoding */ + encode = true; + len_full = len_full * 2; + p += len_tmp; + } else { + require_data = false; + if (!encode) + encode = audit_string_contains_control( + buf, len_tmp); + /* try to use a trusted value for len_full */ + if (len_full < len_max) + len_full = (encode ? + len_tmp * 2 : len_tmp); + p += len_tmp + 1; + } + len_buf += len_tmp; + buf_head[len_buf] = '\0'; - /* - * first record needs to say how long the original string was - * so we can be sure nothing was lost. - */ - if ((i == 0) && (too_long)) - audit_log_format(*ab, " a%d_len=%zu", arg_num, - has_cntl ? 2*len : len); - - /* - * normally arguments are small enough to fit and we already - * filled buf above when we checked for control characters - * so don't bother with another copy_from_user - */ - if (len >= max_execve_audit_len) - ret = copy_from_user(buf, p, to_send); - else - ret = 0; - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; + /* length of the buffer in the audit record? */ + len_abuf = (encode ? len_buf * 2 : len_buf + 2); } - buf[to_send] = '\0'; - - /* actually log it */ - audit_log_format(*ab, " a%d", arg_num); - if (too_long) - audit_log_format(*ab, "[%d]", i); - audit_log_format(*ab, "="); - if (has_cntl) - audit_log_n_hex(*ab, buf, to_send); - else - audit_log_string(*ab, buf); - - p += to_send; - len_left -= to_send; - *len_sent += arg_num_len; - if (has_cntl) - *len_sent += to_send * 2; - else - *len_sent += to_send; - } - /* include the null we didn't log */ - return len + 1; -} -static void audit_log_execve_info(struct audit_context *context, - struct audit_buffer **ab) -{ - int i, len; - size_t len_sent = 0; - const char __user *p; - char *buf; + /* write as much as we can to the audit log */ + if (len_buf > 0) { + /* NOTE: some magic numbers here - basically if we + * can't fit a reasonable amount of data into the + * existing audit buffer, flush it and start with + * a new buffer */ + if ((sizeof(abuf) + 8) > len_rem) { + len_rem = len_max; + audit_log_end(*ab); + *ab = audit_log_start(context, + GFP_KERNEL, AUDIT_EXECVE); + if (!*ab) + goto out; + } - p = (const char __user *)current->mm->arg_start; + /* create the non-arg portion of the arg record */ + len_tmp = 0; + if (require_data || (iter > 0) || + ((len_abuf + sizeof(abuf)) > len_rem)) { + if (iter == 0) { + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d_len=%lu", + arg, len_full); + } + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d[%d]=", arg, iter++); + } else + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d=", arg); + WARN_ON(len_tmp >= sizeof(abuf)); + abuf[sizeof(abuf) - 1] = '\0'; + + /* log the arg in the audit record */ + audit_log_format(*ab, "%s", abuf); + len_rem -= len_tmp; + len_tmp = len_buf; + if (encode) { + if (len_abuf > len_rem) + len_tmp = len_rem / 2; /* encoding */ + audit_log_n_hex(*ab, buf, len_tmp); + len_rem -= len_tmp * 2; + len_abuf -= len_tmp * 2; + } else { + if (len_abuf > len_rem) + len_tmp = len_rem - 2; /* quotes */ + audit_log_n_string(*ab, buf, len_tmp); + len_rem -= len_tmp + 2; + /* don't subtract the "2" because we still need + * to add quotes to the remaining string */ + len_abuf -= len_tmp; + } + len_buf -= len_tmp; + buf += len_tmp; + } - audit_log_format(*ab, "argc=%d", context->execve.argc); + /* ready to move to the next argument? */ + if ((len_buf == 0) && !require_data) { + arg++; + iter = 0; + len_full = 0; + require_data = true; + encode = false; + } + } while (arg < context->execve.argc); - /* - * we need some kernel buffer to hold the userspace args. Just - * allocate one big one rather than allocating one of the right size - * for every single argument inside audit_log_single_execve_arg() - * should be <8k allocation so should be pretty safe. - */ - buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); - if (!buf) { - audit_panic("out of memory for argv string"); - return; - } + /* NOTE: the caller handles the final audit_log_end() call */ - for (i = 0; i < context->execve.argc; i++) { - len = audit_log_single_execve_arg(context, ab, i, - &len_sent, p, buf); - if (len <= 0) - break; - p += len; - } - kfree(buf); +out: + kfree(buf_head); } static void show_special(struct audit_context *context, int *call_panic) -- cgit v1.3-8-gc7d7 From 5cbea46984d67f614c74c4401b54b9d681861e80 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Wed, 13 Jul 2016 13:25:26 -0700 Subject: cpufreq: schedutil: map raw required frequency to driver frequency The slow-path frequency transition path is relatively expensive as it requires waking up a thread to do work. Should support be added for remote CPU cpufreq updates that is also expensive since it requires an IPI. These activities should be avoided if they are not necessary. To that end, calculate the actual driver-supported frequency required by the new utilization value in schedutil by using the recently added cpufreq_driver_resolve_freq API. If it is the same as the previously requested driver frequency then there is no need to continue with the update assuming the cpu frequency limits have not changed. This will have additional benefits should the semantics of the rate limit be changed to apply solely to frequency transitions rather than to frequency calculations in schedutil. The last raw required frequency is cached. This allows the driver frequency lookup to be skipped in the event that the new raw required frequency matches the last one, assuming a frequency update has not been forced due to limits changing (indicated by a next_freq value of UINT_MAX, see sugov_should_update_freq). Signed-off-by: Steve Muckle Reviewed-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 758efd7f3abe..a84641b222c1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -47,6 +47,8 @@ struct sugov_cpu { struct update_util_data update_util; struct sugov_policy *sg_policy; + unsigned int cached_raw_freq; + /* The fields below are only needed when sharing a policy. */ unsigned long util; unsigned long max; @@ -106,7 +108,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, /** * get_next_freq - Compute a new frequency for a given cpufreq policy. - * @policy: cpufreq policy object to compute the new frequency for. + * @sg_cpu: schedutil cpu object to compute the new frequency for. * @util: Current CPU utilization. * @max: CPU capacity. * @@ -121,14 +123,25 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, * next_freq = C * curr_freq * util_raw / max * * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. + * + * The lowest driver-supported frequency which is equal or greater than the raw + * next_freq (as calculated above) is returned, subject to policy min/max and + * cpufreq driver limitations. */ -static unsigned int get_next_freq(struct cpufreq_policy *policy, - unsigned long util, unsigned long max) +static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, + unsigned long max) { + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + struct cpufreq_policy *policy = sg_policy->policy; unsigned int freq = arch_scale_freq_invariant() ? policy->cpuinfo.max_freq : policy->cur; - return (freq + (freq >> 2)) * util / max; + freq = (freq + (freq >> 2)) * util / max; + + if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX) + return sg_policy->next_freq; + sg_cpu->cached_raw_freq = freq; + return cpufreq_driver_resolve_freq(policy, freq); } static void sugov_update_single(struct update_util_data *hook, u64 time, @@ -143,13 +156,14 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, return; next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : - get_next_freq(policy, util, max); + get_next_freq(sg_cpu, util, max); sugov_update_commit(sg_policy, time, next_f); } -static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy, +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, unsigned long util, unsigned long max) { + struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; unsigned int max_f = policy->cpuinfo.max_freq; u64 last_freq_update_time = sg_policy->last_freq_update_time; @@ -189,7 +203,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy, } } - return get_next_freq(policy, util, max); + return get_next_freq(sg_cpu, util, max); } static void sugov_update_shared(struct update_util_data *hook, u64 time, @@ -206,7 +220,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, sg_cpu->last_update = time; if (sugov_should_update_freq(sg_policy, time)) { - next_f = sugov_next_freq_shared(sg_policy, util, max); + next_f = sugov_next_freq_shared(sg_cpu, util, max); sugov_update_commit(sg_policy, time, next_f); } @@ -433,6 +447,7 @@ static int sugov_start(struct cpufreq_policy *policy) sg_cpu->util = ULONG_MAX; sg_cpu->max = 0; sg_cpu->last_update = 0; + sg_cpu->cached_raw_freq = 0; cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, sugov_update_shared); } else { -- cgit v1.3-8-gc7d7 From fe12c00d21bb4985fa8da282942250be21e7dd59 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Fri, 22 Jul 2016 10:30:47 +0800 Subject: PM / hibernate: Introduce test_resume mode for hibernation test_resume mode is to verify if the snapshot data written to swap device can be successfully restored to memory. It is useful to ease the debugging process on hibernation, since this mode can not only bypass the BIOSes/bootloader, but also the system re-initialization. To avoid the risk to break the filesystm on persistent storage, this patch resumes the image with tasks frozen. For example: echo test_resume > /sys/power/disk echo disk > /sys/power/state [ 187.306470] PM: Image saving progress: 70% [ 187.395298] PM: Image saving progress: 80% [ 187.476697] PM: Image saving progress: 90% [ 187.554641] PM: Image saving done. [ 187.558896] PM: Wrote 594600 kbytes in 0.90 seconds (660.66 MB/s) [ 187.566000] PM: S| [ 187.589742] PM: Basic memory bitmaps freed [ 187.594694] PM: Checking hibernation image [ 187.599865] PM: Image signature found, resuming [ 187.605209] PM: Loading hibernation image. [ 187.665753] PM: Basic memory bitmaps created [ 187.691397] PM: Using 3 thread(s) for decompression. [ 187.691397] PM: Loading and decompressing image data (148650 pages)... [ 187.889719] PM: Image loading progress: 0% [ 188.100452] PM: Image loading progress: 10% [ 188.244781] PM: Image loading progress: 20% [ 189.057305] PM: Image loading done. [ 189.068793] PM: Image successfully loaded Suggested-by: Rafael J. Wysocki Signed-off-by: Chen Yu Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 65 ++++++++++++++++++++++++++++++++---------------- kernel/power/swap.c | 6 +++++ 2 files changed, 50 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 5f3523e18e46..0ee1df0a0bd6 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -52,6 +52,7 @@ enum { #ifdef CONFIG_SUSPEND HIBERNATION_SUSPEND, #endif + HIBERNATION_TEST_RESUME, /* keep last */ __HIBERNATION_AFTER_LAST }; @@ -647,12 +648,39 @@ static void power_down(void) cpu_relax(); } +static int load_image_and_restore(void) +{ + int error; + unsigned int flags; + + pr_debug("PM: Loading hibernation image.\n"); + + lock_device_hotplug(); + error = create_basic_memory_bitmaps(); + if (error) + goto Unlock; + + error = swsusp_read(&flags); + swsusp_close(FMODE_READ); + if (!error) + hibernation_restore(flags & SF_PLATFORM_MODE); + + printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); + swsusp_free(); + free_basic_memory_bitmaps(); + Unlock: + unlock_device_hotplug(); + + return error; +} + /** * hibernate - Carry out system hibernation, including saving the image. */ int hibernate(void) { int error, nr_calls = 0; + bool snapshot_test = false; if (!hibernation_available()) { pr_debug("PM: Hibernation not available.\n"); @@ -704,8 +732,12 @@ int hibernate(void) pr_debug("PM: writing image.\n"); error = swsusp_write(flags); swsusp_free(); - if (!error) - power_down(); + if (!error) { + if (hibernation_mode == HIBERNATION_TEST_RESUME) + snapshot_test = true; + else + power_down(); + } in_suspend = 0; pm_restore_gfp_mask(); } else { @@ -716,6 +748,12 @@ int hibernate(void) free_basic_memory_bitmaps(); Thaw: unlock_device_hotplug(); + if (snapshot_test) { + pr_debug("PM: Checking hibernation image\n"); + error = swsusp_check(); + if (!error) + error = load_image_and_restore(); + } thaw_processes(); /* Don't bother checking whether freezer_test_done is true */ @@ -748,7 +786,6 @@ int hibernate(void) static int software_resume(void) { int error, nr_calls = 0; - unsigned int flags; /* * If the user said "noresume".. bail out early. @@ -844,24 +881,7 @@ static int software_resume(void) error = freeze_processes(); if (error) goto Close_Finish; - - pr_debug("PM: Loading hibernation image.\n"); - - lock_device_hotplug(); - error = create_basic_memory_bitmaps(); - if (error) - goto Thaw; - - error = swsusp_read(&flags); - swsusp_close(FMODE_READ); - if (!error) - hibernation_restore(flags & SF_PLATFORM_MODE); - - printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); - swsusp_free(); - free_basic_memory_bitmaps(); - Thaw: - unlock_device_hotplug(); + error = load_image_and_restore(); thaw_processes(); Finish: __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); @@ -887,6 +907,7 @@ static const char * const hibernation_modes[] = { #ifdef CONFIG_SUSPEND [HIBERNATION_SUSPEND] = "suspend", #endif + [HIBERNATION_TEST_RESUME] = "test_resume", }; /* @@ -933,6 +954,7 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, #ifdef CONFIG_SUSPEND case HIBERNATION_SUSPEND: #endif + case HIBERNATION_TEST_RESUME: break; case HIBERNATION_PLATFORM: if (hibernation_ops) @@ -979,6 +1001,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, #ifdef CONFIG_SUSPEND case HIBERNATION_SUSPEND: #endif + case HIBERNATION_TEST_RESUME: hibernation_mode = mode; break; case HIBERNATION_PLATFORM: diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 160e1006640d..51cef8432154 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -348,6 +348,12 @@ static int swsusp_swap_check(void) if (res < 0) blkdev_put(hib_resume_bdev, FMODE_WRITE); + /* + * Update the resume device to the one actually used, + * so the test_resume mode can use it in case it is + * invoked from hibernate() to test the snapshot. + */ + swsusp_resume_device = hib_resume_bdev->bd_dev; return res; } -- cgit v1.3-8-gc7d7 From aa7145c16d6bf086538ad7eb20c807513bfa5efc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Jul 2016 01:19:42 +0200 Subject: bpf, events: fix offset in skb copy handler This patch fixes the __output_custom() routine we currently use with bpf_skb_copy(). I missed that when len is larger than the size of the current handle, we can issue multiple invocations of copy_func, and __output_custom() advances destination but also source buffer by the written amount of bytes. When we have __output_custom(), this is actually wrong since in that case the source buffer points to a non-linear object, in our case an skb, which the copy_func helper is supposed to walk. Therefore, since this is non-linear we thus need to pass the offset into the helper, so that copy_func can use it for extracting the data from the source object. Therefore, adjust the callback signatures properly and pass offset into the skb_header_pointer() invoked from bpf_skb_copy() callback. The __DEFINE_OUTPUT_COPY_BODY() is adjusted to accommodate for two things: i) to pass in whether we should advance source buffer or not; this is a compile-time constant condition, ii) to pass in the offset for __output_custom(), which we do with help of __VA_ARGS__, so everything can stay inlined as is currently. Both changes allow for adapting the __output_* fast-path helpers w/o extra overhead. Fixes: 555c8a8623a3 ("bpf: avoid stack copy and use skb ctx for event output") Fixes: 7e3f977edd0b ("perf, events: add non-linear data support for raw records") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 +- include/linux/perf_event.h | 2 +- kernel/events/internal.h | 15 ++++++++++----- net/core/filter.c | 4 ++-- 4 files changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 36da0749205a..11134238417d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -211,7 +211,7 @@ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *f const struct bpf_func_proto *bpf_get_trace_printk_proto(void); typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, - unsigned long len); + unsigned long off, unsigned long len); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e79e6c6fed89..15e55b7ee096 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -70,7 +70,7 @@ struct perf_callchain_entry_ctx { }; typedef unsigned long (*perf_copy_f)(void *dst, const void *src, - unsigned long len); + unsigned long off, unsigned long len); struct perf_raw_frag { union { diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 2417eb5512cd..486fd78eb8d5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -123,18 +123,19 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb) return rb->aux_nr_pages << PAGE_SHIFT; } -#define __DEFINE_OUTPUT_COPY_BODY(memcpy_func) \ +#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \ { \ unsigned long size, written; \ \ do { \ size = min(handle->size, len); \ - written = memcpy_func(handle->addr, buf, size); \ + written = memcpy_func(__VA_ARGS__); \ written = size - written; \ \ len -= written; \ handle->addr += written; \ - buf += written; \ + if (advance_buf) \ + buf += written; \ handle->size -= written; \ if (!handle->size) { \ struct ring_buffer *rb = handle->rb; \ @@ -153,12 +154,16 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb) static inline unsigned long \ func_name(struct perf_output_handle *handle, \ const void *buf, unsigned long len) \ -__DEFINE_OUTPUT_COPY_BODY(memcpy_func) +__DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size) static inline unsigned long __output_custom(struct perf_output_handle *handle, perf_copy_f copy_func, const void *buf, unsigned long len) -__DEFINE_OUTPUT_COPY_BODY(copy_func) +{ + unsigned long orig_len = len; + __DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf, + orig_len - len, size) +} static inline unsigned long memcpy_common(void *dst, const void *src, unsigned long n) diff --git a/net/core/filter.c b/net/core/filter.c index 0b521353008d..5708999f8a79 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2026,9 +2026,9 @@ bool bpf_helper_changes_skb_data(void *func) } static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, - unsigned long len) + unsigned long off, unsigned long len) { - void *ptr = skb_header_pointer(skb, 0, len, dst_buff); + void *ptr = skb_header_pointer(skb, off, len, dst_buff); if (unlikely(!ptr)) return len; -- cgit v1.3-8-gc7d7 From 96ae52279594470622ff0585621a13e96b700600 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Mon, 25 Jul 2016 05:54:46 -0700 Subject: bpf: Add bpf_probe_write_user BPF helper to be called in tracers This allows user memory to be written to during the course of a kprobe. It shouldn't be used to implement any kind of security mechanism because of TOC-TOU attacks, but rather to debug, divert, and manipulate execution of semi-cooperative processes. Although it uses probe_kernel_write, we limit the address space the probe can write into by checking the space with access_ok. We do this as opposed to calling copy_to_user directly, in order to avoid sleeping. In addition we ensure the threads's current fs / segment is USER_DS and the thread isn't exiting nor a kernel thread. Given this feature is meant for experiments, and it has a risk of crashing the system, and running programs, we print a warning on when a proglet that attempts to use this helper is installed, along with the pid and process name. Signed-off-by: Sargun Dhillon Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 10 ++++++++++ kernel/trace/bpf_trace.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ samples/bpf/bpf_helpers.h | 2 ++ 3 files changed, 57 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2b7076f5b5ad..da218fec6056 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -365,6 +365,16 @@ enum bpf_func_id { */ BPF_FUNC_get_current_task, + /** + * bpf_probe_write_user(void *dst, void *src, int len) + * safely attempt to write to a location + * @dst: destination address in userspace + * @src: source address on stack + * @len: number of bytes to copy + * Return: 0 on success or negative error + */ + BPF_FUNC_probe_write_user, + __BPF_FUNC_MAX_ID, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a12bbd32c0a6..b20438fdb029 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -81,6 +81,49 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .arg3_type = ARG_ANYTHING, }; +static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + void *unsafe_ptr = (void *) (long) r1; + void *src = (void *) (long) r2; + int size = (int) r3; + + /* + * Ensure we're in user context which is safe for the helper to + * run. This helper has no business in a kthread. + * + * access_ok() should prevent writing to non-user memory, but in + * some situations (nommu, temporary switch, etc) access_ok() does + * not provide enough validation, hence the check on KERNEL_DS. + */ + + if (unlikely(in_interrupt() || + current->flags & (PF_KTHREAD | PF_EXITING))) + return -EPERM; + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) + return -EPERM; + if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) + return -EPERM; + + return probe_kernel_write(unsafe_ptr, src, size); +} + +static const struct bpf_func_proto bpf_probe_write_user_proto = { + .func = bpf_probe_write_user, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + +static const struct bpf_func_proto *bpf_get_probe_write_proto(void) +{ + pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", + current->comm, task_pid_nr(current)); + + return &bpf_probe_write_user_proto; +} + /* * limited trace_printk() * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed @@ -362,6 +405,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return &bpf_get_smp_processor_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; + case BPF_FUNC_probe_write_user: + return bpf_get_probe_write_proto(); default: return NULL; } diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index 84e3fd919a06..217c8d507f2e 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -41,6 +41,8 @@ static int (*bpf_perf_event_output)(void *ctx, void *map, int index, void *data, (void *) BPF_FUNC_perf_event_output; static int (*bpf_get_stackid)(void *ctx, void *map, int flags) = (void *) BPF_FUNC_get_stackid; +static int (*bpf_probe_write_user)(void *dst, void *src, int size) = + (void *) BPF_FUNC_probe_write_user; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions -- cgit v1.3-8-gc7d7 From 4949148ad433f6f11cf837978b2907092ec99f3a Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 26 Jul 2016 15:24:24 -0700 Subject: mm: charge/uncharge kmemcg from generic page allocator paths Currently, to charge a non-slab allocation to kmemcg one has to use alloc_kmem_pages helper with __GFP_ACCOUNT flag. A page allocated with this helper should finally be freed using free_kmem_pages, otherwise it won't be uncharged. This API suits its current users fine, but it turns out to be impossible to use along with page reference counting, i.e. when an allocation is supposed to be freed with put_page, as it is the case with pipe or unix socket buffers. To overcome this limitation, this patch moves charging/uncharging to generic page allocator paths, i.e. to __alloc_pages_nodemask and free_pages_prepare, and zaps alloc/free_kmem_pages helpers. This way, one can use any of the available page allocation functions to get the allocated page charged to kmemcg - it's enough to pass __GFP_ACCOUNT, just like in case of kmalloc and friends. A charged page will be automatically uncharged on free. To make it possible, we need to mark pages charged to kmemcg somehow. To avoid introducing a new page flag, we make use of page->_mapcount for marking such pages. Since pages charged to kmemcg are not supposed to be mapped to userspace, it should work just fine. There are other (ab)users of page->_mapcount - buddy and balloon pages - but we don't conflict with them. In case kmemcg is compiled out or not used at runtime, this patch introduces no overhead to generic page allocator paths. If kmemcg is used, it will be plus one gfp flags check on alloc and plus one page->_mapcount check on free, which shouldn't hurt performance, because the data accessed are hot. Link: http://lkml.kernel.org/r/a9736d856f895bcb465d9f257b54efe32eda6f99.1464079538.git.vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Cc: Eric Dumazet Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 10 +------ include/linux/page-flags.h | 7 +++++ kernel/fork.c | 6 ++--- mm/page_alloc.c | 66 +++++++++------------------------------------- mm/slab_common.c | 2 +- mm/slub.c | 6 ++--- mm/vmalloc.c | 6 ++--- 7 files changed, 31 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 570383a41853..c29e9d347bc6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -78,8 +78,7 @@ struct vm_area_struct; * __GFP_THISNODE forces the allocation to be satisified from the requested * node with no fallbacks or placement policy enforcements. * - * __GFP_ACCOUNT causes the allocation to be accounted to kmemcg (only relevant - * to kmem allocations). + * __GFP_ACCOUNT causes the allocation to be accounted to kmemcg. */ #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) @@ -486,10 +485,6 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) -extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order); -extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, - unsigned int order); - extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); @@ -513,9 +508,6 @@ extern void *__alloc_page_frag(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask); extern void __free_page_frag(void *addr); -extern void __free_kmem_pages(struct page *page, unsigned int order); -extern void free_kmem_pages(unsigned long addr, unsigned int order); - #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 96084ee74ee8..7c8e82ac2eb7 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -641,6 +641,13 @@ PAGE_MAPCOUNT_OPS(Buddy, BUDDY) #define PAGE_BALLOON_MAPCOUNT_VALUE (-256) PAGE_MAPCOUNT_OPS(Balloon, BALLOON) +/* + * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on + * pages allocated with __GFP_ACCOUNT. It gets cleared on page free. + */ +#define PAGE_KMEMCG_MAPCOUNT_VALUE (-512) +PAGE_MAPCOUNT_OPS(Kmemcg, KMEMCG) + extern bool is_free_buddy_page(struct page *page); __PAGEFLAG(Isolated, isolated, PF_ANY); diff --git a/kernel/fork.c b/kernel/fork.c index 4a7ec0c6c88c..de21f25e0d2c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -162,8 +162,8 @@ void __weak arch_release_thread_stack(unsigned long *stack) static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { - struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, - THREAD_SIZE_ORDER); + struct page *page = alloc_pages_node(node, THREADINFO_GFP, + THREAD_SIZE_ORDER); if (page) memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, @@ -178,7 +178,7 @@ static inline void free_thread_stack(unsigned long *stack) memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, -(1 << THREAD_SIZE_ORDER)); - __free_kmem_pages(page, THREAD_SIZE_ORDER); + __free_pages(page, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_stack_cache; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index de2491c42d4f..7023a31edc5c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -1018,6 +1019,10 @@ static __always_inline bool free_pages_prepare(struct page *page, } if (PageMappingFlags(page)) page->mapping = NULL; + if (memcg_kmem_enabled() && PageKmemcg(page)) { + memcg_kmem_uncharge(page, order); + __ClearPageKmemcg(page); + } if (check_free) bad += free_pages_check(page); if (bad) @@ -3841,6 +3846,14 @@ no_zone: } out: + if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page) { + if (unlikely(memcg_kmem_charge(page, gfp_mask, order))) { + __free_pages(page, order); + page = NULL; + } else + __SetPageKmemcg(page); + } + if (kmemcheck_enabled && page) kmemcheck_pagealloc_alloc(page, order, gfp_mask); @@ -3996,59 +4009,6 @@ void __free_page_frag(void *addr) } EXPORT_SYMBOL(__free_page_frag); -/* - * alloc_kmem_pages charges newly allocated pages to the kmem resource counter - * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is - * equivalent to alloc_pages. - * - * It should be used when the caller would like to use kmalloc, but since the - * allocation is large, it has to fall back to the page allocator. - */ -struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) -{ - struct page *page; - - page = alloc_pages(gfp_mask, order); - if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && - page && memcg_kmem_charge(page, gfp_mask, order) != 0) { - __free_pages(page, order); - page = NULL; - } - return page; -} - -struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) -{ - struct page *page; - - page = alloc_pages_node(nid, gfp_mask, order); - if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && - page && memcg_kmem_charge(page, gfp_mask, order) != 0) { - __free_pages(page, order); - page = NULL; - } - return page; -} - -/* - * __free_kmem_pages and free_kmem_pages will free pages allocated with - * alloc_kmem_pages. - */ -void __free_kmem_pages(struct page *page, unsigned int order) -{ - if (memcg_kmem_enabled()) - memcg_kmem_uncharge(page, order); - __free_pages(page, order); -} - -void free_kmem_pages(unsigned long addr, unsigned int order) -{ - if (addr != 0) { - VM_BUG_ON(!virt_addr_valid((void *)addr)); - __free_kmem_pages(virt_to_page((void *)addr), order); - } -} - static void *make_alloc_exact(unsigned long addr, unsigned int order, size_t size) { diff --git a/mm/slab_common.c b/mm/slab_common.c index da88c1588752..71f0b28a1bec 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1012,7 +1012,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) struct page *page; flags |= __GFP_COMP; - page = alloc_kmem_pages(flags, order); + page = alloc_pages(flags, order); ret = page ? page_address(page) : NULL; kmemleak_alloc(ret, size, 1, flags); kasan_kmalloc_large(ret, size, flags); diff --git a/mm/slub.c b/mm/slub.c index c0cfa2722539..f9da8716b8b3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2977,7 +2977,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kfree_hook(object); - __free_kmem_pages(page, compound_order(page)); + __free_pages(page, compound_order(page)); p[size] = NULL; /* mark object processed */ return size; } @@ -3693,7 +3693,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) void *ptr = NULL; flags |= __GFP_COMP | __GFP_NOTRACK; - page = alloc_kmem_pages_node(node, flags, get_order(size)); + page = alloc_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -3774,7 +3774,7 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kfree_hook(x); - __free_kmem_pages(page, compound_order(page)); + __free_pages(page, compound_order(page)); return; } slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e11475cdeb7a..91f44e78c516 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1501,7 +1501,7 @@ static void __vunmap(const void *addr, int deallocate_pages) struct page *page = area->pages[i]; BUG_ON(!page); - __free_kmem_pages(page, 0); + __free_pages(page, 0); } kvfree(area->pages); @@ -1629,9 +1629,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; if (node == NUMA_NO_NODE) - page = alloc_kmem_pages(alloc_mask, order); + page = alloc_pages(alloc_mask, order); else - page = alloc_kmem_pages_node(node, alloc_mask, order); + page = alloc_pages_node(node, alloc_mask, order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ -- cgit v1.3-8-gc7d7 From 1fe4d021acbc356723818a633fe0a10c59c2a4c1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 26 Jul 2016 15:26:58 -0700 Subject: cgroup: fix idr leak for the first cgroup root The valid cgroup hierarchy ID range includes 0, so we can't filter for positive numbers when freeing it, or it'll leak the first ID. No big deal, just disruptive when reading the code. The ID is freed during error handling and when the reference count hits zero, so the double-free test is not necessary; remove it. Link: http://lkml.kernel.org/r/20160617162359.GB19084@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Vladimir Davydov Cc: Tejun Heo Cc: Nikolay Borisov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 75c0ff00aca6..3108150e47b1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1160,18 +1160,12 @@ static void cgroup_exit_root_id(struct cgroup_root *root) { lockdep_assert_held(&cgroup_mutex); - if (root->hierarchy_id) { - idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); - root->hierarchy_id = 0; - } + idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); } static void cgroup_free_root(struct cgroup_root *root) { if (root) { - /* hierarchy ID should already have been released */ - WARN_ON_ONCE(root->hierarchy_id); - idr_destroy(&root->cgroup_idr); kfree(root); } -- cgit v1.3-8-gc7d7 From cb773df88a737d7d7e05ca7ca516414d3fcdcab8 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 26 Jul 2016 15:27:01 -0700 Subject: cgroup: remove unnecessary 0 check from css_from_id() css_idr allocation starts at 1, so index 0 will never point to an item. css_from_id() currently filters that before asking idr_find(), but idr_find() would also just return NULL, so this is not needed. Link: http://lkml.kernel.org/r/20160617162427.GC19084@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Vladimir Davydov Cc: Tejun Heo Cc: Nikolay Borisov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3108150e47b1..fa943843a32f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6166,7 +6166,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { WARN_ON_ONCE(!rcu_read_lock_held()); - return id > 0 ? idr_find(&ss->css_idr, id) : NULL; + return idr_find(&ss->css_idr, id); } /** -- cgit v1.3-8-gc7d7 From bf262dcec6383188a3324192c4a7e405b3b1ad23 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Tue, 12 Apr 2016 05:02:09 +0930 Subject: module: fix noreturn attribute for __module_put_and_exit() __module_put_and_exit() is makred noreturn in module.h declaration, but is lacking the attribute in the definition, which makes some tools (such as sparse) unhappy. Amend the definition with the attribute as well (and reformat the declaration so that it uses more common format). Signed-off-by: Jiri Kosina Signed-off-by: Rusty Russell --- include/linux/module.h | 4 ++-- kernel/module.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 3daf2b3a09d2..f777164c238b 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -575,8 +575,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data); -extern void __module_put_and_exit(struct module *mod, long code) - __attribute__((noreturn)); +extern void __noreturn __module_put_and_exit(struct module *mod, + long code); #define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code) #ifdef CONFIG_MODULE_UNLOAD diff --git a/kernel/module.c b/kernel/module.c index 5f71aa63ed2a..5e876977844b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -336,7 +336,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag, * A thread that wants to hold a reference to a module only while it * is running can call this to safely exit. nfsd and lockd use this. */ -void __module_put_and_exit(struct module *mod, long code) +void __noreturn __module_put_and_exit(struct module *mod, long code) { module_put(mod); do_exit(code); -- cgit v1.3-8-gc7d7 From c75b590d60ffa3e31bcb9608b68006a8bab9e0ed Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 12 Apr 2016 05:03:09 +0930 Subject: module: fix redundant test. [linux-4.5-rc4/kernel/module.c:1692]: (style) Redundant condition: attr.test. '!attr.test || (attr.test && attr.test(mod))' is equivalent to '!attr.test || attr.test(mod)' This code was added like this ten years ago, in c988d2b284549 "modules: add version and srcversion to sysfs". Reported-by: David Binderman Cc: Matt Domsch Signed-off-by: Rusty Russell --- kernel/module.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 5e876977844b..9e04a4210a4a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1693,8 +1693,7 @@ static int module_add_modinfo_attrs(struct module *mod) temp_attr = mod->modinfo_attrs; for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) { - if (!attr->test || - (attr->test && attr->test(mod))) { + if (!attr->test || attr->test(mod)) { memcpy(temp_attr, attr, sizeof(*temp_attr)); sysfs_attr_init(&temp_attr->attr); error = sysfs_create_file(&mod->mkobj.kobj, -- cgit v1.3-8-gc7d7 From 3205c36cf7d96024626f92d65f560035df1abcb2 Mon Sep 17 00:00:00 2001 From: Libor Pechacek Date: Wed, 13 Apr 2016 11:06:12 +0930 Subject: module: Issue warnings when tainting kernel While most of the locations where a kernel taint bit is set are accompanied with a warning message, there are two which set their bits silently. If the tainting module gets unloaded later on, it is almost impossible to tell what was the reason for setting the flag. Signed-off-by: Libor Pechacek Signed-off-by: Rusty Russell --- kernel/module.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 9e04a4210a4a..0b4f3a85d4fc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2919,8 +2919,12 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) return -ENOEXEC; } - if (!get_modinfo(info, "intree")) + if (!get_modinfo(info, "intree")) { + if (!test_taint(TAINT_OOT_MODULE)) + pr_warn("%s: loading out-of-tree module taints kernel.\n", + mod->name); add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); + } if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); @@ -3089,6 +3093,8 @@ static int move_module(struct module *mod, struct load_info *info) static int check_module_license_and_versions(struct module *mod) { + int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE); + /* * ndiswrapper is under GPL by itself, but loads proprietary modules. * Don't use add_taint_module(), as it would prevent ndiswrapper from @@ -3107,6 +3113,9 @@ static int check_module_license_and_versions(struct module *mod) add_taint_module(mod, TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE); + if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE)) + pr_warn("%s: module license taints kernel.\n", mod->name); + #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) -- cgit v1.3-8-gc7d7 From bca014caaa6130e57f69b5bf527967aa8ee70fdd Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 28 Apr 2016 09:24:01 +0930 Subject: module: Invalidate signatures on force-loaded modules Signing a module should only make it trusted by the specific kernel it was built for, not anything else. Loading a signed module meant for a kernel with a different ABI could have interesting effects. Therefore, treat all signatures as invalid when a module is force-loaded. Signed-off-by: Ben Hutchings Cc: stable@vger.kernel.org Signed-off-by: Rusty Russell --- kernel/module.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0b4f3a85d4fc..7f21ab238aa7 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2686,13 +2686,18 @@ static inline void kmemleak_load_module(const struct module *mod, #endif #ifdef CONFIG_MODULE_SIG -static int module_sig_check(struct load_info *info) +static int module_sig_check(struct load_info *info, int flags) { int err = -ENOKEY; const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; const void *mod = info->hdr; - if (info->len > markerlen && + /* + * Require flags == 0, as a module with version information + * removed is no longer the module that was signed + */ + if (flags == 0 && + info->len > markerlen && memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { /* We truncate the module to discard the signature */ info->len -= markerlen; @@ -2711,7 +2716,7 @@ static int module_sig_check(struct load_info *info) return err; } #else /* !CONFIG_MODULE_SIG */ -static int module_sig_check(struct load_info *info) +static int module_sig_check(struct load_info *info, int flags) { return 0; } @@ -3506,7 +3511,7 @@ static int load_module(struct load_info *info, const char __user *uargs, long err; char *after_dashes; - err = module_sig_check(info); + err = module_sig_check(info, flags); if (err) goto free_copy; -- cgit v1.3-8-gc7d7 From ce4f06dcbb5d6d04d202f1b81ac72d5679dcdfc0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 26 Jul 2016 20:57:36 +0200 Subject: stop_machine: Touch_nmi_watchdog() after MULTI_STOP_PREPARE Suppose that stop_machine(fn) hangs because fn() hangs. In this case NMI hard-lockup can be triggered on another CPU which does nothing wrong and the trace from nmi_panic() won't help to investigate the problem. And this change "fixes" the problem we (seem to) hit in practice. - stop_two_cpus(0, 1) races with show_state_filter() running on CPU_0. - CPU_1 already spins in MULTI_STOP_PREPARE state, it detects the soft lockup and tries to report the problem. - show_state_filter() enables preemption, CPU_0 calls multi_cpu_stop() which goes to MULTI_STOP_DISABLE_IRQ state and disables interrupts. - CPU_1 spends more than 10 seconds trying to flush the log buffer to the slow serial console. - NMI interrupt on CPU_0 (which now waits for CPU_1) calls nmi_panic(). Reported-by: Wang Shu Signed-off-by: Oleg Nesterov Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Dave Anderson Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/20160726185736.GB4088@redhat.com Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index a467e6c28a3b..4a1ca5f6da7e 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * Structure to determine completion condition and record errors. May @@ -209,6 +210,13 @@ static int multi_cpu_stop(void *data) break; } ack_state(msdata); + } else if (curstate > MULTI_STOP_PREPARE) { + /* + * At this stage all other CPUs we depend on must spin + * in the same loop. Any reason for hard-lockup should + * be detected and reported on their side. + */ + touch_nmi_watchdog(); } } while (curstate != MULTI_STOP_EXIT); -- cgit v1.3-8-gc7d7 From 4fae16dffb812f0e0d98a0b2b0856ca48ca63e6c Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 27 Jul 2016 11:08:18 +0200 Subject: timers/core: Correct callback order during CPU hot plug On the tear-down path, the dead CPU callback for the timers was misplaced within the 'cpuhp_state' enumeration. There is a hidden dependency between the timers and block multiqueue. The timers callback must happen before the block multiqueue callback otherwise a RCU stall occurs. Move the timers callback to the proper place in the state machine. Reported-and-tested-by: Jon Hunter Reported-by: kbuild test robot Fixes: 24f73b99716a ("timers/core: Convert to hotplug state machine") Signed-off-by: Richard Cochran Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Rasmus Villemoes Cc: John Stultz Cc: rt@linutronix.de Cc: Oleg Nesterov Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1469610498-25914-1-git-send-email-rcochran@linutronix.de Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/cpuhotplug.h | 2 +- kernel/cpu.c | 15 ++++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 6d405db6fb12..242bf530edfc 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -20,9 +20,9 @@ enum cpuhp_state { CPUHP_PROFILE_PREPARE, CPUHP_X2APIC_PREPARE, CPUHP_SMPCFD_PREPARE, - CPUHP_TIMERS_DEAD, CPUHP_RCUTREE_PREP, CPUHP_NOTIFY_PREPARE, + CPUHP_TIMERS_DEAD, CPUHP_BRINGUP_CPU, CPUHP_AP_IDLE_DEAD, CPUHP_AP_OFFLINE, diff --git a/kernel/cpu.c b/kernel/cpu.c index f24f45915b54..341bf80f80bd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1200,11 +1200,6 @@ static struct cpuhp_step cpuhp_bp_states[] = { .startup = smpcfd_prepare_cpu, .teardown = smpcfd_dead_cpu, }, - [CPUHP_TIMERS_DEAD] = { - .name = "timers dead", - .startup = NULL, - .teardown = timers_dead_cpu, - }, [CPUHP_RCUTREE_PREP] = { .name = "RCU-tree prepare", .startup = rcutree_prepare_cpu, @@ -1221,6 +1216,16 @@ static struct cpuhp_step cpuhp_bp_states[] = { .skip_onerr = true, .cant_stop = true, }, + /* + * On the tear-down path, timers_dead_cpu() must be invoked + * before blk_mq_queue_reinit_notify() from notify_dead(), + * otherwise a RCU stall occurs. + */ + [CPUHP_TIMERS_DEAD] = { + .name = "timers dead", + .startup = NULL, + .teardown = timers_dead_cpu, + }, /* Kicks the plugged cpu into life */ [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", -- cgit v1.3-8-gc7d7 From a34c80a7294e34ba213c285dff38b1137745f94b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 28 Jul 2016 15:45:16 -0700 Subject: freezer, oom: check TIF_MEMDIE on the correct task freezing_slow_path() is checking TIF_MEMDIE to skip OOM killed tasks. It is, however, checking the flag on the current task rather than the given one. This is really confusing because freezing() can be called also on !current tasks. It would end up working correctly for its main purpose because __refrigerator will be always called on the current task so the oom victim will never get frozen. But it could lead to surprising results when a task which is freezing a cgroup got oom killed because only part of the cgroup would get frozen. This is highly unlikely but worth fixing as the resulting code would be more clear anyway. Link: http://lkml.kernel.org/r/1467029719-17602-2-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: David Rientjes Cc: Miao Xie Cc: Miao Xie Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/freezer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index a8900a3bc27a..6f56a9e219fa 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; - if (test_thread_flag(TIF_MEMDIE)) + if (test_tsk_thread_flag(p, TIF_MEMDIE)) return false; if (pm_nosig_freezing || cgroup_freezing(p)) -- cgit v1.3-8-gc7d7 From fec1e5f987bfc41f9f08cbd206e7302e6ac2ab0c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 28 Jul 2016 15:45:19 -0700 Subject: cpuset, mm: fix TIF_MEMDIE check in cpuset_change_task_nodemask Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") has added TIF_MEMDIE and PF_EXITING check but it is checking the flag on the current task rather than the given one. This doesn't make much sense and it is actually wrong. If the current task which updates the nodemask of a cpuset got killed by the OOM killer then a part of the cpuset cgroup processes would have incompatible nodemask which is surprising to say the least. The comment suggests the intention was to skip oom victim or an exiting task so we should be checking the given task. But even then it would be layering violation because it is the memory allocator to interpret the TIF_MEMDIE meaning. Simply drop both checks. All tasks in the cpuset should simply follow the same mask. Link: http://lkml.kernel.org/r/1467029719-17602-3-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: David Rientjes Cc: Miao Xie Cc: Miao Xie Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 73e93e53884d..c7fd2778ed50 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1034,15 +1034,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, { bool need_loop; - /* - * Allow tasks that have access to memory reserves because they have - * been OOM killed to get memory anywhere. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) - return; - if (current->flags & PF_EXITING) /* Let dying task have memory */ - return; - task_lock(tsk); /* * Determine if a loop is necessary if another thread is doing -- cgit v1.3-8-gc7d7 From 599d0c954f91d0689c9bb421b5bc04ea02437a41 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 28 Jul 2016 15:45:31 -0700 Subject: mm, vmscan: move LRU lists to node This moves the LRU lists from the zone to the node and related data such as counters, tracing, congestion tracking and writeback tracking. Unfortunately, due to reclaim and compaction retry logic, it is necessary to account for the number of LRU pages on both zone and node logic. Most reclaim logic is based on the node counters but the retry logic uses the zone counters which do not distinguish inactive and active sizes. It would be possible to leave the LRU counters on a per-zone basis but it's a heavier calculation across multiple cache lines that is much more frequent than the retry checks. Other than the LRU counters, this is mostly a mechanical patch but note that it introduces a number of anomalies. For example, the scans are per-zone but using per-node counters. We also mark a node as congested when a zone is congested. This causes weird problems that are fixed later but is easier to review. In the event that there is excessive overhead on 32-bit systems due to the nodes being on LRU then there are two potential solutions 1. Long-term isolation of highmem pages when reclaim is lowmem When pages are skipped, they are immediately added back onto the LRU list. If lowmem reclaim persisted for long periods of time, the same highmem pages get continually scanned. The idea would be that lowmem keeps those pages on a separate list until a reclaim for highmem pages arrives that splices the highmem pages back onto the LRU. It potentially could be implemented similar to the UNEVICTABLE list. That would reduce the skip rate with the potential corner case is that highmem pages have to be scanned and reclaimed to free lowmem slab pages. 2. Linear scan lowmem pages if the initial LRU shrink fails This will break LRU ordering but may be preferable and faster during memory pressure than skipping LRU pages. Link: http://lkml.kernel.org/r/1467970510-21195-4-git-send-email-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Hillf Danton Cc: Joonsoo Kim Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/tile/mm/pgtable.c | 8 +- drivers/base/node.c | 19 +-- drivers/staging/android/lowmemorykiller.c | 8 +- include/linux/backing-dev.h | 2 +- include/linux/memcontrol.h | 18 +-- include/linux/mm_inline.h | 21 ++- include/linux/mmzone.h | 68 +++++---- include/linux/swap.h | 1 + include/linux/vm_event_item.h | 10 +- include/linux/vmstat.h | 17 +++ include/trace/events/vmscan.h | 12 +- kernel/power/snapshot.c | 10 +- mm/backing-dev.c | 15 +- mm/compaction.c | 18 +-- mm/huge_memory.c | 2 +- mm/internal.h | 2 +- mm/khugepaged.c | 4 +- mm/memcontrol.c | 17 +-- mm/memory-failure.c | 4 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 21 +-- mm/mlock.c | 2 +- mm/page-writeback.c | 8 +- mm/page_alloc.c | 68 +++++---- mm/swap.c | 50 +++---- mm/vmscan.c | 226 +++++++++++++++++------------- mm/vmstat.c | 47 ++++--- mm/workingset.c | 4 +- 29 files changed, 386 insertions(+), 300 deletions(-) (limited to 'kernel') diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index c4d5bf841a7f..9e389213580d 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c @@ -45,10 +45,10 @@ void show_mem(unsigned int filter) struct zone *zone; pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu pagecache:%lu swap:%lu\n", - (global_page_state(NR_ACTIVE_ANON) + - global_page_state(NR_ACTIVE_FILE)), - (global_page_state(NR_INACTIVE_ANON) + - global_page_state(NR_INACTIVE_FILE)), + (global_node_page_state(NR_ACTIVE_ANON) + + global_node_page_state(NR_ACTIVE_FILE)), + (global_node_page_state(NR_INACTIVE_ANON) + + global_node_page_state(NR_INACTIVE_FILE)), global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), diff --git a/drivers/base/node.c b/drivers/base/node.c index 0a1b6433a76c..d4698f096209 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -56,6 +56,7 @@ static ssize_t node_read_meminfo(struct device *dev, { int n; int nid = dev->id; + struct pglist_data *pgdat = NODE_DATA(nid); struct sysinfo i; si_meminfo_node(&i, nid); @@ -74,15 +75,15 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(i.totalram), nid, K(i.freeram), nid, K(i.totalram - i.freeram), - nid, K(sum_zone_node_page_state(nid, NR_ACTIVE_ANON) + - sum_zone_node_page_state(nid, NR_ACTIVE_FILE)), - nid, K(sum_zone_node_page_state(nid, NR_INACTIVE_ANON) + - sum_zone_node_page_state(nid, NR_INACTIVE_FILE)), - nid, K(sum_zone_node_page_state(nid, NR_ACTIVE_ANON)), - nid, K(sum_zone_node_page_state(nid, NR_INACTIVE_ANON)), - nid, K(sum_zone_node_page_state(nid, NR_ACTIVE_FILE)), - nid, K(sum_zone_node_page_state(nid, NR_INACTIVE_FILE)), - nid, K(sum_zone_node_page_state(nid, NR_UNEVICTABLE)), + nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) + + node_page_state(pgdat, NR_ACTIVE_FILE)), + nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) + + node_page_state(pgdat, NR_INACTIVE_FILE)), + nid, K(node_page_state(pgdat, NR_ACTIVE_ANON)), + nid, K(node_page_state(pgdat, NR_INACTIVE_ANON)), + nid, K(node_page_state(pgdat, NR_ACTIVE_FILE)), + nid, K(node_page_state(pgdat, NR_INACTIVE_FILE)), + nid, K(node_page_state(pgdat, NR_UNEVICTABLE)), nid, K(sum_zone_node_page_state(nid, NR_MLOCK))); #ifdef CONFIG_HIGHMEM diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index 24d2745e9437..93dbcc38eb0f 100644 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c @@ -72,10 +72,10 @@ static unsigned long lowmem_deathpending_timeout; static unsigned long lowmem_count(struct shrinker *s, struct shrink_control *sc) { - return global_page_state(NR_ACTIVE_ANON) + - global_page_state(NR_ACTIVE_FILE) + - global_page_state(NR_INACTIVE_ANON) + - global_page_state(NR_INACTIVE_FILE); + return global_node_page_state(NR_ACTIVE_ANON) + + global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_ANON) + + global_node_page_state(NR_INACTIVE_FILE); } static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c82794f20110..491a91717788 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -197,7 +197,7 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) } long congestion_wait(int sync, long timeout); -long wait_iff_congested(struct zone *zone, int sync, long timeout); +long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout); int pdflush_proc_obsolete(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1c4df4420258..6d2321c148cd 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -339,7 +339,7 @@ static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, struct lruvec *lruvec; if (mem_cgroup_disabled()) { - lruvec = &zone->lruvec; + lruvec = zone_lruvec(zone); goto out; } @@ -348,15 +348,15 @@ static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, out: /* * Since a node can be onlined after the mem_cgroup was created, - * we have to be prepared to initialize lruvec->zone here; + * we have to be prepared to initialize lruvec->pgdat here; * and if offlined then reonlined, we need to reinitialize it. */ - if (unlikely(lruvec->zone != zone)) - lruvec->zone = zone; + if (unlikely(lruvec->pgdat != zone->zone_pgdat)) + lruvec->pgdat = zone->zone_pgdat; return lruvec; } -struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); +struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *); bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); @@ -437,7 +437,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int nr_pages); + enum zone_type zid, int nr_pages); unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask); @@ -612,13 +612,13 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new) static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, struct mem_cgroup *memcg) { - return &zone->lruvec; + return zone_lruvec(zone); } static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, - struct zone *zone) + struct pglist_data *pgdat) { - return &zone->lruvec; + return &pgdat->lruvec; } static inline bool mm_match_cgroup(struct mm_struct *mm, diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 5bd29ba4f174..9aadcc781857 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -23,25 +23,32 @@ static inline int page_is_file_cache(struct page *page) } static __always_inline void __update_lru_size(struct lruvec *lruvec, - enum lru_list lru, int nr_pages) + enum lru_list lru, enum zone_type zid, + int nr_pages) { - __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages); + __mod_zone_page_state(&pgdat->node_zones[zid], + NR_ZONE_LRU_BASE + !!is_file_lru(lru), + nr_pages); } static __always_inline void update_lru_size(struct lruvec *lruvec, - enum lru_list lru, int nr_pages) + enum lru_list lru, enum zone_type zid, + int nr_pages) { #ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, nr_pages); + mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #else - __update_lru_size(lruvec, lru, nr_pages); + __update_lru_size(lruvec, lru, zid, nr_pages); #endif } static __always_inline void add_page_to_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { - update_lru_size(lruvec, lru, hpage_nr_pages(page)); + update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page)); list_add(&page->lru, &lruvec->lists[lru]); } @@ -49,7 +56,7 @@ static __always_inline void del_page_from_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { list_del(&page->lru); - update_lru_size(lruvec, lru, -hpage_nr_pages(page)); + update_lru_size(lruvec, lru, page_zonenum(page), -hpage_nr_pages(page)); } /** diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cfa870107abe..d4f5cac0a8c3 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -111,12 +111,9 @@ enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, NR_ALLOC_BATCH, - NR_LRU_BASE, - NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ - NR_ACTIVE_ANON, /* " " " " " */ - NR_INACTIVE_FILE, /* " " " " " */ - NR_ACTIVE_FILE, /* " " " " " */ - NR_UNEVICTABLE, /* " " " " " */ + NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ + NR_ZONE_LRU_ANON = NR_ZONE_LRU_BASE, + NR_ZONE_LRU_FILE, NR_MLOCK, /* mlock()ed pages found and moved off LRU */ NR_ANON_PAGES, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. @@ -134,12 +131,9 @@ enum zone_stat_item { NR_VMSCAN_WRITE, NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ - NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ - NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ - NR_PAGES_SCANNED, /* pages scanned since last reclaim */ #if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ #endif @@ -161,6 +155,15 @@ enum zone_stat_item { NR_VM_ZONE_STAT_ITEMS }; enum node_stat_item { + NR_LRU_BASE, + NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ + NR_ACTIVE_ANON, /* " " " " " */ + NR_INACTIVE_FILE, /* " " " " " */ + NR_ACTIVE_FILE, /* " " " " " */ + NR_UNEVICTABLE, /* " " " " " */ + NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ + NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ + NR_PAGES_SCANNED, /* pages scanned since last reclaim */ NR_VM_NODE_STAT_ITEMS }; @@ -219,7 +222,7 @@ struct lruvec { /* Evictions & activations on the inactive file list */ atomic_long_t inactive_age; #ifdef CONFIG_MEMCG - struct zone *zone; + struct pglist_data *pgdat; #endif }; @@ -357,13 +360,6 @@ struct zone { #ifdef CONFIG_NUMA int node; #endif - - /* - * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on - * this zone's LRU. Maintained by the pageout code. - */ - unsigned int inactive_ratio; - struct pglist_data *zone_pgdat; struct per_cpu_pageset __percpu *pageset; @@ -495,9 +491,6 @@ struct zone { /* Write-intensive fields used by page reclaim */ - /* Fields commonly accessed by the page reclaim scanner */ - struct lruvec lruvec; - /* * When free pages are below this point, additional steps are taken * when reading the number of free pages to avoid per-cpu counter @@ -537,17 +530,20 @@ struct zone { enum zone_flags { ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ - ZONE_CONGESTED, /* zone has many dirty pages backed by + ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ +}; + +enum pgdat_flags { + PGDAT_CONGESTED, /* pgdat has many dirty pages backed by * a congested BDI */ - ZONE_DIRTY, /* reclaim scanning has recently found + PGDAT_DIRTY, /* reclaim scanning has recently found * many dirty file pages at the tail * of the LRU. */ - ZONE_WRITEBACK, /* reclaim scanning has recently found + PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ - ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ }; static inline unsigned long zone_end_pfn(const struct zone *zone) @@ -707,6 +703,19 @@ typedef struct pglist_data { unsigned long split_queue_len; #endif + /* Fields commonly accessed by the page reclaim scanner */ + struct lruvec lruvec; + + /* + * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on + * this node's LRU. Maintained by the pageout code. + */ + unsigned int inactive_ratio; + + unsigned long flags; + + ZONE_PADDING(_pad2_) + /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; @@ -728,6 +737,11 @@ static inline spinlock_t *zone_lru_lock(struct zone *zone) return &zone->zone_pgdat->lru_lock; } +static inline struct lruvec *zone_lruvec(struct zone *zone) +{ + return &zone->zone_pgdat->lruvec; +} + static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) { return pgdat->node_start_pfn + pgdat->node_spanned_pages; @@ -779,12 +793,12 @@ extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, extern void lruvec_init(struct lruvec *lruvec); -static inline struct zone *lruvec_zone(struct lruvec *lruvec) +static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) { #ifdef CONFIG_MEMCG - return lruvec->zone; + return lruvec->pgdat; #else - return container_of(lruvec, struct zone, lruvec); + return container_of(lruvec, struct pglist_data, lruvec); #endif } diff --git a/include/linux/swap.h b/include/linux/swap.h index 0af2bb2028fd..c82f916008b7 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -317,6 +317,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page, /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); +extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 42604173f122..1798ff542517 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -26,11 +26,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGFREE, PGACTIVATE, PGDEACTIVATE, PGFAULT, PGMAJFAULT, PGLAZYFREED, - FOR_ALL_ZONES(PGREFILL), - FOR_ALL_ZONES(PGSTEAL_KSWAPD), - FOR_ALL_ZONES(PGSTEAL_DIRECT), - FOR_ALL_ZONES(PGSCAN_KSWAPD), - FOR_ALL_ZONES(PGSCAN_DIRECT), + PGREFILL, + PGSTEAL_KSWAPD, + PGSTEAL_DIRECT, + PGSCAN_KSWAPD, + PGSCAN_DIRECT, PGSCAN_DIRECT_THROTTLE, #ifdef CONFIG_NUMA PGSCAN_ZONE_RECLAIM_FAILED, diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index d1744aa3ab9c..fee321c98550 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -178,6 +178,23 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, return x; } +static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat, + enum node_stat_item item) +{ + long x = atomic_long_read(&pgdat->vm_stat[item]); + +#ifdef CONFIG_SMP + int cpu; + for_each_online_cpu(cpu) + x += per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->vm_node_stat_diff[item]; + + if (x < 0) + x = 0; +#endif + return x; +} + + #ifdef CONFIG_NUMA extern unsigned long sum_zone_node_page_state(int node, enum zone_stat_item item); diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 0101ef37f1ee..897f1aa1ee5f 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -352,15 +352,14 @@ TRACE_EVENT(mm_vmscan_writepage, TRACE_EVENT(mm_vmscan_lru_shrink_inactive, - TP_PROTO(struct zone *zone, + TP_PROTO(int nid, unsigned long nr_scanned, unsigned long nr_reclaimed, int priority, int file), - TP_ARGS(zone, nr_scanned, nr_reclaimed, priority, file), + TP_ARGS(nid, nr_scanned, nr_reclaimed, priority, file), TP_STRUCT__entry( __field(int, nid) - __field(int, zid) __field(unsigned long, nr_scanned) __field(unsigned long, nr_reclaimed) __field(int, priority) @@ -368,16 +367,15 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, ), TP_fast_assign( - __entry->nid = zone_to_nid(zone); - __entry->zid = zone_idx(zone); + __entry->nid = nid; __entry->nr_scanned = nr_scanned; __entry->nr_reclaimed = nr_reclaimed; __entry->priority = priority; __entry->reclaim_flags = trace_shrink_flags(file); ), - TP_printk("nid=%d zid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s", - __entry->nid, __entry->zid, + TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s", + __entry->nid, __entry->nr_scanned, __entry->nr_reclaimed, __entry->priority, show_reclaim_flags(__entry->reclaim_flags)) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d90df926b59f..9a0178c2ac1d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1627,11 +1627,11 @@ static unsigned long minimum_image_size(unsigned long saveable) unsigned long size; size = global_page_state(NR_SLAB_RECLAIMABLE) - + global_page_state(NR_ACTIVE_ANON) - + global_page_state(NR_INACTIVE_ANON) - + global_page_state(NR_ACTIVE_FILE) - + global_page_state(NR_INACTIVE_FILE) - - global_page_state(NR_FILE_MAPPED); + + global_node_page_state(NR_ACTIVE_ANON) + + global_node_page_state(NR_INACTIVE_ANON) + + global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE) + - global_node_page_state(NR_FILE_MAPPED); return saveable <= size ? 0 : saveable - size; } diff --git a/mm/backing-dev.c b/mm/backing-dev.c index ed173b8ae8f2..efe237742074 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -947,24 +947,24 @@ long congestion_wait(int sync, long timeout) EXPORT_SYMBOL(congestion_wait); /** - * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes - * @zone: A zone to check if it is heavily congested + * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes + * @pgdat: A pgdat to check if it is heavily congested * @sync: SYNC or ASYNC IO * @timeout: timeout in jiffies * * In the event of a congested backing_dev (any backing_dev) and the given - * @zone has experienced recent congestion, this waits for up to @timeout + * @pgdat has experienced recent congestion, this waits for up to @timeout * jiffies for either a BDI to exit congestion of the given @sync queue * or a write to complete. * - * In the absence of zone congestion, cond_resched() is called to yield + * In the absence of pgdat congestion, cond_resched() is called to yield * the processor if necessary but otherwise does not sleep. * * The return value is 0 if the sleep is for the full timeout. Otherwise, * it is the number of jiffies that were still remaining when the function * returned. return_value == timeout implies the function did not sleep. */ -long wait_iff_congested(struct zone *zone, int sync, long timeout) +long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout) { long ret; unsigned long start = jiffies; @@ -973,12 +973,13 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) /* * If there is no congestion, or heavy congestion is not being - * encountered in the current zone, yield if necessary instead + * encountered in the current pgdat, yield if necessary instead * of sleeping on the congestion queue */ if (atomic_read(&nr_wb_congested[sync]) == 0 || - !test_bit(ZONE_CONGESTED, &zone->flags)) { + !test_bit(PGDAT_CONGESTED, &pgdat->flags)) { cond_resched(); + /* In case we scheduled, work out time remaining */ ret = timeout - (jiffies - start); if (ret < 0) diff --git a/mm/compaction.c b/mm/compaction.c index 5c65fad3f330..e5995f38d677 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -646,8 +646,8 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc) list_for_each_entry(page, &cc->migratepages, lru) count[!!page_is_file_cache(page)]++; - mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); - mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]); } /* Similar to reclaim, but different enough that they don't share logic */ @@ -655,12 +655,12 @@ static bool too_many_isolated(struct zone *zone) { unsigned long active, inactive, isolated; - inactive = zone_page_state(zone, NR_INACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_ANON); - active = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_ACTIVE_ANON); - isolated = zone_page_state(zone, NR_ISOLATED_FILE) + - zone_page_state(zone, NR_ISOLATED_ANON); + inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) + + node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON); + active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) + + node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON); + isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) + + node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON); return isolated > (inactive + active) / 2; } @@ -856,7 +856,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); /* Try isolate the page */ if (__isolate_lru_page(page, isolate_mode) != 0) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 99578b63814b..481fb0128d21 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1818,7 +1818,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, pgoff_t end = -1; int i; - lruvec = mem_cgroup_page_lruvec(head, zone); + lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); diff --git a/mm/internal.h b/mm/internal.h index 9b6a6c43ac39..2f80d0343c56 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -78,7 +78,7 @@ extern unsigned long highest_memmap_pfn; */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); -extern bool zone_reclaimable(struct zone *zone); +extern bool pgdat_reclaimable(struct pglist_data *pgdat); /* * in mm/rmap.c: diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7dbee698d6aa..374237bb059d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -480,7 +480,7 @@ void __khugepaged_exit(struct mm_struct *mm) static void release_pte_page(struct page *page) { /* 0 stands for page_is_file_cache(page) == false */ - dec_zone_page_state(page, NR_ISOLATED_ANON + 0); + dec_node_page_state(page, NR_ISOLATED_ANON + 0); unlock_page(page); putback_lru_page(page); } @@ -576,7 +576,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } /* 0 stands for page_is_file_cache(page) == false */ - inc_zone_page_state(page, NR_ISOLATED_ANON + 0); + inc_node_page_state(page, NR_ISOLATED_ANON + 0); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9b70f9ca8ddf..50c86ad121bc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -943,14 +943,14 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) * and putback protocol: the LRU lock must be held, and the page must * either be PageLRU() or the caller must have isolated/allocated it. */ -struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) +struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) { struct mem_cgroup_per_zone *mz; struct mem_cgroup *memcg; struct lruvec *lruvec; if (mem_cgroup_disabled()) { - lruvec = &zone->lruvec; + lruvec = &pgdat->lruvec; goto out; } @@ -970,8 +970,8 @@ out: * we have to be prepared to initialize lruvec->zone here; * and if offlined then reonlined, we need to reinitialize it. */ - if (unlikely(lruvec->zone != zone)) - lruvec->zone = zone; + if (unlikely(lruvec->pgdat != pgdat)) + lruvec->pgdat = pgdat; return lruvec; } @@ -979,6 +979,7 @@ out: * mem_cgroup_update_lru_size - account for adding or removing an lru page * @lruvec: mem_cgroup per zone lru vector * @lru: index of lru list the page is sitting on + * @zid: Zone ID of the zone pages have been added to * @nr_pages: positive when adding or negative when removing * * This function must be called under lru_lock, just before a page is added @@ -986,14 +987,14 @@ out: * so as to allow it to check that lru_size 0 is consistent with list_empty). */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int nr_pages) + enum zone_type zid, int nr_pages) { struct mem_cgroup_per_zone *mz; unsigned long *lru_size; long size; bool empty; - __update_lru_size(lruvec, lru, nr_pages); + __update_lru_size(lruvec, lru, zid, nr_pages); if (mem_cgroup_disabled()) return; @@ -2069,7 +2070,7 @@ static void lock_page_lru(struct page *page, int *isolated) if (PageLRU(page)) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_lru(page)); *isolated = 1; @@ -2084,7 +2085,7 @@ static void unlock_page_lru(struct page *page, int isolated) if (isolated) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); add_page_to_lru_list(page, lruvec, page_lru(page)); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2fcca6b0e005..11de752ccaf5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1663,7 +1663,7 @@ static int __soft_offline_page(struct page *page, int flags) put_hwpoison_page(page); if (!ret) { LIST_HEAD(pagelist); - inc_zone_page_state(page, NR_ISOLATED_ANON + + inc_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, @@ -1671,7 +1671,7 @@ static int __soft_offline_page(struct page *page, int flags) if (ret) { if (!list_empty(&pagelist)) { list_del(&page->lru); - dec_zone_page_state(page, NR_ISOLATED_ANON + + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); putback_lru_page(page); } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 82d0b98d27f8..c5278360ca66 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1586,7 +1586,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) put_page(page); list_add_tail(&page->lru, &source); move_pages--; - inc_zone_page_state(page, NR_ISOLATED_ANON + + inc_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); } else { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 53e40d3f3933..d8c4e38fb5f4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -962,7 +962,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { if (!isolate_lru_page(page)) { list_add_tail(&page->lru, pagelist); - inc_zone_page_state(page, NR_ISOLATED_ANON + + inc_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); } } diff --git a/mm/migrate.c b/mm/migrate.c index 2232f6923cc7..3033dae33a0a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -168,7 +168,7 @@ void putback_movable_pages(struct list_head *l) continue; } list_del(&page->lru); - dec_zone_page_state(page, NR_ISOLATED_ANON + + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); /* * We isolated non-lru movable page so here we can use @@ -1119,7 +1119,7 @@ out: * restored. */ list_del(&page->lru); - dec_zone_page_state(page, NR_ISOLATED_ANON + + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); } @@ -1460,7 +1460,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, err = isolate_lru_page(page); if (!err) { list_add_tail(&page->lru, &pagelist); - inc_zone_page_state(page, NR_ISOLATED_ANON + + inc_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); } put_and_set: @@ -1726,15 +1726,16 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, unsigned long nr_migrate_pages) { int z; + + if (!pgdat_reclaimable(pgdat)) + return false; + for (z = pgdat->nr_zones - 1; z >= 0; z--) { struct zone *zone = pgdat->node_zones + z; if (!populated_zone(zone)) continue; - if (!zone_reclaimable(zone)) - continue; - /* Avoid waking kswapd by allocating pages_to_migrate pages. */ if (!zone_watermark_ok(zone, 0, high_wmark_pages(zone) + @@ -1828,7 +1829,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) } page_lru = page_is_file_cache(page); - mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, hpage_nr_pages(page)); /* @@ -1886,7 +1887,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); - dec_zone_page_state(page, NR_ISOLATED_ANON + + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); putback_lru_page(page); } @@ -1979,7 +1980,7 @@ fail_putback: /* Retake the callers reference and putback on LRU */ get_page(page); putback_lru_page(page); - mod_zone_page_state(page_zone(page), + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); goto out_unlock; @@ -2030,7 +2031,7 @@ fail_putback: count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); - mod_zone_page_state(page_zone(page), + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); return isolated; diff --git a/mm/mlock.c b/mm/mlock.c index 997f63082ff5..14645be06e30 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -103,7 +103,7 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage) if (PageLRU(page)) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, page_zone(page)); + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); if (getpage) get_page(page); ClearPageLRU(page); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d578d2a56b19..0ada2b2954b0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -285,8 +285,8 @@ static unsigned long zone_dirtyable_memory(struct zone *zone) */ nr_pages -= min(nr_pages, zone->totalreserve_pages); - nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); - nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); + nr_pages += node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE); + nr_pages += node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE); return nr_pages; } @@ -348,8 +348,8 @@ static unsigned long global_dirtyable_memory(void) */ x -= min(x, totalreserve_pages); - x += global_page_state(NR_INACTIVE_FILE); - x += global_page_state(NR_ACTIVE_FILE); + x += global_node_page_state(NR_INACTIVE_FILE); + x += global_node_page_state(NR_ACTIVE_FILE); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5760c626c309..35e2d0f9d44f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1078,9 +1078,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, spin_lock(&zone->lock); isolated_pageblocks = has_isolate_pageblock(zone); - nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); while (count) { struct page *page; @@ -1135,9 +1135,9 @@ static void free_one_page(struct zone *zone, { unsigned long nr_scanned; spin_lock(&zone->lock); - nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { @@ -4288,6 +4288,7 @@ void show_free_areas(unsigned int filter) unsigned long free_pcp = 0; int cpu; struct zone *zone; + pg_data_t *pgdat; for_each_populated_zone(zone) { if (skip_free_areas_node(filter, zone_to_nid(zone))) @@ -4306,13 +4307,13 @@ void show_free_areas(unsigned int filter) " anon_thp: %lu shmem_thp: %lu shmem_pmdmapped: %lu\n" #endif " free:%lu free_pcp:%lu free_cma:%lu\n", - global_page_state(NR_ACTIVE_ANON), - global_page_state(NR_INACTIVE_ANON), - global_page_state(NR_ISOLATED_ANON), - global_page_state(NR_ACTIVE_FILE), - global_page_state(NR_INACTIVE_FILE), - global_page_state(NR_ISOLATED_FILE), - global_page_state(NR_UNEVICTABLE), + global_node_page_state(NR_ACTIVE_ANON), + global_node_page_state(NR_INACTIVE_ANON), + global_node_page_state(NR_ISOLATED_ANON), + global_node_page_state(NR_ACTIVE_FILE), + global_node_page_state(NR_INACTIVE_FILE), + global_node_page_state(NR_ISOLATED_FILE), + global_node_page_state(NR_UNEVICTABLE), global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), @@ -4331,6 +4332,28 @@ void show_free_areas(unsigned int filter) free_pcp, global_page_state(NR_FREE_CMA_PAGES)); + for_each_online_pgdat(pgdat) { + printk("Node %d" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" + " unevictable:%lukB" + " isolated(anon):%lukB" + " isolated(file):%lukB" + " all_unreclaimable? %s" + "\n", + pgdat->node_id, + K(node_page_state(pgdat, NR_ACTIVE_ANON)), + K(node_page_state(pgdat, NR_INACTIVE_ANON)), + K(node_page_state(pgdat, NR_ACTIVE_FILE)), + K(node_page_state(pgdat, NR_INACTIVE_FILE)), + K(node_page_state(pgdat, NR_UNEVICTABLE)), + K(node_page_state(pgdat, NR_ISOLATED_ANON)), + K(node_page_state(pgdat, NR_ISOLATED_FILE)), + !pgdat_reclaimable(pgdat) ? "yes" : "no"); + } + for_each_populated_zone(zone) { int i; @@ -4347,13 +4370,6 @@ void show_free_areas(unsigned int filter) " min:%lukB" " low:%lukB" " high:%lukB" - " active_anon:%lukB" - " inactive_anon:%lukB" - " active_file:%lukB" - " inactive_file:%lukB" - " unevictable:%lukB" - " isolated(anon):%lukB" - " isolated(file):%lukB" " present:%lukB" " managed:%lukB" " mlocked:%lukB" @@ -4376,21 +4392,13 @@ void show_free_areas(unsigned int filter) " local_pcp:%ukB" " free_cma:%lukB" " writeback_tmp:%lukB" - " pages_scanned:%lu" - " all_unreclaimable? %s" + " node_pages_scanned:%lu" "\n", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), - K(zone_page_state(zone, NR_ACTIVE_ANON)), - K(zone_page_state(zone, NR_INACTIVE_ANON)), - K(zone_page_state(zone, NR_ACTIVE_FILE)), - K(zone_page_state(zone, NR_INACTIVE_FILE)), - K(zone_page_state(zone, NR_UNEVICTABLE)), - K(zone_page_state(zone, NR_ISOLATED_ANON)), - K(zone_page_state(zone, NR_ISOLATED_FILE)), K(zone->present_pages), K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), @@ -4415,9 +4423,7 @@ void show_free_areas(unsigned int filter) K(this_cpu_read(zone->pageset->pcp.count)), K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), - K(zone_page_state(zone, NR_PAGES_SCANNED)), - (!zone_reclaimable(zone) ? "yes" : "no") - ); + K(node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED))); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) printk(" %ld", zone->lowmem_reserve[i]); @@ -5967,7 +5973,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) /* For bootup, initialized properly in watermark setup */ mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); - lruvec_init(&zone->lruvec); + lruvec_init(zone_lruvec(zone)); if (!size) continue; diff --git a/mm/swap.c b/mm/swap.c index bf37e5cfae81..77af473635fe 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -63,7 +63,7 @@ static void __page_cache_release(struct page *page) unsigned long flags; spin_lock_irqsave(zone_lru_lock(zone), flags); - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); @@ -194,7 +194,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, spin_lock_irqsave(zone_lru_lock(zone), flags); } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); (*move_fn)(page, lruvec, arg); } if (zone) @@ -319,7 +319,7 @@ void activate_page(struct page *page) page = compound_head(page); spin_lock_irq(zone_lru_lock(zone)); - __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); + __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL); spin_unlock_irq(zone_lru_lock(zone)); } #endif @@ -445,16 +445,16 @@ void lru_cache_add(struct page *page) */ void add_page_to_unevictable_list(struct page *page) { - struct zone *zone = page_zone(page); + struct pglist_data *pgdat = page_pgdat(page); struct lruvec *lruvec; - spin_lock_irq(zone_lru_lock(zone)); - lruvec = mem_cgroup_page_lruvec(page, zone); + spin_lock_irq(&pgdat->lru_lock); + lruvec = mem_cgroup_page_lruvec(page, pgdat); ClearPageActive(page); SetPageUnevictable(page); SetPageLRU(page); add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); } /** @@ -730,7 +730,7 @@ void release_pages(struct page **pages, int nr, bool cold) { int i; LIST_HEAD(pages_to_free); - struct zone *zone = NULL; + struct pglist_data *locked_pgdat = NULL; struct lruvec *lruvec; unsigned long uninitialized_var(flags); unsigned int uninitialized_var(lock_batch); @@ -741,11 +741,11 @@ void release_pages(struct page **pages, int nr, bool cold) /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the - * same zone. The lock is held only if zone != NULL. + * same pgdat. The lock is held only if pgdat != NULL. */ - if (zone && ++lock_batch == SWAP_CLUSTER_MAX) { - spin_unlock_irqrestore(zone_lru_lock(zone), flags); - zone = NULL; + if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) { + spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); + locked_pgdat = NULL; } if (is_huge_zero_page(page)) { @@ -758,27 +758,27 @@ void release_pages(struct page **pages, int nr, bool cold) continue; if (PageCompound(page)) { - if (zone) { - spin_unlock_irqrestore(zone_lru_lock(zone), flags); - zone = NULL; + if (locked_pgdat) { + spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); + locked_pgdat = NULL; } __put_compound_page(page); continue; } if (PageLRU(page)) { - struct zone *pagezone = page_zone(page); + struct pglist_data *pgdat = page_pgdat(page); - if (pagezone != zone) { - if (zone) - spin_unlock_irqrestore(zone_lru_lock(zone), + if (pgdat != locked_pgdat) { + if (locked_pgdat) + spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); lock_batch = 0; - zone = pagezone; - spin_lock_irqsave(zone_lru_lock(zone), flags); + locked_pgdat = pgdat; + spin_lock_irqsave(&locked_pgdat->lru_lock, flags); } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, locked_pgdat); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); @@ -789,8 +789,8 @@ void release_pages(struct page **pages, int nr, bool cold) list_add(&page->lru, &pages_to_free); } - if (zone) - spin_unlock_irqrestore(zone_lru_lock(zone), flags); + if (locked_pgdat) + spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); mem_cgroup_uncharge_list(&pages_to_free); free_hot_cold_page_list(&pages_to_free, cold); @@ -826,7 +826,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, VM_BUG_ON_PAGE(PageCompound(page_tail), page); VM_BUG_ON_PAGE(PageLRU(page_tail), page); VM_BUG_ON(NR_CPUS != 1 && - !spin_is_locked(zone_lru_lock(lruvec_zone(lruvec)))); + !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock)); if (!list) SetPageLRU(page_tail); diff --git a/mm/vmscan.c b/mm/vmscan.c index e7ffcd259cc4..86a523a761c9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -191,26 +191,42 @@ static bool sane_reclaim(struct scan_control *sc) } #endif +/* + * This misses isolated pages which are not accounted for to save counters. + * As the data only determines if reclaim or compaction continues, it is + * not expected that isolated pages will be a dominating factor. + */ unsigned long zone_reclaimable_pages(struct zone *zone) { unsigned long nr; - nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) + - zone_page_state_snapshot(zone, NR_INACTIVE_FILE) + - zone_page_state_snapshot(zone, NR_ISOLATED_FILE); + nr = zone_page_state_snapshot(zone, NR_ZONE_LRU_FILE); + if (get_nr_swap_pages() > 0) + nr += zone_page_state_snapshot(zone, NR_ZONE_LRU_ANON); + + return nr; +} + +unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) +{ + unsigned long nr; + + nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) + + node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) + + node_page_state_snapshot(pgdat, NR_ISOLATED_FILE); if (get_nr_swap_pages() > 0) - nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) + - zone_page_state_snapshot(zone, NR_INACTIVE_ANON) + - zone_page_state_snapshot(zone, NR_ISOLATED_ANON); + nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) + + node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) + + node_page_state_snapshot(pgdat, NR_ISOLATED_ANON); return nr; } -bool zone_reclaimable(struct zone *zone) +bool pgdat_reclaimable(struct pglist_data *pgdat) { - return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) < - zone_reclaimable_pages(zone) * 6; + return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) < + pgdat_reclaimable_pages(pgdat) * 6; } unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) @@ -218,7 +234,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) if (!mem_cgroup_disabled()) return mem_cgroup_get_lru_size(lruvec, lru); - return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); + return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); } /* @@ -877,7 +893,7 @@ static void page_check_dirty_writeback(struct page *page, * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct zone *zone, + struct pglist_data *pgdat, struct scan_control *sc, enum ttu_flags ttu_flags, unsigned long *ret_nr_dirty, @@ -917,7 +933,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep; VM_BUG_ON_PAGE(PageActive(page), page); - VM_BUG_ON_PAGE(page_zone(page) != zone, page); sc->nr_scanned++; @@ -996,7 +1011,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Case 1 above */ if (current_is_kswapd() && PageReclaim(page) && - test_bit(ZONE_WRITEBACK, &zone->flags)) { + test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { nr_immediate++; goto keep_locked; @@ -1092,7 +1107,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_is_file_cache(page) && (!current_is_kswapd() || - !test_bit(ZONE_DIRTY, &zone->flags))) { + !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -1266,11 +1281,11 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } } - ret = shrink_page_list(&clean_pages, zone, &sc, + ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, TTU_UNMAP|TTU_IGNORE_ACCESS, &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); list_splice(&clean_pages, page_list); - mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); return ret; } @@ -1375,7 +1390,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, { struct list_head *src = &lruvec->lists[lru]; unsigned long nr_taken = 0; - unsigned long scan; + unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; + unsigned long scan, nr_pages; for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src); scan++) { @@ -1388,7 +1404,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, switch (__isolate_lru_page(page, mode)) { case 0: - nr_taken += hpage_nr_pages(page); + nr_pages = hpage_nr_pages(page); + nr_taken += nr_pages; + nr_zone_taken[page_zonenum(page)] += nr_pages; list_move(&page->lru, dst); break; @@ -1405,6 +1423,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, *nr_scanned = scan; trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, nr_taken, mode, is_file_lru(lru)); + for (scan = 0; scan < MAX_NR_ZONES; scan++) { + nr_pages = nr_zone_taken[scan]; + if (!nr_pages) + continue; + + update_lru_size(lruvec, lru, scan, -nr_pages); + } return nr_taken; } @@ -1445,7 +1470,7 @@ int isolate_lru_page(struct page *page) struct lruvec *lruvec; spin_lock_irq(zone_lru_lock(zone)); - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); if (PageLRU(page)) { int lru = page_lru(page); get_page(page); @@ -1465,7 +1490,7 @@ int isolate_lru_page(struct page *page) * the LRU list will go small and be scanned faster than necessary, leading to * unnecessary swapping, thrashing and OOM. */ -static int too_many_isolated(struct zone *zone, int file, +static int too_many_isolated(struct pglist_data *pgdat, int file, struct scan_control *sc) { unsigned long inactive, isolated; @@ -1477,11 +1502,11 @@ static int too_many_isolated(struct zone *zone, int file, return 0; if (file) { - inactive = zone_page_state(zone, NR_INACTIVE_FILE); - isolated = zone_page_state(zone, NR_ISOLATED_FILE); + inactive = node_page_state(pgdat, NR_INACTIVE_FILE); + isolated = node_page_state(pgdat, NR_ISOLATED_FILE); } else { - inactive = zone_page_state(zone, NR_INACTIVE_ANON); - isolated = zone_page_state(zone, NR_ISOLATED_ANON); + inactive = node_page_state(pgdat, NR_INACTIVE_ANON); + isolated = node_page_state(pgdat, NR_ISOLATED_ANON); } /* @@ -1499,7 +1524,7 @@ static noinline_for_stack void putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) { struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); LIST_HEAD(pages_to_free); /* @@ -1512,13 +1537,13 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) VM_BUG_ON_PAGE(PageLRU(page), page); list_del(&page->lru); if (unlikely(!page_evictable(page))) { - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); putback_lru_page(page); - spin_lock_irq(zone_lru_lock(zone)); + spin_lock_irq(&pgdat->lru_lock); continue; } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, pgdat); SetPageLRU(page); lru = page_lru(page); @@ -1535,10 +1560,10 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) del_page_from_lru_list(page, lruvec, lru); if (unlikely(PageCompound(page))) { - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); - spin_lock_irq(zone_lru_lock(zone)); + spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, &pages_to_free); } @@ -1582,10 +1607,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_immediate = 0; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - while (unlikely(too_many_isolated(zone, file, sc))) { + while (unlikely(too_many_isolated(pgdat, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); /* We are about to die and free our memory. Return now. */ @@ -1600,48 +1625,45 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (!sc->may_writepage) isolate_mode |= ISOLATE_CLEAN; - spin_lock_irq(zone_lru_lock(zone)); + spin_lock_irq(&pgdat->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, &nr_scanned, sc, isolate_mode, lru); - update_lru_size(lruvec, lru, -nr_taken); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) { - __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); + __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) - __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); + __count_vm_events(PGSCAN_KSWAPD, nr_scanned); else - __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); + __count_vm_events(PGSCAN_DIRECT, nr_scanned); } - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, + nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP, &nr_dirty, &nr_unqueued_dirty, &nr_congested, &nr_writeback, &nr_immediate, false); - spin_lock_irq(zone_lru_lock(zone)); + spin_lock_irq(&pgdat->lru_lock); if (global_reclaim(sc)) { if (current_is_kswapd()) - __count_zone_vm_events(PGSTEAL_KSWAPD, zone, - nr_reclaimed); + __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed); else - __count_zone_vm_events(PGSTEAL_DIRECT, zone, - nr_reclaimed); + __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed); } putback_inactive_pages(lruvec, &page_list); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge_list(&page_list); free_hot_cold_page_list(&page_list, true); @@ -1661,7 +1683,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * are encountered in the nr_immediate check below. */ if (nr_writeback && nr_writeback == nr_taken) - set_bit(ZONE_WRITEBACK, &zone->flags); + set_bit(PGDAT_WRITEBACK, &pgdat->flags); /* * Legacy memcg will stall in page writeback so avoid forcibly @@ -1673,16 +1695,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * backed by a congested BDI and wait_iff_congested will stall. */ if (nr_dirty && nr_dirty == nr_congested) - set_bit(ZONE_CONGESTED, &zone->flags); + set_bit(PGDAT_CONGESTED, &pgdat->flags); /* * If dirty pages are scanned that are not queued for IO, it * implies that flushers are not keeping up. In this case, flag - * the zone ZONE_DIRTY and kswapd will start writing pages from + * the pgdat PGDAT_DIRTY and kswapd will start writing pages from * reclaim context. */ if (nr_unqueued_dirty == nr_taken) - set_bit(ZONE_DIRTY, &zone->flags); + set_bit(PGDAT_DIRTY, &pgdat->flags); /* * If kswapd scans pages marked marked for immediate @@ -1701,9 +1723,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, */ if (!sc->hibernation_mode && !current_is_kswapd() && current_may_throttle()) - wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); + wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10); - trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed, + trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, + nr_scanned, nr_reclaimed, sc->priority, file); return nr_reclaimed; } @@ -1731,20 +1754,20 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, struct list_head *pages_to_free, enum lru_list lru) { - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); unsigned long pgmoved = 0; struct page *page; int nr_pages; while (!list_empty(list)) { page = lru_to_page(list); - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, pgdat); VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); nr_pages = hpage_nr_pages(page); - update_lru_size(lruvec, lru, nr_pages); + update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); list_move(&page->lru, &lruvec->lists[lru]); pgmoved += nr_pages; @@ -1754,10 +1777,10 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, del_page_from_lru_list(page, lruvec, lru); if (unlikely(PageCompound(page))) { - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); - spin_lock_irq(zone_lru_lock(zone)); + spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, pages_to_free); } @@ -1783,7 +1806,7 @@ static void shrink_active_list(unsigned long nr_to_scan, unsigned long nr_rotated = 0; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); lru_add_drain(); @@ -1792,20 +1815,19 @@ static void shrink_active_list(unsigned long nr_to_scan, if (!sc->may_writepage) isolate_mode |= ISOLATE_CLEAN; - spin_lock_irq(zone_lru_lock(zone)); + spin_lock_irq(&pgdat->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); - update_lru_size(lruvec, lru, -nr_taken); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); - __count_zone_vm_events(PGREFILL, zone, nr_scanned); + __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); + __count_vm_events(PGREFILL, nr_scanned); - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); while (!list_empty(&l_hold)) { cond_resched(); @@ -1850,7 +1872,7 @@ static void shrink_active_list(unsigned long nr_to_scan, /* * Move pages back to the lru list. */ - spin_lock_irq(zone_lru_lock(zone)); + spin_lock_irq(&pgdat->lru_lock); /* * Count referenced pages from currently used mappings as rotated, * even though only some of them are actually re-activated. This @@ -1861,8 +1883,8 @@ static void shrink_active_list(unsigned long nr_to_scan, move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); - spin_unlock_irq(zone_lru_lock(zone)); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge_list(&l_hold); free_hot_cold_page_list(&l_hold, true); @@ -1956,7 +1978,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; u64 denominator = 0; /* gcc */ - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); unsigned long anon_prio, file_prio; enum scan_balance scan_balance; unsigned long anon, file; @@ -1977,7 +1999,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * well. */ if (current_is_kswapd()) { - if (!zone_reclaimable(zone)) + if (!pgdat_reclaimable(pgdat)) force_scan = true; if (!mem_cgroup_online(memcg)) force_scan = true; @@ -2023,14 +2045,24 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * anon pages. Try to detect this based on file LRU size. */ if (global_reclaim(sc)) { - unsigned long zonefile; - unsigned long zonefree; + unsigned long pgdatfile; + unsigned long pgdatfree; + int z; + unsigned long total_high_wmark = 0; - zonefree = zone_page_state(zone, NR_FREE_PAGES); - zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); + pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + if (!populated_zone(zone)) + continue; + + total_high_wmark += high_wmark_pages(zone); + } - if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { + if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) { scan_balance = SCAN_ANON; goto out; } @@ -2077,7 +2109,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); - spin_lock_irq(zone_lru_lock(zone)); + spin_lock_irq(&pgdat->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { reclaim_stat->recent_scanned[0] /= 2; reclaim_stat->recent_rotated[0] /= 2; @@ -2098,7 +2130,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); fraction[0] = ap; fraction[1] = fp; @@ -2352,9 +2384,9 @@ static inline bool should_continue_reclaim(struct zone *zone, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); - inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); + inactive_lru_pages = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE); if (get_nr_swap_pages() > 0) - inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); + inactive_lru_pages += node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON); if (sc->nr_reclaimed < pages_for_compaction && inactive_lru_pages > pages_for_compaction) return true; @@ -2554,7 +2586,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) continue; if (sc->priority != DEF_PRIORITY && - !zone_reclaimable(zone)) + !pgdat_reclaimable(zone->zone_pgdat)) continue; /* Let kswapd poll it */ /* @@ -2692,7 +2724,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; if (!populated_zone(zone) || - zone_reclaimable_pages(zone) == 0) + pgdat_reclaimable_pages(pgdat) == 0) continue; pfmemalloc_reserve += min_wmark_pages(zone); @@ -3000,7 +3032,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) * DEF_PRIORITY. Effectively, it considers them balanced so * they must be considered balanced here as well! */ - if (!zone_reclaimable(zone)) { + if (!pgdat_reclaimable(zone->zone_pgdat)) { balanced_pages += zone->managed_pages; continue; } @@ -3063,6 +3095,7 @@ static bool kswapd_shrink_zone(struct zone *zone, { unsigned long balance_gap; bool lowmem_pressure; + struct pglist_data *pgdat = zone->zone_pgdat; /* Reclaim above the high watermark. */ sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); @@ -3087,7 +3120,8 @@ static bool kswapd_shrink_zone(struct zone *zone, shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); - clear_bit(ZONE_WRITEBACK, &zone->flags); + /* TODO: ANOMALY */ + clear_bit(PGDAT_WRITEBACK, &pgdat->flags); /* * If a zone reaches its high watermark, consider it to be no longer @@ -3095,10 +3129,10 @@ static bool kswapd_shrink_zone(struct zone *zone, * BDIs but as pressure is relieved, speculatively avoid congestion * waits. */ - if (zone_reclaimable(zone) && + if (pgdat_reclaimable(zone->zone_pgdat) && zone_balanced(zone, sc->order, false, 0, classzone_idx)) { - clear_bit(ZONE_CONGESTED, &zone->flags); - clear_bit(ZONE_DIRTY, &zone->flags); + clear_bit(PGDAT_CONGESTED, &pgdat->flags); + clear_bit(PGDAT_DIRTY, &pgdat->flags); } return sc->nr_scanned >= sc->nr_to_reclaim; @@ -3157,7 +3191,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) continue; if (sc.priority != DEF_PRIORITY && - !zone_reclaimable(zone)) + !pgdat_reclaimable(zone->zone_pgdat)) continue; /* @@ -3184,9 +3218,11 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) /* * If balanced, clear the dirty and congested * flags + * + * TODO: ANOMALY */ - clear_bit(ZONE_CONGESTED, &zone->flags); - clear_bit(ZONE_DIRTY, &zone->flags); + clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags); + clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags); } } @@ -3216,7 +3252,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) continue; if (sc.priority != DEF_PRIORITY && - !zone_reclaimable(zone)) + !pgdat_reclaimable(zone->zone_pgdat)) continue; sc.nr_scanned = 0; @@ -3612,8 +3648,8 @@ int sysctl_min_slab_ratio = 5; static inline unsigned long zone_unmapped_file_pages(struct zone *zone) { unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); - unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + - zone_page_state(zone, NR_ACTIVE_FILE); + unsigned long file_lru = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) + + node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE); /* * It's possible for there to be more file mapped pages than @@ -3716,7 +3752,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) return ZONE_RECLAIM_FULL; - if (!zone_reclaimable(zone)) + if (!pgdat_reclaimable(zone->zone_pgdat)) return ZONE_RECLAIM_FULL; /* @@ -3795,7 +3831,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) zone = pagezone; spin_lock_irq(zone_lru_lock(zone)); } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); if (!PageLRU(page) || !PageUnevictable(page)) continue; diff --git a/mm/vmstat.c b/mm/vmstat.c index 3345d396a99b..de0c17076270 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -936,11 +936,8 @@ const char * const vmstat_text[] = { /* enum zone_stat_item countes */ "nr_free_pages", "nr_alloc_batch", - "nr_inactive_anon", - "nr_active_anon", - "nr_inactive_file", - "nr_active_file", - "nr_unevictable", + "nr_zone_anon_lru", + "nr_zone_file_lru", "nr_mlock", "nr_anon_pages", "nr_mapped", @@ -956,12 +953,9 @@ const char * const vmstat_text[] = { "nr_vmscan_write", "nr_vmscan_immediate_reclaim", "nr_writeback_temp", - "nr_isolated_anon", - "nr_isolated_file", "nr_shmem", "nr_dirtied", "nr_written", - "nr_pages_scanned", #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", #endif @@ -981,6 +975,16 @@ const char * const vmstat_text[] = { "nr_shmem_pmdmapped", "nr_free_cma", + /* Node-based counters */ + "nr_inactive_anon", + "nr_active_anon", + "nr_inactive_file", + "nr_active_file", + "nr_unevictable", + "nr_isolated_anon", + "nr_isolated_file", + "nr_pages_scanned", + /* enum writeback_stat_item counters */ "nr_dirty_threshold", "nr_dirty_background_threshold", @@ -1002,11 +1006,11 @@ const char * const vmstat_text[] = { "pgmajfault", "pglazyfreed", - TEXTS_FOR_ZONES("pgrefill") - TEXTS_FOR_ZONES("pgsteal_kswapd") - TEXTS_FOR_ZONES("pgsteal_direct") - TEXTS_FOR_ZONES("pgscan_kswapd") - TEXTS_FOR_ZONES("pgscan_direct") + "pgrefill", + "pgsteal_kswapd", + "pgsteal_direct", + "pgscan_kswapd", + "pgscan_direct", "pgscan_direct_throttle", #ifdef CONFIG_NUMA @@ -1434,7 +1438,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n min %lu" "\n low %lu" "\n high %lu" - "\n scanned %lu" + "\n node_scanned %lu" "\n spanned %lu" "\n present %lu" "\n managed %lu", @@ -1442,13 +1446,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - zone_page_state(zone, NR_PAGES_SCANNED), + node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED), zone->spanned_pages, zone->present_pages, zone->managed_pages); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - seq_printf(m, "\n %-12s %lu", vmstat_text[i], + seq_printf(m, "\n %-12s %lu", vmstat_text[i], zone_page_state(zone, i)); seq_printf(m, @@ -1478,12 +1482,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, #endif } seq_printf(m, - "\n all_unreclaimable: %u" - "\n start_pfn: %lu" - "\n inactive_ratio: %u", - !zone_reclaimable(zone), + "\n node_unreclaimable: %u" + "\n start_pfn: %lu" + "\n node_inactive_ratio: %u", + !pgdat_reclaimable(zone->zone_pgdat), zone->zone_start_pfn, - zone->inactive_ratio); + zone->zone_pgdat->inactive_ratio); seq_putc(m, '\n'); } @@ -1574,7 +1578,6 @@ static int vmstat_show(struct seq_file *m, void *arg) { unsigned long *l = arg; unsigned long off = l - (unsigned long *)m->private; - seq_printf(m, "%s %lu\n", vmstat_text[off], *l); return 0; } diff --git a/mm/workingset.c b/mm/workingset.c index 5ffba0c0adc6..7820a7e1ca98 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -355,8 +355,8 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, LRU_ALL_FILE); } else { - pages = sum_zone_node_page_state(sc->nid, NR_ACTIVE_FILE) + - sum_zone_node_page_state(sc->nid, NR_INACTIVE_FILE); + pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + + node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); } /* -- cgit v1.3-8-gc7d7 From a5f5f91da6ad647fb0cc7fce0e17343c0d1c5a9a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 28 Jul 2016 15:46:32 -0700 Subject: mm: convert zone_reclaim to node_reclaim As reclaim is now per-node based, convert zone_reclaim to be node_reclaim. It is possible that a node will be reclaimed multiple times if it has multiple zones but this is unavoidable without caching all nodes traversed so far. The documentation and interface to userspace is the same from a configuration perspective and will will be similar in behaviour unless the node-local allocation requests were also limited to lower zones. Link: http://lkml.kernel.org/r/1467970510-21195-24-git-send-email-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Hillf Danton Acked-by: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 18 +++++------ include/linux/swap.h | 9 +++--- include/linux/topology.h | 2 +- kernel/sysctl.c | 4 +-- mm/internal.h | 8 ++--- mm/khugepaged.c | 4 +-- mm/page_alloc.c | 24 ++++++++++----- mm/vmscan.c | 77 ++++++++++++++++++++++++------------------------ 8 files changed, 77 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e3d6d42722a0..e19c081c794e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -372,14 +372,6 @@ struct zone { unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ -#ifdef CONFIG_NUMA - /* - * zone reclaim becomes active if more unmapped pages exist. - */ - unsigned long min_unmapped_pages; - unsigned long min_slab_pages; -#endif /* CONFIG_NUMA */ - /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; @@ -525,7 +517,6 @@ struct zone { } ____cacheline_internodealigned_in_smp; enum zone_flags { - ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ }; @@ -540,6 +531,7 @@ enum pgdat_flags { PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ + PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ }; static inline unsigned long zone_end_pfn(const struct zone *zone) @@ -688,6 +680,14 @@ typedef struct pglist_data { */ unsigned long totalreserve_pages; +#ifdef CONFIG_NUMA + /* + * zone reclaim becomes active if more unmapped pages exist. + */ + unsigned long min_unmapped_pages; + unsigned long min_slab_pages; +#endif /* CONFIG_NUMA */ + /* Write-intensive fields used by page reclaim */ ZONE_PADDING(_pad1_) spinlock_t lru_lock; diff --git a/include/linux/swap.h b/include/linux/swap.h index 2a23ddc96edd..b17cc4830fa6 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -326,13 +326,14 @@ extern int remove_mapping(struct address_space *mapping, struct page *page); extern unsigned long vm_total_pages; #ifdef CONFIG_NUMA -extern int zone_reclaim_mode; +extern int node_reclaim_mode; extern int sysctl_min_unmapped_ratio; extern int sysctl_min_slab_ratio; -extern int zone_reclaim(struct zone *, gfp_t, unsigned int); +extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); #else -#define zone_reclaim_mode 0 -static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) +#define node_reclaim_mode 0 +static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, + unsigned int order) { return 0; } diff --git a/include/linux/topology.h b/include/linux/topology.h index afce69296ac0..cb0775e1ee4b 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -54,7 +54,7 @@ int arch_update_cpu_topology(void); /* * If the distance between nodes in a system is larger than RECLAIM_DISTANCE * (in whatever arch specific measurement units returned by node_distance()) - * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim() + * and node_reclaim_mode is enabled then the VM will only call node_reclaim() * on nodes within this distance. */ #define RECLAIM_DISTANCE 30 diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 35f0dcb1cb4f..53954631a4e1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1508,8 +1508,8 @@ static struct ctl_table vm_table[] = { #ifdef CONFIG_NUMA { .procname = "zone_reclaim_mode", - .data = &zone_reclaim_mode, - .maxlen = sizeof(zone_reclaim_mode), + .data = &node_reclaim_mode, + .maxlen = sizeof(node_reclaim_mode), .mode = 0644, .proc_handler = proc_dointvec, .extra1 = &zero, diff --git a/mm/internal.h b/mm/internal.h index 2f80d0343c56..1e21b2d3838d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -433,10 +433,10 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, } #endif /* CONFIG_SPARSEMEM */ -#define ZONE_RECLAIM_NOSCAN -2 -#define ZONE_RECLAIM_FULL -1 -#define ZONE_RECLAIM_SOME 0 -#define ZONE_RECLAIM_SUCCESS 1 +#define NODE_RECLAIM_NOSCAN -2 +#define NODE_RECLAIM_FULL -1 +#define NODE_RECLAIM_SOME 0 +#define NODE_RECLAIM_SUCCESS 1 extern int hwpoison_filter(struct page *p); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d03b14a6ef5e..d1423d790f6d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -672,10 +672,10 @@ static bool khugepaged_scan_abort(int nid) int i; /* - * If zone_reclaim_mode is disabled, then no extra effort is made to + * If node_reclaim_mode is disabled, then no extra effort is made to * allocate memory locally. */ - if (!zone_reclaim_mode) + if (!node_reclaim_mode) return false; /* If there is a count for this node already, it must be acceptable */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f2c56a13b065..c9d1720c58a3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2942,16 +2942,16 @@ zonelist_scan: if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; - if (zone_reclaim_mode == 0 || + if (node_reclaim_mode == 0 || !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue; - ret = zone_reclaim(zone, gfp_mask, order); + ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); switch (ret) { - case ZONE_RECLAIM_NOSCAN: + case NODE_RECLAIM_NOSCAN: /* did not scan */ continue; - case ZONE_RECLAIM_FULL: + case NODE_RECLAIM_FULL: /* scanned but unreclaimable */ continue; default: @@ -5948,9 +5948,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; #ifdef CONFIG_NUMA zone->node = nid; - zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) + pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio) / 100; - zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; + pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; zone->zone_pgdat = pgdat; @@ -6922,6 +6922,7 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { + struct pglist_data *pgdat; struct zone *zone; int rc; @@ -6929,8 +6930,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, if (rc) return rc; + for_each_online_pgdat(pgdat) + pgdat->min_slab_pages = 0; + for_each_zone(zone) - zone->min_unmapped_pages = (zone->managed_pages * + zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * sysctl_min_unmapped_ratio) / 100; return 0; } @@ -6938,6 +6942,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { + struct pglist_data *pgdat; struct zone *zone; int rc; @@ -6945,8 +6950,11 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, if (rc) return rc; + for_each_online_pgdat(pgdat) + pgdat->min_slab_pages = 0; + for_each_zone(zone) - zone->min_slab_pages = (zone->managed_pages * + zone->zone_pgdat->min_slab_pages += (zone->managed_pages * sysctl_min_slab_ratio) / 100; return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 31edd7776289..1013f37cd815 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3565,12 +3565,12 @@ module_init(kswapd_init) #ifdef CONFIG_NUMA /* - * Zone reclaim mode + * Node reclaim mode * - * If non-zero call zone_reclaim when the number of free pages falls below + * If non-zero call node_reclaim when the number of free pages falls below * the watermarks. */ -int zone_reclaim_mode __read_mostly; +int node_reclaim_mode __read_mostly; #define RECLAIM_OFF 0 #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ @@ -3578,14 +3578,14 @@ int zone_reclaim_mode __read_mostly; #define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ /* - * Priority for ZONE_RECLAIM. This determines the fraction of pages + * Priority for NODE_RECLAIM. This determines the fraction of pages * of a node considered for each zone_reclaim. 4 scans 1/16th of * a zone. */ -#define ZONE_RECLAIM_PRIORITY 4 +#define NODE_RECLAIM_PRIORITY 4 /* - * Percentage of pages in a zone that must be unmapped for zone_reclaim to + * Percentage of pages in a zone that must be unmapped for node_reclaim to * occur. */ int sysctl_min_unmapped_ratio = 1; @@ -3611,7 +3611,7 @@ static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat) } /* Work out how many page cache pages we can reclaim in this reclaim_mode */ -static unsigned long zone_pagecache_reclaimable(struct zone *zone) +static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) { unsigned long nr_pagecache_reclaimable; unsigned long delta = 0; @@ -3622,14 +3622,14 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone) * pages like swapcache and node_unmapped_file_pages() provides * a better estimate */ - if (zone_reclaim_mode & RECLAIM_UNMAP) - nr_pagecache_reclaimable = node_page_state(zone->zone_pgdat, NR_FILE_PAGES); + if (node_reclaim_mode & RECLAIM_UNMAP) + nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES); else - nr_pagecache_reclaimable = node_unmapped_file_pages(zone->zone_pgdat); + nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat); /* If we can't clean pages, remove dirty pages from consideration */ - if (!(zone_reclaim_mode & RECLAIM_WRITE)) - delta += node_page_state(zone->zone_pgdat, NR_FILE_DIRTY); + if (!(node_reclaim_mode & RECLAIM_WRITE)) + delta += node_page_state(pgdat, NR_FILE_DIRTY); /* Watch for any possible underflows due to delta */ if (unlikely(delta > nr_pagecache_reclaimable)) @@ -3639,23 +3639,24 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone) } /* - * Try to free up some pages from this zone through reclaim. + * Try to free up some pages from this node through reclaim. */ -static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { /* Minimum pages needed in order to stay on node */ const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; + int classzone_idx = gfp_zone(gfp_mask); struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .order = order, - .priority = ZONE_RECLAIM_PRIORITY, - .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP), + .priority = NODE_RECLAIM_PRIORITY, + .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), + .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), .may_swap = 1, - .reclaim_idx = zone_idx(zone), + .reclaim_idx = classzone_idx, }; cond_resched(); @@ -3669,13 +3670,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { + if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { /* * Free memory by calling shrink zone with increasing * priorities until we have enough memory freed. */ do { - shrink_node(zone->zone_pgdat, &sc, zone_idx(zone)); + shrink_node(pgdat, &sc, classzone_idx); } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } @@ -3685,49 +3686,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) return sc.nr_reclaimed >= nr_pages; } -int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { - int node_id; int ret; /* - * Zone reclaim reclaims unmapped file backed pages and + * Node reclaim reclaims unmapped file backed pages and * slab pages if we are over the defined limits. * * A small portion of unmapped file backed pages is needed for * file I/O otherwise pages read by file I/O will be immediately - * thrown out if the zone is overallocated. So we do not reclaim - * if less than a specified percentage of the zone is used by + * thrown out if the node is overallocated. So we do not reclaim + * if less than a specified percentage of the node is used by * unmapped file backed pages. */ - if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && - zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) - return ZONE_RECLAIM_FULL; + if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && + sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) + return NODE_RECLAIM_FULL; - if (!pgdat_reclaimable(zone->zone_pgdat)) - return ZONE_RECLAIM_FULL; + if (!pgdat_reclaimable(pgdat)) + return NODE_RECLAIM_FULL; /* * Do not scan if the allocation should not be delayed. */ if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) - return ZONE_RECLAIM_NOSCAN; + return NODE_RECLAIM_NOSCAN; /* - * Only run zone reclaim on the local zone or on zones that do not + * Only run node reclaim on the local node or on nodes that do not * have associated processors. This will favor the local processor * over remote processors and spread off node memory allocations * as wide as possible. */ - node_id = zone_to_nid(zone); - if (node_state(node_id, N_CPU) && node_id != numa_node_id()) - return ZONE_RECLAIM_NOSCAN; + if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) + return NODE_RECLAIM_NOSCAN; - if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags)) - return ZONE_RECLAIM_NOSCAN; + if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) + return NODE_RECLAIM_NOSCAN; - ret = __zone_reclaim(zone, gfp_mask, order); - clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags); + ret = __node_reclaim(pgdat, gfp_mask, order); + clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); if (!ret) count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); -- cgit v1.3-8-gc7d7 From 11db04864336f20e19e16b64ade781eeefc3f6d3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 28 Jul 2016 15:48:11 -0700 Subject: mm: cleanup ifdef guards for vmem_altmap Now that ZONE_DEVICE depends on SPARSEMEM_VMEMMAP we can simplify some ifdef guards to just ZONE_DEVICE. Link: http://lkml.kernel.org/r/146687646788.39261.8020536391978771940.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reported-by: Vlastimil Babka Cc: Eric Sandeen Cc: Jeff Moyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memremap.h | 2 +- kernel/memremap.c | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index bcaa634139a9..93416196ba64 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -26,7 +26,7 @@ struct vmem_altmap { unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); -#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_ZONE_DEVICE) +#ifdef CONFIG_ZONE_DEVICE struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start); #else static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) diff --git a/kernel/memremap.c b/kernel/memremap.c index 017532193fb1..ddb3247a872a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -308,12 +308,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (is_ram == REGION_INTERSECTS) return __va(res->start); - if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) { - dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n", - __func__); - return ERR_PTR(-ENXIO); - } - if (!ref) return ERR_PTR(-EINVAL); @@ -401,7 +395,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) altmap->alloc -= nr_pfns; } -#ifdef CONFIG_SPARSEMEM_VMEMMAP struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) { /* @@ -427,5 +420,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) return pgmap ? pgmap->altmap : NULL; } -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ #endif /* CONFIG_ZONE_DEVICE */ -- cgit v1.3-8-gc7d7 From d30dd8be06a5ae640766b20ea9ae288832bd12ac Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jul 2016 15:48:14 -0700 Subject: mm: track NR_KERNEL_STACK in KiB instead of number of stacks Currently, NR_KERNEL_STACK tracks the number of kernel stacks in a zone. This only makes sense if each kernel stack exists entirely in one zone, and allowing vmapped stacks could break this assumption. Since frv has THREAD_SIZE < PAGE_SIZE, we need to track kernel stack allocations in a unit that divides both THREAD_SIZE and PAGE_SIZE on all architectures. Keep it simple and use KiB. Link: http://lkml.kernel.org/r/083c71e642c5fa5f1b6898902e1b2db7b48940d4.1468523549.git.luto@kernel.org Signed-off-by: Andy Lutomirski Cc: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Reviewed-by: Josh Poimboeuf Reviewed-by: Vladimir Davydov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 3 +-- fs/proc/meminfo.c | 2 +- include/linux/mmzone.h | 2 +- kernel/fork.c | 3 ++- mm/page_alloc.c | 3 +-- 5 files changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/drivers/base/node.c b/drivers/base/node.c index 264cc214c4df..29cd96661b30 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -124,8 +124,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_FILE_MAPPED)), nid, K(node_page_state(pgdat, NR_ANON_MAPPED)), nid, K(i.sharedram), - nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK) * - THREAD_SIZE / 1024, + nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB), nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)), nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)), nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index c1fdcc1a907a..09e18fdf61e5 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -147,7 +147,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_page_state(NR_SLAB_UNRECLAIMABLE)), K(global_page_state(NR_SLAB_RECLAIMABLE)), K(global_page_state(NR_SLAB_UNRECLAIMABLE)), - global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, + global_page_state(NR_KERNEL_STACK_KB), K(global_page_state(NR_PAGETABLE)), #ifdef CONFIG_QUICKLIST K(quicklist_total_size()), diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ca0fbc483441..f2e4e90621ec 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -121,7 +121,7 @@ enum zone_stat_item { NR_SLAB_RECLAIMABLE, NR_SLAB_UNRECLAIMABLE, NR_PAGETABLE, /* used for pagetables */ - NR_KERNEL_STACK, + NR_KERNEL_STACK_KB, /* measured in KiB */ /* Second 128 byte cacheline */ NR_BOUNCE, #if IS_ENABLED(CONFIG_ZSMALLOC) diff --git a/kernel/fork.c b/kernel/fork.c index de21f25e0d2c..af3637e0ee52 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -225,7 +225,8 @@ static void account_kernel_stack(unsigned long *stack, int account) { struct zone *zone = page_zone(virt_to_page(stack)); - mod_zone_page_state(zone, NR_KERNEL_STACK, account); + mod_zone_page_state(zone, NR_KERNEL_STACK_KB, + THREAD_SIZE / 1024 * account); } void free_task(struct task_struct *tsk) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dfdb608f7b3d..c281125b2349 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4359,8 +4359,7 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_MLOCK)), K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), - zone_page_state(zone, NR_KERNEL_STACK) * - THREAD_SIZE / 1024, + zone_page_state(zone, NR_KERNEL_STACK_KB), K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), -- cgit v1.3-8-gc7d7 From efdc94907977d2db84b4b00cb9bd98ca011f6819 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jul 2016 15:48:17 -0700 Subject: mm: fix memcg stack accounting for sub-page stacks We should account for stacks regardless of stack size, and we need to account in sub-page units if THREAD_SIZE < PAGE_SIZE. Change the units to kilobytes and Move it into account_kernel_stack(). Fixes: 12580e4b54ba8 ("mm: memcontrol: report kernel stack usage in cgroup2 memory.stat") Link: http://lkml.kernel.org/r/9b5314e3ee5eda61b0317ec1563768602c1ef438.1468523549.git.luto@kernel.org Signed-off-by: Andy Lutomirski Cc: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Reviewed-by: Josh Poimboeuf Reviewed-by: Vladimir Davydov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 +- kernel/fork.c | 19 ++++++++----------- mm/memcontrol.c | 2 +- 3 files changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5147e650287a..5d8ca6e02e39 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -52,7 +52,7 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, /* default hierarchy stats */ - MEMCG_KERNEL_STACK = MEM_CGROUP_STAT_NSTATS, + MEMCG_KERNEL_STACK_KB = MEM_CGROUP_STAT_NSTATS, MEMCG_SLAB_RECLAIMABLE, MEMCG_SLAB_UNRECLAIMABLE, MEMCG_SOCK, diff --git a/kernel/fork.c b/kernel/fork.c index af3637e0ee52..52e725d4a866 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -165,20 +165,12 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); - if (page) - memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, - 1 << THREAD_SIZE_ORDER); - return page ? page_address(page) : NULL; } static inline void free_thread_stack(unsigned long *stack) { - struct page *page = virt_to_page(stack); - - memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, - -(1 << THREAD_SIZE_ORDER)); - __free_pages(page, THREAD_SIZE_ORDER); + __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_stack_cache; @@ -223,10 +215,15 @@ static struct kmem_cache *mm_cachep; static void account_kernel_stack(unsigned long *stack, int account) { - struct zone *zone = page_zone(virt_to_page(stack)); + /* All stack pages are in the same zone and belong to the same memcg. */ + struct page *first_page = virt_to_page(stack); - mod_zone_page_state(zone, NR_KERNEL_STACK_KB, + mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, THREAD_SIZE / 1024 * account); + + memcg_kmem_update_page_stat( + first_page, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } void free_task(struct task_struct *tsk) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 13be30c3ea78..c265212bec8c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5171,7 +5171,7 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "file %llu\n", (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); seq_printf(m, "kernel_stack %llu\n", - (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE); + (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); seq_printf(m, "slab %llu\n", (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); -- cgit v1.3-8-gc7d7 From 8b70ca65616b3588ea1907e87f0df6d2530350df Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jul 2016 15:48:23 -0700 Subject: printk: when dumping regs, show the stack, not thread_info We currently show: task: ti: task.ti: " "ti" and "task.ti" are redundant, and neither is actually what we want to show, which the the base of the thread stack. Change the display to show the stack pointer explicitly. Link: http://lkml.kernel.org/r/543ac5bd66ff94000a57a02e11af7239571a3055.1468523549.git.luto@kernel.org Signed-off-by: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 60cdf6386763..d4de33934dac 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3177,9 +3177,8 @@ void show_regs_print_info(const char *log_lvl) { dump_stack_print_info(log_lvl); - printk("%stask: %p ti: %p task.ti: %p\n", - log_lvl, current, current_thread_info(), - task_thread_info(current)); + printk("%stask: %p task.stack: %p\n", + log_lvl, current, task_stack_page(current)); } #endif -- cgit v1.3-8-gc7d7 From 784bdf3bb694b256fcd6120b93e8947a84249a3a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Jul 2016 16:32:30 +0200 Subject: futex: Assume all mappings are private on !MMU systems To quote Rick why there is no need for shared mapping on !MMU systems: |With MMU, shared futex keys need to identify the physical backing for |a memory address because it may be mapped at different addresses in |different processes (or even multiple times in the same process). |Without MMU this cannot happen. You only have physical addresses. So |the "private futex" behavior of using the virtual address as the key |is always correct (for both shared and private cases) on nommu |systems. This patch disables the FLAGS_SHARED in a way that allows the compiler to remove that code. [bigeasy: Added changelog ] Reported-by: Rich Felker Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Cc: Andrew Morton Link: http://lkml.kernel.org/r/20160729143230.GA21715@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/futex.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 33664f70e2d2..46cb3a301bc1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -179,7 +179,15 @@ int __read_mostly futex_cmpxchg_enabled; * Futex flags used to encode options to functions and preserve them across * restarts. */ -#define FLAGS_SHARED 0x01 +#ifdef CONFIG_MMU +# define FLAGS_SHARED 0x01 +#else +/* + * NOMMU does not have per process address space. Let the compiler optimize + * code away. + */ +# define FLAGS_SHARED 0x00 +#endif #define FLAGS_CLOCKRT 0x02 #define FLAGS_HAS_TIMEOUT 0x04 @@ -405,6 +413,16 @@ static void get_futex_key_refs(union futex_key *key) if (!key->both.ptr) return; + /* + * On MMU less systems futexes are always "private" as there is no per + * process address space. We need the smp wmb nevertheless - yes, + * arch/blackfin has MMU less SMP ... + */ + if (!IS_ENABLED(CONFIG_MMU)) { + smp_mb(); /* explicit smp_mb(); (B) */ + return; + } + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: ihold(key->shared.inode); /* implies smp_mb(); (B) */ @@ -436,6 +454,9 @@ static void drop_futex_key_refs(union futex_key *key) return; } + if (!IS_ENABLED(CONFIG_MMU)) + return; + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: iput(key->shared.inode); -- cgit v1.3-8-gc7d7 From e3f91083facb792dc8d8fd0a59639e4d6e7c0c8f Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Sat, 23 Jul 2016 14:42:37 +0530 Subject: jump_label: Make it possible for arches to invoke jump_label_init() earlier Some arches (powerpc at least) would like to invoke jump_label_init() much earlier in boot. So check static_key_initialized in order to make sure this function runs only once. LGTM-by: Ingo (http://marc.info/?l=linux-kernel&m=144049104329961&w=2) Signed-off-by: Kevin Hao Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- kernel/jump_label.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 0dbea887d625..2d693be967df 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -235,6 +235,9 @@ void __init jump_label_init(void) struct static_key *key = NULL; struct jump_entry *iter; + if (static_key_initialized) + return; + jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); -- cgit v1.3-8-gc7d7 From 0d87d7ec22a0879d3926faa4f4f4412a5dee1fba Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 1 Aug 2016 13:49:29 -0700 Subject: perf/core: Change log level for duration warning to KERN_INFO When the perf interrupt handler exceeds a threshold warning messages are displayed on console: [12739.31793] perf interrupt took too long (2504 > 2500), lowering kernel.perf_event_max_sample_rate to 50000 [71340.165065] perf interrupt took too long (5005 > 5000), lowering kernel.perf_event_max_sample_rate to 25000 Many customers and users are confused by the message wondering if something is wrong or they need to take action to fix a problem. Since a user can not do anything to fix the issue, the message is really more informational than a warning. Adjust the log level accordingly. Signed-off-by: David Ahern Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1470084569-438-1-git-send-email-dsa@cumulusnetworks.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 356a6c7cb52a..a19550d80ab1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -448,7 +448,7 @@ static u64 __report_allowed; static void perf_duration_warn(struct irq_work *w) { - printk_ratelimited(KERN_WARNING + printk_ratelimited(KERN_INFO "perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, -- cgit v1.3-8-gc7d7 From 377ccbb483738f84400ddf5840c7dd8825716985 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 28 Jul 2016 22:30:43 -0400 Subject: Makefile: Mute warning for __builtin_return_address(>0) for tracing only With the latest gcc compilers, they give a warning if __builtin_return_address() parameter is greater than 0. That is because if it is used by a function called by a top level function (or in the case of the kernel, by assembly), it can try to access stack frames outside the stack and crash the system. The tracing system uses __builtin_return_address() of up to 2! But it is well aware of the dangers that it may have, and has even added precautions to protect against it (see the thunk code in arch/x86/entry/thunk*.S) Linus originally added KBUILD_CFLAGS that would suppress the warning for the entire kernel, as simply adding KBUILD_CFLAGS to the tracing directory wouldn't work. The tracing directory plays a bit with the CFLAGS and requires a little more logic. This adds that special logic to only suppress the warning for the tracing directory. If it is used anywhere else outside of tracing, the warning will still be triggered. Link: http://lkml.kernel.org/r/20160728223043.51996267@grimm.local.home Tested-by: Linus Torvalds Signed-off-by: Steven Rostedt --- Makefile | 1 - kernel/trace/Makefile | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Makefile b/Makefile index 393b6159ae92..d384848478b9 100644 --- a/Makefile +++ b/Makefile @@ -620,7 +620,6 @@ include arch/$(SRCARCH)/Makefile KBUILD_CFLAGS += $(call cc-option,-fno-delete-null-pointer-checks,) KBUILD_CFLAGS += $(call cc-disable-warning,maybe-uninitialized,) -KBUILD_CFLAGS += $(call cc-disable-warning,frame-address,) ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE KBUILD_CFLAGS += -Os diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 979e7bfbde7a..d0a1617b52b4 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,4 +1,8 @@ +# We are fully aware of the dangers of __builtin_return_address() +FRAME_CFLAGS := $(call cc-disable-warning,frame-address) +KBUILD_CFLAGS += $(FRAME_CFLAGS) + # Do not instrument the tracer itself: ifdef CONFIG_FUNCTION_TRACER -- cgit v1.3-8-gc7d7 From 47c1856971dd05cac730f70d073518da021b2e5c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Jun 2016 19:55:59 -0500 Subject: tracing: Fix use-after-free in hist_unreg_all/hist_enable_unreg_all While running tools/testing/selftests test suite with KASAN, Dmitry Vyukov hit the following use-after-free report: ================================================================== BUG: KASAN: use-after-free in hist_unreg_all+0x1a1/0x1d0 at addr ffff880031632cc0 Read of size 8 by task ftracetest/7413 ================================================================== BUG kmalloc-128 (Not tainted): kasan: bad access detected ------------------------------------------------------------------ This fixes the problem, along with the same problem in hist_enable_unreg_all(). Link: http://lkml.kernel.org/r/c3d05b79e42555b6e36a3a99aae0e37315ee5304.1467247517.git.tom.zanussi@linux.intel.com Cc: Dmitry Vyukov [Copied Steve's hist_enable_unreg_all() fix to hist_unreg_all()] Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_hist.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0c05b8a99806..19ae135120a3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1500,9 +1500,9 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, static void hist_unreg_all(struct trace_event_file *file) { - struct event_trigger_data *test; + struct event_trigger_data *test, *n; - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry_safe(test, n, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { list_del_rcu(&test->list); trace_event_trigger_enable_disable(file, 0); @@ -1699,9 +1699,9 @@ hist_enable_get_trigger_ops(char *cmd, char *param) static void hist_enable_unreg_all(struct trace_event_file *file) { - struct event_trigger_data *test; + struct event_trigger_data *test, *n; - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry_safe(test, n, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_HIST_ENABLE) { list_del_rcu(&test->list); update_cond_flag(file); -- cgit v1.3-8-gc7d7 From 7522c03ae307e657114ff909aec650304371a134 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 29 Jun 2016 19:56:00 -0500 Subject: tracing: Fix use-after-free in hist_register_trigger() This fixes a use-after-free case flagged by KASAN; make sure the test happens before the potential free in this case. Link: http://lkml.kernel.org/r/48fd74ab61bebd7dca9714386bb47d7c5ccd6a7b.1467247517.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_hist.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 19ae135120a3..f3a960ed75a1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1441,6 +1441,9 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, goto out; } + if (hist_data->attrs->pause) + data->paused = true; + if (named_data) { destroy_hist_data(data->private_data); data->private_data = named_data->private_data; @@ -1448,9 +1451,6 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, data->ops = &event_hist_trigger_named_ops; } - if (hist_data->attrs->pause) - data->paused = true; - if (data->ops->init) { ret = data->ops->init(data->ops, data); if (ret < 0) -- cgit v1.3-8-gc7d7 From 61e96496d3c949701a48b908f99f4ed891cd1101 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 2 Aug 2016 14:03:44 -0700 Subject: task_work: use READ_ONCE/lockless_dereference, avoid pi_lock if !task_works Change task_work_cancel() to use lockless_dereference(), this is what the code really wants but we didn't have this helper when it was written. Also add the fast-path task->task_works == NULL check, in the likely case this task has no pending works and we can avoid spin_lock(task->pi_lock). While at it, change other users of ACCESS_ONCE() to use READ_ONCE(). Link: http://lkml.kernel.org/r/20160610150042.GA13868@redhat.com Signed-off-by: Oleg Nesterov Cc: Andrea Parri Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/task_work.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/task_work.c b/kernel/task_work.c index 6ab4842b00e8..d513051fcca2 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -29,7 +29,7 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify) struct callback_head *head; do { - head = ACCESS_ONCE(task->task_works); + head = READ_ONCE(task->task_works); if (unlikely(head == &work_exited)) return -ESRCH; work->next = head; @@ -57,6 +57,9 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) struct callback_head **pprev = &task->task_works; struct callback_head *work; unsigned long flags; + + if (likely(!task->task_works)) + return NULL; /* * If cmpxchg() fails we continue without updating pprev. * Either we raced with task_work_add() which added the @@ -64,8 +67,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - while ((work = ACCESS_ONCE(*pprev))) { - smp_read_barrier_depends(); + while ((work = lockless_dereference(*pprev))) { if (work->func != func) pprev = &work->next; else if (cmpxchg(pprev, work, work->next) == work) @@ -95,7 +97,7 @@ void task_work_run(void) * work_exited unless the list is empty. */ do { - work = ACCESS_ONCE(task->task_works); + work = READ_ONCE(task->task_works); head = !work && (task->flags & PF_EXITING) ? &work_exited : NULL; } while (cmpxchg(&task->task_works, work, head) != work); -- cgit v1.3-8-gc7d7 From 9d5059c959ac739dbf837cec14586e58e7a67292 Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Tue, 2 Aug 2016 14:03:47 -0700 Subject: dynamic_debug: only add header when used kernel.h header doesn't directly use dynamic debug, instead we can include it in module.c (which used it via kernel.h). printk.h only uses it if CONFIG_DYNAMIC_DEBUG is on, changing the inclusion to only happen in that case. Link: http://lkml.kernel.org/r/1468429793-16917-1-git-send-email-luisbg@osg.samsung.com [luisbg@osg.samsung.com: include dynamic_debug.h in drb_int.h] Link: http://lkml.kernel.org/r/1468447828-18558-2-git-send-email-luisbg@osg.samsung.com Signed-off-by: Luis de Bethencourt Cc: Rusty Russell Cc: Hidehiro Kawai Cc: Borislav Petkov Cc: Michal Nazarewicz Cc: Rasmus Villemoes Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/drbd/drbd_actlog.c | 1 - drivers/block/drbd/drbd_int.h | 1 + include/linux/kernel.h | 1 - include/linux/printk.h | 3 ++- kernel/module.c | 1 + 5 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 0a1aaf8c24c4..2d3d50ab74bf 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -27,7 +27,6 @@ #include #include #include -#include #include "drbd_int.h" diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 7b54354976a5..4cb8f21ff4ef 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/kernel.h b/include/linux/kernel.h index c42082112ec8..d96a6118d26a 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include diff --git a/include/linux/printk.h b/include/linux/printk.h index f136b22c7772..987c65ed34e5 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -289,10 +289,11 @@ extern asmlinkage void dump_stack(void) __cold; no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif -#include /* If you are writing a driver, please use dev_dbg instead */ #if defined(CONFIG_DYNAMIC_DEBUG) +#include + /* dynamic_pr_debug() uses pr_fmt() internally so we don't need it here */ #define pr_debug(fmt, ...) \ dynamic_pr_debug(fmt, ##__VA_ARGS__) diff --git a/kernel/module.c b/kernel/module.c index 5f71aa63ed2a..a0f48b8b00da 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include "module-internal.h" -- cgit v1.3-8-gc7d7 From bebca05281d039e4144e1c51f402fd1d5f54b5e2 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 2 Aug 2016 14:03:50 -0700 Subject: printk: do not include interrupt.h A trivial cosmetic change: interrupt.h header is redundant since commit 6b898c07cb1d ("console: use might_sleep in console_lock"). Link: http://lkml.kernel.org/r/20160620132847.21930-1-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d4de33934dac..09af62e71fee 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -26,7 +26,6 @@ #include #include #include -#include /* For in_interrupt() */ #include #include #include -- cgit v1.3-8-gc7d7 From 874f9c7da9a4acbc1b9e12ca722579fb50e4d142 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 2 Aug 2016 14:03:53 -0700 Subject: printk: create pr_ functions Using functions instead of macros can reduce overall code size by eliminating unnecessary "KERN_SOH" prefixes from format strings. defconfig x86-64: $ size vmlinux* text data bss dec hex filename 10193570 4331464 1105920 15630954 ee826a vmlinux.new 10192623 4335560 1105920 15634103 ee8eb7 vmlinux.old As the return value are unimportant and unused in the kernel tree, these new functions return void. Miscellanea: - change pr_ macros to call new __pr_ functions - change vprintk_nmi and vprintk_default to add LOGLEVEL_ argument [akpm@linux-foundation.org: fix LOGLEVEL_INFO, per Joe] Link: http://lkml.kernel.org/r/e16cc34479dfefcae37c98b481e6646f0f69efc3.1466718827.git.joe@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 48 +++++++++++++++++++++++++++++++++--------------- kernel/printk/internal.h | 16 ++++++++++------ kernel/printk/nmi.c | 13 +++++++++++-- kernel/printk/printk.c | 27 ++++++++++++++++++++++++--- 4 files changed, 78 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index 987c65ed34e5..c2158f0f1499 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -257,21 +257,39 @@ extern asmlinkage void dump_stack(void) __cold; * and other debug macros are compiled out unless either DEBUG is defined * or CONFIG_DYNAMIC_DEBUG is set. */ -#define pr_emerg(fmt, ...) \ - printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) -#define pr_alert(fmt, ...) \ - printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) -#define pr_crit(fmt, ...) \ - printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) -#define pr_err(fmt, ...) \ - printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) -#define pr_warning(fmt, ...) \ - printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) -#define pr_warn pr_warning -#define pr_notice(fmt, ...) \ - printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) -#define pr_info(fmt, ...) \ - printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) + +#ifdef CONFIG_PRINTK + +asmlinkage __printf(1, 2) __cold void __pr_emerg(const char *fmt, ...); +asmlinkage __printf(1, 2) __cold void __pr_alert(const char *fmt, ...); +asmlinkage __printf(1, 2) __cold void __pr_crit(const char *fmt, ...); +asmlinkage __printf(1, 2) __cold void __pr_err(const char *fmt, ...); +asmlinkage __printf(1, 2) __cold void __pr_warn(const char *fmt, ...); +asmlinkage __printf(1, 2) __cold void __pr_notice(const char *fmt, ...); +asmlinkage __printf(1, 2) __cold void __pr_info(const char *fmt, ...); + +#define pr_emerg(fmt, ...) __pr_emerg(pr_fmt(fmt), ##__VA_ARGS__) +#define pr_alert(fmt, ...) __pr_alert(pr_fmt(fmt), ##__VA_ARGS__) +#define pr_crit(fmt, ...) __pr_crit(pr_fmt(fmt), ##__VA_ARGS__) +#define pr_err(fmt, ...) __pr_err(pr_fmt(fmt), ##__VA_ARGS__) +#define pr_warn(fmt, ...) __pr_warn(pr_fmt(fmt), ##__VA_ARGS__) +#define pr_notice(fmt, ...) __pr_notice(pr_fmt(fmt), ##__VA_ARGS__) +#define pr_info(fmt, ...) __pr_info(pr_fmt(fmt), ##__VA_ARGS__) + +#else + +#define pr_emerg(fmt, ...) printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) +#define pr_alert(fmt, ...) printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) +#define pr_crit(fmt, ...) printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) +#define pr_err(fmt, ...) printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) +#define pr_warn(fmt, ...) printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) +#define pr_notice(fmt, ...) printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) +#define pr_info(fmt, ...) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) + +#endif + +#define pr_warning pr_warn + /* * Like KERN_CONT, pr_cont() should only be used when continuing * a line with no newline ('\n') enclosed. Otherwise it defaults diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 7fd2838fa417..5d4505f30083 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -16,9 +16,11 @@ */ #include -typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args); +typedef __printf(2, 0) int (*printk_func_t)(int level, const char *fmt, + va_list args); -int __printf(1, 0) vprintk_default(const char *fmt, va_list args); +__printf(2, 0) +int vprintk_default(int level, const char *fmt, va_list args); #ifdef CONFIG_PRINTK_NMI @@ -31,9 +33,10 @@ extern raw_spinlock_t logbuf_lock; * via per-CPU variable. */ DECLARE_PER_CPU(printk_func_t, printk_func); -static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) +__printf(2, 0) +static inline int vprintk_func(int level, const char *fmt, va_list args) { - return this_cpu_read(printk_func)(fmt, args); + return this_cpu_read(printk_func)(level, fmt, args); } extern atomic_t nmi_message_lost; @@ -44,9 +47,10 @@ static inline int get_nmi_message_lost(void) #else /* CONFIG_PRINTK_NMI */ -static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) +__printf(2, 0) +static inline int vprintk_func(int level, const char *fmt, va_list args) { - return vprintk_default(fmt, args); + return vprintk_default(level, fmt, args); } static inline int get_nmi_message_lost(void) diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index b69eb8a2876f..bc3eeb1ae6da 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -58,7 +58,7 @@ static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); * one writer running. But the buffer might get flushed from another * CPU, so we need to be careful. */ -static int vprintk_nmi(const char *fmt, va_list args) +static int vprintk_nmi(int level, const char *fmt, va_list args) { struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); int add = 0; @@ -79,7 +79,16 @@ again: if (!len) smp_rmb(); - add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + if (level != LOGLEVEL_DEFAULT) { + add = snprintf(s->buffer + len, sizeof(s->buffer) - len, + KERN_SOH "%c", '0' + level); + add += vsnprintf(s->buffer + len + add, + sizeof(s->buffer) - len - add, + fmt, args); + } else { + add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, + fmt, args); + } /* * Do it once again if the buffer has been flushed in the meantime. diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 09af62e71fee..d2accf2f4448 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1801,7 +1801,28 @@ asmlinkage int printk_emit(int facility, int level, } EXPORT_SYMBOL(printk_emit); -int vprintk_default(const char *fmt, va_list args) +#ifdef CONFIG_PRINTK +#define define_pr_level(func, loglevel) \ +asmlinkage __visible void func(const char *fmt, ...) \ +{ \ + va_list args; \ + \ + va_start(args, fmt); \ + vprintk_default(loglevel, fmt, args); \ + va_end(args); \ +} \ +EXPORT_SYMBOL(func) + +define_pr_level(__pr_emerg, LOGLEVEL_EMERG); +define_pr_level(__pr_alert, LOGLEVEL_ALERT); +define_pr_level(__pr_crit, LOGLEVEL_CRIT); +define_pr_level(__pr_err, LOGLEVEL_ERR); +define_pr_level(__pr_warn, LOGLEVEL_WARNING); +define_pr_level(__pr_notice, LOGLEVEL_NOTICE); +define_pr_level(__pr_info, LOGLEVEL_INFO); +#endif + +int vprintk_default(int level, const char *fmt, va_list args) { int r; @@ -1811,7 +1832,7 @@ int vprintk_default(const char *fmt, va_list args) return r; } #endif - r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + r = vprintk_emit(0, level, NULL, 0, fmt, args); return r; } @@ -1844,7 +1865,7 @@ asmlinkage __visible int printk(const char *fmt, ...) int r; va_start(args, fmt); - r = vprintk_func(fmt, args); + r = vprintk_func(LOGLEVEL_DEFAULT, fmt, args); va_end(args); return r; -- cgit v1.3-8-gc7d7 From cf7754441c563230ed74096fcd4b8cca49910550 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 2 Aug 2016 14:03:56 -0700 Subject: printk: introduce suppress_message_printing() Messages' levels and console log level are inspected when the actual printing occurs, which may provoke console_unlock() and console_cont_flush() to waste CPU cycles on every message that has loglevel above the current console_loglevel. Schematically, console_unlock() does the following: console_unlock() { ... for (;;) { ... raw_spin_lock_irqsave(&logbuf_lock, flags); skip: msg = log_from_idx(console_idx); if (msg->flags & LOG_NOCONS) { ... goto skip; } level = msg->level; len += msg_print_text(); >> sprintfs memcpy, etc. if (nr_ext_console_drivers) { ext_len = msg_print_ext_header(); >> scnprintf ext_len += msg_print_ext_body(); >> scnprintfs etc. } ... raw_spin_unlock(&logbuf_lock); call_console_drivers(level, ext_text, ext_len, text, len) { if (level >= console_loglevel && >> drop the message !ignore_loglevel) return; console->write(...); } local_irq_restore(flags); } ... } The thing here is this deferred `level >= console_loglevel' check. We are wasting CPU cycles on sprintfs/memcpy/etc. preparing the messages that we will eventually drop. This can be huge when we register a new CON_PRINTBUFFER console, for instance. For every such a console register_console() resets the console_seq, console_idx, console_prev and sets a `exclusive console' pointer to replay the log buffer to that just-registered console. And there can be a lot of messages to replay, in the worst case most of which can be dropped after console_loglevel test. We know messages' levels long before we call msg_print_text() and friends, so we can just move console_loglevel check out of call_console_drivers() and format a new message only if we are sure that it won't be dropped. The patch factors out loglevel check into suppress_message_printing() function and tests message->level and console_loglevel before formatting functions in console_unlock() and console_cont_flush() are getting executed. This improves things not only for exclusive CON_PRINTBUFFER consoles, but for every console_unlock() that attempts to print a message of level above the console_loglevel. Link: http://lkml.kernel.org/r/20160627135012.8229-1-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Cc: Tejun Heo Cc: Jan Kara Cc: Calvin Owens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d2accf2f4448..8bdce14254f4 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -985,6 +985,11 @@ module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting (prints all kernel messages to the console)"); +static bool suppress_message_printing(int level) +{ + return (level >= console_loglevel && !ignore_loglevel); +} + #ifdef CONFIG_BOOT_PRINTK_DELAY static int boot_delay; /* msecs delay after each printk during bootup */ @@ -1014,7 +1019,7 @@ static void boot_delay_msec(int level) unsigned long timeout; if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) - || (level >= console_loglevel && !ignore_loglevel)) { + || suppress_message_printing(level)) { return; } @@ -1438,8 +1443,6 @@ static void call_console_drivers(int level, trace_console(text, len); - if (level >= console_loglevel && !ignore_loglevel) - return; if (!console_drivers) return; @@ -1908,6 +1911,7 @@ static void call_console_drivers(int level, static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, bool syslog, char *buf, size_t size) { return 0; } static size_t cont_print_text(char *text, size_t size) { return 0; } +static bool suppress_message_printing(int level) { return false; } /* Still needs to be defined for users */ DEFINE_PER_CPU(printk_func_t, printk_func); @@ -2187,6 +2191,13 @@ static void console_cont_flush(char *text, size_t size) if (!cont.len) goto out; + if (suppress_message_printing(cont.level)) { + cont.cons = cont.len; + if (cont.flushed) + cont.len = 0; + goto out; + } + /* * We still queue earlier records, likely because the console was * busy. The earlier ones need to be printed before this one, we @@ -2290,10 +2301,13 @@ skip: break; msg = log_from_idx(console_idx); - if (msg->flags & LOG_NOCONS) { + level = msg->level; + if ((msg->flags & LOG_NOCONS) || + suppress_message_printing(level)) { /* * Skip record we have buffered and already printed - * directly to the console when we received it. + * directly to the console when we received it, and + * record that has level above the console loglevel. */ console_idx = log_next(console_idx); console_seq++; @@ -2307,7 +2321,6 @@ skip: goto skip; } - level = msg->level; len += msg_print_text(msg, console_prev, false, text + len, sizeof(text) - len); if (nr_ext_console_drivers) { -- cgit v1.3-8-gc7d7 From 40a7d9f5f90681c6d7890b6a07f230bb4afe7e39 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Aug 2016 14:03:59 -0700 Subject: printk: include instead of asm-generic headers are generic implementations for architecture specific code and should not be included by common code. Thus use the asm/ version of sections.h to get at the linker sections. Link: http://lkml.kernel.org/r/1468285008-7331-1-git-send-email-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8bdce14254f4..70c66c5ba212 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -47,7 +47,7 @@ #include #include -#include +#include #define CREATE_TRACE_POINTS #include -- cgit v1.3-8-gc7d7 From 750afe7babd117daabebf4855da18e4418ea845e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 2 Aug 2016 14:04:07 -0700 Subject: printk: add kernel parameter to control writes to /dev/kmsg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a "printk.devkmsg" kernel command line parameter which controls how userspace writes into /dev/kmsg. It has three options: * ratelimit - ratelimit logging from userspace. * on - unlimited logging from userspace * off - logging from userspace gets ignored The default setting is to ratelimit the messages written to it. This changes the kernel default setting of "on" to "ratelimit" and we do that because we want to keep userspace spamming /dev/kmsg to sane levels. This is especially moot when a small kernel log buffer wraps around and messages get lost. So the ratelimiting setting should be a sane setting where kernel messages should have a bit higher chance of survival from all the spamming. It additionally does not limit logging to /dev/kmsg while the system is booting if we haven't disabled it on the command line. Furthermore, we can control the logging from a lower priority sysctl interface - kernel.printk_devkmsg. That interface will succeed only if printk.devkmsg *hasn't* been supplied on the command line. If it has, then printk.devkmsg is a one-time setting which remains for the duration of the system lifetime. This "locking" of the setting is to prevent userspace from changing the logging on us through sysctl(2). This patch is based on previous patches from Linus and Steven. [bp@suse.de: fixes] Link: http://lkml.kernel.org/r/20160719072344.GC25563@nazgul.tnic Link: http://lkml.kernel.org/r/20160716061745.15795-3-bp@alien8.de Signed-off-by: Borislav Petkov Cc: Dave Young Cc: Franck Bui Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Uwe Kleine-König Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 7 ++ Documentation/sysctl/kernel.txt | 14 ++++ include/linux/printk.h | 9 +++ kernel/printk/printk.c | 142 ++++++++++++++++++++++++++++++++++-- kernel/sysctl.c | 7 ++ 5 files changed, 171 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e24aa11e8f8a..b240540e49f2 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3173,6 +3173,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Format: (1/Y/y=enable, 0/N/n=disable) default: disabled + printk.devkmsg={on,off,ratelimit} + Control writing to /dev/kmsg. + on - unlimited logging to /dev/kmsg from userspace + off - logging to /dev/kmsg disabled + ratelimit - ratelimit the logging + Default: ratelimit + printk.time= Show timing data prefixed to each printk message line Format: (1/Y/y=enable, 0/N/n=disable) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 33204604de6c..ffab8b5caa60 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -764,6 +764,20 @@ send before ratelimiting kicks in. ============================================================== +printk_devkmsg: + +Control the logging to /dev/kmsg from userspace: + +ratelimit: default, ratelimited +on: unlimited logging to /dev/kmsg from userspace +off: logging to /dev/kmsg disabled + +The kernel command line parameter printk.devkmsg= overrides this and is +a one-time setting until next reboot: once set, it cannot be changed by +this sysctl interface anymore. + +============================================================== + randomize_va_space: This option can be used to select the type of process address diff --git a/include/linux/printk.h b/include/linux/printk.h index c2158f0f1499..8dc155dab3ed 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -61,6 +61,11 @@ static inline void console_verbose(void) console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; } +/* strlen("ratelimit") + 1 */ +#define DEVKMSG_STR_MAX_SIZE 10 +extern char devkmsg_log_str[]; +struct ctl_table; + struct va_format { const char *fmt; va_list *va; @@ -175,6 +180,10 @@ extern int printk_delay_msec; extern int dmesg_restrict; extern int kptr_restrict; +extern int +devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void __user *buf, + size_t *lenp, loff_t *ppos); + extern void wake_up_klogd(void); char *log_buf_addr_get(void); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 70c66c5ba212..a5ef95ca18c9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -85,6 +85,111 @@ static struct lockdep_map console_lock_dep_map = { }; #endif +enum devkmsg_log_bits { + __DEVKMSG_LOG_BIT_ON = 0, + __DEVKMSG_LOG_BIT_OFF, + __DEVKMSG_LOG_BIT_LOCK, +}; + +enum devkmsg_log_masks { + DEVKMSG_LOG_MASK_ON = BIT(__DEVKMSG_LOG_BIT_ON), + DEVKMSG_LOG_MASK_OFF = BIT(__DEVKMSG_LOG_BIT_OFF), + DEVKMSG_LOG_MASK_LOCK = BIT(__DEVKMSG_LOG_BIT_LOCK), +}; + +/* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */ +#define DEVKMSG_LOG_MASK_DEFAULT 0 + +static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; + +static int __control_devkmsg(char *str) +{ + if (!str) + return -EINVAL; + + if (!strncmp(str, "on", 2)) { + devkmsg_log = DEVKMSG_LOG_MASK_ON; + return 2; + } else if (!strncmp(str, "off", 3)) { + devkmsg_log = DEVKMSG_LOG_MASK_OFF; + return 3; + } else if (!strncmp(str, "ratelimit", 9)) { + devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; + return 9; + } + return -EINVAL; +} + +static int __init control_devkmsg(char *str) +{ + if (__control_devkmsg(str) < 0) + return 1; + + /* + * Set sysctl string accordingly: + */ + if (devkmsg_log == DEVKMSG_LOG_MASK_ON) { + memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE); + strncpy(devkmsg_log_str, "on", 2); + } else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) { + memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE); + strncpy(devkmsg_log_str, "off", 3); + } + /* else "ratelimit" which is set by default. */ + + /* + * Sysctl cannot change it anymore. The kernel command line setting of + * this parameter is to force the setting to be permanent throughout the + * runtime of the system. This is a precation measure against userspace + * trying to be a smarta** and attempting to change it up on us. + */ + devkmsg_log |= DEVKMSG_LOG_MASK_LOCK; + + return 0; +} +__setup("printk.devkmsg=", control_devkmsg); + +char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; + +int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char old_str[DEVKMSG_STR_MAX_SIZE]; + unsigned int old; + int err; + + if (write) { + if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK) + return -EINVAL; + + old = devkmsg_log; + strncpy(old_str, devkmsg_log_str, DEVKMSG_STR_MAX_SIZE); + } + + err = proc_dostring(table, write, buffer, lenp, ppos); + if (err) + return err; + + if (write) { + err = __control_devkmsg(devkmsg_log_str); + + /* + * Do not accept an unknown string OR a known string with + * trailing crap... + */ + if (err < 0 || (err + 1 != *lenp)) { + + /* ... and restore old setting. */ + devkmsg_log = old; + strncpy(devkmsg_log_str, old_str, DEVKMSG_STR_MAX_SIZE); + + return -EINVAL; + } + } + + return 0; +} + /* * Number of registered extended console drivers. * @@ -613,6 +718,7 @@ struct devkmsg_user { u64 seq; u32 idx; enum log_flags prev; + struct ratelimit_state rs; struct mutex lock; char buf[CONSOLE_EXT_LOG_MAX]; }; @@ -622,11 +728,24 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) char *buf, *line; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ + struct file *file = iocb->ki_filp; + struct devkmsg_user *user = file->private_data; size_t len = iov_iter_count(from); ssize_t ret = len; - if (len > LOG_LINE_MAX) + if (!user || len > LOG_LINE_MAX) return -EINVAL; + + /* Ignore when user logging is disabled. */ + if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) + return len; + + /* Ratelimit when not explicitly enabled. */ + if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) { + if (!___ratelimit(&user->rs, current->comm)) + return ret; + } + buf = kmalloc(len+1, GFP_KERNEL); if (buf == NULL) return -ENOMEM; @@ -799,19 +918,24 @@ static int devkmsg_open(struct inode *inode, struct file *file) struct devkmsg_user *user; int err; - /* write-only does not need any file context */ - if ((file->f_flags & O_ACCMODE) == O_WRONLY) - return 0; + if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) + return -EPERM; - err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, - SYSLOG_FROM_READER); - if (err) - return err; + /* write-only does not need any file context */ + if ((file->f_flags & O_ACCMODE) != O_WRONLY) { + err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, + SYSLOG_FROM_READER); + if (err) + return err; + } user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); if (!user) return -ENOMEM; + ratelimit_default_init(&user->rs); + ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE); + mutex_init(&user->lock); raw_spin_lock_irq(&logbuf_lock); @@ -830,6 +954,8 @@ static int devkmsg_release(struct inode *inode, struct file *file) if (!user) return 0; + ratelimit_state_exit(&user->rs); + mutex_destroy(&user->lock); kfree(user); return 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 53954631a4e1..b43d0b27c1fe 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -813,6 +813,13 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &ten_thousand, }, + { + .procname = "printk_devkmsg", + .data = devkmsg_log_str, + .maxlen = DEVKMSG_STR_MAX_SIZE, + .mode = 0644, + .proc_handler = devkmsg_sysctl_set_loglvl, + }, { .procname = "dmesg_restrict", .data = &dmesg_restrict, -- cgit v1.3-8-gc7d7 From 627393d44860386e948bb63a8e5b53f2cc44d070 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 2 Aug 2016 14:05:40 -0700 Subject: kernel/exit.c: quieten greatest stack depth printk Many targets enable CONFIG_DEBUG_STACK_USAGE, and while the information is useful, it isn't worthy of pr_warn(). Reduce it to pr_info(). Link: http://lkml.kernel.org/r/1466982072-29836-1-git-send-email-anton@ozlabs.org Signed-off-by: Anton Blanchard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 84ae830234f8..2f974ae042a6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -715,7 +715,7 @@ static void check_stack_usage(void) spin_lock(&low_water_lock); if (free < lowest_to_date) { - pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n", + pr_info("%s (%d) used greatest stack depth: %lu bytes left\n", current->comm, task_pid_nr(current), free); lowest_to_date = free; } -- cgit v1.3-8-gc7d7 From 4caf9615247aceab56e91df6c0e11892ea55f4f0 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Tue, 2 Aug 2016 14:05:45 -0700 Subject: kexec: return error number directly This is a cleanup patch to make kexec more clear to return error number directly. The variable result is useless, because there is no other function's return value assignes to it. So remove it. Link: http://lkml.kernel.org/r/1464179273-57668-1-git-send-email-mnghuan@gmail.com Signed-off-by: Minfei Huang Cc: Dave Young Cc: Baoquan He Cc: Borislav Petkov Cc: Xunlei Pang Cc: Atsushi Kumagai Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_core.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 56b3ed0927b0..23311c803b1b 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -147,7 +147,7 @@ static struct page *kimage_alloc_page(struct kimage *image, int sanity_check_segment_list(struct kimage *image) { - int result, i; + int i; unsigned long nr_segments = image->nr_segments; /* @@ -163,16 +163,15 @@ int sanity_check_segment_list(struct kimage *image) * simply because addresses are changed to page size * granularity. */ - result = -EADDRNOTAVAIL; for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - return result; + return -EADDRNOTAVAIL; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - return result; + return -EADDRNOTAVAIL; } /* Verify our destination addresses do not overlap. @@ -180,7 +179,6 @@ int sanity_check_segment_list(struct kimage *image) * through very weird things can happen with no * easy explanation as one segment stops on another. */ - result = -EINVAL; for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; unsigned long j; @@ -194,7 +192,7 @@ int sanity_check_segment_list(struct kimage *image) pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) - return result; + return -EINVAL; } } @@ -203,10 +201,9 @@ int sanity_check_segment_list(struct kimage *image) * and it is easier to check up front than to be surprised * later on. */ - result = -EINVAL; for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) - return result; + return -EINVAL; } /* @@ -220,7 +217,6 @@ int sanity_check_segment_list(struct kimage *image) */ if (image->type == KEXEC_TYPE_CRASH) { - result = -EADDRNOTAVAIL; for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; @@ -229,7 +225,7 @@ int sanity_check_segment_list(struct kimage *image) /* Ensure we are within the crash kernel limits */ if ((mstart < crashk_res.start) || (mend > crashk_res.end)) - return result; + return -EADDRNOTAVAIL; } } -- cgit v1.3-8-gc7d7 From 465d377701dfe6a08a9f361a3fd926dea7f89c74 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 2 Aug 2016 14:05:57 -0700 Subject: kexec: ensure user memory sizes do not wrap Ensure that user memory sizes do not wrap around when validating the user input, which can lead to the following input validation working incorrectly. [akpm@linux-foundation.org: fix it for kexec-return-error-number-directly.patch] Link: http://lkml.kernel.org/r/E1b8koF-0004HM-5x@rmk-PC.armlinux.org.uk Signed-off-by: Russell King Reviewed-by: Pratyush Anand Acked-by: Baoquan He Cc: Keerthy Cc: Vitaly Andrianov Cc: Eric Biederman Cc: Dave Young Cc: Vivek Goyal Cc: Simon Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 23311c803b1b..5a83b2a9d584 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -168,6 +168,8 @@ int sanity_check_segment_list(struct kimage *image) mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; + if (mstart > mend) + return -EADDRNOTAVAIL; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) return -EADDRNOTAVAIL; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) -- cgit v1.3-8-gc7d7 From dae28018f56645b61f5beb84d5831346d3c5e457 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 2 Aug 2016 14:06:00 -0700 Subject: kdump: arrange for paddr_vmcoreinfo_note() to return phys_addr_t On PAE systems (eg, ARM LPAE) the vmcore note may be located above 4GB physical on 32-bit architectures, so we need a wider type than "unsigned long" here. Arrange for paddr_vmcoreinfo_note() to return a phys_addr_t, thereby allowing it to be located above 4GB. This makes no difference for kexec-tools, as they already assume a 64-bit type when reading from this file. Link: http://lkml.kernel.org/r/E1b8koK-0004HS-K9@rmk-PC.armlinux.org.uk Signed-off-by: Russell King Reviewed-by: Pratyush Anand Acked-by: Baoquan He Cc: Keerthy Cc: Vitaly Andrianov Cc: Eric Biederman Cc: Dave Young Cc: Vivek Goyal Cc: Simon Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/machine_kexec.c | 2 +- include/linux/kexec.h | 2 +- kernel/kexec_core.c | 2 +- kernel/ksysfs.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c index b72cd7a07222..599507bcec91 100644 --- a/arch/ia64/kernel/machine_kexec.c +++ b/arch/ia64/kernel/machine_kexec.c @@ -163,7 +163,7 @@ void arch_crash_save_vmcoreinfo(void) #endif } -unsigned long paddr_vmcoreinfo_note(void) +phys_addr_t paddr_vmcoreinfo_note(void) { return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note); } diff --git a/include/linux/kexec.h b/include/linux/kexec.h index ce2fe197f583..555227f0029f 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -233,7 +233,7 @@ void crash_save_vmcoreinfo(void); void arch_crash_save_vmcoreinfo(void); __printf(1, 2) void vmcoreinfo_append_str(const char *fmt, ...); -unsigned long paddr_vmcoreinfo_note(void); +phys_addr_t paddr_vmcoreinfo_note(void); #define VMCOREINFO_OSRELEASE(value) \ vmcoreinfo_append_str("OSRELEASE=%s\n", value) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 5a83b2a9d584..dab03f17be25 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1372,7 +1372,7 @@ void vmcoreinfo_append_str(const char *fmt, ...) void __weak arch_crash_save_vmcoreinfo(void) {} -unsigned long __weak paddr_vmcoreinfo_note(void) +phys_addr_t __weak paddr_vmcoreinfo_note(void) { return __pa((unsigned long)(char *)&vmcoreinfo_note); } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 152da4a48867..9f1920d2d0c6 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -128,8 +128,8 @@ KERNEL_ATTR_RW(kexec_crash_size); static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lx %x\n", - paddr_vmcoreinfo_note(), + phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); + return sprintf(buf, "%pa %x\n", &vmcore_base, (unsigned int)sizeof(vmcoreinfo_note)); } KERNEL_ATTR_RO(vmcoreinfo); -- cgit v1.3-8-gc7d7 From 43546d8669d62d75fa69ca9a45d2f586665f56bd Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 2 Aug 2016 14:06:04 -0700 Subject: kexec: allow architectures to override boot mapping kexec physical addresses are the boot-time view of the system. For certain ARM systems (such as Keystone 2), the boot view of the system does not match the kernel's view of the system: the boot view uses a special alias in the lower 4GB of the physical address space. To cater for these kinds of setups, we need to translate between the boot view physical addresses and the normal kernel view physical addresses. This patch extracts the current transation points into linux/kexec.h, and allows an architecture to override the functions. Due to the translations required, we unfortunately end up with six translation functions, which are reduced down to four that the architecture can override. [akpm@linux-foundation.org: kexec.h needs asm/io.h for phys_to_virt()] Link: http://lkml.kernel.org/r/E1b8koP-0004HZ-Vf@rmk-PC.armlinux.org.uk Signed-off-by: Russell King Cc: Keerthy Cc: Pratyush Anand Cc: Vitaly Andrianov Cc: Eric Biederman Cc: Dave Young Cc: Baoquan He Cc: Vivek Goyal Cc: Simon Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kexec.h | 40 ++++++++++++++++++++++++++++++++++++++++ kernel/kexec.c | 3 ++- kernel/kexec_core.c | 26 +++++++++++++------------- 3 files changed, 55 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 555227f0029f..23e14a460cfb 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -14,6 +14,8 @@ #if !defined(__ASSEMBLY__) +#include + #include #ifdef CONFIG_KEXEC_CORE @@ -318,6 +320,44 @@ int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, void arch_kexec_protect_crashkres(void); void arch_kexec_unprotect_crashkres(void); +#ifndef page_to_boot_pfn +static inline unsigned long page_to_boot_pfn(struct page *page) +{ + return page_to_pfn(page); +} +#endif + +#ifndef boot_pfn_to_page +static inline struct page *boot_pfn_to_page(unsigned long boot_pfn) +{ + return pfn_to_page(boot_pfn); +} +#endif + +#ifndef phys_to_boot_phys +static inline unsigned long phys_to_boot_phys(phys_addr_t phys) +{ + return phys; +} +#endif + +#ifndef boot_phys_to_phys +static inline phys_addr_t boot_phys_to_phys(unsigned long boot_phys) +{ + return boot_phys; +} +#endif + +static inline unsigned long virt_to_boot_phys(void *addr) +{ + return phys_to_boot_phys(__pa((unsigned long)addr)); +} + +static inline void *boot_phys_to_virt(unsigned long entry) +{ + return phys_to_virt(boot_phys_to_phys(entry)); +} + #else /* !CONFIG_KEXEC_CORE */ struct pt_regs; struct task_struct; diff --git a/kernel/kexec.c b/kernel/kexec.c index 4384672d3245..980936a90ee6 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -48,7 +48,8 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, if (kexec_on_panic) { /* Verify we have a valid entry point */ - if ((entry < crashk_res.start) || (entry > crashk_res.end)) + if ((entry < phys_to_boot_phys(crashk_res.start)) || + (entry > phys_to_boot_phys(crashk_res.end))) return -EADDRNOTAVAIL; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index dab03f17be25..73d4c5f57dd8 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -225,8 +225,8 @@ int sanity_check_segment_list(struct kimage *image) mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || - (mend > crashk_res.end)) + if ((mstart < phys_to_boot_phys(crashk_res.start)) || + (mend > phys_to_boot_phys(crashk_res.end))) return -EADDRNOTAVAIL; } } @@ -350,7 +350,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); if (!pages) break; - pfn = page_to_pfn(pages); + pfn = page_to_boot_pfn(pages); epfn = pfn + count; addr = pfn << PAGE_SHIFT; eaddr = epfn << PAGE_SHIFT; @@ -476,7 +476,7 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) return -ENOMEM; ind_page = page_address(page); - *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; + *image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION; image->entry = ind_page; image->last_entry = ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); @@ -531,13 +531,13 @@ void kimage_terminate(struct kimage *image) #define for_each_kimage_entry(image, ptr, entry) \ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ ptr = (entry & IND_INDIRECTION) ? \ - phys_to_virt((entry & PAGE_MASK)) : ptr + 1) + boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1) static void kimage_free_entry(kimage_entry_t entry) { struct page *page; - page = pfn_to_page(entry >> PAGE_SHIFT); + page = boot_pfn_to_page(entry >> PAGE_SHIFT); kimage_free_pages(page); } @@ -631,7 +631,7 @@ static struct page *kimage_alloc_page(struct kimage *image, * have a match. */ list_for_each_entry(page, &image->dest_pages, lru) { - addr = page_to_pfn(page) << PAGE_SHIFT; + addr = page_to_boot_pfn(page) << PAGE_SHIFT; if (addr == destination) { list_del(&page->lru); return page; @@ -646,12 +646,12 @@ static struct page *kimage_alloc_page(struct kimage *image, if (!page) return NULL; /* If the page cannot be used file it away */ - if (page_to_pfn(page) > + if (page_to_boot_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { list_add(&page->lru, &image->unusable_pages); continue; } - addr = page_to_pfn(page) << PAGE_SHIFT; + addr = page_to_boot_pfn(page) << PAGE_SHIFT; /* If it is the destination page we want use it */ if (addr == destination) @@ -674,7 +674,7 @@ static struct page *kimage_alloc_page(struct kimage *image, struct page *old_page; old_addr = *old & PAGE_MASK; - old_page = pfn_to_page(old_addr >> PAGE_SHIFT); + old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT); copy_highpage(page, old_page); *old = addr | (*old & ~PAGE_MASK); @@ -730,7 +730,7 @@ static int kimage_load_normal_segment(struct kimage *image, result = -ENOMEM; goto out; } - result = kimage_add_page(image, page_to_pfn(page) + result = kimage_add_page(image, page_to_boot_pfn(page) << PAGE_SHIFT); if (result < 0) goto out; @@ -791,7 +791,7 @@ static int kimage_load_crash_segment(struct kimage *image, char *ptr; size_t uchunk, mchunk; - page = pfn_to_page(maddr >> PAGE_SHIFT); + page = boot_pfn_to_page(maddr >> PAGE_SHIFT); if (!page) { result = -ENOMEM; goto out; @@ -919,7 +919,7 @@ void __weak crash_free_reserved_phys_range(unsigned long begin, unsigned long addr; for (addr = begin; addr < end; addr += PAGE_SIZE) - free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); + free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT)); } int crash_shrink_memory(unsigned long new_size) -- cgit v1.3-8-gc7d7 From b26e27ddfd2a986dc53e259aba572f3aac182eb8 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Tue, 2 Aug 2016 14:06:13 -0700 Subject: kexec: use core_param for crash_kexec_post_notifiers boot option crash_kexec_post_notifiers ia a boot option which controls whether the 1st kernel calls panic notifiers or not before booting the 2nd kernel. However, there is no need to limit it to being modifiable only at boot time. So, use core_param instead of early_param. Link: http://lkml.kernel.org/r/20160705113327.5864.43139.stgit@softrs Signed-off-by: Hidehiro Kawai Cc: Dave Young Cc: Baoquan He Cc: Vivek Goyal Cc: Eric Biederman Cc: Masami Hiramatsu Cc: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 8aa74497cc5a..ca8cea1ef673 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -108,6 +108,7 @@ void panic(const char *fmt, ...) long i, i_next = 0; int state = 0; int old_cpu, this_cpu; + bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; /* * Disable local interrupts. This will prevent panic_smp_self_stop @@ -160,7 +161,7 @@ void panic(const char *fmt, ...) * * Bypass the panic_cpu check and call __crash_kexec directly. */ - if (!crash_kexec_post_notifiers) { + if (!_crash_kexec_post_notifiers) { printk_nmi_flush_on_panic(); __crash_kexec(NULL); } @@ -191,7 +192,7 @@ void panic(const char *fmt, ...) * * Bypass the panic_cpu check and call __crash_kexec directly. */ - if (crash_kexec_post_notifiers) + if (_crash_kexec_post_notifiers) __crash_kexec(NULL); bust_spinlocks(0); @@ -571,13 +572,7 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); - -static int __init setup_crash_kexec_post_notifiers(char *s) -{ - crash_kexec_post_notifiers = true; - return 0; -} -early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers); +core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); static int __init oops_setup(char *s) { -- cgit v1.3-8-gc7d7 From 21db79e8bb054d0351a6b1b464f1c9c47a2e6e8d Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Tue, 2 Aug 2016 14:06:16 -0700 Subject: kexec: add a kexec_crash_loaded() function Provide a wrapper function to be used by kernel code to check whether a crash kernel is loaded. It returns the same value that can be seen in /sys/kernel/kexec_crash_loaded by userspace programs. I'm exporting the function, because it will be used by Xen, and it is possible to compile Xen modules separately to enable the use of PV drivers with unmodified bare-metal kernels. Link: http://lkml.kernel.org/r/20160713121955.14969.69080.stgit@hananiah.suse.cz Signed-off-by: Petr Tesarik Cc: Juergen Gross Cc: Josh Triplett Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Eric Biederman Cc: "H. Peter Anvin" Cc: Boris Ostrovsky Cc: "Paul E. McKenney" Cc: Dave Young Cc: David Vrabel Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kexec.h | 2 ++ kernel/kexec_core.c | 6 ++++++ kernel/ksysfs.c | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 23e14a460cfb..d7437777baaa 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -230,6 +230,7 @@ extern void *kexec_purgatory_get_symbol_addr(struct kimage *image, extern void __crash_kexec(struct pt_regs *); extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); +int kexec_crash_loaded(void); void crash_save_cpu(struct pt_regs *regs, int cpu); void crash_save_vmcoreinfo(void); void arch_crash_save_vmcoreinfo(void); @@ -364,6 +365,7 @@ struct task_struct; static inline void __crash_kexec(struct pt_regs *regs) { } static inline void crash_kexec(struct pt_regs *regs) { } static inline int kexec_should_crash(struct task_struct *p) { return 0; } +static inline int kexec_crash_loaded(void) { return 0; } #define kexec_in_progress false #endif /* CONFIG_KEXEC_CORE */ diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 73d4c5f57dd8..704534029a00 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -95,6 +95,12 @@ int kexec_should_crash(struct task_struct *p) return 0; } +int kexec_crash_loaded(void) +{ + return !!kexec_crash_image; +} +EXPORT_SYMBOL_GPL(kexec_crash_loaded); + /* * When kexec transitions to the new kernel there is a one-to-one * mapping between physical and virtual addresses. On processors diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 9f1920d2d0c6..ee1bc1bb8feb 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -101,7 +101,7 @@ KERNEL_ATTR_RO(kexec_loaded); static ssize_t kexec_crash_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", !!kexec_crash_image); + return sprintf(buf, "%d\n", kexec_crash_loaded()); } KERNEL_ATTR_RO(kexec_crash_loaded); -- cgit v1.3-8-gc7d7 From 1730f146604ea426e54938cdbcf87df1047ef0dc Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Tue, 2 Aug 2016 14:06:22 -0700 Subject: kexec: add restriction on kexec_load() segment sizes I hit the following issue when run trinity in my system. The kernel is 3.4 version, but mainline has the same issue. The root cause is that the segment size is too large so the kerenl spends too long trying to allocate a page. Other cases will block until the test case quits. Also, OOM conditions will occur. Call Trace: __alloc_pages_nodemask+0x14c/0x8f0 alloc_pages_current+0xaf/0x120 kimage_alloc_pages+0x10/0x60 kimage_alloc_control_pages+0x5d/0x270 machine_kexec_prepare+0xe5/0x6c0 ? kimage_free_page_list+0x52/0x70 sys_kexec_load+0x141/0x600 ? vfs_write+0x100/0x180 system_call_fastpath+0x16/0x1b The patch changes sanity_check_segment_list() to verify that the usage by all segments does not exceed half of memory. [akpm@linux-foundation.org: fix for kexec-return-error-number-directly.patch, update comment] Link: http://lkml.kernel.org/r/1469625474-53904-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Suggested-by: Eric W. Biederman Cc: Vivek Goyal Cc: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_core.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 704534029a00..561675589511 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -146,6 +146,7 @@ EXPORT_SYMBOL_GPL(kexec_crash_loaded); * allocating pages whose destination address we do not care about. */ #define KIMAGE_NO_DEST (-1UL) +#define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT) static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, @@ -155,6 +156,7 @@ int sanity_check_segment_list(struct kimage *image) { int i; unsigned long nr_segments = image->nr_segments; + unsigned long total_pages = 0; /* * Verify we have good destination addresses. The caller is @@ -214,6 +216,21 @@ int sanity_check_segment_list(struct kimage *image) return -EINVAL; } + /* + * Verify that no more than half of memory will be consumed. If the + * request from userspace is too large, a large amount of time will be + * wasted allocating pages, which can cause a soft lockup. + */ + for (i = 0; i < nr_segments; i++) { + if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages / 2) + return -EINVAL; + + total_pages += PAGE_COUNT(image->segment[i].memsz); + } + + if (total_pages > totalram_pages / 2) + return -EINVAL; + /* * Verify we have good destination addresses. Normally * the caller is responsible for making certain we don't -- cgit v1.3-8-gc7d7 From 59dbb2a06fc2bcb752b964e036884fe9bb9dbbe0 Mon Sep 17 00:00:00 2001 From: Akash Goel Date: Tue, 2 Aug 2016 14:07:18 -0700 Subject: relay: add global mode support for buffer-only channels Commit 20d8b67c06fa ("relay: add buffer-only channels; useful for early logging") added support to use channels with no associated files. This is useful when the exact location of relay file is not known or the the parent directory of relay file is not available, while creating the channel and the logging has to start right from the boot. But there was no provision to use global mode with buffer-only channels, which is added by this patch, without modifying the interface where initially there will be a dummy invocation of create_buf_file callback through which kernel client can convey the need of a global buffer. For the use case where drivers/kernel clients want a simple interface for the userspace, which enables them to capture data/logs from relay file inorder & without any post processing, support of Global buffer mode is warranted. Modules, like i915, using relay_open() in early init would have to later register their buffer-only relays, once debugfs is available, by calling relay_late_setup_files(). Hence relay_late_setup_files() symbol also needs to be exported. Link: http://lkml.kernel.org/r/1468404563-11653-1-git-send-email-akash.goel@intel.com Signed-off-by: Akash Goel Cc: Eduard - Gabriel Munteanu Cc: Tom Zanussi Cc: Chris Wilson Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/relay.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 04d7cf3ef8cf..d797502140b9 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -451,6 +451,13 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) if (!dentry) goto free_buf; relay_set_buf_dentry(buf, dentry); + } else { + /* Only retrieve global info, nothing more, nothing less */ + dentry = chan->cb->create_buf_file(NULL, NULL, + S_IRUSR, buf, + &chan->is_global); + if (WARN_ON(dentry)) + goto free_buf; } buf->cpu = cpu; @@ -562,6 +569,10 @@ static int relay_hotcpu_callback(struct notifier_block *nb, * attributes specified. The created channel buffer files * will be named base_filename0...base_filenameN-1. File * permissions will be %S_IRUSR. + * + * If opening a buffer (@parent = NULL) that you later wish to register + * in a filesystem, call relay_late_setup_files() once the @parent dentry + * is available. */ struct rchan *relay_open(const char *base_filename, struct dentry *parent, @@ -640,8 +651,12 @@ static void __relay_set_buf_dentry(void *info) * * Returns 0 if successful, non-zero otherwise. * - * Use to setup files for a previously buffer-only channel. - * Useful to do early tracing in kernel, before VFS is up, for example. + * Use to setup files for a previously buffer-only channel created + * by relay_open() with a NULL parent dentry. + * + * For example, this is useful for perfomring early tracing in kernel, + * before VFS is up and then exposing the early results once the dentry + * is available. */ int relay_late_setup_files(struct rchan *chan, const char *base_filename, @@ -666,6 +681,20 @@ int relay_late_setup_files(struct rchan *chan, } chan->has_base_filename = 1; chan->parent = parent; + + if (chan->is_global) { + err = -EINVAL; + if (!WARN_ON_ONCE(!chan->buf[0])) { + dentry = relay_create_buf_file(chan, chan->buf[0], 0); + if (dentry && !WARN_ON_ONCE(!chan->is_global)) { + relay_set_buf_dentry(chan->buf[0], dentry); + err = 0; + } + } + mutex_unlock(&relay_channels_mutex); + return err; + } + curr_cpu = get_cpu(); /* * The CPU hotplug notifier ran before us and created buffers with @@ -706,6 +735,7 @@ int relay_late_setup_files(struct rchan *chan, return err; } +EXPORT_SYMBOL_GPL(relay_late_setup_files); /** * relay_switch_subbuf - switch to a new sub-buffer -- cgit v1.3-8-gc7d7 From 27eb6622ab67bad75814c9b7b08096cfb16be63a Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 2 Aug 2016 14:07:24 -0700 Subject: config: add android config fragments Copy the config fragments from the AOSP common kernel android-4.4 branch. It is becoming possible to run mainline kernels with Android, but the kernel defconfigs don't work as-is and debugging missing config options is a pain. Adding the config fragments into the kernel tree, makes configuring a mainline kernel as simple as: make ARCH=arm multi_v7_defconfig android-base.config android-recommended.config The following non-upstream config options were removed: CONFIG_NETFILTER_XT_MATCH_QTAGUID CONFIG_NETFILTER_XT_MATCH_QUOTA2 CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG CONFIG_PPPOLAC CONFIG_PPPOPNS CONFIG_SECURITY_PERF_EVENTS_RESTRICT CONFIG_USB_CONFIGFS_F_MTP CONFIG_USB_CONFIGFS_F_PTP CONFIG_USB_CONFIGFS_F_ACC CONFIG_USB_CONFIGFS_F_AUDIO_SRC CONFIG_USB_CONFIGFS_UEVENT CONFIG_INPUT_KEYCHORD CONFIG_INPUT_KEYRESET Link: http://lkml.kernel.org/r/1466708235-28593-1-git-send-email-robh@kernel.org Signed-off-by: Rob Herring Cc: Amit Pundir Cc: John Stultz Cc: Dmitry Shmidt Cc: Rom Lemarchand Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 5 + kernel/configs/android-base.config | 152 ++++++++++++++++++++++++++++++ kernel/configs/android-recommended.config | 121 ++++++++++++++++++++++++ 3 files changed, 278 insertions(+) create mode 100644 kernel/configs/android-base.config create mode 100644 kernel/configs/android-recommended.config (limited to 'kernel') diff --git a/MAINTAINERS b/MAINTAINERS index e9eacacf0f08..ce38536009c7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -778,6 +778,11 @@ W: http://ez.analog.com/community/linux-device-drivers S: Supported F: drivers/dma/dma-axi-dmac.c +ANDROID CONFIG FRAGMENTS +M: Rob Herring +S: Supported +F: kernel/configs/android* + ANDROID DRIVERS M: Greg Kroah-Hartman M: Arve Hjønnevåg diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config new file mode 100644 index 000000000000..9f748ed7bea8 --- /dev/null +++ b/kernel/configs/android-base.config @@ -0,0 +1,152 @@ +# KEEP ALPHABETICALLY SORTED +# CONFIG_DEVKMEM is not set +# CONFIG_DEVMEM is not set +# CONFIG_INET_LRO is not set +# CONFIG_MODULES is not set +# CONFIG_OABI_COMPAT is not set +# CONFIG_SYSVIPC is not set +CONFIG_ANDROID=y +CONFIG_ANDROID_BINDER_IPC=y +CONFIG_ANDROID_LOW_MEMORY_KILLER=y +CONFIG_ARMV8_DEPRECATED=y +CONFIG_ASHMEM=y +CONFIG_AUDIT=y +CONFIG_BLK_DEV_DM=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_DEBUG=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_SCHED=y +CONFIG_CP15_BARRIER_EMULATION=y +CONFIG_DM_CRYPT=y +CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y +CONFIG_EMBEDDED=y +CONFIG_FB=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_INET6_AH=y +CONFIG_INET6_ESP=y +CONFIG_INET6_IPCOMP=y +CONFIG_INET=y +CONFIG_INET_DIAG_DESTROY=y +CONFIG_INET_ESP=y +CONFIG_INET_XFRM_MODE_TUNNEL=y +CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_IPTABLES=y +CONFIG_IP6_NF_MANGLE=y +CONFIG_IP6_NF_RAW=y +CONFIG_IP6_NF_TARGET_REJECT=y +CONFIG_IPV6=y +CONFIG_IPV6_MIP6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_IPV6_PRIVACY=y +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_NF_ARPFILTER=y +CONFIG_IP_NF_ARPTABLES=y +CONFIG_IP_NF_ARP_MANGLE=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_MANGLE=y +CONFIG_IP_NF_MATCH_AH=y +CONFIG_IP_NF_MATCH_ECN=y +CONFIG_IP_NF_MATCH_TTL=y +CONFIG_IP_NF_NAT=y +CONFIG_IP_NF_RAW=y +CONFIG_IP_NF_SECURITY=y +CONFIG_IP_NF_TARGET_MASQUERADE=y +CONFIG_IP_NF_TARGET_NETMAP=y +CONFIG_IP_NF_TARGET_REDIRECT=y +CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_NET=y +CONFIG_NETDEVICES=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_TPROXY=y +CONFIG_NETFILTER_XT_MATCH_COMMENT=y +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y +CONFIG_NETFILTER_XT_MATCH_CONNMARK=y +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y +CONFIG_NETFILTER_XT_MATCH_HELPER=y +CONFIG_NETFILTER_XT_MATCH_IPRANGE=y +CONFIG_NETFILTER_XT_MATCH_LENGTH=y +CONFIG_NETFILTER_XT_MATCH_LIMIT=y +CONFIG_NETFILTER_XT_MATCH_MAC=y +CONFIG_NETFILTER_XT_MATCH_MARK=y +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y +CONFIG_NETFILTER_XT_MATCH_POLICY=y +CONFIG_NETFILTER_XT_MATCH_QUOTA=y +CONFIG_NETFILTER_XT_MATCH_SOCKET=y +CONFIG_NETFILTER_XT_MATCH_STATE=y +CONFIG_NETFILTER_XT_MATCH_STATISTIC=y +CONFIG_NETFILTER_XT_MATCH_STRING=y +CONFIG_NETFILTER_XT_MATCH_TIME=y +CONFIG_NETFILTER_XT_MATCH_U32=y +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y +CONFIG_NETFILTER_XT_TARGET_CONNMARK=y +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_NFLOG=y +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y +CONFIG_NETFILTER_XT_TARGET_SECMARK=y +CONFIG_NETFILTER_XT_TARGET_TCPMSS=y +CONFIG_NETFILTER_XT_TARGET_TPROXY=y +CONFIG_NETFILTER_XT_TARGET_TRACE=y +CONFIG_NET_CLS_ACT=y +CONFIG_NET_CLS_U32=y +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_U32=y +CONFIG_NET_KEY=y +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_HTB=y +CONFIG_NF_CONNTRACK=y +CONFIG_NF_CONNTRACK_AMANDA=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_FTP=y +CONFIG_NF_CONNTRACK_H323=y +CONFIG_NF_CONNTRACK_IPV4=y +CONFIG_NF_CONNTRACK_IPV6=y +CONFIG_NF_CONNTRACK_IRC=y +CONFIG_NF_CONNTRACK_NETBIOS_NS=y +CONFIG_NF_CONNTRACK_PPTP=y +CONFIG_NF_CONNTRACK_SANE=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_TFTP=y +CONFIG_NF_CT_NETLINK=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_NAT=y +CONFIG_NO_HZ=y +CONFIG_PACKET=y +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PPP=y +CONFIG_PPP_BSDCOMP=y +CONFIG_PPP_DEFLATE=y +CONFIG_PPP_MPPE=y +CONFIG_PREEMPT=y +CONFIG_QUOTA=y +CONFIG_RTC_CLASS=y +CONFIG_RT_GROUP_SCHED=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_SELINUX=y +CONFIG_SETEND_EMULATION=y +CONFIG_STAGING=y +CONFIG_SWP_EMULATION=y +CONFIG_SYNC=y +CONFIG_TUN=y +CONFIG_UNIX=y +CONFIG_USB_GADGET=y +CONFIG_USB_CONFIGFS=y +CONFIG_USB_CONFIGFS_F_FS=y +CONFIG_USB_CONFIGFS_F_MIDI=y +CONFIG_USB_OTG_WAKELOCK=y +CONFIG_XFRM_USER=y diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config new file mode 100644 index 000000000000..e3b953e966d2 --- /dev/null +++ b/kernel/configs/android-recommended.config @@ -0,0 +1,121 @@ +# KEEP ALPHABETICALLY SORTED +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +# CONFIG_INPUT_MOUSE is not set +# CONFIG_LEGACY_PTYS is not set +# CONFIG_NF_CONNTRACK_SIP is not set +# CONFIG_PM_WAKELOCKS_GC is not set +# CONFIG_VT is not set +CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_COMPACTION=y +CONFIG_DEBUG_RODATA=y +CONFIG_DM_UEVENT=y +CONFIG_DRAGONRISE_FF=y +CONFIG_ENABLE_DEFAULT_TRACERS=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_FUSE_FS=y +CONFIG_GREENASIA_FF=y +CONFIG_HIDRAW=y +CONFIG_HID_A4TECH=y +CONFIG_HID_ACRUX=y +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=y +CONFIG_HID_BELKIN=y +CONFIG_HID_CHERRY=y +CONFIG_HID_CHICONY=y +CONFIG_HID_CYPRESS=y +CONFIG_HID_DRAGONRISE=y +CONFIG_HID_ELECOM=y +CONFIG_HID_EMS_FF=y +CONFIG_HID_EZKEY=y +CONFIG_HID_GREENASIA=y +CONFIG_HID_GYRATION=y +CONFIG_HID_HOLTEK=y +CONFIG_HID_KENSINGTON=y +CONFIG_HID_KEYTOUCH=y +CONFIG_HID_KYE=y +CONFIG_HID_LCPOWER=y +CONFIG_HID_LOGITECH=y +CONFIG_HID_LOGITECH_DJ=y +CONFIG_HID_MAGICMOUSE=y +CONFIG_HID_MICROSOFT=y +CONFIG_HID_MONTEREY=y +CONFIG_HID_MULTITOUCH=y +CONFIG_HID_NTRIG=y +CONFIG_HID_ORTEK=y +CONFIG_HID_PANTHERLORD=y +CONFIG_HID_PETALYNX=y +CONFIG_HID_PICOLCD=y +CONFIG_HID_PRIMAX=y +CONFIG_HID_PRODIKEYS=y +CONFIG_HID_ROCCAT=y +CONFIG_HID_SAITEK=y +CONFIG_HID_SAMSUNG=y +CONFIG_HID_SMARTJOYPLUS=y +CONFIG_HID_SONY=y +CONFIG_HID_SPEEDLINK=y +CONFIG_HID_SUNPLUS=y +CONFIG_HID_THRUSTMASTER=y +CONFIG_HID_TIVO=y +CONFIG_HID_TOPSEED=y +CONFIG_HID_TWINHAN=y +CONFIG_HID_UCLOGIC=y +CONFIG_HID_WACOM=y +CONFIG_HID_WALTOP=y +CONFIG_HID_WIIMOTE=y +CONFIG_HID_ZEROPLUS=y +CONFIG_HID_ZYDACRON=y +CONFIG_INPUT_EVDEV=y +CONFIG_INPUT_GPIO=y +CONFIG_INPUT_JOYSTICK=y +CONFIG_INPUT_MISC=y +CONFIG_INPUT_TABLET=y +CONFIG_INPUT_UINPUT=y +CONFIG_ION=y +CONFIG_JOYSTICK_XPAD=y +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_KALLSYMS_ALL=y +CONFIG_KSM=y +CONFIG_LOGIG940_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGITECH_FF=y +CONFIG_MD=y +CONFIG_MEDIA_SUPPORT=y +CONFIG_MSDOS_FS=y +CONFIG_PANIC_TIMEOUT=5 +CONFIG_PANTHERLORD_FF=y +CONFIG_PERF_EVENTS=y +CONFIG_PM_DEBUG=y +CONFIG_PM_RUNTIME=y +CONFIG_PM_WAKELOCKS_LIMIT=0 +CONFIG_POWER_SUPPLY=y +CONFIG_PSTORE=y +CONFIG_PSTORE_CONSOLE=y +CONFIG_PSTORE_RAM=y +CONFIG_SCHEDSTATS=y +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_SND=y +CONFIG_SOUND=y +CONFIG_SUSPEND_TIME=y +CONFIG_TABLET_USB_ACECAD=y +CONFIG_TABLET_USB_AIPTEK=y +CONFIG_TABLET_USB_GTCO=y +CONFIG_TABLET_USB_HANWANG=y +CONFIG_TABLET_USB_KBTAB=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_TASK_XACCT=y +CONFIG_TIMER_STATS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_UHID=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_HIDDEV=y +CONFIG_USB_USBNET=y +CONFIG_VFAT_FS=y -- cgit v1.3-8-gc7d7 From 1f415a74b0ca64b5bfacbb12d71ed2ec050a8cfb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 2 Aug 2016 16:12:14 +0100 Subject: bpf: fix method of PTR_TO_PACKET reg id generation Using per-register incrementing ID can lead to find_good_pkt_pointers() confusing registers which have completely different values. Consider example: 0: (bf) r6 = r1 1: (61) r8 = *(u32 *)(r6 +76) 2: (61) r0 = *(u32 *)(r6 +80) 3: (bf) r7 = r8 4: (07) r8 += 32 5: (2d) if r8 > r0 goto pc+9 R0=pkt_end R1=ctx R6=ctx R7=pkt(id=0,off=0,r=32) R8=pkt(id=0,off=32,r=32) R10=fp 6: (bf) r8 = r7 7: (bf) r9 = r7 8: (71) r1 = *(u8 *)(r7 +0) 9: (0f) r8 += r1 10: (71) r1 = *(u8 *)(r7 +1) 11: (0f) r9 += r1 12: (07) r8 += 32 13: (2d) if r8 > r0 goto pc+1 R0=pkt_end R1=inv56 R6=ctx R7=pkt(id=0,off=0,r=32) R8=pkt(id=1,off=32,r=32) R9=pkt(id=1,off=0,r=32) R10=fp 14: (71) r1 = *(u8 *)(r9 +16) 15: (b7) r7 = 0 16: (bf) r0 = r7 17: (95) exit We need to get a UNKNOWN_VALUE with imm to force id generation so lines 0-5 make r7 a valid packet pointer. We then read two different bytes from the packet and add them to copies of the constructed packet pointer. r8 (line 9) and r9 (line 11) will get the same id of 1, independently. When either of them is validated (line 13) - find_good_pkt_pointers() will also mark the other as safe. This leads to access on line 14 being mistakenly considered safe. Fixes: 969bf05eb3ce ("bpf: direct packet access") Signed-off-by: Jakub Kicinski Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f72f23b8fdab..7094c69ac199 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -194,6 +194,7 @@ struct verifier_env { struct verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ + u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; }; @@ -1301,7 +1302,7 @@ add_imm: /* dst_reg stays as pkt_ptr type and since some positive * integer value was added to the pointer, increment its 'id' */ - dst_reg->id++; + dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range and off to zero */ dst_reg->off = 0; -- cgit v1.3-8-gc7d7 From 9502514f2808d29f6f2afa1c410e7808898dede1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 19 Jul 2016 05:59:24 +0930 Subject: module: Do a WARN_ON_ONCE() for assert module mutex not held When running with lockdep enabled, I triggered the WARN_ON() in the module code that asserts when module_mutex or rcu_read_lock_sched are not held. The issue I have is that this can also be called from the dump_stack() code, causing us to enter an infinite loop... ------------[ cut here ]------------ WARNING: CPU: 1 PID: 0 at kernel/module.c:268 module_assert_mutex_or_preempt+0x3c/0x3e Modules linked in: ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc3-test-00013-g501c2375253c #14 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014 ffff880215e8fa70 ffff880215e8fa70 ffffffff812fc8e3 0000000000000000 ffffffff81d3e55b ffff880215e8fac0 ffffffff8104fc88 ffffffff8104fcab 0000000915e88300 0000000000000046 ffffffffa019b29a 0000000000000001 Call Trace: [] dump_stack+0x67/0x90 [] __warn+0xcb/0xe9 [] ? warn_slowpath_null+0x5/0x1f ------------[ cut here ]------------ WARNING: CPU: 1 PID: 0 at kernel/module.c:268 module_assert_mutex_or_preempt+0x3c/0x3e Modules linked in: ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc3-test-00013-g501c2375253c #14 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014 ffff880215e8f7a0 ffff880215e8f7a0 ffffffff812fc8e3 0000000000000000 ffffffff81d3e55b ffff880215e8f7f0 ffffffff8104fc88 ffffffff8104fcab 0000000915e88300 0000000000000046 ffffffffa019b29a 0000000000000001 Call Trace: [] dump_stack+0x67/0x90 [] __warn+0xcb/0xe9 [] ? warn_slowpath_null+0x5/0x1f ------------[ cut here ]------------ WARNING: CPU: 1 PID: 0 at kernel/module.c:268 module_assert_mutex_or_preempt+0x3c/0x3e Modules linked in: ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc3-test-00013-g501c2375253c #14 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014 ffff880215e8f4d0 ffff880215e8f4d0 ffffffff812fc8e3 0000000000000000 ffffffff81d3e55b ffff880215e8f520 ffffffff8104fc88 ffffffff8104fcab 0000000915e88300 0000000000000046 ffffffffa019b29a 0000000000000001 Call Trace: [] dump_stack+0x67/0x90 [] __warn+0xcb/0xe9 [] ? warn_slowpath_null+0x5/0x1f ------------[ cut here ]------------ WARNING: CPU: 1 PID: 0 at kernel/module.c:268 module_assert_mutex_or_preempt+0x3c/0x3e [...] Which gives us rather useless information. Worse yet, there's some race that causes this, and I seldom trigger it, so I have no idea what happened. This would not be an issue if that warning was a WARN_ON_ONCE(). Signed-off-by: Steven Rostedt Signed-off-by: Rusty Russell --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 7f21ab238aa7..beaebea627ff 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -264,7 +264,7 @@ static void module_assert_mutex_or_preempt(void) if (unlikely(!debug_locks)) return; - WARN_ON(!rcu_read_lock_sched_held() && + WARN_ON_ONCE(!rcu_read_lock_sched_held() && !lockdep_is_held(&module_mutex)); #endif } -- cgit v1.3-8-gc7d7 From be7de5f91fdc3a63ee01910c43f20db213445ce4 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Thu, 21 Jul 2016 15:37:56 +0930 Subject: modules: Add kernel parameter to blacklist modules Blacklisting a module in linux has long been a problem. The current procedure is to use rd.blacklist=module_name, however, that doesn't cover the case after the initramfs and before a boot prompt (where one is supposed to use /etc/modprobe.d/blacklist.conf to blacklist runtime loading). Using rd.shell to get an early prompt is hit-or-miss, and doesn't cover all situations AFAICT. This patch adds this functionality of permanently blacklisting a module by its name via the kernel parameter module_blacklist=module_name. [v2]: Rusty, use core_param() instead of __setup() which simplifies things. [v3]: Rusty, undo wreckage from strsep() [v4]: Rusty, simpler version of blacklisted() Signed-off-by: Prarit Bhargava Cc: Jonathan Corbet Cc: Rusty Russell Cc: linux-doc@vger.kernel.org Signed-off-by: Rusty Russell --- Documentation/kernel-parameters.txt | 3 +++ kernel/module.c | 24 ++++++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 724970a25666..c59ae1af02b3 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2301,6 +2301,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Note that if CONFIG_MODULE_SIG_FORCE is set, that is always true, so this option does nothing. + module_blacklist= [KNL] Do not load a comma-separated list of + modules. Useful for debugging problem modules. + mousedev.tap_time= [MOUSE] Maximum time between finger touching and leaving touchpad surface for touch to be considered diff --git a/kernel/module.c b/kernel/module.c index beaebea627ff..c91c2fdca2e6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3168,6 +3168,27 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr, return 0; } +/* module_blacklist is a comma-separated list of module names */ +static char *module_blacklist; +static bool blacklisted(char *module_name) +{ + const char *p; + size_t len; + + if (!module_blacklist) + return false; + + for (p = module_blacklist; *p; p += len) { + len = strcspn(p, ","); + if (strlen(module_name) == len && !memcmp(module_name, p, len)) + return true; + if (p[len] == ',') + len++; + } + return false; +} +core_param(module_blacklist, module_blacklist, charp, 0400); + static struct module *layout_and_allocate(struct load_info *info, int flags) { /* Module within temporary copy. */ @@ -3178,6 +3199,9 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) if (IS_ERR(mod)) return mod; + if (blacklisted(mod->name)) + return ERR_PTR(-EPERM); + err = check_modinfo(mod, info, flags); if (err) return ERR_PTR(err); -- cgit v1.3-8-gc7d7 From bdc9f373551dd3f1e6fae1618f2394ee9bc7db2e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 27 Jul 2016 12:17:35 +0930 Subject: jump_label: disable preemption around __module_text_address(). Steven reported a warning caused by not holding module_mutex or rcu_read_lock_sched: his backtrace was corrupted but a quick audit found this possible cause. It's wrong anyway... Reported-by: Steven Rostedt Signed-off-by: Rusty Russell --- kernel/jump_label.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 0dbea887d625..0eef93962a91 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -284,11 +284,14 @@ static int __jump_label_mod_text_reserved(void *start, void *end) { struct module *mod; + preempt_disable(); mod = __module_text_address((unsigned long)start); + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + preempt_enable(); + if (!mod) return 0; - WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); return __jump_label_text_reserved(mod->jump_entries, mod->jump_entries + mod->num_jump_entries, -- cgit v1.3-8-gc7d7 From 444d13ff10fb13bc3e64859c3cf9ce43dcfeb075 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Wed, 27 Jul 2016 12:06:21 +0930 Subject: modules: add ro_after_init support Add ro_after_init support for modules by adding a new page-aligned section in the module layout (after rodata) for ro_after_init data and enabling RO protection for that section after module init runs. Signed-off-by: Jessica Yu Acked-by: Kees Cook Signed-off-by: Rusty Russell --- include/linux/module.h | 6 +++-- include/uapi/linux/elf.h | 1 + kernel/livepatch/core.c | 2 +- kernel/module.c | 66 +++++++++++++++++++++++++++++++++++++++--------- 4 files changed, 60 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index f95ed243a4de..0c3207d26ac0 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -298,6 +298,8 @@ struct module_layout { unsigned int text_size; /* Size of RO section of the module (text+rodata) */ unsigned int ro_size; + /* Size of RO after init section */ + unsigned int ro_after_init_size; #ifdef CONFIG_MODULES_TREE_LOOKUP struct mod_tree_node mtn; @@ -765,12 +767,12 @@ extern int module_sysfs_initialized; #ifdef CONFIG_DEBUG_SET_MODULE_RONX extern void set_all_modules_text_rw(void); extern void set_all_modules_text_ro(void); -extern void module_enable_ro(const struct module *mod); +extern void module_enable_ro(const struct module *mod, bool after_init); extern void module_disable_ro(const struct module *mod); #else static inline void set_all_modules_text_rw(void) { } static inline void set_all_modules_text_ro(void) { } -static inline void module_enable_ro(const struct module *mod) { } +static inline void module_enable_ro(const struct module *mod, bool after_init) { } static inline void module_disable_ro(const struct module *mod) { } #endif diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index cb4a72f888d5..70b172ba41ce 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -286,6 +286,7 @@ typedef struct elf64_phdr { #define SHF_ALLOC 0x2 #define SHF_EXECINSTR 0x4 #define SHF_RELA_LIVEPATCH 0x00100000 +#define SHF_RO_AFTER_INIT 0x00200000 #define SHF_MASKPROC 0xf0000000 /* special section indexes */ diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 5c2bc1052691..8bbe50704621 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -309,7 +309,7 @@ static int klp_write_object_relocations(struct module *pmod, break; } - module_enable_ro(pmod); + module_enable_ro(pmod, true); return ret; } diff --git a/kernel/module.c b/kernel/module.c index c91c2fdca2e6..205a71a97852 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1857,10 +1857,11 @@ static void mod_sysfs_teardown(struct module *mod) * from modification and any data from execution. * * General layout of module is: - * [text] [read-only-data] [writable data] - * text_size -----^ ^ ^ - * ro_size ------------------------| | - * size -------------------------------------------| + * [text] [read-only-data] [ro-after-init] [writable data] + * text_size -----^ ^ ^ ^ + * ro_size ------------------------| | | + * ro_after_init_size -----------------------------| | + * size -----------------------------------------------------------| * * These values are always page-aligned (as is base) */ @@ -1883,14 +1884,24 @@ static void frob_rodata(const struct module_layout *layout, (layout->ro_size - layout->text_size) >> PAGE_SHIFT); } +static void frob_ro_after_init(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base + layout->ro_size, + (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT); +} + static void frob_writable_data(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) { BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base + layout->ro_size, - (layout->size - layout->ro_size) >> PAGE_SHIFT); + set_memory((unsigned long)layout->base + layout->ro_after_init_size, + (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); } /* livepatching wants to disable read-only so it can frob module. */ @@ -1898,21 +1909,26 @@ void module_disable_ro(const struct module *mod) { frob_text(&mod->core_layout, set_memory_rw); frob_rodata(&mod->core_layout, set_memory_rw); + frob_ro_after_init(&mod->core_layout, set_memory_rw); frob_text(&mod->init_layout, set_memory_rw); frob_rodata(&mod->init_layout, set_memory_rw); } -void module_enable_ro(const struct module *mod) +void module_enable_ro(const struct module *mod, bool after_init) { frob_text(&mod->core_layout, set_memory_ro); frob_rodata(&mod->core_layout, set_memory_ro); frob_text(&mod->init_layout, set_memory_ro); frob_rodata(&mod->init_layout, set_memory_ro); + + if (after_init) + frob_ro_after_init(&mod->core_layout, set_memory_ro); } static void module_enable_nx(const struct module *mod) { frob_rodata(&mod->core_layout, set_memory_nx); + frob_ro_after_init(&mod->core_layout, set_memory_nx); frob_writable_data(&mod->core_layout, set_memory_nx); frob_rodata(&mod->init_layout, set_memory_nx); frob_writable_data(&mod->init_layout, set_memory_nx); @@ -1921,6 +1937,7 @@ static void module_enable_nx(const struct module *mod) static void module_disable_nx(const struct module *mod) { frob_rodata(&mod->core_layout, set_memory_x); + frob_ro_after_init(&mod->core_layout, set_memory_x); frob_writable_data(&mod->core_layout, set_memory_x); frob_rodata(&mod->init_layout, set_memory_x); frob_writable_data(&mod->init_layout, set_memory_x); @@ -1963,6 +1980,8 @@ static void disable_ro_nx(const struct module_layout *layout) frob_text(layout, set_memory_rw); frob_rodata(layout, set_memory_rw); frob_rodata(layout, set_memory_x); + frob_ro_after_init(layout, set_memory_rw); + frob_ro_after_init(layout, set_memory_x); frob_writable_data(layout, set_memory_x); } @@ -2305,6 +2324,7 @@ static void layout_sections(struct module *mod, struct load_info *info) * finder in the two loops below */ { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL }, { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL }, + { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL }, { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL }, { ARCH_SHF_SMALL | SHF_ALLOC, 0 } }; @@ -2336,7 +2356,11 @@ static void layout_sections(struct module *mod, struct load_info *info) mod->core_layout.size = debug_align(mod->core_layout.size); mod->core_layout.ro_size = mod->core_layout.size; break; - case 3: /* whole core */ + case 2: /* RO after init */ + mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.ro_after_init_size = mod->core_layout.size; + break; + case 4: /* whole core */ mod->core_layout.size = debug_align(mod->core_layout.size); break; } @@ -2366,7 +2390,14 @@ static void layout_sections(struct module *mod, struct load_info *info) mod->init_layout.size = debug_align(mod->init_layout.size); mod->init_layout.ro_size = mod->init_layout.size; break; - case 3: /* whole init */ + case 2: + /* + * RO after init doesn't apply to init_layout (only + * core_layout), so it just takes the value of ro_size. + */ + mod->init_layout.ro_after_init_size = mod->init_layout.ro_size; + break; + case 4: /* whole init */ mod->init_layout.size = debug_align(mod->init_layout.size); break; } @@ -3193,6 +3224,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) { /* Module within temporary copy. */ struct module *mod; + unsigned int ndx; int err; mod = setup_load_info(info, flags); @@ -3215,6 +3247,15 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) /* We will do a special allocation for per-cpu sections later. */ info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC; + /* + * Mark ro_after_init section with SHF_RO_AFTER_INIT so that + * layout_sections() can put it in the right place. + * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set. + */ + ndx = find_sec(info, ".data..ro_after_init"); + if (ndx) + info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT; + /* Determine total sizes, and put offsets in sh_entsize. For now this is done generically; there doesn't appear to be any special cases for the architectures. */ @@ -3381,12 +3422,14 @@ static noinline int do_init_module(struct module *mod) /* Switch to core kallsyms now init is done: kallsyms may be walking! */ rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif + module_enable_ro(mod, true); mod_tree_remove_init(mod); disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); mod->init_layout.base = NULL; mod->init_layout.size = 0; mod->init_layout.ro_size = 0; + mod->init_layout.ro_after_init_size = 0; mod->init_layout.text_size = 0; /* * We want to free module_init, but be aware that kallsyms may be @@ -3478,8 +3521,7 @@ static int complete_formation(struct module *mod, struct load_info *info) /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); - /* Set RO and NX regions */ - module_enable_ro(mod); + module_enable_ro(mod, false); module_enable_nx(mod); /* Mark state as coming so strong_try_module_get() ignores us, -- cgit v1.3-8-gc7d7 From 97f2645f358b411ba2afb22e5966753f0ad92916 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 3 Aug 2016 13:45:50 -0700 Subject: tree-wide: replace config_enabled() with IS_ENABLED() The use of config_enabled() against config options is ambiguous. In practical terms, config_enabled() is equivalent to IS_BUILTIN(), but the author might have used it for the meaning of IS_ENABLED(). Using IS_ENABLED(), IS_BUILTIN(), IS_MODULE() etc. makes the intention clearer. This commit replaces config_enabled() with IS_ENABLED() where possible. This commit is only touching bool config options. I noticed two cases where config_enabled() is used against a tristate option: - config_enabled(CONFIG_HWMON) [ drivers/net/wireless/ath/ath10k/thermal.c ] - config_enabled(CONFIG_BACKLIGHT_CLASS_DEVICE) [ drivers/gpu/drm/gma500/opregion.c ] I did not touch them because they should be converted to IS_BUILTIN() in order to keep the logic, but I was not sure it was the authors' intention. Link: http://lkml.kernel.org/r/1465215656-20569-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Kees Cook Cc: Stas Sergeev Cc: Matt Redfearn Cc: Joshua Kinard Cc: Jiri Slaby Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Markos Chandras Cc: "Dmitry V. Levin" Cc: yu-cheng yu Cc: James Hogan Cc: Brian Gerst Cc: Johannes Berg Cc: Peter Zijlstra Cc: Al Viro Cc: Will Drewry Cc: Nikolay Martynov Cc: Huacai Chen Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Daniel Borkmann Cc: Leonid Yegoshin Cc: Rafal Milecki Cc: James Cowgill Cc: Greg Kroah-Hartman Cc: Ralf Baechle Cc: Alex Smith Cc: Adam Buchbinder Cc: Qais Yousef Cc: Jiang Liu Cc: Mikko Rapeli Cc: Paul Gortmaker Cc: Denys Vlasenko Cc: Brian Norris Cc: Hidehiro Kawai Cc: "Luis R. Rodriguez" Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Dave Hansen Cc: "Kirill A. Shutemov" Cc: Roland McGrath Cc: Paul Burton Cc: Kalle Valo Cc: Viresh Kumar Cc: Tony Wu Cc: Huaitong Han Cc: Sumit Semwal Cc: Alexei Starovoitov Cc: Juergen Gross Cc: Jason Cooper Cc: "David S. Miller" Cc: Oleg Nesterov Cc: Andrea Gelmini Cc: David Woodhouse Cc: Marc Zyngier Cc: Rabin Vincent Cc: "Maciej W. Rozycki" Cc: David Daney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/include/asm/mips-cm.h | 2 +- arch/mips/include/asm/pgtable.h | 16 +++++------ arch/mips/include/asm/seccomp.h | 4 +-- arch/mips/include/asm/signal.h | 4 +-- arch/mips/include/asm/syscall.h | 2 +- arch/mips/include/asm/uaccess.h | 2 +- arch/mips/jz4740/setup.c | 2 +- arch/mips/kernel/cpu-bugs64.c | 6 ++--- arch/mips/kernel/elf.c | 4 +-- arch/mips/kernel/mips-cm.c | 2 +- arch/mips/kernel/mips-r2-to-r6-emul.c | 34 ++++++++++++------------ arch/mips/kernel/pm-cps.c | 4 +-- arch/mips/kernel/signal.c | 10 +++---- arch/mips/kernel/smp-cps.c | 4 +-- arch/mips/kernel/unaligned.c | 10 +++---- arch/mips/math-emu/cp1emu.c | 6 ++--- arch/mips/mm/tlbex.c | 10 +++---- arch/mips/mti-malta/malta-dtshim.c | 4 +-- arch/mips/mti-malta/malta-memory.c | 2 +- arch/mips/mti-malta/malta-setup.c | 2 +- arch/mips/net/bpf_jit.c | 4 +-- arch/x86/include/asm/elf.h | 4 +-- arch/x86/include/asm/fpu/internal.h | 16 +++++------ arch/x86/include/asm/mmu_context.h | 2 +- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/vector.c | 2 +- arch/x86/kernel/fpu/signal.c | 12 ++++----- arch/x86/kernel/signal.c | 14 +++++----- drivers/firmware/broadcom/bcm47xx_sprom.c | 2 +- drivers/irqchip/irq-mips-gic.c | 2 +- drivers/mtd/bcm47xxpart.c | 2 +- drivers/net/wireless/ath/ath10k/debug.c | 12 ++++----- drivers/net/wireless/ath/ath10k/mac.c | 10 +++---- drivers/net/wireless/ath/ath10k/wmi.c | 2 +- drivers/net/wireless/ath/ath6kl/cfg80211.c | 2 +- drivers/net/wireless/ath/ath9k/common-spectral.c | 6 ++--- drivers/net/wireless/ath/ath9k/init.c | 2 +- drivers/net/wireless/ath/ath9k/main.c | 12 ++++----- drivers/net/wireless/ath/ath9k/recv.c | 2 +- drivers/net/wireless/ath/dfs_pattern_detector.c | 2 +- drivers/net/wireless/ath/regd.c | 4 +-- drivers/pci/ecam.c | 2 +- drivers/tty/serial/ar933x_uart.c | 4 +-- include/linux/fence.h | 2 +- include/linux/ww_mutex.h | 4 +-- kernel/ptrace.c | 4 +-- kernel/seccomp.c | 6 ++--- net/wireless/chan.c | 2 +- 48 files changed, 135 insertions(+), 135 deletions(-) (limited to 'kernel') diff --git a/arch/mips/include/asm/mips-cm.h b/arch/mips/include/asm/mips-cm.h index 9411a4c0bdad..58e7874e9347 100644 --- a/arch/mips/include/asm/mips-cm.h +++ b/arch/mips/include/asm/mips-cm.h @@ -462,7 +462,7 @@ static inline unsigned int mips_cm_max_vp_width(void) if (mips_cm_revision() >= CM_REV_CM3) return read_gcr_sys_config2() & CM_GCR_SYS_CONFIG2_MAXVPW_MSK; - if (config_enabled(CONFIG_SMP)) + if (IS_ENABLED(CONFIG_SMP)) return smp_num_siblings; return 1; diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 7d44e888134f..70128d3f770a 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -159,7 +159,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte) * it better already be global) */ if (pte_none(*buddy)) { - if (!config_enabled(CONFIG_XPA)) + if (!IS_ENABLED(CONFIG_XPA)) buddy->pte_low |= _PAGE_GLOBAL; buddy->pte_high |= _PAGE_GLOBAL; } @@ -172,7 +172,7 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *pt htw_stop(); /* Preserve global status for the pair */ - if (config_enabled(CONFIG_XPA)) { + if (IS_ENABLED(CONFIG_XPA)) { if (ptep_buddy(ptep)->pte_high & _PAGE_GLOBAL) null.pte_high = _PAGE_GLOBAL; } else { @@ -319,7 +319,7 @@ static inline int pte_young(pte_t pte) { return pte.pte_low & _PAGE_ACCESSED; } static inline pte_t pte_wrprotect(pte_t pte) { pte.pte_low &= ~_PAGE_WRITE; - if (!config_enabled(CONFIG_XPA)) + if (!IS_ENABLED(CONFIG_XPA)) pte.pte_low &= ~_PAGE_SILENT_WRITE; pte.pte_high &= ~_PAGE_SILENT_WRITE; return pte; @@ -328,7 +328,7 @@ static inline pte_t pte_wrprotect(pte_t pte) static inline pte_t pte_mkclean(pte_t pte) { pte.pte_low &= ~_PAGE_MODIFIED; - if (!config_enabled(CONFIG_XPA)) + if (!IS_ENABLED(CONFIG_XPA)) pte.pte_low &= ~_PAGE_SILENT_WRITE; pte.pte_high &= ~_PAGE_SILENT_WRITE; return pte; @@ -337,7 +337,7 @@ static inline pte_t pte_mkclean(pte_t pte) static inline pte_t pte_mkold(pte_t pte) { pte.pte_low &= ~_PAGE_ACCESSED; - if (!config_enabled(CONFIG_XPA)) + if (!IS_ENABLED(CONFIG_XPA)) pte.pte_low &= ~_PAGE_SILENT_READ; pte.pte_high &= ~_PAGE_SILENT_READ; return pte; @@ -347,7 +347,7 @@ static inline pte_t pte_mkwrite(pte_t pte) { pte.pte_low |= _PAGE_WRITE; if (pte.pte_low & _PAGE_MODIFIED) { - if (!config_enabled(CONFIG_XPA)) + if (!IS_ENABLED(CONFIG_XPA)) pte.pte_low |= _PAGE_SILENT_WRITE; pte.pte_high |= _PAGE_SILENT_WRITE; } @@ -358,7 +358,7 @@ static inline pte_t pte_mkdirty(pte_t pte) { pte.pte_low |= _PAGE_MODIFIED; if (pte.pte_low & _PAGE_WRITE) { - if (!config_enabled(CONFIG_XPA)) + if (!IS_ENABLED(CONFIG_XPA)) pte.pte_low |= _PAGE_SILENT_WRITE; pte.pte_high |= _PAGE_SILENT_WRITE; } @@ -369,7 +369,7 @@ static inline pte_t pte_mkyoung(pte_t pte) { pte.pte_low |= _PAGE_ACCESSED; if (!(pte.pte_low & _PAGE_NO_READ)) { - if (!config_enabled(CONFIG_XPA)) + if (!IS_ENABLED(CONFIG_XPA)) pte.pte_low |= _PAGE_SILENT_READ; pte.pte_high |= _PAGE_SILENT_READ; } diff --git a/arch/mips/include/asm/seccomp.h b/arch/mips/include/asm/seccomp.h index 684fb3a12ed3..d886d6f7687a 100644 --- a/arch/mips/include/asm/seccomp.h +++ b/arch/mips/include/asm/seccomp.h @@ -16,10 +16,10 @@ static inline const int *get_compat_mode1_syscalls(void) 0, /* null terminated */ }; - if (config_enabled(CONFIG_MIPS32_O32) && test_thread_flag(TIF_32BIT_REGS)) + if (IS_ENABLED(CONFIG_MIPS32_O32) && test_thread_flag(TIF_32BIT_REGS)) return syscalls_O32; - if (config_enabled(CONFIG_MIPS32_N32)) + if (IS_ENABLED(CONFIG_MIPS32_N32)) return syscalls_N32; BUG(); diff --git a/arch/mips/include/asm/signal.h b/arch/mips/include/asm/signal.h index 2292373ff11a..82eae1583bcf 100644 --- a/arch/mips/include/asm/signal.h +++ b/arch/mips/include/asm/signal.h @@ -19,8 +19,8 @@ extern struct mips_abi mips_abi_32; ((ka)->sa.sa_flags & SA_SIGINFO)) #else #define sig_uses_siginfo(ka, abi) \ - (config_enabled(CONFIG_64BIT) ? 1 : \ - (config_enabled(CONFIG_TRAD_SIGNALS) ? \ + (IS_ENABLED(CONFIG_64BIT) ? 1 : \ + (IS_ENABLED(CONFIG_TRAD_SIGNALS) ? \ ((ka)->sa.sa_flags & SA_SIGINFO) : 1) ) #endif diff --git a/arch/mips/include/asm/syscall.h b/arch/mips/include/asm/syscall.h index 47bc45a67e9b..d87882513ee3 100644 --- a/arch/mips/include/asm/syscall.h +++ b/arch/mips/include/asm/syscall.h @@ -99,7 +99,7 @@ static inline void syscall_get_arguments(struct task_struct *task, { int ret; /* O32 ABI syscall() - Either 64-bit with O32 or 32-bit */ - if ((config_enabled(CONFIG_32BIT) || + if ((IS_ENABLED(CONFIG_32BIT) || test_tsk_thread_flag(task, TIF_32BIT_REGS)) && (regs->regs[2] == __NR_syscall)) i++; diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h index 7f109d4f64a4..11b965f98d95 100644 --- a/arch/mips/include/asm/uaccess.h +++ b/arch/mips/include/asm/uaccess.h @@ -88,7 +88,7 @@ extern u64 __ua_limit; */ static inline bool eva_kernel_access(void) { - if (!config_enabled(CONFIG_EVA)) + if (!IS_ENABLED(CONFIG_EVA)) return false; return segment_eq(get_fs(), get_ds()); diff --git a/arch/mips/jz4740/setup.c b/arch/mips/jz4740/setup.c index 0914ef775b5f..6d0152321819 100644 --- a/arch/mips/jz4740/setup.c +++ b/arch/mips/jz4740/setup.c @@ -75,7 +75,7 @@ void __init device_tree_init(void) const char *get_system_type(void) { - if (config_enabled(CONFIG_MACH_JZ4780)) + if (IS_ENABLED(CONFIG_MACH_JZ4780)) return "JZ4780"; return "JZ4740"; diff --git a/arch/mips/kernel/cpu-bugs64.c b/arch/mips/kernel/cpu-bugs64.c index 6392dbe504fb..a378e44688f5 100644 --- a/arch/mips/kernel/cpu-bugs64.c +++ b/arch/mips/kernel/cpu-bugs64.c @@ -244,7 +244,7 @@ static inline void check_daddi(void) panic(bug64hit, !DADDI_WAR ? daddiwar : nowar); } -int daddiu_bug = config_enabled(CONFIG_CPU_MIPSR6) ? 0 : -1; +int daddiu_bug = IS_ENABLED(CONFIG_CPU_MIPSR6) ? 0 : -1; static inline void check_daddiu(void) { @@ -314,7 +314,7 @@ static inline void check_daddiu(void) void __init check_bugs64_early(void) { - if (!config_enabled(CONFIG_CPU_MIPSR6)) { + if (!IS_ENABLED(CONFIG_CPU_MIPSR6)) { check_mult_sh(); check_daddiu(); } @@ -322,6 +322,6 @@ void __init check_bugs64_early(void) void __init check_bugs64(void) { - if (!config_enabled(CONFIG_CPU_MIPSR6)) + if (!IS_ENABLED(CONFIG_CPU_MIPSR6)) check_daddi(); } diff --git a/arch/mips/kernel/elf.c b/arch/mips/kernel/elf.c index 891f5ee63983..e6eb7f1f7723 100644 --- a/arch/mips/kernel/elf.c +++ b/arch/mips/kernel/elf.c @@ -179,7 +179,7 @@ int arch_check_elf(void *_ehdr, bool has_interpreter, void *_interp_ehdr, return -ELIBBAD; } - if (!config_enabled(CONFIG_MIPS_O32_FP64_SUPPORT)) + if (!IS_ENABLED(CONFIG_MIPS_O32_FP64_SUPPORT)) return 0; fp_abi = state->fp_abi; @@ -285,7 +285,7 @@ void mips_set_personality_fp(struct arch_elf_state *state) * not be worried about N32/N64 binaries. */ - if (!config_enabled(CONFIG_MIPS_O32_FP64_SUPPORT)) + if (!IS_ENABLED(CONFIG_MIPS_O32_FP64_SUPPORT)) return; switch (state->overall_fp_mode) { diff --git a/arch/mips/kernel/mips-cm.c b/arch/mips/kernel/mips-cm.c index 760217bbb2fa..659e6d3ae335 100644 --- a/arch/mips/kernel/mips-cm.c +++ b/arch/mips/kernel/mips-cm.c @@ -251,7 +251,7 @@ int mips_cm_probe(void) mips_cm_probe_l2sync(); /* determine register width for this CM */ - mips_cm_is64 = config_enabled(CONFIG_64BIT) && (mips_cm_revision() >= CM_REV_CM3); + mips_cm_is64 = IS_ENABLED(CONFIG_64BIT) && (mips_cm_revision() >= CM_REV_CM3); for_each_possible_cpu(cpu) spin_lock_init(&per_cpu(cm_core_lock, cpu)); diff --git a/arch/mips/kernel/mips-r2-to-r6-emul.c b/arch/mips/kernel/mips-r2-to-r6-emul.c index 7ff2a557f4aa..43fbadc78d0a 100644 --- a/arch/mips/kernel/mips-r2-to-r6-emul.c +++ b/arch/mips/kernel/mips-r2-to-r6-emul.c @@ -84,7 +84,7 @@ static inline int mipsr6_emul(struct pt_regs *regs, u32 ir) (s32)MIPSInst_SIMM(ir); return 0; case daddiu_op: - if (config_enabled(CONFIG_32BIT)) + if (IS_ENABLED(CONFIG_32BIT)) break; if (MIPSInst_RT(ir)) @@ -143,7 +143,7 @@ static inline int mipsr6_emul(struct pt_regs *regs, u32 ir) (u32)regs->regs[MIPSInst_RT(ir)]); return 0; case dsll_op: - if (config_enabled(CONFIG_32BIT) || MIPSInst_RS(ir)) + if (IS_ENABLED(CONFIG_32BIT) || MIPSInst_RS(ir)) break; if (MIPSInst_RD(ir)) @@ -152,7 +152,7 @@ static inline int mipsr6_emul(struct pt_regs *regs, u32 ir) MIPSInst_FD(ir)); return 0; case dsrl_op: - if (config_enabled(CONFIG_32BIT) || MIPSInst_RS(ir)) + if (IS_ENABLED(CONFIG_32BIT) || MIPSInst_RS(ir)) break; if (MIPSInst_RD(ir)) @@ -161,7 +161,7 @@ static inline int mipsr6_emul(struct pt_regs *regs, u32 ir) MIPSInst_FD(ir)); return 0; case daddu_op: - if (config_enabled(CONFIG_32BIT) || MIPSInst_FD(ir)) + if (IS_ENABLED(CONFIG_32BIT) || MIPSInst_FD(ir)) break; if (MIPSInst_RD(ir)) @@ -170,7 +170,7 @@ static inline int mipsr6_emul(struct pt_regs *regs, u32 ir) (u64)regs->regs[MIPSInst_RT(ir)]; return 0; case dsubu_op: - if (config_enabled(CONFIG_32BIT) || MIPSInst_FD(ir)) + if (IS_ENABLED(CONFIG_32BIT) || MIPSInst_FD(ir)) break; if (MIPSInst_RD(ir)) @@ -498,7 +498,7 @@ static int dmult_func(struct pt_regs *regs, u32 ir) s64 res; s64 rt, rs; - if (config_enabled(CONFIG_32BIT)) + if (IS_ENABLED(CONFIG_32BIT)) return SIGILL; rt = regs->regs[MIPSInst_RT(ir)]; @@ -530,7 +530,7 @@ static int dmultu_func(struct pt_regs *regs, u32 ir) u64 res; u64 rt, rs; - if (config_enabled(CONFIG_32BIT)) + if (IS_ENABLED(CONFIG_32BIT)) return SIGILL; rt = regs->regs[MIPSInst_RT(ir)]; @@ -561,7 +561,7 @@ static int ddiv_func(struct pt_regs *regs, u32 ir) { s64 rt, rs; - if (config_enabled(CONFIG_32BIT)) + if (IS_ENABLED(CONFIG_32BIT)) return SIGILL; rt = regs->regs[MIPSInst_RT(ir)]; @@ -586,7 +586,7 @@ static int ddivu_func(struct pt_regs *regs, u32 ir) { u64 rt, rs; - if (config_enabled(CONFIG_32BIT)) + if (IS_ENABLED(CONFIG_32BIT)) return SIGILL; rt = regs->regs[MIPSInst_RT(ir)]; @@ -825,7 +825,7 @@ static int dclz_func(struct pt_regs *regs, u32 ir) u64 res; u64 rs; - if (config_enabled(CONFIG_32BIT)) + if (IS_ENABLED(CONFIG_32BIT)) return SIGILL; if (!MIPSInst_RD(ir)) @@ -852,7 +852,7 @@ static int dclo_func(struct pt_regs *regs, u32 ir) u64 res; u64 rs; - if (config_enabled(CONFIG_32BIT)) + if (IS_ENABLED(CONFIG_32BIT)) return SIGILL; if (!MIPSInst_RD(ir)) @@ -1484,7 +1484,7 @@ fpu_emul: break; case ldl_op: - if (config_enabled(CONFIG_32BIT)) { + if (IS_ENABLED(CONFIG_32BIT)) { err = SIGILL; break; } @@ -1603,7 +1603,7 @@ fpu_emul: break; case ldr_op: - if (config_enabled(CONFIG_32BIT)) { + if (IS_ENABLED(CONFIG_32BIT)) { err = SIGILL; break; } @@ -1722,7 +1722,7 @@ fpu_emul: break; case sdl_op: - if (config_enabled(CONFIG_32BIT)) { + if (IS_ENABLED(CONFIG_32BIT)) { err = SIGILL; break; } @@ -1840,7 +1840,7 @@ fpu_emul: break; case sdr_op: - if (config_enabled(CONFIG_32BIT)) { + if (IS_ENABLED(CONFIG_32BIT)) { err = SIGILL; break; } @@ -2072,7 +2072,7 @@ fpu_emul: break; case lld_op: - if (config_enabled(CONFIG_32BIT)) { + if (IS_ENABLED(CONFIG_32BIT)) { err = SIGILL; break; } @@ -2133,7 +2133,7 @@ fpu_emul: break; case scd_op: - if (config_enabled(CONFIG_32BIT)) { + if (IS_ENABLED(CONFIG_32BIT)) { err = SIGILL; break; } diff --git a/arch/mips/kernel/pm-cps.c b/arch/mips/kernel/pm-cps.c index adda3ffb9b78..5b31a9405ebc 100644 --- a/arch/mips/kernel/pm-cps.c +++ b/arch/mips/kernel/pm-cps.c @@ -148,7 +148,7 @@ int cps_pm_enter_state(enum cps_pm_state state) } /* Setup the VPE to run mips_cps_pm_restore when started again */ - if (config_enabled(CONFIG_CPU_PM) && state == CPS_PM_POWER_GATED) { + if (IS_ENABLED(CONFIG_CPU_PM) && state == CPS_PM_POWER_GATED) { /* Power gating relies upon CPS SMP */ if (!mips_cps_smp_in_use()) return -EINVAL; @@ -387,7 +387,7 @@ static void * __init cps_gen_entry_code(unsigned cpu, enum cps_pm_state state) memset(labels, 0, sizeof(labels)); memset(relocs, 0, sizeof(relocs)); - if (config_enabled(CONFIG_CPU_PM) && state == CPS_PM_POWER_GATED) { + if (IS_ENABLED(CONFIG_CPU_PM) && state == CPS_PM_POWER_GATED) { /* Power gating relies upon CPS SMP */ if (!mips_cps_smp_in_use()) goto out_err; diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index ae4231452115..1975cd2f7de6 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -165,7 +165,7 @@ static int save_msa_extcontext(void __user *buf) * should already have been done when handling scalar FP * context. */ - BUG_ON(config_enabled(CONFIG_EVA)); + BUG_ON(IS_ENABLED(CONFIG_EVA)); err = __put_user(read_msa_csr(), &msa->csr); err |= _save_msa_all_upper(&msa->wr); @@ -195,7 +195,7 @@ static int restore_msa_extcontext(void __user *buf, unsigned int size) unsigned int csr; int i, err; - if (!config_enabled(CONFIG_CPU_HAS_MSA)) + if (!IS_ENABLED(CONFIG_CPU_HAS_MSA)) return SIGSYS; if (size != sizeof(*msa)) @@ -215,7 +215,7 @@ static int restore_msa_extcontext(void __user *buf, unsigned int size) * scalar FP context, so FPU & MSA should have already been * disabled whilst handling scalar FP context. */ - BUG_ON(config_enabled(CONFIG_EVA)); + BUG_ON(IS_ENABLED(CONFIG_EVA)); write_msa_csr(csr); err |= _restore_msa_all_upper(&msa->wr); @@ -315,7 +315,7 @@ int protected_save_fp_context(void __user *sc) * EVA does not have userland equivalents of ldc1 or sdc1, so * save to the kernel FP context & copy that to userland below. */ - if (config_enabled(CONFIG_EVA)) + if (IS_ENABLED(CONFIG_EVA)) lose_fpu(1); while (1) { @@ -378,7 +378,7 @@ int protected_restore_fp_context(void __user *sc) * disable the FPU here such that the code below simply copies to * the kernel FP context. */ - if (config_enabled(CONFIG_EVA)) + if (IS_ENABLED(CONFIG_EVA)) lose_fpu(0); while (1) { diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c index 4ed36f288d64..05b3201271b4 100644 --- a/arch/mips/kernel/smp-cps.c +++ b/arch/mips/kernel/smp-cps.c @@ -46,8 +46,8 @@ static unsigned core_vpe_count(unsigned core) if (threads_disabled) return 1; - if ((!config_enabled(CONFIG_MIPS_MT_SMP) || !cpu_has_mipsmt) - && (!config_enabled(CONFIG_CPU_MIPSR6) || !cpu_has_vp)) + if ((!IS_ENABLED(CONFIG_MIPS_MT_SMP) || !cpu_has_mipsmt) + && (!IS_ENABLED(CONFIG_CPU_MIPSR6) || !cpu_has_vp)) return 1; mips_cm_lock_other(core, 0); diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c index 28b3af73a17b..f1c308dbbc4a 100644 --- a/arch/mips/kernel/unaligned.c +++ b/arch/mips/kernel/unaligned.c @@ -1025,7 +1025,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, if (!access_ok(VERIFY_READ, addr, 2)) goto sigbus; - if (config_enabled(CONFIG_EVA)) { + if (IS_ENABLED(CONFIG_EVA)) { if (segment_eq(get_fs(), get_ds())) LoadHW(addr, value, res); else @@ -1044,7 +1044,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, if (!access_ok(VERIFY_READ, addr, 4)) goto sigbus; - if (config_enabled(CONFIG_EVA)) { + if (IS_ENABLED(CONFIG_EVA)) { if (segment_eq(get_fs(), get_ds())) LoadW(addr, value, res); else @@ -1063,7 +1063,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, if (!access_ok(VERIFY_READ, addr, 2)) goto sigbus; - if (config_enabled(CONFIG_EVA)) { + if (IS_ENABLED(CONFIG_EVA)) { if (segment_eq(get_fs(), get_ds())) LoadHWU(addr, value, res); else @@ -1131,7 +1131,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, compute_return_epc(regs); value = regs->regs[insn.i_format.rt]; - if (config_enabled(CONFIG_EVA)) { + if (IS_ENABLED(CONFIG_EVA)) { if (segment_eq(get_fs(), get_ds())) StoreHW(addr, value, res); else @@ -1151,7 +1151,7 @@ static void emulate_load_store_insn(struct pt_regs *regs, compute_return_epc(regs); value = regs->regs[insn.i_format.rt]; - if (config_enabled(CONFIG_EVA)) { + if (IS_ENABLED(CONFIG_EVA)) { if (segment_eq(get_fs(), get_ds())) StoreW(addr, value, res); else diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c index 6dc07fba187f..92d15e68abb6 100644 --- a/arch/mips/math-emu/cp1emu.c +++ b/arch/mips/math-emu/cp1emu.c @@ -784,10 +784,10 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn, */ static inline int cop1_64bit(struct pt_regs *xcp) { - if (config_enabled(CONFIG_64BIT) && !config_enabled(CONFIG_MIPS32_O32)) + if (IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_MIPS32_O32)) return 1; - else if (config_enabled(CONFIG_32BIT) && - !config_enabled(CONFIG_MIPS_O32_FP64_SUPPORT)) + else if (IS_ENABLED(CONFIG_32BIT) && + !IS_ENABLED(CONFIG_MIPS_O32_FP64_SUPPORT)) return 0; return !test_thread_flag(TIF_32BIT_FPREGS); diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c index 4004b659ce50..ff49b29c2d16 100644 --- a/arch/mips/mm/tlbex.c +++ b/arch/mips/mm/tlbex.c @@ -1025,7 +1025,7 @@ static void build_update_entries(u32 **p, unsigned int tmp, unsigned int ptep) pte_off_odd += offsetof(pte_t, pte_high); #endif - if (config_enabled(CONFIG_XPA)) { + if (IS_ENABLED(CONFIG_XPA)) { uasm_i_lw(p, tmp, pte_off_even, ptep); /* even pte */ UASM_i_ROTR(p, tmp, tmp, ilog2(_PAGE_GLOBAL)); UASM_i_MTC0(p, tmp, C0_ENTRYLO0); @@ -1643,7 +1643,7 @@ iPTE_SW(u32 **p, struct uasm_reloc **r, unsigned int pte, unsigned int ptr, unsigned int hwmode = mode & (_PAGE_VALID | _PAGE_DIRTY); unsigned int swmode = mode & ~hwmode; - if (config_enabled(CONFIG_XPA) && !cpu_has_64bits) { + if (IS_ENABLED(CONFIG_XPA) && !cpu_has_64bits) { uasm_i_lui(p, scratch, swmode >> 16); uasm_i_or(p, pte, pte, scratch); BUG_ON(swmode & 0xffff); @@ -2432,7 +2432,7 @@ static void config_htw_params(void) pwsize |= ilog2(PTRS_PER_PMD) << MIPS_PWSIZE_MDW_SHIFT; /* Set pointer size to size of directory pointers */ - if (config_enabled(CONFIG_64BIT)) + if (IS_ENABLED(CONFIG_64BIT)) pwsize |= MIPS_PWSIZE_PS_MASK; /* PTEs may be multiple pointers long (e.g. with XPA) */ pwsize |= ((PTE_T_LOG2 - PGD_T_LOG2) << MIPS_PWSIZE_PTEW_SHIFT) @@ -2448,7 +2448,7 @@ static void config_htw_params(void) * the pwctl fields. */ config = 1 << MIPS_PWCTL_PWEN_SHIFT; - if (config_enabled(CONFIG_64BIT)) + if (IS_ENABLED(CONFIG_64BIT)) config |= MIPS_PWCTL_XU_MASK; write_c0_pwctl(config); pr_info("Hardware Page Table Walker enabled\n"); @@ -2522,7 +2522,7 @@ void build_tlb_refill_handler(void) */ static int run_once = 0; - if (config_enabled(CONFIG_XPA) && !cpu_has_rixi) + if (IS_ENABLED(CONFIG_XPA) && !cpu_has_rixi) panic("Kernels supporting XPA currently require CPUs with RIXI"); output_pgtable_bits_defines(); diff --git a/arch/mips/mti-malta/malta-dtshim.c b/arch/mips/mti-malta/malta-dtshim.c index f7133efc5843..151f4882ec8a 100644 --- a/arch/mips/mti-malta/malta-dtshim.c +++ b/arch/mips/mti-malta/malta-dtshim.c @@ -31,7 +31,7 @@ static unsigned __init gen_fdt_mem_array(__be32 *mem_array, unsigned long size) entries = 1; mem_array[0] = cpu_to_be32(PHYS_OFFSET); - if (config_enabled(CONFIG_EVA)) { + if (IS_ENABLED(CONFIG_EVA)) { /* * The current Malta EVA configuration is "special" in that it * always makes use of addresses in the upper half of the 32 bit @@ -82,7 +82,7 @@ static void __init append_memory(void *fdt, int root_off) physical_memsize = 32 << 20; } - if (config_enabled(CONFIG_CPU_BIG_ENDIAN)) { + if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) { /* * SOC-it swaps, or perhaps doesn't swap, when DMA'ing * the last word of physical memory. diff --git a/arch/mips/mti-malta/malta-memory.c b/arch/mips/mti-malta/malta-memory.c index d5f8dae6a797..a47556723b85 100644 --- a/arch/mips/mti-malta/malta-memory.c +++ b/arch/mips/mti-malta/malta-memory.c @@ -32,7 +32,7 @@ static void free_init_pages_eva_malta(void *begin, void *end) void __init fw_meminit(void) { - bool eva = config_enabled(CONFIG_EVA); + bool eva = IS_ENABLED(CONFIG_EVA); free_init_pages_eva = eva ? free_init_pages_eva_malta : NULL; } diff --git a/arch/mips/mti-malta/malta-setup.c b/arch/mips/mti-malta/malta-setup.c index 33d5ff5069e5..ec5b21678fad 100644 --- a/arch/mips/mti-malta/malta-setup.c +++ b/arch/mips/mti-malta/malta-setup.c @@ -261,7 +261,7 @@ void __init plat_mem_setup(void) fdt = malta_dt_shim(fdt); __dt_setup_arch(fdt); - if (config_enabled(CONFIG_EVA)) + if (IS_ENABLED(CONFIG_EVA)) /* EVA has already been configured in mach-malta/kernel-init.h */ pr_info("Enhanced Virtual Addressing (EVA) activated\n"); diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 1a8c96035716..d1b7bd09253a 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -426,7 +426,7 @@ static inline void emit_load_ptr(unsigned int dst, unsigned int src, static inline void emit_load_func(unsigned int reg, ptr imm, struct jit_ctx *ctx) { - if (config_enabled(CONFIG_64BIT)) { + if (IS_ENABLED(CONFIG_64BIT)) { /* At this point imm is always 64-bit */ emit_load_imm(r_tmp, (u64)imm >> 32, ctx); emit_dsll(r_tmp_imm, r_tmp, 16, ctx); /* left shift by 16 */ @@ -516,7 +516,7 @@ static inline void emit_jr(unsigned int reg, struct jit_ctx *ctx) static inline u16 align_sp(unsigned int num) { /* Double word alignment for 32-bit, quadword for 64-bit */ - unsigned int align = config_enabled(CONFIG_64BIT) ? 16 : 8; + unsigned int align = IS_ENABLED(CONFIG_64BIT) ? 16 : 8; num = (num + (align - 1)) & -align; return num; } diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index fea7724141a0..e7f155c3045e 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -344,8 +344,8 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, */ static inline int mmap_is_ia32(void) { - return config_enabled(CONFIG_X86_32) || - (config_enabled(CONFIG_COMPAT) && + return IS_ENABLED(CONFIG_X86_32) || + (IS_ENABLED(CONFIG_COMPAT) && test_thread_flag(TIF_ADDR32)); } diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 116b58347501..2737366ea583 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -137,9 +137,9 @@ static inline int copy_fregs_to_user(struct fregs_state __user *fx) static inline int copy_fxregs_to_user(struct fxregs_state __user *fx) { - if (config_enabled(CONFIG_X86_32)) + if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) + else if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); /* See comment in copy_fxregs_to_kernel() below. */ @@ -150,10 +150,10 @@ static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) { int err; - if (config_enabled(CONFIG_X86_32)) { + if (IS_ENABLED(CONFIG_X86_32)) { err = check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } else { - if (config_enabled(CONFIG_AS_FXSAVEQ)) { + if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) { err = check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } else { /* See comment in copy_fxregs_to_kernel() below. */ @@ -166,9 +166,9 @@ static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) { - if (config_enabled(CONFIG_X86_32)) + if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) + else if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); /* See comment in copy_fxregs_to_kernel() below. */ @@ -190,9 +190,9 @@ static inline int copy_user_to_fregs(struct fregs_state __user *fx) static inline void copy_fxregs_to_kernel(struct fpu *fpu) { - if (config_enabled(CONFIG_X86_32)) + if (IS_ENABLED(CONFIG_X86_32)) asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) + else if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave)); else { /* Using "rex64; fxsave %0" is broken because, if the memory diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 396348196aa7..d8abfcf524d1 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -155,7 +155,7 @@ static inline void arch_exit_mmap(struct mm_struct *mm) #ifdef CONFIG_X86_64 static inline bool is_64bit_mm(struct mm_struct *mm) { - return !config_enabled(CONFIG_IA32_EMULATION) || + return !IS_ENABLED(CONFIG_IA32_EMULATION) || !(mm->context.ia32_compat == TIF_IA32); } #else diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 7943d38c57ca..20abd912f0e4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -147,7 +147,7 @@ static int force_enable_local_apic __initdata; */ static int __init parse_lapic(char *arg) { - if (config_enabled(CONFIG_X86_32) && !arg) + if (IS_ENABLED(CONFIG_X86_32) && !arg) force_enable_local_apic = 1; else if (arg && !strncmp(arg, "notscdeadline", 13)) setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index a5e400afc563..6066d945c40e 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -523,7 +523,7 @@ static int apic_set_affinity(struct irq_data *irq_data, struct apic_chip_data *data = irq_data->chip_data; int err, irq = irq_data->irq; - if (!config_enabled(CONFIG_SMP)) + if (!IS_ENABLED(CONFIG_SMP)) return -EPERM; if (!cpumask_intersects(dest, cpu_online_mask)) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 9e231d88bb33..a184c210efba 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -159,8 +159,8 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); - ia32_fxstate &= (config_enabled(CONFIG_X86_32) || - config_enabled(CONFIG_IA32_EMULATION)); + ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || + IS_ENABLED(CONFIG_IA32_EMULATION)); if (!access_ok(VERIFY_WRITE, buf, size)) return -EACCES; @@ -268,8 +268,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) u64 xfeatures = 0; int fx_only = 0; - ia32_fxstate &= (config_enabled(CONFIG_X86_32) || - config_enabled(CONFIG_IA32_EMULATION)); + ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || + IS_ENABLED(CONFIG_IA32_EMULATION)); if (!buf) { fpu__clear(fpu); @@ -416,8 +416,8 @@ void fpu__init_prepare_fx_sw_frame(void) fx_sw_reserved.xfeatures = xfeatures_mask; fx_sw_reserved.xstate_size = fpu_user_xstate_size; - if (config_enabled(CONFIG_IA32_EMULATION) || - config_enabled(CONFIG_X86_32)) { + if (IS_ENABLED(CONFIG_IA32_EMULATION) || + IS_ENABLED(CONFIG_X86_32)) { int fsave_header_size = sizeof(struct fregs_state); fx_sw_reserved_ia32 = fx_sw_reserved; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 22cc2f9f8aec..99f285b512db 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -146,7 +146,7 @@ static int restore_sigcontext(struct pt_regs *regs, buf = (void __user *)buf_val; } get_user_catch(err); - err |= fpu__restore_sig(buf, config_enabled(CONFIG_X86_32)); + err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32)); force_iret(); @@ -245,14 +245,14 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, struct fpu *fpu = ¤t->thread.fpu; /* redzone */ - if (config_enabled(CONFIG_X86_64)) + if (IS_ENABLED(CONFIG_X86_64)) sp -= 128; /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { if (sas_ss_flags(sp) == 0) sp = current->sas_ss_sp + current->sas_ss_size; - } else if (config_enabled(CONFIG_X86_32) && + } else if (IS_ENABLED(CONFIG_X86_32) && !onsigstack && (regs->ss & 0xffff) != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && @@ -262,7 +262,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, } if (fpu->fpstate_active) { - sp = fpu__alloc_mathframe(sp, config_enabled(CONFIG_X86_32), + sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), &buf_fx, &math_size); *fpstate = (void __user *)sp; } @@ -662,18 +662,18 @@ badframe: static inline int is_ia32_compat_frame(void) { - return config_enabled(CONFIG_IA32_EMULATION) && + return IS_ENABLED(CONFIG_IA32_EMULATION) && test_thread_flag(TIF_IA32); } static inline int is_ia32_frame(void) { - return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame(); + return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(); } static inline int is_x32_frame(void) { - return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); + return IS_ENABLED(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); } static int diff --git a/drivers/firmware/broadcom/bcm47xx_sprom.c b/drivers/firmware/broadcom/bcm47xx_sprom.c index b6eb875d4af3..62aa3cf09b4d 100644 --- a/drivers/firmware/broadcom/bcm47xx_sprom.c +++ b/drivers/firmware/broadcom/bcm47xx_sprom.c @@ -669,7 +669,7 @@ static int bcm47xx_get_sprom_bcma(struct bcma_bus *bus, struct ssb_sprom *out) case BCMA_HOSTTYPE_PCI: memset(out, 0, sizeof(struct ssb_sprom)); /* On BCM47XX all PCI buses share the same domain */ - if (config_enabled(CONFIG_BCM47XX)) + if (IS_ENABLED(CONFIG_BCM47XX)) snprintf(buf, sizeof(buf), "pci/%u/%u/", bus->host_pci->bus->number + 1, PCI_SLOT(bus->host_pci->devfn)); diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 3786d0f21972..c5f33c3bd228 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -359,7 +359,7 @@ static void gic_handle_shared_int(bool chained) pending_reg += gic_reg_step; intrmask_reg += gic_reg_step; - if (!config_enabled(CONFIG_64BIT) || mips_cm_is64) + if (!IS_ENABLED(CONFIG_64BIT) || mips_cm_is64) continue; pending[i] |= (u64)gic_read(pending_reg) << 32; diff --git a/drivers/mtd/bcm47xxpart.c b/drivers/mtd/bcm47xxpart.c index 845dd27d9f41..377947580203 100644 --- a/drivers/mtd/bcm47xxpart.c +++ b/drivers/mtd/bcm47xxpart.c @@ -122,7 +122,7 @@ static int bcm47xxpart_parse(struct mtd_info *master, for (offset = 0; offset <= master->size - blocksize; offset += blocksize) { /* Nothing more in higher memory on BCM47XX (MIPS) */ - if (config_enabled(CONFIG_BCM47XX) && offset >= 0x2000000) + if (IS_ENABLED(CONFIG_BCM47XX) && offset >= 0x2000000) break; if (curr_part >= BCM47XXPART_MAX_PARTS) { diff --git a/drivers/net/wireless/ath/ath10k/debug.c b/drivers/net/wireless/ath/ath10k/debug.c index 355e1ae665f9..8f0fd41dfd4b 100644 --- a/drivers/net/wireless/ath/ath10k/debug.c +++ b/drivers/net/wireless/ath/ath10k/debug.c @@ -139,11 +139,11 @@ void ath10k_debug_print_hwfw_info(struct ath10k *ar) ar->id.subsystem_vendor, ar->id.subsystem_device); ath10k_info(ar, "kconfig debug %d debugfs %d tracing %d dfs %d testmode %d\n", - config_enabled(CONFIG_ATH10K_DEBUG), - config_enabled(CONFIG_ATH10K_DEBUGFS), - config_enabled(CONFIG_ATH10K_TRACING), - config_enabled(CONFIG_ATH10K_DFS_CERTIFIED), - config_enabled(CONFIG_NL80211_TESTMODE)); + IS_ENABLED(CONFIG_ATH10K_DEBUG), + IS_ENABLED(CONFIG_ATH10K_DEBUGFS), + IS_ENABLED(CONFIG_ATH10K_TRACING), + IS_ENABLED(CONFIG_ATH10K_DFS_CERTIFIED), + IS_ENABLED(CONFIG_NL80211_TESTMODE)); firmware = ar->normal_mode_fw.fw_file.firmware; if (firmware) @@ -2424,7 +2424,7 @@ int ath10k_debug_register(struct ath10k *ar) debugfs_create_file("nf_cal_period", S_IRUSR | S_IWUSR, ar->debug.debugfs_phy, ar, &fops_nf_cal_period); - if (config_enabled(CONFIG_ATH10K_DFS_CERTIFIED)) { + if (IS_ENABLED(CONFIG_ATH10K_DFS_CERTIFIED)) { debugfs_create_file("dfs_simulate_radar", S_IWUSR, ar->debug.debugfs_phy, ar, &fops_simulate_radar); diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index fb8e38df9446..0bbd0a00edcc 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -3039,7 +3039,7 @@ static void ath10k_regd_update(struct ath10k *ar) regpair = ar->ath_common.regulatory.regpair; - if (config_enabled(CONFIG_ATH10K_DFS_CERTIFIED) && ar->dfs_detector) { + if (IS_ENABLED(CONFIG_ATH10K_DFS_CERTIFIED) && ar->dfs_detector) { nl_dfs_reg = ar->dfs_detector->region; wmi_dfs_reg = ath10k_mac_get_dfs_region(nl_dfs_reg); } else { @@ -3068,7 +3068,7 @@ static void ath10k_reg_notifier(struct wiphy *wiphy, ath_reg_notifier_apply(wiphy, request, &ar->ath_common.regulatory); - if (config_enabled(CONFIG_ATH10K_DFS_CERTIFIED) && ar->dfs_detector) { + if (IS_ENABLED(CONFIG_ATH10K_DFS_CERTIFIED) && ar->dfs_detector) { ath10k_dbg(ar, ATH10K_DBG_REGULATORY, "dfs region 0x%x\n", request->dfs_region); result = ar->dfs_detector->set_dfs_domain(ar->dfs_detector, @@ -7955,7 +7955,7 @@ int ath10k_mac_register(struct ath10k *ar) if (!test_bit(ATH10K_FLAG_RAW_MODE, &ar->dev_flags)) ar->hw->netdev_features = NETIF_F_HW_CSUM; - if (config_enabled(CONFIG_ATH10K_DFS_CERTIFIED)) { + if (IS_ENABLED(CONFIG_ATH10K_DFS_CERTIFIED)) { /* Init ath dfs pattern detector */ ar->ath_common.debug_mask = ATH_DBG_DFS; ar->dfs_detector = dfs_pattern_detector_init(&ar->ath_common, @@ -8003,7 +8003,7 @@ err_unregister: ieee80211_unregister_hw(ar->hw); err_dfs_detector_exit: - if (config_enabled(CONFIG_ATH10K_DFS_CERTIFIED) && ar->dfs_detector) + if (IS_ENABLED(CONFIG_ATH10K_DFS_CERTIFIED) && ar->dfs_detector) ar->dfs_detector->exit(ar->dfs_detector); err_free: @@ -8018,7 +8018,7 @@ void ath10k_mac_unregister(struct ath10k *ar) { ieee80211_unregister_hw(ar->hw); - if (config_enabled(CONFIG_ATH10K_DFS_CERTIFIED) && ar->dfs_detector) + if (IS_ENABLED(CONFIG_ATH10K_DFS_CERTIFIED) && ar->dfs_detector) ar->dfs_detector->exit(ar->dfs_detector); kfree(ar->mac.sbands[NL80211_BAND_2GHZ].channels); diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c index 169cd2e783eb..d2462886b75c 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.c +++ b/drivers/net/wireless/ath/ath10k/wmi.c @@ -3704,7 +3704,7 @@ void ath10k_wmi_event_dfs(struct ath10k *ar, phyerr->tsf_timestamp, tsf, buf_len); /* Skip event if DFS disabled */ - if (!config_enabled(CONFIG_ATH10K_DFS_CERTIFIED)) + if (!IS_ENABLED(CONFIG_ATH10K_DFS_CERTIFIED)) return; ATH10K_DFS_STAT_INC(ar, pulses_total); diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c index 4ad6284fc37d..72e2ec67768d 100644 --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c @@ -3881,7 +3881,7 @@ int ath6kl_cfg80211_init(struct ath6kl *ar) BIT(NL80211_IFTYPE_P2P_CLIENT); } - if (config_enabled(CONFIG_ATH6KL_REGDOMAIN) && + if (IS_ENABLED(CONFIG_ATH6KL_REGDOMAIN) && test_bit(ATH6KL_FW_CAPABILITY_REGDOMAIN, ar->fw_capabilities)) { wiphy->reg_notifier = ath6kl_cfg80211_reg_notify; ar->wiphy->features |= NL80211_FEATURE_CELL_BASE_REG_HINTS; diff --git a/drivers/net/wireless/ath/ath9k/common-spectral.c b/drivers/net/wireless/ath/ath9k/common-spectral.c index a8762711ad74..e2512d5bc0e1 100644 --- a/drivers/net/wireless/ath/ath9k/common-spectral.c +++ b/drivers/net/wireless/ath/ath9k/common-spectral.c @@ -731,7 +731,7 @@ void ath9k_cmn_spectral_scan_trigger(struct ath_common *common, struct ath_hw *ah = spec_priv->ah; u32 rxfilter; - if (config_enabled(CONFIG_ATH9K_TX99)) + if (IS_ENABLED(CONFIG_ATH9K_TX99)) return; if (!ath9k_hw_ops(ah)->spectral_scan_trigger) { @@ -806,7 +806,7 @@ static ssize_t write_file_spec_scan_ctl(struct file *file, char buf[32]; ssize_t len; - if (config_enabled(CONFIG_ATH9K_TX99)) + if (IS_ENABLED(CONFIG_ATH9K_TX99)) return -EOPNOTSUPP; len = min(count, sizeof(buf) - 1); @@ -1072,7 +1072,7 @@ static struct rchan_callbacks rfs_spec_scan_cb = { void ath9k_cmn_spectral_deinit_debug(struct ath_spec_scan_priv *spec_priv) { - if (config_enabled(CONFIG_ATH9K_DEBUGFS)) { + if (IS_ENABLED(CONFIG_ATH9K_DEBUGFS)) { relay_close(spec_priv->rfs_chan_spec_scan); spec_priv->rfs_chan_spec_scan = NULL; } diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c index edc74fca60aa..cfa3fe82ade3 100644 --- a/drivers/net/wireless/ath/ath9k/init.c +++ b/drivers/net/wireless/ath/ath9k/init.c @@ -843,7 +843,7 @@ static void ath9k_set_hw_capab(struct ath_softc *sc, struct ieee80211_hw *hw) NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE | NL80211_FEATURE_P2P_GO_CTWIN; - if (!config_enabled(CONFIG_ATH9K_TX99)) { + if (!IS_ENABLED(CONFIG_ATH9K_TX99)) { hw->wiphy->interface_modes = BIT(NL80211_IFTYPE_P2P_GO) | BIT(NL80211_IFTYPE_P2P_CLIENT) | diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index 7594650f214f..a394622c9022 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -1250,7 +1250,7 @@ static int ath9k_add_interface(struct ieee80211_hw *hw, mutex_lock(&sc->mutex); - if (config_enabled(CONFIG_ATH9K_TX99)) { + if (IS_ENABLED(CONFIG_ATH9K_TX99)) { if (sc->cur_chan->nvifs >= 1) { mutex_unlock(&sc->mutex); return -EOPNOTSUPP; @@ -1300,7 +1300,7 @@ static int ath9k_change_interface(struct ieee80211_hw *hw, mutex_lock(&sc->mutex); - if (config_enabled(CONFIG_ATH9K_TX99)) { + if (IS_ENABLED(CONFIG_ATH9K_TX99)) { mutex_unlock(&sc->mutex); return -EOPNOTSUPP; } @@ -1360,7 +1360,7 @@ static void ath9k_enable_ps(struct ath_softc *sc) struct ath_hw *ah = sc->sc_ah; struct ath_common *common = ath9k_hw_common(ah); - if (config_enabled(CONFIG_ATH9K_TX99)) + if (IS_ENABLED(CONFIG_ATH9K_TX99)) return; sc->ps_enabled = true; @@ -1379,7 +1379,7 @@ static void ath9k_disable_ps(struct ath_softc *sc) struct ath_hw *ah = sc->sc_ah; struct ath_common *common = ath9k_hw_common(ah); - if (config_enabled(CONFIG_ATH9K_TX99)) + if (IS_ENABLED(CONFIG_ATH9K_TX99)) return; sc->ps_enabled = false; @@ -1953,7 +1953,7 @@ static int ath9k_get_survey(struct ieee80211_hw *hw, int idx, struct ieee80211_channel *chan; int pos; - if (config_enabled(CONFIG_ATH9K_TX99)) + if (IS_ENABLED(CONFIG_ATH9K_TX99)) return -EOPNOTSUPP; spin_lock_bh(&common->cc_lock); @@ -2003,7 +2003,7 @@ static void ath9k_set_coverage_class(struct ieee80211_hw *hw, struct ath_softc *sc = hw->priv; struct ath_hw *ah = sc->sc_ah; - if (config_enabled(CONFIG_ATH9K_TX99)) + if (IS_ENABLED(CONFIG_ATH9K_TX99)) return; mutex_lock(&sc->mutex); diff --git a/drivers/net/wireless/ath/ath9k/recv.c b/drivers/net/wireless/ath/ath9k/recv.c index 32160fca876a..669734252664 100644 --- a/drivers/net/wireless/ath/ath9k/recv.c +++ b/drivers/net/wireless/ath/ath9k/recv.c @@ -377,7 +377,7 @@ u32 ath_calcrxfilter(struct ath_softc *sc) struct ath_common *common = ath9k_hw_common(sc->sc_ah); u32 rfilt; - if (config_enabled(CONFIG_ATH9K_TX99)) + if (IS_ENABLED(CONFIG_ATH9K_TX99)) return 0; rfilt = ATH9K_RX_FILTER_UCAST | ATH9K_RX_FILTER_BCAST diff --git a/drivers/net/wireless/ath/dfs_pattern_detector.c b/drivers/net/wireless/ath/dfs_pattern_detector.c index 2303ef96299d..2f8136d50f78 100644 --- a/drivers/net/wireless/ath/dfs_pattern_detector.c +++ b/drivers/net/wireless/ath/dfs_pattern_detector.c @@ -352,7 +352,7 @@ dfs_pattern_detector_init(struct ath_common *common, { struct dfs_pattern_detector *dpd; - if (!config_enabled(CONFIG_CFG80211_CERTIFICATION_ONUS)) + if (!IS_ENABLED(CONFIG_CFG80211_CERTIFICATION_ONUS)) return NULL; dpd = kmalloc(sizeof(*dpd), GFP_KERNEL); diff --git a/drivers/net/wireless/ath/regd.c b/drivers/net/wireless/ath/regd.c index 7e15ed9ed31f..f8506037736f 100644 --- a/drivers/net/wireless/ath/regd.c +++ b/drivers/net/wireless/ath/regd.c @@ -116,7 +116,7 @@ static const struct ieee80211_regdomain ath_world_regdom_67_68_6A_6C = { static bool dynamic_country_user_possible(struct ath_regulatory *reg) { - if (config_enabled(CONFIG_ATH_REG_DYNAMIC_USER_CERT_TESTING)) + if (IS_ENABLED(CONFIG_ATH_REG_DYNAMIC_USER_CERT_TESTING)) return true; switch (reg->country_code) { @@ -188,7 +188,7 @@ static bool dynamic_country_user_possible(struct ath_regulatory *reg) static bool ath_reg_dyn_country_user_allow(struct ath_regulatory *reg) { - if (!config_enabled(CONFIG_ATH_REG_DYNAMIC_USER_REG_HINTS)) + if (!IS_ENABLED(CONFIG_ATH_REG_DYNAMIC_USER_REG_HINTS)) return false; if (!dynamic_country_user_possible(reg)) return false; diff --git a/drivers/pci/ecam.c b/drivers/pci/ecam.c index 66e0d718472f..43ed08dd8b01 100644 --- a/drivers/pci/ecam.c +++ b/drivers/pci/ecam.c @@ -27,7 +27,7 @@ * since we have enough virtual address range available. On 32-bit, we * ioremap the config space for each bus individually. */ -static const bool per_bus_mapping = !config_enabled(CONFIG_64BIT); +static const bool per_bus_mapping = !IS_ENABLED(CONFIG_64BIT); /* * Create a PCI config space window diff --git a/drivers/tty/serial/ar933x_uart.c b/drivers/tty/serial/ar933x_uart.c index 1519d2ca7705..73137f4aac20 100644 --- a/drivers/tty/serial/ar933x_uart.c +++ b/drivers/tty/serial/ar933x_uart.c @@ -54,7 +54,7 @@ struct ar933x_uart_port { static inline bool ar933x_uart_console_enabled(void) { - return config_enabled(CONFIG_SERIAL_AR933X_CONSOLE); + return IS_ENABLED(CONFIG_SERIAL_AR933X_CONSOLE); } static inline unsigned int ar933x_uart_read(struct ar933x_uart_port *up, @@ -636,7 +636,7 @@ static int ar933x_uart_probe(struct platform_device *pdev) int ret; np = pdev->dev.of_node; - if (config_enabled(CONFIG_OF) && np) { + if (IS_ENABLED(CONFIG_OF) && np) { id = of_alias_get_id(np, "serial"); if (id < 0) { dev_err(&pdev->dev, "unable to get alias id, err=%d\n", diff --git a/include/linux/fence.h b/include/linux/fence.h index 523ea3fbbddd..8cc719a63728 100644 --- a/include/linux/fence.h +++ b/include/linux/fence.h @@ -358,7 +358,7 @@ u64 fence_context_alloc(unsigned num); #define FENCE_TRACE(f, fmt, args...) \ do { \ struct fence *__ff = (f); \ - if (config_enabled(CONFIG_FENCE_TRACE)) \ + if (IS_ENABLED(CONFIG_FENCE_TRACE)) \ pr_info("f %llu#%u: " fmt, \ __ff->context, __ff->seqno, ##args); \ } while (0) diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index 760399a470bd..2bb5deb0012e 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -173,14 +173,14 @@ static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx) mutex_release(&ctx->dep_map, 0, _THIS_IP_); DEBUG_LOCKS_WARN_ON(ctx->acquired); - if (!config_enabled(CONFIG_PROVE_LOCKING)) + if (!IS_ENABLED(CONFIG_PROVE_LOCKING)) /* * lockdep will normally handle this, * but fail without anyway */ ctx->done_acquire = 1; - if (!config_enabled(CONFIG_DEBUG_LOCK_ALLOC)) + if (!IS_ENABLED(CONFIG_DEBUG_LOCK_ALLOC)) /* ensure ww_acquire_fini will still fail if called twice */ ctx->acquired = ~0U; #endif diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d49bfa1e53e6..1d3b7665d0be 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -585,8 +585,8 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data) return -EINVAL; if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { - if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) || - !config_enabled(CONFIG_SECCOMP)) + if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) || + !IS_ENABLED(CONFIG_SECCOMP)) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 54d15eb2b701..ef6c6c3f9d8a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -347,7 +347,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *sfilter; int ret; - const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE); + const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE); if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) return ERR_PTR(-EINVAL); @@ -542,7 +542,7 @@ void secure_computing_strict(int this_syscall) { int mode = current->seccomp.mode; - if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) && unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) return; @@ -655,7 +655,7 @@ int __secure_computing(const struct seccomp_data *sd) int mode = current->seccomp.mode; int this_syscall; - if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) && unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) return 0; diff --git a/net/wireless/chan.c b/net/wireless/chan.c index da49c0b1fd32..b0e11b6dc994 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -715,7 +715,7 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy, ASSERT_RTNL(); - if (!config_enabled(CONFIG_CFG80211_REG_RELAX_NO_IR) || + if (!IS_ENABLED(CONFIG_CFG80211_REG_RELAX_NO_IR) || !(wiphy->regulatory_flags & REGULATORY_ENABLE_RELAX_NO_IR)) return false; -- cgit v1.3-8-gc7d7 From 1f69bf9c6137602cd028c96b4f8329121ec89231 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 3 Aug 2016 13:46:36 -0700 Subject: jump_label: remove bug.h, atomic.h dependencies for HAVE_JUMP_LABEL The current jump_label.h includes bug.h for things such as WARN_ON(). This makes the header problematic for inclusion by kernel.h or any headers that kernel.h includes, since bug.h includes kernel.h (circular dependency). The inclusion of atomic.h is similarly problematic. Thus, this should make jump_label.h 'includable' from most places. Link: http://lkml.kernel.org/r/7060ce35ddd0d20b33bf170685e6b0fab816bdf2.1467837322.git.jbaron@akamai.com Signed-off-by: Jason Baron Cc: "David S. Miller" Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Chris Metcalf Cc: Heiko Carstens Cc: Joe Perches Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/jump_label.h | 46 ++++++++++++++++++++-------------------- kernel/jump_label.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 68904469fba1..661af564fae8 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -76,7 +76,6 @@ #include #include -#include extern bool static_key_initialized; @@ -115,20 +114,8 @@ enum jump_label_type { struct module; -#include - #ifdef HAVE_JUMP_LABEL -static inline int static_key_count(struct static_key *key) -{ - /* - * -1 means the first static_key_slow_inc() is in progress. - * static_key_enabled() must return true, so return 1 here. - */ - int n = atomic_read(&key->enabled); - return n >= 0 ? n : 1; -} - #define JUMP_TYPE_FALSE 0UL #define JUMP_TYPE_TRUE 1UL #define JUMP_TYPE_MASK 1UL @@ -157,16 +144,29 @@ extern int jump_label_text_reserved(void *start, void *end); extern void static_key_slow_inc(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); extern void jump_label_apply_nops(struct module *mod); +extern int static_key_count(struct static_key *key); +extern void static_key_enable(struct static_key *key); +extern void static_key_disable(struct static_key *key); +/* + * We should be using ATOMIC_INIT() for initializing .enabled, but + * the inclusion of atomic.h is problematic for inclusion of jump_label.h + * in 'low-level' headers. Thus, we are initializing .enabled with a + * raw value, but have added a BUILD_BUG_ON() to catch any issues in + * jump_label_init() see: kernel/jump_label.c. + */ #define STATIC_KEY_INIT_TRUE \ - { .enabled = ATOMIC_INIT(1), \ + { .enabled = { 1 }, \ .entries = (void *)JUMP_TYPE_TRUE } #define STATIC_KEY_INIT_FALSE \ - { .enabled = ATOMIC_INIT(0), \ + { .enabled = { 0 }, \ .entries = (void *)JUMP_TYPE_FALSE } #else /* !HAVE_JUMP_LABEL */ +#include +#include + static inline int static_key_count(struct static_key *key) { return atomic_read(&key->enabled); @@ -216,14 +216,6 @@ static inline int jump_label_apply_nops(struct module *mod) return 0; } -#define STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) } -#define STATIC_KEY_INIT_FALSE { .enabled = ATOMIC_INIT(0) } - -#endif /* HAVE_JUMP_LABEL */ - -#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE -#define jump_label_enabled static_key_enabled - static inline void static_key_enable(struct static_key *key) { int count = static_key_count(key); @@ -244,6 +236,14 @@ static inline void static_key_disable(struct static_key *key) static_key_slow_dec(key); } +#define STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) } +#define STATIC_KEY_INIT_FALSE { .enabled = ATOMIC_INIT(0) } + +#endif /* HAVE_JUMP_LABEL */ + +#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE +#define jump_label_enabled static_key_enabled + /* -------------------------------------------------------------------------- */ /* diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 0dbea887d625..f19aa02a8f48 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef HAVE_JUMP_LABEL @@ -56,6 +57,49 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) static void jump_label_update(struct static_key *key); +/* + * There are similar definitions for the !HAVE_JUMP_LABEL case in jump_label.h. + * The use of 'atomic_read()' requires atomic.h and its problematic for some + * kernel headers such as kernel.h and others. Since static_key_count() is not + * used in the branch statements as it is for the !HAVE_JUMP_LABEL case its ok + * to have it be a function here. Similarly, for 'static_key_enable()' and + * 'static_key_disable()', which require bug.h. This should allow jump_label.h + * to be included from most/all places for HAVE_JUMP_LABEL. + */ +int static_key_count(struct static_key *key) +{ + /* + * -1 means the first static_key_slow_inc() is in progress. + * static_key_enabled() must return true, so return 1 here. + */ + int n = atomic_read(&key->enabled); + + return n >= 0 ? n : 1; +} +EXPORT_SYMBOL_GPL(static_key_count); + +void static_key_enable(struct static_key *key) +{ + int count = static_key_count(key); + + WARN_ON_ONCE(count < 0 || count > 1); + + if (!count) + static_key_slow_inc(key); +} +EXPORT_SYMBOL_GPL(static_key_enable); + +void static_key_disable(struct static_key *key) +{ + int count = static_key_count(key); + + WARN_ON_ONCE(count < 0 || count > 1); + + if (count) + static_key_slow_dec(key); +} +EXPORT_SYMBOL_GPL(static_key_disable); + void static_key_slow_inc(struct static_key *key) { int v, v1; @@ -235,6 +279,15 @@ void __init jump_label_init(void) struct static_key *key = NULL; struct jump_entry *iter; + /* + * Since we are initializing the static_key.enabled field with + * with the 'raw' int values (to avoid pulling in atomic.h) in + * jump_label.h, let's make sure that is safe. There are only two + * cases to check since we initialize to 0 or 1. + */ + BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0); + BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1); + jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); -- cgit v1.3-8-gc7d7 From a6ed3ea65d9868fdf9eff84e6fe4f666b8d14b02 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 5 Aug 2016 14:01:27 -0700 Subject: bpf: restore behavior of bpf_map_update_elem The introduction of pre-allocated hash elements inadvertently broke the behavior of bpf hash maps where users expected to call bpf_map_update_elem() without considering that the map can be full. Some programs do: old_value = bpf_map_lookup_elem(map, key); if (old_value) { ... prepare new_value on stack ... bpf_map_update_elem(map, key, new_value); } Before pre-alloc the update() for existing element would work even in 'map full' condition. Restore this behavior. The above program could have updated old_value in place instead of update() which would be faster and most programs use that approach, but sometimes the values are large and the programs use update() helper to do atomic replacement of the element. Note we cannot simply update element's value in-place like percpu hash map does and have to allocate extra num_possible_cpu elements and use this extra reserve when the map is full. Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements") Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 84 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 73 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fff3650d52fc..570eeca7bdfa 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -26,11 +26,18 @@ struct bpf_htab { struct bucket *buckets; void *elems; struct pcpu_freelist freelist; + void __percpu *extra_elems; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ }; +enum extra_elem_state { + HTAB_NOT_AN_EXTRA_ELEM = 0, + HTAB_EXTRA_ELEM_FREE, + HTAB_EXTRA_ELEM_USED +}; + /* each htab element is struct htab_elem + key + value */ struct htab_elem { union { @@ -38,7 +45,10 @@ struct htab_elem { struct bpf_htab *htab; struct pcpu_freelist_node fnode; }; - struct rcu_head rcu; + union { + struct rcu_head rcu; + enum extra_elem_state state; + }; u32 hash; char key[0] __aligned(8); }; @@ -113,6 +123,23 @@ free_elems: return err; } +static int alloc_extra_elems(struct bpf_htab *htab) +{ + void __percpu *pptr; + int cpu; + + pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN); + if (!pptr) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state = + HTAB_EXTRA_ELEM_FREE; + } + htab->extra_elems = pptr; + return 0; +} + /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { @@ -185,6 +212,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (percpu) cost += (u64) round_up(htab->map.value_size, 8) * num_possible_cpus() * htab->map.max_entries; + else + cost += (u64) htab->elem_size * num_possible_cpus(); if (cost >= U32_MAX - PAGE_SIZE) /* make sure page count doesn't overflow */ @@ -212,14 +241,22 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } + if (!percpu) { + err = alloc_extra_elems(htab); + if (err) + goto free_buckets; + } + if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { err = prealloc_elems_and_freelist(htab); if (err) - goto free_buckets; + goto free_extra_elems; } return &htab->map; +free_extra_elems: + free_percpu(htab->extra_elems); free_buckets: kvfree(htab->buckets); free_htab: @@ -349,7 +386,6 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); kfree(l); - } static void htab_elem_free_rcu(struct rcu_head *head) @@ -370,6 +406,11 @@ static void htab_elem_free_rcu(struct rcu_head *head) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { + if (l->state == HTAB_EXTRA_ELEM_USED) { + l->state = HTAB_EXTRA_ELEM_FREE; + return; + } + if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { pcpu_freelist_push(&htab->freelist, &l->fnode); } else { @@ -381,25 +422,44 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, - bool percpu, bool onallcpus) + bool percpu, bool onallcpus, + bool old_elem_exists) { u32 size = htab->map.value_size; bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); struct htab_elem *l_new; void __percpu *pptr; + int err = 0; if (prealloc) { l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); if (!l_new) - return ERR_PTR(-E2BIG); + err = -E2BIG; } else { if (atomic_inc_return(&htab->count) > htab->map.max_entries) { atomic_dec(&htab->count); - return ERR_PTR(-E2BIG); + err = -E2BIG; + } else { + l_new = kmalloc(htab->elem_size, + GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return ERR_PTR(-ENOMEM); } - l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); - if (!l_new) - return ERR_PTR(-ENOMEM); + } + + if (err) { + if (!old_elem_exists) + return ERR_PTR(err); + + /* if we're updating the existing element and the hash table + * is full, use per-cpu extra elems + */ + l_new = this_cpu_ptr(htab->extra_elems); + if (l_new->state != HTAB_EXTRA_ELEM_FREE) + return ERR_PTR(-E2BIG); + l_new->state = HTAB_EXTRA_ELEM_USED; + } else { + l_new->state = HTAB_NOT_AN_EXTRA_ELEM; } memcpy(l_new->key, key, key_size); @@ -489,7 +549,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (ret) goto err; - l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, + !!l_old); if (IS_ERR(l_new)) { /* all pre-allocated elements are in use or memory exhausted */ ret = PTR_ERR(l_new); @@ -563,7 +624,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, } } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true, onallcpus); + hash, true, onallcpus, false); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; @@ -652,6 +713,7 @@ static void htab_map_free(struct bpf_map *map) htab_free_elems(htab); pcpu_freelist_destroy(&htab->freelist); } + free_percpu(htab->extra_elems); kvfree(htab->buckets); kfree(htab); } -- cgit v1.3-8-gc7d7 From 1eff9d322a444245c67515edb52bc0eb68374aa8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 5 Aug 2016 15:35:16 -0600 Subject: block: rename bio bi_rw to bi_opf Since commit 63a4cc24867d, bio->bi_rw contains flags in the lower portion and the op code in the higher portions. This means that old code that relies on manually setting bi_rw is most likely going to be broken. Instead of letting that brokeness linger, rename the member, to force old and out-of-tree code to break at compile time instead of at runtime. No intended functional changes in this commit. Signed-off-by: Jens Axboe --- Documentation/block/biodoc.txt | 4 ++-- Documentation/device-mapper/dm-flakey.txt | 2 +- block/bio-integrity.c | 2 +- block/bio.c | 6 +++--- block/blk-core.c | 26 +++++++++++++------------- block/blk-merge.c | 8 ++++---- block/blk-mq.c | 10 +++++----- block/blk-throttle.c | 8 ++++---- block/cfq-iosched.c | 4 ++-- drivers/block/drbd/drbd_main.c | 8 ++++---- drivers/block/drbd/drbd_receiver.c | 2 +- drivers/block/drbd/drbd_req.c | 6 +++--- drivers/block/drbd/drbd_worker.c | 2 +- drivers/block/pktcdvd.c | 2 +- drivers/block/umem.c | 2 +- drivers/md/bcache/request.c | 12 ++++++------ drivers/md/bcache/super.c | 2 +- drivers/md/bcache/writeback.h | 2 +- drivers/md/dm-cache-target.c | 8 ++++---- drivers/md/dm-crypt.c | 4 ++-- drivers/md/dm-era-target.c | 2 +- drivers/md/dm-flakey.c | 6 +++--- drivers/md/dm-io.c | 6 +++--- drivers/md/dm-log-writes.c | 4 ++-- drivers/md/dm-mpath.c | 2 +- drivers/md/dm-raid1.c | 10 +++++----- drivers/md/dm-region-hash.c | 4 ++-- drivers/md/dm-snap.c | 6 +++--- drivers/md/dm-stripe.c | 4 ++-- drivers/md/dm-thin.c | 8 ++++---- drivers/md/dm-zero.c | 2 +- drivers/md/dm.c | 10 +++++----- drivers/md/linear.c | 2 +- drivers/md/md.c | 4 ++-- drivers/md/multipath.c | 8 ++++---- drivers/md/raid0.c | 2 +- drivers/md/raid1.c | 6 +++--- drivers/md/raid10.c | 8 ++++---- drivers/md/raid5-cache.c | 2 +- drivers/md/raid5.c | 20 ++++++++++---------- drivers/nvdimm/pmem.c | 4 ++-- fs/btrfs/check-integrity.c | 10 +++++----- fs/btrfs/disk-io.c | 2 +- fs/btrfs/inode.c | 6 +++--- fs/btrfs/volumes.c | 6 +++--- include/linux/bio.h | 4 ++-- include/linux/blk-cgroup.h | 4 ++-- include/linux/blk_types.h | 15 ++++++++------- include/trace/events/bcache.h | 8 ++++---- include/trace/events/block.h | 14 +++++++------- kernel/trace/blktrace.c | 6 +++--- 51 files changed, 158 insertions(+), 157 deletions(-) (limited to 'kernel') diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 026d13362aca..bcdb2b4c1f12 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -269,7 +269,7 @@ Arjan's proposed request priority scheme allows higher levels some broad requests which haven't aged too much on the queue. Potentially this priority could even be exposed to applications in some manner, providing higher level tunability. Time based aging avoids starvation of lower priority - requests. Some bits in the bi_rw flags field in the bio structure are + requests. Some bits in the bi_opf flags field in the bio structure are intended to be used for this priority information. @@ -432,7 +432,7 @@ struct bio { struct bio *bi_next; /* request queue link */ struct block_device *bi_bdev; /* target device */ unsigned long bi_flags; /* status, command, etc */ - unsigned long bi_rw; /* low bits: r/w, high: priority */ + unsigned long bi_opf; /* low bits: r/w, high: priority */ unsigned int bi_vcnt; /* how may bio_vec's */ struct bvec_iter bi_iter; /* current index into bio_vec array */ diff --git a/Documentation/device-mapper/dm-flakey.txt b/Documentation/device-mapper/dm-flakey.txt index 6ff5c2327227..c43030718cef 100644 --- a/Documentation/device-mapper/dm-flakey.txt +++ b/Documentation/device-mapper/dm-flakey.txt @@ -42,7 +42,7 @@ Optional feature parameters: : Either 'r' to corrupt reads or 'w' to corrupt writes. 'w' is incompatible with drop_writes. : The value (from 0-255) to write. - : Perform the replacement only if bio->bi_rw has all the + : Perform the replacement only if bio->bi_opf has all the selected flags set. Examples: diff --git a/block/bio-integrity.c b/block/bio-integrity.c index f70cc3bdfd01..63f72f00c72e 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -86,7 +86,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, bip->bip_bio = bio; bio->bi_integrity = bip; - bio->bi_rw |= REQ_INTEGRITY; + bio->bi_opf |= REQ_INTEGRITY; return bip; err: diff --git a/block/bio.c b/block/bio.c index 3f76a38a5e2d..f39477538fef 100644 --- a/block/bio.c +++ b/block/bio.c @@ -580,7 +580,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) */ bio->bi_bdev = bio_src->bi_bdev; bio_set_flag(bio, BIO_CLONED); - bio->bi_rw = bio_src->bi_rw; + bio->bi_opf = bio_src->bi_opf; bio->bi_iter = bio_src->bi_iter; bio->bi_io_vec = bio_src->bi_io_vec; @@ -663,7 +663,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, if (!bio) return NULL; bio->bi_bdev = bio_src->bi_bdev; - bio->bi_rw = bio_src->bi_rw; + bio->bi_opf = bio_src->bi_opf; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; @@ -873,7 +873,7 @@ int submit_bio_wait(struct bio *bio) init_completion(&ret.event); bio->bi_private = &ret; bio->bi_end_io = submit_bio_wait_endio; - bio->bi_rw |= REQ_SYNC; + bio->bi_opf |= REQ_SYNC; submit_bio(bio); wait_for_completion_io(&ret.event); diff --git a/block/blk-core.c b/block/blk-core.c index a687e9cc16c2..999442ec4601 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1029,7 +1029,7 @@ static bool blk_rq_should_init_elevator(struct bio *bio) * Flush requests do not use the elevator so skip initialization. * This allows a request to share the flush and elevator data. */ - if (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA)) + if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) return false; return true; @@ -1504,7 +1504,7 @@ EXPORT_SYMBOL_GPL(blk_add_request_payload); bool bio_attempt_back_merge(struct request_queue *q, struct request *req, struct bio *bio) { - const int ff = bio->bi_rw & REQ_FAILFAST_MASK; + const int ff = bio->bi_opf & REQ_FAILFAST_MASK; if (!ll_back_merge_fn(q, req, bio)) return false; @@ -1526,7 +1526,7 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req, bool bio_attempt_front_merge(struct request_queue *q, struct request *req, struct bio *bio) { - const int ff = bio->bi_rw & REQ_FAILFAST_MASK; + const int ff = bio->bi_opf & REQ_FAILFAST_MASK; if (!ll_front_merge_fn(q, req, bio)) return false; @@ -1648,8 +1648,8 @@ void init_request_from_bio(struct request *req, struct bio *bio) { req->cmd_type = REQ_TYPE_FS; - req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK; - if (bio->bi_rw & REQ_RAHEAD) + req->cmd_flags |= bio->bi_opf & REQ_COMMON_MASK; + if (bio->bi_opf & REQ_RAHEAD) req->cmd_flags |= REQ_FAILFAST_MASK; req->errors = 0; @@ -1660,7 +1660,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) { - const bool sync = !!(bio->bi_rw & REQ_SYNC); + const bool sync = !!(bio->bi_opf & REQ_SYNC); struct blk_plug *plug; int el_ret, rw_flags = 0, where = ELEVATOR_INSERT_SORT; struct request *req; @@ -1681,7 +1681,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } - if (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA)) { + if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) { spin_lock_irq(q->queue_lock); where = ELEVATOR_INSERT_FLUSH; goto get_rq; @@ -1728,7 +1728,7 @@ get_rq: /* * Add in META/PRIO flags, if set, before we get to the IO scheduler */ - rw_flags |= (bio->bi_rw & (REQ_META | REQ_PRIO)); + rw_flags |= (bio->bi_opf & (REQ_META | REQ_PRIO)); /* * Grab a free request. This is might sleep but can not fail. @@ -1805,7 +1805,7 @@ static void handle_bad_sector(struct bio *bio) printk(KERN_INFO "attempt to access beyond end of device\n"); printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n", bdevname(bio->bi_bdev, b), - bio->bi_rw, + bio->bi_opf, (unsigned long long)bio_end_sector(bio), (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); } @@ -1918,9 +1918,9 @@ generic_make_request_checks(struct bio *bio) * drivers without flush support don't have to worry * about them. */ - if ((bio->bi_rw & (REQ_PREFLUSH | REQ_FUA)) && + if ((bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) && !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { - bio->bi_rw &= ~(REQ_PREFLUSH | REQ_FUA); + bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); if (!nr_sectors) { err = 0; goto end_io; @@ -2219,7 +2219,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq) * one. */ for (bio = rq->bio; bio; bio = bio->bi_next) { - if ((bio->bi_rw & ff) != ff) + if ((bio->bi_opf & ff) != ff) break; bytes += bio->bi_iter.bi_size; } @@ -2630,7 +2630,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) /* mixed attributes always follow the first bio */ if (req->cmd_flags & REQ_MIXED_MERGE) { req->cmd_flags &= ~REQ_FAILFAST_MASK; - req->cmd_flags |= req->bio->bi_rw & REQ_FAILFAST_MASK; + req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; } /* diff --git a/block/blk-merge.c b/block/blk-merge.c index 41cbd4878958..3eec75a9e91d 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -186,7 +186,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio, if (split) { /* there isn't chance to merge the splitted bio */ - split->bi_rw |= REQ_NOMERGE; + split->bi_opf |= REQ_NOMERGE; bio_chain(split, *bio); trace_block_split(q, split, (*bio)->bi_iter.bi_sector); @@ -616,9 +616,9 @@ void blk_rq_set_mixed_merge(struct request *rq) * Distributes the attributs to each bio. */ for (bio = rq->bio; bio; bio = bio->bi_next) { - WARN_ON_ONCE((bio->bi_rw & REQ_FAILFAST_MASK) && - (bio->bi_rw & REQ_FAILFAST_MASK) != ff); - bio->bi_rw |= ff; + WARN_ON_ONCE((bio->bi_opf & REQ_FAILFAST_MASK) && + (bio->bi_opf & REQ_FAILFAST_MASK) != ff); + bio->bi_opf |= ff; } rq->cmd_flags |= REQ_MIXED_MERGE; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 6a63da101bc4..e931a0e8e73d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1234,7 +1234,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - if (rw_is_sync(bio_op(bio), bio->bi_rw)) + if (rw_is_sync(bio_op(bio), bio->bi_opf)) op_flags |= REQ_SYNC; trace_block_getrq(q, bio, op); @@ -1302,8 +1302,8 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) */ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { - const int is_sync = rw_is_sync(bio_op(bio), bio->bi_rw); - const int is_flush_fua = bio->bi_rw & (REQ_PREFLUSH | REQ_FUA); + const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf); + const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); struct blk_map_ctx data; struct request *rq; unsigned int request_count = 0; @@ -1396,8 +1396,8 @@ done: */ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) { - const int is_sync = rw_is_sync(bio_op(bio), bio->bi_rw); - const int is_flush_fua = bio->bi_rw & (REQ_PREFLUSH | REQ_FUA); + const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf); + const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); struct blk_plug *plug; unsigned int request_count = 0; struct blk_map_ctx data; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index c5494e403239..f1aba26f4719 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -821,8 +821,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) * second time when it eventually gets issued. Set it when a bio * is being charged to a tg. */ - if (!(bio->bi_rw & REQ_THROTTLED)) - bio->bi_rw |= REQ_THROTTLED; + if (!(bio->bi_opf & REQ_THROTTLED)) + bio->bi_opf |= REQ_THROTTLED; } /** @@ -1399,7 +1399,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, WARN_ON_ONCE(!rcu_read_lock_held()); /* see throtl_charge_bio() */ - if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw]) + if ((bio->bi_opf & REQ_THROTTLED) || !tg->has_rules[rw]) goto out; spin_lock_irq(q->queue_lock); @@ -1478,7 +1478,7 @@ out: * being issued. */ if (!throttled) - bio->bi_rw &= ~REQ_THROTTLED; + bio->bi_opf &= ~REQ_THROTTLED; return throttled; } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index acabba198de9..cc2f6dbd4303 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -918,7 +918,7 @@ static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic) */ static inline bool cfq_bio_sync(struct bio *bio) { - return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC); + return bio_data_dir(bio) == READ || (bio->bi_opf & REQ_SYNC); } /* @@ -2565,7 +2565,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req, static void cfq_bio_merged(struct request_queue *q, struct request *req, struct bio *bio) { - cfqg_stats_update_io_merged(RQ_CFQG(req), bio_op(bio), bio->bi_rw); + cfqg_stats_update_io_merged(RQ_CFQG(req), bio_op(bio), bio->bi_opf); } static void diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 0501ae0c517b..100be556e613 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1663,13 +1663,13 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection, struct bio *bio) { if (connection->agreed_pro_version >= 95) - return (bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | - (bio->bi_rw & REQ_FUA ? DP_FUA : 0) | - (bio->bi_rw & REQ_PREFLUSH ? DP_FLUSH : 0) | + return (bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0) | + (bio->bi_opf & REQ_FUA ? DP_FUA : 0) | + (bio->bi_opf & REQ_PREFLUSH ? DP_FLUSH : 0) | (bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) | (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0); else - return bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0; + return bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0; } /* Used to send write or TRIM aka REQ_DISCARD requests diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index df45713dfbe8..942384f34e22 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1564,7 +1564,7 @@ static void drbd_issue_peer_wsame(struct drbd_device *device, * drbd_submit_peer_request() * @device: DRBD device. * @peer_req: peer request - * @rw: flag field, see bio->bi_rw + * @rw: flag field, see bio->bi_opf * * May spread the pages to multiple bios, * depending on bio_add_page restrictions. diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 66b8e4bb74d8..de279fe4e4fd 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -288,7 +288,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) */ if (!ok && bio_op(req->master_bio) == REQ_OP_READ && - !(req->master_bio->bi_rw & REQ_RAHEAD) && + !(req->master_bio->bi_opf & REQ_RAHEAD) && !list_empty(&req->tl_requests)) req->rq_state |= RQ_POSTPONED; @@ -1137,7 +1137,7 @@ static int drbd_process_write_request(struct drbd_request *req) * replicating, in which case there is no point. */ if (unlikely(req->i.size == 0)) { /* The only size==0 bios we expect are empty flushes. */ - D_ASSERT(device, req->master_bio->bi_rw & REQ_PREFLUSH); + D_ASSERT(device, req->master_bio->bi_opf & REQ_PREFLUSH); if (remote) _req_mod(req, QUEUE_AS_DRBD_BARRIER); return remote; @@ -1176,7 +1176,7 @@ drbd_submit_req_private_bio(struct drbd_request *req) if (bio_op(bio) != REQ_OP_READ) type = DRBD_FAULT_DT_WR; - else if (bio->bi_rw & REQ_RAHEAD) + else if (bio->bi_opf & REQ_RAHEAD) type = DRBD_FAULT_DT_RA; else type = DRBD_FAULT_DT_RD; diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 35dbb3dca47e..c6755c9a0aea 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -256,7 +256,7 @@ void drbd_request_endio(struct bio *bio) what = DISCARD_COMPLETED_WITH_ERROR; break; case REQ_OP_READ: - if (bio->bi_rw & REQ_RAHEAD) + if (bio->bi_opf & REQ_RAHEAD) what = READ_AHEAD_COMPLETED_WITH_ERROR; else what = READ_COMPLETED_WITH_ERROR; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 9393bc730acf..90fa4ac149db 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -1157,7 +1157,7 @@ static int pkt_start_recovery(struct packet_data *pkt) bio_reset(pkt->bio); pkt->bio->bi_bdev = pd->bdev; - pkt->bio->bi_rw = REQ_WRITE; + bio_set_op_attrs(pkt->bio, REQ_OP_WRITE, 0); pkt->bio->bi_iter.bi_sector = new_sector; pkt->bio->bi_iter.bi_size = pkt->frames * CD_FRAMESIZE; pkt->bio->bi_vcnt = pkt->frames; diff --git a/drivers/block/umem.c b/drivers/block/umem.c index d0a3e6d4515f..be90e15854ed 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -535,7 +535,7 @@ static blk_qc_t mm_make_request(struct request_queue *q, struct bio *bio) *card->biotail = bio; bio->bi_next = NULL; card->biotail = &bio->bi_next; - if (bio->bi_rw & REQ_SYNC || !mm_check_plugged(card)) + if (bio->bi_opf & REQ_SYNC || !mm_check_plugged(card)) activate(card); spin_unlock_irq(&card->lock); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 69f16f43f8ab..4b177fe11ebb 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -208,7 +208,7 @@ static void bch_data_insert_start(struct closure *cl) * Journal writes are marked REQ_PREFLUSH; if the original write was a * flush, it'll wait on the journal write. */ - bio->bi_rw &= ~(REQ_PREFLUSH|REQ_FUA); + bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA); do { unsigned i; @@ -405,7 +405,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) if (!congested && mode == CACHE_MODE_WRITEBACK && op_is_write(bio_op(bio)) && - (bio->bi_rw & REQ_SYNC)) + (bio->bi_opf & REQ_SYNC)) goto rescale; spin_lock(&dc->io_lock); @@ -668,7 +668,7 @@ static inline struct search *search_alloc(struct bio *bio, s->iop.write_prio = 0; s->iop.error = 0; s->iop.flags = 0; - s->iop.flush_journal = (bio->bi_rw & (REQ_PREFLUSH|REQ_FUA)) != 0; + s->iop.flush_journal = (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA)) != 0; s->iop.wq = bcache_wq; return s; @@ -796,8 +796,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, goto out_submit; } - if (!(bio->bi_rw & REQ_RAHEAD) && - !(bio->bi_rw & REQ_META) && + if (!(bio->bi_opf & REQ_RAHEAD) && + !(bio->bi_opf & REQ_META) && s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) reada = min_t(sector_t, dc->readahead >> 9, bdev_sectors(bio->bi_bdev) - bio_end_sector(bio)); @@ -920,7 +920,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) bch_writeback_add(dc); s->iop.bio = bio; - if (bio->bi_rw & REQ_PREFLUSH) { + if (bio->bi_opf & REQ_PREFLUSH) { /* Also need to send a flush to the backing device */ struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, dc->disk.bio_split); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 88ef6d14cce3..95a4ca6ce6ff 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -347,7 +347,7 @@ static void uuid_io(struct cache_set *c, int op, unsigned long op_flags, for (i = 0; i < KEY_PTRS(k); i++) { struct bio *bio = bch_bbio_alloc(c); - bio->bi_rw = REQ_SYNC|REQ_META|op_flags; + bio->bi_opf = REQ_SYNC | REQ_META | op_flags; bio->bi_iter.bi_size = KEY_SIZE(k) << 9; bio->bi_end_io = uuid_endio; diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 073a042aed24..301eaf565167 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -57,7 +57,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, if (would_skip) return false; - return bio->bi_rw & REQ_SYNC || + return bio->bi_opf & REQ_SYNC || in_use <= CUTOFF_WRITEBACK; } diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 718744db62df..59b2c50562e4 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -788,7 +788,7 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) spin_lock_irqsave(&cache->lock, flags); if (cache->need_tick_bio && - !(bio->bi_rw & (REQ_FUA | REQ_PREFLUSH)) && + !(bio->bi_opf & (REQ_FUA | REQ_PREFLUSH)) && bio_op(bio) != REQ_OP_DISCARD) { pb->tick = true; cache->need_tick_bio = false; @@ -830,7 +830,7 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) static int bio_triggers_commit(struct cache *cache, struct bio *bio) { - return bio->bi_rw & (REQ_PREFLUSH | REQ_FUA); + return bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); } /* @@ -1069,7 +1069,7 @@ static void dec_io_migrations(struct cache *cache) static bool discard_or_flush(struct bio *bio) { return bio_op(bio) == REQ_OP_DISCARD || - bio->bi_rw & (REQ_PREFLUSH | REQ_FUA); + bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); } static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) @@ -1980,7 +1980,7 @@ static void process_deferred_bios(struct cache *cache) bio = bio_list_pop(&bios); - if (bio->bi_rw & REQ_PREFLUSH) + if (bio->bi_opf & REQ_PREFLUSH) process_flush_bio(cache, bio); else if (bio_op(bio) == REQ_OP_DISCARD) process_discard_bio(cache, &structs, bio); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 8f2e3e2ffd26..4e9784b4e0ac 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1136,7 +1136,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) clone->bi_private = io; clone->bi_end_io = crypt_endio; clone->bi_bdev = cc->dev->bdev; - bio_set_op_attrs(clone, bio_op(io->base_bio), io->base_bio->bi_rw); + bio_set_op_attrs(clone, bio_op(io->base_bio), io->base_bio->bi_opf); } static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) @@ -1915,7 +1915,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) * - for REQ_PREFLUSH device-mapper core ensures that no IO is in-flight * - for REQ_OP_DISCARD caller must use flush if IO ordering matters */ - if (unlikely(bio->bi_rw & REQ_PREFLUSH || + if (unlikely(bio->bi_opf & REQ_PREFLUSH || bio_op(bio) == REQ_OP_DISCARD)) { bio->bi_bdev = cc->dev->bdev; if (bio_sectors(bio)) diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 2faf49d8f4d7..bf2b2676cb8a 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1542,7 +1542,7 @@ static int era_map(struct dm_target *ti, struct bio *bio) /* * REQ_PREFLUSH bios carry no data, so we're not interested in them. */ - if (!(bio->bi_rw & REQ_PREFLUSH) && + if (!(bio->bi_opf & REQ_PREFLUSH) && (bio_data_dir(bio) == WRITE) && !metadata_current_marked(era->md, block)) { defer_bio(era, bio); diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 19db13e99466..97e446d54a15 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -16,7 +16,7 @@ #define DM_MSG_PREFIX "flakey" #define all_corrupt_bio_flags_match(bio, fc) \ - (((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags) + (((bio)->bi_opf & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags) /* * Flakey: Used for testing only, simulates intermittent, @@ -266,9 +266,9 @@ static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value; DMDEBUG("Corrupting data bio=%p by writing %u to byte %u " - "(rw=%c bi_rw=%u bi_sector=%llu cur_bytes=%u)\n", + "(rw=%c bi_opf=%u bi_sector=%llu cur_bytes=%u)\n", bio, fc->corrupt_bio_value, fc->corrupt_bio_byte, - (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_rw, + (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf, (unsigned long long)bio->bi_iter.bi_sector, bio_bytes); } } diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index daa03e41654a..0bf1a12e35fe 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -505,9 +505,9 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp, * New collapsed (a)synchronous interface. * * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug - * the queue with blk_unplug() some time later or set REQ_SYNC in io_req->bi_rw. - * If you fail to do one of these, the IO will be submitted to the disk after - * q->unplug_delay, which defaults to 3ms in blk-settings.c. + * the queue with blk_unplug() some time later or set REQ_SYNC in + * io_req->bi_opf. If you fail to do one of these, the IO will be submitted to + * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c. */ int dm_io(struct dm_io_request *io_req, unsigned num_regions, struct dm_io_region *where, unsigned long *sync_error_bits) diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index b5dbf7a0515e..4ab68033f9d1 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -555,8 +555,8 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio) struct bio_vec bv; size_t alloc_size; int i = 0; - bool flush_bio = (bio->bi_rw & REQ_PREFLUSH); - bool fua_bio = (bio->bi_rw & REQ_FUA); + bool flush_bio = (bio->bi_opf & REQ_PREFLUSH); + bool fua_bio = (bio->bi_opf & REQ_FUA); bool discard_bio = (bio_op(bio) == REQ_OP_DISCARD); pb->block = NULL; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index d7107d23b897..ac734e5bbe48 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -661,7 +661,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m bio->bi_error = 0; bio->bi_bdev = pgpath->path.dev->bdev; - bio->bi_rw |= REQ_FAILFAST_TRANSPORT; + bio->bi_opf |= REQ_FAILFAST_TRANSPORT; if (pgpath->pg->ps.type->start_io) pgpath->pg->ps.type->start_io(&pgpath->pg->ps, diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index dac55b254a09..bdf1606f67bc 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -657,7 +657,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) struct mirror *m; struct dm_io_request io_req = { .bi_op = REQ_OP_WRITE, - .bi_op_flags = bio->bi_rw & WRITE_FLUSH_FUA, + .bi_op_flags = bio->bi_opf & WRITE_FLUSH_FUA, .mem.type = DM_IO_BIO, .mem.ptr.bio = bio, .notify.fn = write_callback, @@ -704,7 +704,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) bio_list_init(&requeue); while ((bio = bio_list_pop(writes))) { - if ((bio->bi_rw & REQ_PREFLUSH) || + if ((bio->bi_opf & REQ_PREFLUSH) || (bio_op(bio) == REQ_OP_DISCARD)) { bio_list_add(&sync, bio); continue; @@ -1217,7 +1217,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio) * If region is not in-sync queue the bio. */ if (!r || (r == -EWOULDBLOCK)) { - if (bio->bi_rw & REQ_RAHEAD) + if (bio->bi_opf & REQ_RAHEAD) return -EWOULDBLOCK; queue_bio(ms, bio, rw); @@ -1253,7 +1253,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) * We need to dec pending if this was a write. */ if (rw == WRITE) { - if (!(bio->bi_rw & REQ_PREFLUSH) && + if (!(bio->bi_opf & REQ_PREFLUSH) && bio_op(bio) != REQ_OP_DISCARD) dm_rh_dec(ms->rh, bio_record->write_region); return error; @@ -1262,7 +1262,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) if (error == -EOPNOTSUPP) goto out; - if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD)) + if ((error == -EWOULDBLOCK) && (bio->bi_opf & REQ_RAHEAD)) goto out; if (unlikely(error)) { diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index b11813431f31..85c32b22a420 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -398,7 +398,7 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) region_t region = dm_rh_bio_to_region(rh, bio); int recovering = 0; - if (bio->bi_rw & REQ_PREFLUSH) { + if (bio->bi_opf & REQ_PREFLUSH) { rh->flush_failure = 1; return; } @@ -526,7 +526,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) struct bio *bio; for (bio = bios->head; bio; bio = bio->bi_next) { - if (bio->bi_rw & REQ_PREFLUSH || bio_op(bio) == REQ_OP_DISCARD) + if (bio->bi_opf & REQ_PREFLUSH || bio_op(bio) == REQ_OP_DISCARD) continue; rh_inc(rh, dm_rh_bio_to_region(rh, bio)); } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index ce2a910709f7..c65feeada864 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1680,7 +1680,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) init_tracked_chunk(bio); - if (bio->bi_rw & REQ_PREFLUSH) { + if (bio->bi_opf & REQ_PREFLUSH) { bio->bi_bdev = s->cow->bdev; return DM_MAPIO_REMAPPED; } @@ -1800,7 +1800,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) init_tracked_chunk(bio); - if (bio->bi_rw & REQ_PREFLUSH) { + if (bio->bi_opf & REQ_PREFLUSH) { if (!dm_bio_get_target_bio_nr(bio)) bio->bi_bdev = s->origin->bdev; else @@ -2286,7 +2286,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio) bio->bi_bdev = o->dev->bdev; - if (unlikely(bio->bi_rw & REQ_PREFLUSH)) + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) return DM_MAPIO_REMAPPED; if (bio_data_dir(bio) != WRITE) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 83f1d4667195..28193a57bf47 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -286,7 +286,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) uint32_t stripe; unsigned target_bio_nr; - if (bio->bi_rw & REQ_PREFLUSH) { + if (bio->bi_opf & REQ_PREFLUSH) { target_bio_nr = dm_bio_get_target_bio_nr(bio); BUG_ON(target_bio_nr >= sc->stripes); bio->bi_bdev = sc->stripe[target_bio_nr].dev->bdev; @@ -383,7 +383,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error) if (!error) return 0; /* I/O complete */ - if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD)) + if ((error == -EWOULDBLOCK) && (bio->bi_opf & REQ_RAHEAD)) return error; if (error == -EOPNOTSUPP) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 197ea2003400..d1c05c12a9db 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -699,7 +699,7 @@ static void remap_to_origin(struct thin_c *tc, struct bio *bio) static int bio_triggers_commit(struct thin_c *tc, struct bio *bio) { - return (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA)) && + return (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) && dm_thin_changed_this_transaction(tc->td); } @@ -870,7 +870,7 @@ static void __inc_remap_and_issue_cell(void *context, struct bio *bio; while ((bio = bio_list_pop(&cell->bios))) { - if (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA) || + if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) || bio_op(bio) == REQ_OP_DISCARD) bio_list_add(&info->defer_bios, bio); else { @@ -1717,7 +1717,7 @@ static void __remap_and_issue_shared_cell(void *context, while ((bio = bio_list_pop(&cell->bios))) { if ((bio_data_dir(bio) == WRITE) || - (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA) || + (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) || bio_op(bio) == REQ_OP_DISCARD)) bio_list_add(&info->defer_bios, bio); else { @@ -2635,7 +2635,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } - if (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA) || + if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) || bio_op(bio) == REQ_OP_DISCARD) { thin_defer_bio_with_throttle(tc, bio); return DM_MAPIO_SUBMITTED; diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c index 618b8752dcf1..b616f11d8473 100644 --- a/drivers/md/dm-zero.c +++ b/drivers/md/dm-zero.c @@ -37,7 +37,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio) { switch (bio_op(bio)) { case REQ_OP_READ: - if (bio->bi_rw & REQ_RAHEAD) { + if (bio->bi_opf & REQ_RAHEAD) { /* readahead of null bytes only wastes buffer cache */ return -EIO; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index dfa09e14e847..fa9b1cb4438a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -798,12 +798,12 @@ static void dec_pending(struct dm_io *io, int error) if (io_error == DM_ENDIO_REQUEUE) return; - if ((bio->bi_rw & REQ_PREFLUSH) && bio->bi_iter.bi_size) { + if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) { /* * Preflush done for flush with data, reissue * without REQ_PREFLUSH. */ - bio->bi_rw &= ~REQ_PREFLUSH; + bio->bi_opf &= ~REQ_PREFLUSH; queue_io(md, bio); } else { /* done with normal IO or empty flush */ @@ -964,7 +964,7 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) { struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; - BUG_ON(bio->bi_rw & REQ_PREFLUSH); + BUG_ON(bio->bi_opf & REQ_PREFLUSH); BUG_ON(bi_size > *tio->len_ptr); BUG_ON(n_sectors > bi_size); *tio->len_ptr -= bi_size - n_sectors; @@ -1252,7 +1252,7 @@ static void __split_and_process_bio(struct mapped_device *md, start_io_acct(ci.io); - if (bio->bi_rw & REQ_PREFLUSH) { + if (bio->bi_opf & REQ_PREFLUSH) { ci.bio = &ci.md->flush_bio; ci.sector_count = 0; error = __send_empty_flush(&ci); @@ -1290,7 +1290,7 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { dm_put_live_table(md, srcu_idx); - if (!(bio->bi_rw & REQ_RAHEAD)) + if (!(bio->bi_opf & REQ_RAHEAD)) queue_io(md, bio); else bio_io_error(bio); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 70ff888d25d0..86f5d435901d 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -221,7 +221,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) struct bio *split; sector_t start_sector, end_sector, data_offset; - if (unlikely(bio->bi_rw & REQ_PREFLUSH)) { + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); return; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 2c3ab6f5e6be..d646f6e444f0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -285,7 +285,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) */ sectors = bio_sectors(bio); /* bio could be mergeable after passing to underlayer */ - bio->bi_rw &= ~REQ_NOMERGE; + bio->bi_opf &= ~REQ_NOMERGE; mddev->pers->make_request(mddev, bio); cpu = part_stat_lock(); @@ -414,7 +414,7 @@ static void md_submit_flush_data(struct work_struct *ws) /* an empty barrier - all done */ bio_endio(bio); else { - bio->bi_rw &= ~REQ_PREFLUSH; + bio->bi_opf &= ~REQ_PREFLUSH; mddev->pers->make_request(mddev, bio); } diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 4974682842ae..673efbd6fc47 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -91,7 +91,7 @@ static void multipath_end_request(struct bio *bio) if (!bio->bi_error) multipath_end_bh_io(mp_bh, 0); - else if (!(bio->bi_rw & REQ_RAHEAD)) { + else if (!(bio->bi_opf & REQ_RAHEAD)) { /* * oops, IO error: */ @@ -112,7 +112,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) struct multipath_bh * mp_bh; struct multipath_info *multipath; - if (unlikely(bio->bi_rw & REQ_PREFLUSH)) { + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); return; } @@ -135,7 +135,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset; mp_bh->bio.bi_bdev = multipath->rdev->bdev; - mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT; + mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT; mp_bh->bio.bi_end_io = multipath_end_request; mp_bh->bio.bi_private = mp_bh; generic_make_request(&mp_bh->bio); @@ -360,7 +360,7 @@ static void multipathd(struct md_thread *thread) bio->bi_iter.bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset; bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev; - bio->bi_rw |= REQ_FAILFAST_TRANSPORT; + bio->bi_opf |= REQ_FAILFAST_TRANSPORT; bio->bi_end_io = multipath_end_request; bio->bi_private = mp_bh; generic_make_request(bio); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c3d439083212..258986a2699d 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -458,7 +458,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) struct md_rdev *tmp_dev; struct bio *split; - if (unlikely(bio->bi_rw & REQ_PREFLUSH)) { + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); return; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 46168ef2e279..21dc00eb1989 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1043,8 +1043,8 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio) unsigned long flags; const int op = bio_op(bio); const int rw = bio_data_dir(bio); - const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); - const unsigned long do_flush_fua = (bio->bi_rw & + const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); + const unsigned long do_flush_fua = (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)); struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; @@ -2318,7 +2318,7 @@ read_more: raid_end_bio_io(r1_bio); } else { const unsigned long do_sync - = r1_bio->master_bio->bi_rw & REQ_SYNC; + = r1_bio->master_bio->bi_opf & REQ_SYNC; if (bio) { r1_bio->bios[r1_bio->read_disk] = mddev->ro ? IO_BLOCKED : NULL; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ed29fc899f06..0e4efcd10795 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1054,8 +1054,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio) int i; const int op = bio_op(bio); const int rw = bio_data_dir(bio); - const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); - const unsigned long do_fua = (bio->bi_rw & REQ_FUA); + const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); + const unsigned long do_fua = (bio->bi_opf & REQ_FUA); unsigned long flags; struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; @@ -1440,7 +1440,7 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio) struct bio *split; - if (unlikely(bio->bi_rw & REQ_PREFLUSH)) { + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); return; } @@ -2533,7 +2533,7 @@ read_more: return; } - do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); + do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC); slot = r10_bio->read_slot; printk_ratelimited( KERN_ERR diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 5504ce2bac06..51f76ddbe265 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -536,7 +536,7 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) bio_endio(bio); return 0; } - bio->bi_rw &= ~REQ_PREFLUSH; + bio->bi_opf &= ~REQ_PREFLUSH; return -EAGAIN; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d189e894b921..8912407a4dd0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -806,7 +806,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh dd_idx = 0; while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) dd_idx++; - if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw || + if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf || bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite)) goto unlock_out; @@ -1003,7 +1003,7 @@ again: pr_debug("%s: for %llu schedule op %d on disc %d\n", __func__, (unsigned long long)sh->sector, - bi->bi_rw, i); + bi->bi_opf, i); atomic_inc(&sh->count); if (sh != head_sh) atomic_inc(&head_sh->count); @@ -1014,7 +1014,7 @@ again: bi->bi_iter.bi_sector = (sh->sector + rdev->data_offset); if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) - bi->bi_rw |= REQ_NOMERGE; + bi->bi_opf |= REQ_NOMERGE; if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); @@ -1055,7 +1055,7 @@ again: pr_debug("%s: for %llu schedule op %d on " "replacement disc %d\n", __func__, (unsigned long long)sh->sector, - rbi->bi_rw, i); + rbi->bi_opf, i); atomic_inc(&sh->count); if (sh != head_sh) atomic_inc(&head_sh->count); @@ -1088,7 +1088,7 @@ again: if (op_is_write(op)) set_bit(STRIPE_DEGRADED, &sh->state); pr_debug("skip op %d on disc %d for sector %llu\n", - bi->bi_rw, i, (unsigned long long)sh->sector); + bi->bi_opf, i, (unsigned long long)sh->sector); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); } @@ -1619,9 +1619,9 @@ again: while (wbi && wbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { - if (wbi->bi_rw & REQ_FUA) + if (wbi->bi_opf & REQ_FUA) set_bit(R5_WantFUA, &dev->flags); - if (wbi->bi_rw & REQ_SYNC) + if (wbi->bi_opf & REQ_SYNC) set_bit(R5_SyncIO, &dev->flags); if (bio_op(wbi) == REQ_OP_DISCARD) set_bit(R5_Discard, &dev->flags); @@ -5154,7 +5154,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) DEFINE_WAIT(w); bool do_prepare; - if (unlikely(bi->bi_rw & REQ_PREFLUSH)) { + if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = r5l_handle_flush_request(conf->log, bi); if (ret == 0) @@ -5237,7 +5237,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) (unsigned long long)logical_sector); sh = raid5_get_active_stripe(conf, new_sector, previous, - (bi->bi_rw & REQ_RAHEAD), 0); + (bi->bi_opf & REQ_RAHEAD), 0); if (sh) { if (unlikely(previous)) { /* expansion might have moved on while waiting for a @@ -5305,7 +5305,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); if ((!sh->batch_head || sh == sh->batch_head) && - (bi->bi_rw & REQ_SYNC) && + (bi->bi_opf & REQ_SYNC) && !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); release_stripe_plug(mddev, sh); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 20bae50c231d..571a6c7ee2fc 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -128,7 +128,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) struct pmem_device *pmem = q->queuedata; struct nd_region *nd_region = to_region(pmem); - if (bio->bi_rw & REQ_FLUSH) + if (bio->bi_opf & REQ_FLUSH) nvdimm_flush(nd_region); do_acct = nd_iostat_start(bio, &start); @@ -144,7 +144,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) if (do_acct) nd_iostat_end(bio, start); - if (bio->bi_rw & REQ_FUA) + if (bio->bi_opf & REQ_FUA) nvdimm_flush(nd_region); bio_endio(bio); diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 5d5cae05818d..66789471b49d 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -2945,7 +2945,7 @@ static void __btrfsic_submit_bio(struct bio *bio) printk(KERN_INFO "submit_bio(rw=%d,0x%x, bi_vcnt=%u," " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - bio_op(bio), bio->bi_rw, bio->bi_vcnt, + bio_op(bio), bio->bi_opf, bio->bi_vcnt, (unsigned long long)bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); @@ -2976,18 +2976,18 @@ static void __btrfsic_submit_bio(struct bio *bio) btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, bio->bi_vcnt, bio, &bio_is_patched, - NULL, bio->bi_rw); + NULL, bio->bi_opf); while (i > 0) { i--; kunmap(bio->bi_io_vec[i].bv_page); } kfree(mapped_datav); - } else if (NULL != dev_state && (bio->bi_rw & REQ_PREFLUSH)) { + } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO "submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", - bio_op(bio), bio->bi_rw, bio->bi_bdev); + bio_op(bio), bio->bi_opf, bio->bi_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | @@ -3005,7 +3005,7 @@ static void __btrfsic_submit_bio(struct bio *bio) block->never_written = 0; block->iodone_w_error = 0; block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = bio->bi_rw; + block->submit_bio_bh_rw = bio->bi_opf; block->orig_bio_bh_private = bio->bi_private; block->orig_bio_bh_end_io.bio = bio->bi_end_io; block->next_in_same_bio = NULL; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 87dad552e39a..59febfb8d04a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -870,7 +870,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, atomic_inc(&fs_info->nr_async_submits); - if (bio->bi_rw & REQ_SYNC) + if (bio->bi_opf & REQ_SYNC) btrfs_set_work_high_priority(&async->work); btrfs_queue_work(fs_info->workers, &async->work); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b0f421f332ae..2f5975954ccf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8209,7 +8209,7 @@ static void btrfs_end_dio_bio(struct bio *bio) if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", - btrfs_ino(dip->inode), bio_op(bio), bio->bi_rw, + btrfs_ino(dip->inode), bio_op(bio), bio->bi_opf, (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); @@ -8373,7 +8373,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, if (!bio) return -ENOMEM; - bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_rw); + bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; @@ -8411,7 +8411,7 @@ next_block: start_sector, GFP_NOFS); if (!bio) goto out_err; - bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_rw); + bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bb0addce7558..51f125508771 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6012,7 +6012,7 @@ static void btrfs_end_bio(struct bio *bio) else btrfs_dev_stat_inc(dev, BTRFS_DEV_STAT_READ_ERRS); - if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH) + if ((bio->bi_opf & WRITE_FLUSH) == WRITE_FLUSH) btrfs_dev_stat_inc(dev, BTRFS_DEV_STAT_FLUSH_ERRS); btrfs_dev_stat_print_on_error(dev); @@ -6089,7 +6089,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, bio->bi_next = NULL; spin_lock(&device->io_lock); - if (bio->bi_rw & REQ_SYNC) + if (bio->bi_opf & REQ_SYNC) pending_bios = &device->pending_sync_bios; else pending_bios = &device->pending_bios; @@ -6127,7 +6127,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, rcu_read_lock(); name = rcu_dereference(dev->name); pr_debug("btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu " - "(%s id %llu), size=%u\n", bio_op(bio), bio->bi_rw, + "(%s id %llu), size=%u\n", bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev, name->str, dev->devid, bio->bi_iter.bi_size); rcu_read_unlock(); diff --git a/include/linux/bio.h b/include/linux/bio.h index e09a8895fc31..59ffaa68b11b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -95,7 +95,7 @@ static inline bool bio_is_rw(struct bio *bio) static inline bool bio_mergeable(struct bio *bio) { - if (bio->bi_rw & REQ_NOMERGE_FLAGS) + if (bio->bi_opf & REQ_NOMERGE_FLAGS) return false; return true; @@ -318,7 +318,7 @@ struct bio_integrity_payload { static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) { - if (bio->bi_rw & REQ_INTEGRITY) + if (bio->bi_opf & REQ_INTEGRITY) return bio->bi_integrity; return NULL; diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index f77150a4a96a..10648e300c93 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -714,9 +714,9 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, if (!throtl) { blkg = blkg ?: q->root_blkg; - blkg_rwstat_add(&blkg->stat_bytes, bio_op(bio), bio->bi_rw, + blkg_rwstat_add(&blkg->stat_bytes, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_size); - blkg_rwstat_add(&blkg->stat_ios, bio_op(bio), bio->bi_rw, 1); + blkg_rwstat_add(&blkg->stat_ios, bio_op(bio), bio->bi_opf, 1); } rcu_read_unlock(); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f254eb264924..436f43f87da9 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -27,8 +27,9 @@ struct bio { struct bio *bi_next; /* request queue link */ struct block_device *bi_bdev; int bi_error; - unsigned int bi_rw; /* bottom bits req flags, - * top bits REQ_OP + unsigned int bi_opf; /* bottom bits req flags, + * top bits REQ_OP. Use + * accessors. */ unsigned short bi_flags; /* status, command, etc */ unsigned short bi_ioprio; @@ -89,13 +90,13 @@ struct bio { }; #define BIO_OP_SHIFT (8 * sizeof(unsigned int) - REQ_OP_BITS) -#define bio_op(bio) ((bio)->bi_rw >> BIO_OP_SHIFT) +#define bio_op(bio) ((bio)->bi_opf >> BIO_OP_SHIFT) #define bio_set_op_attrs(bio, op, op_flags) do { \ WARN_ON(op >= (1 << REQ_OP_BITS)); \ - (bio)->bi_rw &= ((1 << BIO_OP_SHIFT) - 1); \ - (bio)->bi_rw |= ((unsigned int) (op) << BIO_OP_SHIFT); \ - (bio)->bi_rw |= op_flags; \ + (bio)->bi_opf &= ((1 << BIO_OP_SHIFT) - 1); \ + (bio)->bi_opf |= ((unsigned int) (op) << BIO_OP_SHIFT); \ + (bio)->bi_opf |= op_flags; \ } while (0) #define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs) @@ -138,7 +139,7 @@ struct bio { /* * Request flags. For use in the cmd_flags field of struct request, and in - * bi_rw of struct bio. Note that some flags are only valid in either one. + * bi_opf of struct bio. Note that some flags are only valid in either one. */ enum rq_flag_bits { /* common flags */ diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 65673d8b81ac..d336b890e31f 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -27,7 +27,7 @@ DECLARE_EVENT_CLASS(bcache_request, __entry->sector = bio->bi_iter.bi_sector; __entry->orig_sector = bio->bi_iter.bi_sector - 16; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_size); ), @@ -102,7 +102,7 @@ DECLARE_EVENT_CLASS(bcache_bio, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_size); ), @@ -138,7 +138,7 @@ TRACE_EVENT(bcache_read, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_size); __entry->cache_hit = hit; __entry->bypass = bypass; @@ -170,7 +170,7 @@ TRACE_EVENT(bcache_write, __entry->inode = inode; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_size); __entry->writeback = writeback; __entry->bypass = bypass; diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 5a2a7592068f..8f3a163b8166 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -274,7 +274,7 @@ TRACE_EVENT(block_bio_bounce, bio->bi_bdev->bd_dev : 0; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -313,7 +313,7 @@ TRACE_EVENT(block_bio_complete, __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->error = error; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_size); ), @@ -341,7 +341,7 @@ DECLARE_EVENT_CLASS(block_bio_merge, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -409,7 +409,7 @@ TRACE_EVENT(block_bio_queue, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -439,7 +439,7 @@ DECLARE_EVENT_CLASS(block_get_rq, __entry->sector = bio ? bio->bi_iter.bi_sector : 0; __entry->nr_sector = bio ? bio_sectors(bio) : 0; blk_fill_rwbs(__entry->rwbs, bio ? bio_op(bio) : 0, - bio ? bio->bi_rw : 0, __entry->nr_sector); + bio ? bio->bi_opf : 0, __entry->nr_sector); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -573,7 +573,7 @@ TRACE_EVENT(block_split, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->new_sector = new_sector; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -617,7 +617,7 @@ TRACE_EVENT(block_bio_remap, __entry->nr_sector = bio_sectors(bio); __entry->old_dev = dev; __entry->old_sector = from; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw, + blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_size); ), diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fb345cd11883..7598e6ca817a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -776,7 +776,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, return; __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_rw, what, error, 0, NULL); + bio_op(bio), bio->bi_opf, what, error, 0, NULL); } static void blk_add_trace_bio_bounce(void *ignore, @@ -881,7 +881,7 @@ static void blk_add_trace_split(void *ignore, __be64 rpdu = cpu_to_be64(pdu); __blk_add_trace(bt, bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, bio_op(bio), bio->bi_rw, + bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu), &rpdu); } @@ -915,7 +915,7 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_rw, BLK_TA_REMAP, bio->bi_error, + bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error, sizeof(r), &r); } -- cgit v1.3-8-gc7d7 From 574673c231a5fad1560249cc3a598907acb36cf9 Mon Sep 17 00:00:00 2001 From: Andreas Ziegler Date: Thu, 4 Aug 2016 09:52:09 +0200 Subject: printk: Remove unnecessary #ifdef CONFIG_PRINTK In commit 874f9c7da9a4 ("printk: create pr_ functions"), new pr_level defines were added to printk.c. These new defines are guarded by an #ifdef CONFIG_PRINTK - however, there is already a surrounding #ifdef CONFIG_PRINTK starting a lot earlier in line 249 which means the newly introduced #ifdef is unnecessary. Let's remove it to avoid confusion. Signed-off-by: Andreas Ziegler Cc: Joe Perches Cc: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a5ef95ca18c9..a37fc8cf8e84 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1930,7 +1930,6 @@ asmlinkage int printk_emit(int facility, int level, } EXPORT_SYMBOL(printk_emit); -#ifdef CONFIG_PRINTK #define define_pr_level(func, loglevel) \ asmlinkage __visible void func(const char *fmt, ...) \ { \ @@ -1949,7 +1948,6 @@ define_pr_level(__pr_err, LOGLEVEL_ERR); define_pr_level(__pr_warn, LOGLEVEL_WARNING); define_pr_level(__pr_notice, LOGLEVEL_NOTICE); define_pr_level(__pr_info, LOGLEVEL_INFO); -#endif int vprintk_default(int level, const char *fmt, va_list args) { -- cgit v1.3-8-gc7d7 From f3b0946d629c8bfbd3e5f038e30cb9c711a35f10 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 13 Jul 2016 17:18:33 +0100 Subject: genirq/msi: Make sure PCI MSIs are activated early Bharat Kumar Gogada reported issues with the generic MSI code, where the end-point ended up with garbage in its MSI configuration (both for the vector and the message). It turns out that the two MSI paths in the kernel are doing slightly different things: generic MSI: disable MSI -> allocate MSI -> enable MSI -> setup EP PCI MSI: disable MSI -> allocate MSI -> setup EP -> enable MSI And it turns out that end-points are allowed to latch the content of the MSI configuration registers as soon as MSIs are enabled. In Bharat's case, the end-point ends up using whatever was there already, which is not what you want. In order to make things converge, we introduce a new MSI domain flag (MSI_FLAG_ACTIVATE_EARLY) that is unconditionally set for PCI/MSI. When set, this flag forces the programming of the end-point as soon as the MSIs are allocated. A consequence of this is that we have an extra activate in irq_startup, but that should be without much consequence. tglx: - Several people reported a VMWare regression with PCI/MSI-X passthrough. It turns out that the patch also cures that issue. - We need to have a look at the MSI disable interrupt path, where we write the msg to all zeros without disabling MSI in the PCI device. Is that correct? Fixes: 52f518a3a7c2 "x86/MSI: Use hierarchical irqdomains to manage MSI interrupts" Reported-and-tested-by: Bharat Kumar Gogada Reported-and-tested-by: Foster Snowhill Reported-by: Matthias Prager Reported-by: Jason Taylor Signed-off-by: Marc Zyngier Acked-by: Bjorn Helgaas Cc: linux-pci@vger.kernel.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1468426713-31431-1-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- drivers/pci/msi.c | 2 ++ include/linux/msi.h | 2 ++ kernel/irq/msi.c | 11 +++++++++++ 3 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index a02981efdad5..eafa6138a6b8 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1411,6 +1411,8 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode, if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) pci_msi_domain_update_chip_ops(info); + info->flags |= MSI_FLAG_ACTIVATE_EARLY; + domain = msi_create_irq_domain(fwnode, info, parent); if (!domain) return NULL; diff --git a/include/linux/msi.h b/include/linux/msi.h index 4f0bfe5912b2..e8c81fbd5f9c 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -270,6 +270,8 @@ enum { MSI_FLAG_MULTI_PCI_MSI = (1 << 2), /* Support PCI MSIX interrupts */ MSI_FLAG_PCI_MSIX = (1 << 3), + /* Needs early activate, required for PCI */ + MSI_FLAG_ACTIVATE_EARLY = (1 << 4), }; int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask, diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 54999350162c..19e9dfbe97fa 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -359,6 +359,17 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, else dev_dbg(dev, "irq [%d-%d] for MSI\n", virq, virq + desc->nvec_used - 1); + /* + * This flag is set by the PCI layer as we need to activate + * the MSI entries before the PCI layer enables MSI in the + * card. Otherwise the card latches a random msi message. + */ + if (info->flags & MSI_FLAG_ACTIVATE_EARLY) { + struct irq_data *irq_data; + + irq_data = irq_domain_get_irq_data(domain, desc->irq); + irq_domain_activate_irq(irq_data); + } } return 0; -- cgit v1.3-8-gc7d7 From 46c8f0b077a838eb1f6169bb370aab8ed98f7630 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Mon, 8 Aug 2016 16:29:07 -0400 Subject: timers: Fix get_next_timer_interrupt() computation The tick_nohz_stop_sched_tick() routine is not properly canceling the sched timer when nothing is pending, because get_next_timer_interrupt() is no longer returning KTIME_MAX in that case. This causes periodic interrupts when none are needed. When determining the next interrupt time, we first use __next_timer_interrupt() to get the first expiring timer in the timer wheel. If no timer is found, we return the base clock value plus NEXT_TIMER_MAX_DELTA to indicate there is no timer in the timer wheel. Back in get_next_timer_interrupt(), we set the "expires" value by converting the timer wheel expiry (in ticks) to a nsec value. But we don't want to do this if the timer wheel expiry value indicates no timer; we want to return KTIME_MAX. Prior to commit 500462a9de65 ("timers: Switch to a non-cascading wheel") we checked base->active_timers to see if any timers were active, and if not, we didn't touch the expiry value and so properly returned KTIME_MAX. Now we don't have active_timers. To fix this, we now just check the timer wheel expiry value to see if it is "now + NEXT_TIMER_MAX_DELTA", and if it is, we don't try to compute a new value based on it, but instead simply let the KTIME_MAX value in expires remain. Fixes: 500462a9de65 "timers: Switch to a non-cascading wheel" Signed-off-by: Chris Metcalf Cc: Frederic Weisbecker Cc: Christoph Lameter Cc: John Stultz Link: http://lkml.kernel.org/r/1470688147-22287-1-git-send-email-cmetcalf@mellanox.com Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 555670a5143c..32bf6f75a8fe 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1496,6 +1496,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); u64 expires = KTIME_MAX; unsigned long nextevt; + bool is_max_delta; /* * Pretend that there is no timer pending if the cpu is offline. @@ -1506,6 +1507,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) spin_lock(&base->lock); nextevt = __next_timer_interrupt(base); + is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); base->next_expiry = nextevt; /* * We have a fresh next event. Check whether we can forward the base: @@ -1519,7 +1521,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) expires = basem; base->is_idle = false; } else { - expires = basem + (nextevt - basej) * TICK_NSEC; + if (!is_max_delta) + expires = basem + (nextevt - basej) * TICK_NSEC; /* * If we expect to sleep more than a tick, mark the base idle: */ -- cgit v1.3-8-gc7d7 From a0cba2179ea4c1820fce2ee046b6ed90ecc56196 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 9 Aug 2016 10:48:18 -0700 Subject: Revert "printk: create pr_ functions" This reverts commit 874f9c7da9a4acbc1b9e12ca722579fb50e4d142. Geert Uytterhoeven reports: "This change seems to have an (unintendent?) side-effect. Before, pr_*() calls without a trailing newline characters would be printed with a newline character appended, both on the console and in the output of the dmesg command. After this commit, no new line character is appended, and the output of the next pr_*() call of the same type may be appended, like in: - Truncating RAM at 0x0000000040000000-0x00000000c0000000 to -0x0000000070000000 - Ignoring RAM at 0x0000000200000000-0x0000000240000000 (!CONFIG_HIGHMEM) + Truncating RAM at 0x0000000040000000-0x00000000c0000000 to -0x0000000070000000Ignoring RAM at 0x0000000200000000-0x0000000240000000 (!CONFIG_HIGHMEM)" Joe Perches says: "No, that is not intentional. The newline handling code inside vprintk_emit is a bit involved and for now I suggest a revert until this has all the same behavior as earlier" Reported-by: Geert Uytterhoeven Requested-by: Joe Perches Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 48 +++++++++++++++--------------------------------- kernel/printk/internal.h | 16 ++++++---------- kernel/printk/nmi.c | 13 ++----------- kernel/printk/printk.c | 25 +++---------------------- 4 files changed, 26 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index 8dc155dab3ed..696a56be7d3e 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -266,39 +266,21 @@ extern asmlinkage void dump_stack(void) __cold; * and other debug macros are compiled out unless either DEBUG is defined * or CONFIG_DYNAMIC_DEBUG is set. */ - -#ifdef CONFIG_PRINTK - -asmlinkage __printf(1, 2) __cold void __pr_emerg(const char *fmt, ...); -asmlinkage __printf(1, 2) __cold void __pr_alert(const char *fmt, ...); -asmlinkage __printf(1, 2) __cold void __pr_crit(const char *fmt, ...); -asmlinkage __printf(1, 2) __cold void __pr_err(const char *fmt, ...); -asmlinkage __printf(1, 2) __cold void __pr_warn(const char *fmt, ...); -asmlinkage __printf(1, 2) __cold void __pr_notice(const char *fmt, ...); -asmlinkage __printf(1, 2) __cold void __pr_info(const char *fmt, ...); - -#define pr_emerg(fmt, ...) __pr_emerg(pr_fmt(fmt), ##__VA_ARGS__) -#define pr_alert(fmt, ...) __pr_alert(pr_fmt(fmt), ##__VA_ARGS__) -#define pr_crit(fmt, ...) __pr_crit(pr_fmt(fmt), ##__VA_ARGS__) -#define pr_err(fmt, ...) __pr_err(pr_fmt(fmt), ##__VA_ARGS__) -#define pr_warn(fmt, ...) __pr_warn(pr_fmt(fmt), ##__VA_ARGS__) -#define pr_notice(fmt, ...) __pr_notice(pr_fmt(fmt), ##__VA_ARGS__) -#define pr_info(fmt, ...) __pr_info(pr_fmt(fmt), ##__VA_ARGS__) - -#else - -#define pr_emerg(fmt, ...) printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) -#define pr_alert(fmt, ...) printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) -#define pr_crit(fmt, ...) printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) -#define pr_err(fmt, ...) printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) -#define pr_warn(fmt, ...) printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) -#define pr_notice(fmt, ...) printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) -#define pr_info(fmt, ...) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) - -#endif - -#define pr_warning pr_warn - +#define pr_emerg(fmt, ...) \ + printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) +#define pr_alert(fmt, ...) \ + printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) +#define pr_crit(fmt, ...) \ + printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) +#define pr_err(fmt, ...) \ + printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) +#define pr_warning(fmt, ...) \ + printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) +#define pr_warn pr_warning +#define pr_notice(fmt, ...) \ + printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) +#define pr_info(fmt, ...) \ + printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) /* * Like KERN_CONT, pr_cont() should only be used when continuing * a line with no newline ('\n') enclosed. Otherwise it defaults diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 5d4505f30083..7fd2838fa417 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -16,11 +16,9 @@ */ #include -typedef __printf(2, 0) int (*printk_func_t)(int level, const char *fmt, - va_list args); +typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args); -__printf(2, 0) -int vprintk_default(int level, const char *fmt, va_list args); +int __printf(1, 0) vprintk_default(const char *fmt, va_list args); #ifdef CONFIG_PRINTK_NMI @@ -33,10 +31,9 @@ extern raw_spinlock_t logbuf_lock; * via per-CPU variable. */ DECLARE_PER_CPU(printk_func_t, printk_func); -__printf(2, 0) -static inline int vprintk_func(int level, const char *fmt, va_list args) +static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { - return this_cpu_read(printk_func)(level, fmt, args); + return this_cpu_read(printk_func)(fmt, args); } extern atomic_t nmi_message_lost; @@ -47,10 +44,9 @@ static inline int get_nmi_message_lost(void) #else /* CONFIG_PRINTK_NMI */ -__printf(2, 0) -static inline int vprintk_func(int level, const char *fmt, va_list args) +static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { - return vprintk_default(level, fmt, args); + return vprintk_default(fmt, args); } static inline int get_nmi_message_lost(void) diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index bc3eeb1ae6da..b69eb8a2876f 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -58,7 +58,7 @@ static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); * one writer running. But the buffer might get flushed from another * CPU, so we need to be careful. */ -static int vprintk_nmi(int level, const char *fmt, va_list args) +static int vprintk_nmi(const char *fmt, va_list args) { struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); int add = 0; @@ -79,16 +79,7 @@ again: if (!len) smp_rmb(); - if (level != LOGLEVEL_DEFAULT) { - add = snprintf(s->buffer + len, sizeof(s->buffer) - len, - KERN_SOH "%c", '0' + level); - add += vsnprintf(s->buffer + len + add, - sizeof(s->buffer) - len - add, - fmt, args); - } else { - add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, - fmt, args); - } + add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); /* * Do it once again if the buffer has been flushed in the meantime. diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a37fc8cf8e84..eea6dbc2d8cf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1930,26 +1930,7 @@ asmlinkage int printk_emit(int facility, int level, } EXPORT_SYMBOL(printk_emit); -#define define_pr_level(func, loglevel) \ -asmlinkage __visible void func(const char *fmt, ...) \ -{ \ - va_list args; \ - \ - va_start(args, fmt); \ - vprintk_default(loglevel, fmt, args); \ - va_end(args); \ -} \ -EXPORT_SYMBOL(func) - -define_pr_level(__pr_emerg, LOGLEVEL_EMERG); -define_pr_level(__pr_alert, LOGLEVEL_ALERT); -define_pr_level(__pr_crit, LOGLEVEL_CRIT); -define_pr_level(__pr_err, LOGLEVEL_ERR); -define_pr_level(__pr_warn, LOGLEVEL_WARNING); -define_pr_level(__pr_notice, LOGLEVEL_NOTICE); -define_pr_level(__pr_info, LOGLEVEL_INFO); - -int vprintk_default(int level, const char *fmt, va_list args) +int vprintk_default(const char *fmt, va_list args) { int r; @@ -1959,7 +1940,7 @@ int vprintk_default(int level, const char *fmt, va_list args) return r; } #endif - r = vprintk_emit(0, level, NULL, 0, fmt, args); + r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); return r; } @@ -1992,7 +1973,7 @@ asmlinkage __visible int printk(const char *fmt, ...) int r; va_start(args, fmt); - r = vprintk_func(LOGLEVEL_DEFAULT, fmt, args); + r = vprintk_func(fmt, args); va_end(args); return r; -- cgit v1.3-8-gc7d7 From 06f4e94898918bcad00cdd4d349313a439d6911e Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Tue, 9 Aug 2016 11:25:01 +0800 Subject: cpuset: make sure new tasks conform to the current config of the cpuset A new task inherits cpus_allowed and mems_allowed masks from its parent, but if someone changes cpuset's config by writing to cpuset.cpus/cpuset.mems before this new task is inserted into the cgroup's task list, the new task won't be updated accordingly. Signed-off-by: Zefan Li Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cpuset.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c7fd2778ed50..c27e53326bef 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2069,6 +2069,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_unlock(&cpuset_mutex); } +/* + * Make sure the new task conform to the current state of its parent, + * which could have been changed by cpuset just after it inherits the + * state from the parent and before it sits on the cgroup's task list. + */ +void cpuset_fork(struct task_struct *task) +{ + if (task_css_is_root(task, cpuset_cgrp_id)) + return; + + set_cpus_allowed_ptr(task, ¤t->cpus_allowed); + task->mems_allowed = current->mems_allowed; +} + struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, @@ -2079,6 +2093,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .attach = cpuset_attach, .post_attach = cpuset_post_attach, .bind = cpuset_bind, + .fork = cpuset_fork, .legacy_cftypes = files, .early_init = true, }; -- cgit v1.3-8-gc7d7 From 0b8f1e2e26bfc6b9abe3f0f3faba2cb0eecb9fb9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Aug 2016 14:37:24 +0200 Subject: perf/core: Fix sideband list-iteration vs. event ordering NULL pointer deference crash Vegard Nossum reported that perf fuzzing generates a NULL pointer dereference crash: > Digging a bit deeper into this, it seems the event itself is getting > created by perf_event_open() and it gets added to the pmu_event_list > through: > > perf_event_open() > - perf_event_alloc() > - account_event() > - account_pmu_sb_event() > - attach_sb_event() > > so at this point the event is being attached but its ->ctx is still > NULL. It seems like ->ctx is set just a bit later in > perf_event_open(), though. > > But before that, __schedule() comes along and creates a stack trace > similar to the one above: > > __schedule() > - __perf_event_task_sched_out() > - perf_iterate_sb() > - perf_iterate_sb_cpu() > - event_filter_match() > - perf_cgroup_match() > - __get_cpu_context() > - (dereference ctx which is NULL) > > So I guess the question is... should the event be attached (= put on > the list) before ->ctx gets set? Or should the cgroup code check for a > NULL ->ctx? The latter seems like the simplest solution. Moving the list-add later creates a bit of a mess. Reported-by: Vegard Nossum Tested-by: Vegard Nossum Tested-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Carrillo-Cisneros Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Fixes: f2fb6bef9251 ("perf/core: Optimize side-band event delivery") Link: http://lkml.kernel.org/r/20160804123724.GN6862@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a19550d80ab1..87d02b8cb87e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1716,8 +1716,8 @@ static inline int pmu_filter_match(struct perf_event *event) static inline int event_filter_match(struct perf_event *event) { - return (event->cpu == -1 || event->cpu == smp_processor_id()) - && perf_cgroup_match(event) && pmu_filter_match(event); + return (event->cpu == -1 || event->cpu == smp_processor_id()) && + perf_cgroup_match(event) && pmu_filter_match(event); } static void @@ -1737,8 +1737,8 @@ event_sched_out(struct perf_event *event, * maintained, otherwise bogus information is return * via read() for time_enabled, time_running: */ - if (event->state == PERF_EVENT_STATE_INACTIVE - && !event_filter_match(event)) { + if (event->state == PERF_EVENT_STATE_INACTIVE && + !event_filter_match(event)) { delta = tstamp - event->tstamp_stopped; event->tstamp_running += delta; event->tstamp_stopped = tstamp; @@ -2236,10 +2236,15 @@ perf_install_in_context(struct perf_event_context *ctx, lockdep_assert_held(&ctx->mutex); - event->ctx = ctx; if (event->cpu != -1) event->cpu = cpu; + /* + * Ensures that if we can observe event->ctx, both the event and ctx + * will be 'complete'. See perf_iterate_sb_cpu(). + */ + smp_store_release(&event->ctx, ctx); + if (!task) { cpu_function_call(cpu, __perf_install_in_context, event); return; @@ -5969,6 +5974,14 @@ static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) struct perf_event *event; list_for_each_entry_rcu(event, &pel->list, sb_list) { + /* + * Skip events that are not fully formed yet; ensure that + * if we observe event->ctx, both event and ctx will be + * complete enough. See perf_install_in_context(). + */ + if (!smp_load_acquire(&event->ctx)) + continue; + if (event->state < PERF_EVENT_STATE_INACTIVE) continue; if (!event_filter_match(event)) -- cgit v1.3-8-gc7d7 From db4a835601b73cf8d6cd8986381d966b8e13d2d9 Mon Sep 17 00:00:00 2001 From: David Carrillo-Cisneros Date: Tue, 2 Aug 2016 00:48:12 -0700 Subject: perf/core: Set cgroup in CPU contexts for new cgroup events There's a perf stat bug easy to observer on a machine with only one cgroup: $ perf stat -e cycles -I 1000 -C 0 -G / # time counts unit events 1.000161699 cycles / 2.000355591 cycles / 3.000565154 cycles / 4.000951350 cycles / We'd expect some output there. The underlying problem is that there is an optimization in perf_cgroup_sched_{in,out}() that skips the switch of cgroup events if the old and new cgroups in a task switch are the same. This optimization interacts with the current code in two ways that cause a CPU context's cgroup (cpuctx->cgrp) to be NULL even if a cgroup event matches the current task. These are: 1. On creation of the first cgroup event in a CPU: In current code, cpuctx->cpu is only set in perf_cgroup_sched_in, but due to the aforesaid optimization, perf_cgroup_sched_in will run until the next cgroup switches in that CPU. This may happen late or never happen, depending on system's number of cgroups, CPU load, etc. 2. On deletion of the last cgroup event in a cpuctx: In list_del_event, cpuctx->cgrp is set NULL. Any new cgroup event will not be sched in because cpuctx->cgrp == NULL until a cgroup switch occurs and perf_cgroup_sched_in is executed (updating cpuctx->cgrp). This patch fixes both problems by setting cpuctx->cgrp in list_add_event, mirroring what list_del_event does when removing a cgroup event from CPU context, as introduced in: commit 68cacd29167b ("perf_events: Fix stale ->cgrp pointer in update_cgrp_time_from_cpuctx()") With this patch, cpuctx->cgrp is always set/clear when installing/removing the first/last cgroup event in/from the CPU context. With cpuctx->cgrp correctly set, event_filter_match works as intended when events are sched in/out. After the fix, the output is as expected: $ perf stat -e cycles -I 1000 -a -G / # time counts unit events 1.004699159 627342882 cycles / 2.007397156 615272690 cycles / 3.010019057 616726074 cycles / Signed-off-by: David Carrillo-Cisneros Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Paul Turner Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vegard Nossum Cc: Vince Weaver Link: http://lkml.kernel.org/r/1470124092-113192-1-git-send-email-davidcc@google.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 ++++ kernel/events/core.c | 54 ++++++++++++++++++++++++++++++---------------- 2 files changed, 40 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8ed4326164cc..2b6b43cc0dd5 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -743,7 +743,9 @@ struct perf_event_context { u64 parent_gen; u64 generation; int pin_count; +#ifdef CONFIG_CGROUP_PERF int nr_cgroups; /* cgroup evts */ +#endif void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; }; @@ -769,7 +771,9 @@ struct perf_cpu_context { unsigned int hrtimer_active; struct pmu *unique_pmu; +#ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; +#endif }; struct perf_output_handle { diff --git a/kernel/events/core.c b/kernel/events/core.c index 87d02b8cb87e..1903b8f3a705 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -843,6 +843,32 @@ perf_cgroup_mark_enabled(struct perf_event *event, } } } + +/* + * Update cpuctx->cgrp so that it is set when first cgroup event is added and + * cleared when last cgroup event is removed. + */ +static inline void +list_update_cgroup_event(struct perf_event *event, + struct perf_event_context *ctx, bool add) +{ + struct perf_cpu_context *cpuctx; + + if (!is_cgroup_event(event)) + return; + + if (add && ctx->nr_cgroups++) + return; + else if (!add && --ctx->nr_cgroups) + return; + /* + * Because cgroup events are always per-cpu events, + * this will always be called from the right CPU. + */ + cpuctx = __get_cpu_context(ctx); + cpuctx->cgrp = add ? event->cgrp : NULL; +} + #else /* !CONFIG_CGROUP_PERF */ static inline bool @@ -920,6 +946,13 @@ perf_cgroup_mark_enabled(struct perf_event *event, struct perf_event_context *ctx) { } + +static inline void +list_update_cgroup_event(struct perf_event *event, + struct perf_event_context *ctx, bool add) +{ +} + #endif /* @@ -1392,6 +1425,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { + lockdep_assert_held(&ctx->lock); WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); @@ -1412,8 +1446,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_tail(&event->group_entry, list); } - if (is_cgroup_event(event)) - ctx->nr_cgroups++; + list_update_cgroup_event(event, ctx, true); list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; @@ -1581,8 +1614,6 @@ static void perf_group_attach(struct perf_event *event) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_cpu_context *cpuctx; - WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); @@ -1594,20 +1625,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; - if (is_cgroup_event(event)) { - ctx->nr_cgroups--; - /* - * Because cgroup events are always per-cpu events, this will - * always be called from the right CPU. - */ - cpuctx = __get_cpu_context(ctx); - /* - * If there are no more cgroup events then clear cgrp to avoid - * stale pointer in update_cgrp_time_from_cpuctx(). - */ - if (!ctx->nr_cgroups) - cpuctx->cgrp = NULL; - } + list_update_cgroup_event(event, ctx, false); ctx->nr_events--; if (event->attr.inherit_stat) -- cgit v1.3-8-gc7d7 From a23eadfae2fd45536a355b785d5a1533e1955c22 Mon Sep 17 00:00:00 2001 From: Tommaso Cucinotta Date: Tue, 19 Jul 2016 11:44:50 +0200 Subject: sched/deadline: Fix wrap-around in DL heap Current code in cpudeadline.c has a bug in re-heapifying when adding a new element at the end of the heap, because a deadline value of 0 is temporarily set in the new elem, then cpudl_change_key() is called with the actual elem deadline as param. However, the function compares the new deadline to set with the one previously in the elem, which is 0. So, if current absolute deadlines grew so much to have negative values as s64, the comparison in cpudl_change_key() makes the wrong decision. Instead, as from dl_time_before(), the kernel should handle correctly abs deadlines wrap-arounds. This patch fixes the problem with a minimally invasive change that forces cpudl_change_key() to heapify up in this case. Signed-off-by: Tommaso Cucinotta Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Luca Abeni Cc: Juri Lelli Cc: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1468921493-10054-2-git-send-email-tommaso.cucinotta@sssup.it Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5be58820465c..d4184498c9f5 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -168,7 +168,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) if (old_idx == IDX_INVALID) { cp->size++; - cp->elements[cp->size - 1].dl = 0; + cp->elements[cp->size - 1].dl = dl; cp->elements[cp->size - 1].cpu = cpu; cp->elements[cpu].idx = cp->size - 1; cpudl_change_key(cp, cp->size - 1, dl); -- cgit v1.3-8-gc7d7 From b8922125e4790fa237a8a4204562ecf457ef54bb Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Sat, 9 Jul 2016 15:54:22 +0800 Subject: sched/fair: Fix typo in sync_throttle() We should update cfs_rq->throttled_clock_task, not pcfs_rq->throttle_clock_task. The effects of this bug was probably occasionally erratic group scheduling, particularly in cgroups-intense workloads. Signed-off-by: Xunlei Pang [ Added changelog. ] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Konstantin Khlebnikov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 55e16d30bd99 ("sched/fair: Rework throttle_count sync") Link: http://lkml.kernel.org/r/1468050862-18864-1-git-send-email-xlpang@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4088eedea763..039de34f1521 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4269,7 +4269,7 @@ static void sync_throttle(struct task_group *tg, int cpu) pcfs_rq = tg->parent->cfs_rq[cpu]; cfs_rq->throttle_count = pcfs_rq->throttle_count; - pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); + cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); } /* conditionally throttle active cfs_rq's from put_prev_entity() */ -- cgit v1.3-8-gc7d7 From 6075620b0590eaf22f10ce88833eb20a57f760d6 Mon Sep 17 00:00:00 2001 From: Giovanni Gherdovich Date: Fri, 5 Aug 2016 10:21:56 +0200 Subject: sched/cputime: Mitigate performance regression in times()/clock_gettime() Commit: 6e998916dfe3 ("sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency") fixed a problem whereby clock_nanosleep() followed by clock_gettime() could allow a task to wake early. It addressed the problem by calling the scheduling classes update_curr() when the cputimer starts. Said change induced a considerable performance regression on the syscalls times() and clock_gettimes(CLOCK_PROCESS_CPUTIME_ID). There are some debuggers and applications that monitor their own performance that accidentally depend on the performance of these specific calls. This patch mitigates the performace loss by prefetching data in the CPU cache, as stalls due to cache misses appear to be where most time is spent in our benchmarks. Here are the performance gain of this patch over v4.7-rc7 on a Sandy Bridge box with 32 logical cores and 2 NUMA nodes. The test is repeated with a variable number of threads, from 2 to 4*num_cpus; the results are in seconds and correspond to the average of 10 runs; the percentage gain is computed with (before-after)/before so a positive value is an improvement (it's faster). The improvement varies between a few percents for 5-20 threads and more than 10% for 2 or >20 threads. pound_clock_gettime: threads 4.7-rc7 patched 4.7-rc7 [num] [secs] [secs (percent)] 2 3.48 3.06 ( 11.83%) 5 3.33 3.25 ( 2.40%) 8 3.37 3.26 ( 3.30%) 12 3.32 3.37 ( -1.60%) 21 4.01 3.90 ( 2.74%) 30 3.63 3.36 ( 7.41%) 48 3.71 3.11 ( 16.27%) 79 3.75 3.16 ( 15.74%) 110 3.81 3.25 ( 14.80%) 128 3.88 3.31 ( 14.76%) pound_times: threads 4.7-rc7 patched 4.7-rc7 [num] [secs] [secs (percent)] 2 3.65 3.25 ( 11.03%) 5 3.45 3.17 ( 7.92%) 8 3.52 3.22 ( 8.69%) 12 3.29 3.36 ( -2.04%) 21 4.07 3.92 ( 3.78%) 30 3.87 3.40 ( 12.17%) 48 3.79 3.16 ( 16.61%) 79 3.88 3.28 ( 15.42%) 110 3.90 3.38 ( 13.35%) 128 4.00 3.38 ( 15.45%) pound_clock_gettime and pound_clock_gettime are two benchmarks included in the MMTests framework. They launch a given number of threads which repeatedly call times() or clock_gettimes(). The results above can be reproduced with cloning MMTests from github.com and running the "poundtime" workload: $ git clone https://github.com/gormanm/mmtests.git $ cd mmtests $ cp configs/config-global-dhp__workload_poundtime config $ ./run-mmtests.sh --run-monitor $(uname -r) The above will run "poundtime" measuring the kernel currently running on the machine; Once a new kernel is installed and the machine rebooted, running again $ cd mmtests $ ./run-mmtests.sh --run-monitor $(uname -r) will produce results to compare with. A comparison table will be output with: $ cd mmtests/work/log $ ../../compare-kernels.sh the table will contain a lot of entries; grepping for "Amean" (as in "arithmetic mean") will give the tables presented above. The source code for the two benchmarks is reported at the end of this changelog for clairity. The cache misses addressed by this patch were found using a combination of `perf top`, `perf record` and `perf annotate`. The incriminated lines were found to be struct sched_entity *curr = cfs_rq->curr; and delta_exec = now - curr->exec_start; in the function update_curr() from kernel/sched/fair.c. This patch prefetches the data from memory just before update_curr is called in the interested execution path. A comparison of the total number of cycles before and after the patch follows; the data is obtained using `perf stat -r 10 -ddd ` running over the same sequence of number of threads used above (a positive gain is an improvement): threads cycles before cycles after gain 2 19,699,563,964 +-1.19% 17,358,917,517 +-1.85% 11.88% 5 47,401,089,566 +-2.96% 45,103,730,829 +-0.97% 4.85% 8 80,923,501,004 +-3.01% 71,419,385,977 +-0.77% 11.74% 12 112,326,485,473 +-0.47% 110,371,524,403 +-0.47% 1.74% 21 193,455,574,299 +-0.72% 180,120,667,904 +-0.36% 6.89% 30 315,073,519,013 +-1.64% 271,222,225,950 +-1.29% 13.92% 48 321,969,515,332 +-1.48% 273,353,977,321 +-1.16% 15.10% 79 337,866,003,422 +-0.97% 289,462,481,538 +-1.05% 14.33% 110 338,712,691,920 +-0.78% 290,574,233,170 +-0.77% 14.21% 128 348,384,794,006 +-0.50% 292,691,648,206 +-0.66% 15.99% A comparison of cache miss vs total cache loads ratios, before and after the patch (again from the `perf stat -r 10 -ddd ` tables): threads L1 misses/total*100 L1 misses/total*100 gain before after 2 7.43 +-4.90% 7.36 +-4.70% 0.94% 5 13.09 +-4.74% 13.52 +-3.73% -3.28% 8 13.79 +-5.61% 12.90 +-3.27% 6.45% 12 11.57 +-2.44% 8.71 +-1.40% 24.72% 21 12.39 +-3.92% 9.97 +-1.84% 19.53% 30 13.91 +-2.53% 11.73 +-2.28% 15.67% 48 13.71 +-1.59% 12.32 +-1.97% 10.14% 79 14.44 +-0.66% 13.40 +-1.06% 7.20% 110 15.86 +-0.50% 14.46 +-0.59% 8.83% 128 16.51 +-0.32% 15.06 +-0.78% 8.78% As a final note, the following shows the evolution of performance figures in the "poundtime" benchmark and pinpoints commit 6e998916dfe3 ("sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency") as a major source of degradation, mostly unaddressed to this day (figures expressed in seconds). pound_clock_gettime: threads parent of 6e998916dfe3 4.7-rc7 6e998916dfe3 itself 2 2.23 3.68 ( -64.56%) 3.48 (-55.48%) 5 2.83 3.78 ( -33.42%) 3.33 (-17.43%) 8 2.84 4.31 ( -52.12%) 3.37 (-18.76%) 12 3.09 3.61 ( -16.74%) 3.32 ( -7.17%) 21 3.14 4.63 ( -47.36%) 4.01 (-27.71%) 30 3.28 5.75 ( -75.37%) 3.63 (-10.80%) 48 3.02 6.05 (-100.56%) 3.71 (-22.99%) 79 2.88 6.30 (-118.90%) 3.75 (-30.26%) 110 2.95 6.46 (-119.00%) 3.81 (-29.24%) 128 3.05 6.42 (-110.08%) 3.88 (-27.04%) pound_times: threads parent of 6e998916dfe3 4.7-rc7 6e998916dfe3 itself 2 2.27 3.73 ( -64.71%) 3.65 (-61.14%) 5 2.78 3.77 ( -35.56%) 3.45 (-23.98%) 8 2.79 4.41 ( -57.71%) 3.52 (-26.05%) 12 3.02 3.56 ( -17.94%) 3.29 ( -9.08%) 21 3.10 4.61 ( -48.74%) 4.07 (-31.34%) 30 3.33 5.75 ( -72.53%) 3.87 (-16.01%) 48 2.96 6.06 (-105.04%) 3.79 (-28.10%) 79 2.88 6.24 (-116.83%) 3.88 (-34.81%) 110 2.98 6.37 (-114.08%) 3.90 (-31.12%) 128 3.10 6.35 (-104.61%) 4.00 (-28.87%) The source code of the two benchmarks follows. To compile the two: NR_THREADS=42 for FILE in pound_times pound_clock_gettime; do gcc -lrt -O2 -lpthread -DNUM_THREADS=$NR_THREADS $FILE.c -o $FILE done ==== BEGIN pound_times.c ==== struct tms start; void *pound (void *threadid) { struct tms end; int oldutime = 0; int utime; int i; for (i = 0; i < 5000000 / NUM_THREADS; i++) { times(&end); utime = ((int)end.tms_utime - (int)start.tms_utime); if (oldutime > utime) { printf("utime decreased, was %d, now %d!\n", oldutime, utime); } oldutime = utime; } pthread_exit(NULL); } int main() { pthread_t th[NUM_THREADS]; long i; times(&start); for (i = 0; i < NUM_THREADS; i++) { pthread_create (&th[i], NULL, pound, (void *)i); } pthread_exit(NULL); return 0; } ==== END pound_times.c ==== ==== BEGIN pound_clock_gettime.c ==== void *pound (void *threadid) { struct timespec ts; int rc, i; unsigned long prev = 0, this = 0; for (i = 0; i < 5000000 / NUM_THREADS; i++) { rc = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); if (rc < 0) perror("clock_gettime"); this = (ts.tv_sec * 1000000000) + ts.tv_nsec; if (0 && this < prev) printf("%lu ns timewarp at iteration %d\n", prev - this, i); prev = this; } pthread_exit(NULL); } int main() { pthread_t th[NUM_THREADS]; long rc, i; pid_t pgid; for (i = 0; i < NUM_THREADS; i++) { rc = pthread_create(&th[i], NULL, pound, (void *)i); if (rc < 0) perror("pthread_create"); } pthread_exit(NULL); return 0; } ==== END pound_clock_gettime.c ==== Suggested-by: Mike Galbraith Signed-off-by: Giovanni Gherdovich Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Stanislaw Gruszka Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1470385316-15027-2-git-send-email-ggherdovich@suse.cz Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5c883fe8e440..2a906f20fba7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include @@ -2971,6 +2972,23 @@ DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); EXPORT_PER_CPU_SYMBOL(kstat); EXPORT_PER_CPU_SYMBOL(kernel_cpustat); +/* + * The function fair_sched_class.update_curr accesses the struct curr + * and its field curr->exec_start; when called from task_sched_runtime(), + * we observe a high rate of cache misses in practice. + * Prefetching this data results in improved performance. + */ +static inline void prefetch_curr_exec_start(struct task_struct *p) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + struct sched_entity *curr = (&p->se)->cfs_rq->curr; +#else + struct sched_entity *curr = (&task_rq(p)->cfs)->curr; +#endif + prefetch(curr); + prefetch(&curr->exec_start); +} + /* * Return accounted runtime for the task. * In case the task is currently running, return the runtime plus current's @@ -3005,6 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) * thread, breaking clock_gettime(). */ if (task_current(rq, p) && task_on_rq_queued(p)) { + prefetch_curr_exec_start(p); update_rq_clock(rq); p->sched_class->update_curr(rq); } -- cgit v1.3-8-gc7d7 From c0c8c9fa210c9a042060435f17e40ba4a76d6d6f Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 4 Aug 2016 09:42:20 +0800 Subject: sched/deadline: Fix lock pinning warning during CPU hotplug The following warning can be triggered by hot-unplugging the CPU on which an active SCHED_DEADLINE task is running on: WARNING: CPU: 0 PID: 0 at kernel/locking/lockdep.c:3531 lock_release+0x690/0x6a0 releasing a pinned lock Call Trace: dump_stack+0x99/0xd0 __warn+0xd1/0xf0 ? dl_task_timer+0x1a1/0x2b0 warn_slowpath_fmt+0x4f/0x60 ? sched_clock+0x13/0x20 lock_release+0x690/0x6a0 ? enqueue_pushable_dl_task+0x9b/0xa0 ? enqueue_task_dl+0x1ca/0x480 _raw_spin_unlock+0x1f/0x40 dl_task_timer+0x1a1/0x2b0 ? push_dl_task.part.31+0x190/0x190 WARNING: CPU: 0 PID: 0 at kernel/locking/lockdep.c:3649 lock_unpin_lock+0x181/0x1a0 unpinning an unpinned lock Call Trace: dump_stack+0x99/0xd0 __warn+0xd1/0xf0 warn_slowpath_fmt+0x4f/0x60 lock_unpin_lock+0x181/0x1a0 dl_task_timer+0x127/0x2b0 ? push_dl_task.part.31+0x190/0x190 As per the comment before this code, its safe to drop the RQ lock here, and since we (potentially) change rq, unpin and repin to avoid the splat. Signed-off-by: Wanpeng Li [ Rewrote changelog. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Luca Abeni Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1470274940-17976-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fcb7f0217ff4..1ce8867283dc 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -658,8 +658,11 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * * XXX figure out if select_task_rq_dl() deals with offline cpus. */ - if (unlikely(!rq->online)) + if (unlikely(!rq->online)) { + lockdep_unpin_lock(&rq->lock, rf.cookie); rq = dl_task_offline_migration(rq, p); + rf.cookie = lockdep_pin_lock(&rq->lock); + } /* * Queueing this task back might have overloaded rq, check if we need -- cgit v1.3-8-gc7d7 From 229ce631574761870a2ac938845fadbd07f35caa Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 14 Jul 2016 16:15:56 +0800 Subject: locking/pvqspinlock: Fix double hash race When the lock holder vCPU is racing with the queue head: CPU 0 (lock holder) CPU1 (queue head) =================== ================= spin_lock(); spin_lock(); pv_kick_node(): pv_wait_head_or_lock(): if (!lp) { lp = pv_hash(lock, pn); xchg(&l->locked, _Q_SLOW_VAL); } WRITE_ONCE(pn->state, vcpu_halted); cmpxchg(&pn->state, vcpu_halted, vcpu_hashed); WRITE_ONCE(l->locked, _Q_SLOW_VAL); (void)pv_hash(lock, pn); In this case, lock holder inserts the pv_node of queue head into the hash table and set _Q_SLOW_VAL unnecessary. This patch avoids it by restoring/setting vcpu_hashed state after failing adaptive locking spinning. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Pan Xinhui Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Link: http://lkml.kernel.org/r/1468484156-4521-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 37649e69056c..8a99abf58080 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -450,7 +450,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) goto gotlock; } } - WRITE_ONCE(pn->state, vcpu_halted); + WRITE_ONCE(pn->state, vcpu_hashed); qstat_inc(qstat_pv_wait_head, true); qstat_inc(qstat_pv_wait_again, waitcnt); pv_wait(&l->locked, _Q_SLOW_VAL); -- cgit v1.3-8-gc7d7 From c2ace36b884de9330c4149064ae8d212d2e0d9ee Mon Sep 17 00:00:00 2001 From: Pan Xinhui Date: Wed, 13 Jul 2016 18:23:34 +0800 Subject: locking/pvqspinlock: Fix a bug in qstat_read() It's obviously wrong to set stat to NULL. So lets remove it. Otherwise it is always zero when we check the latency of kick/wake. Signed-off-by: Pan Xinhui Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Waiman Long Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1468405414-3700-1-git-send-email-xinhui.pan@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_stat.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 22e025309845..b9d031516254 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -153,7 +153,6 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf, */ if ((counter == qstat_pv_latency_kick) || (counter == qstat_pv_latency_wake)) { - stat = 0; if (kicks) stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); } -- cgit v1.3-8-gc7d7 From f9bcf1e0e0145323ba2cf72ecad5264ff3883eb1 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 11 Aug 2016 13:36:35 +0800 Subject: sched/cputime: Fix steal time accounting Commit: 57430218317 ("sched/cputime: Count actually elapsed irq & softirq time") ... didn't take steal time into consideration with passing the noirqtime kernel parameter. As Paolo pointed out before: | Why not? If idle=poll, for example, any time the guest is suspended (and | thus cannot poll) does count as stolen time. This patch fixes it by reducing steal time from idle time accounting when the noirqtime parameter is true. The average idle time drops from 56.8% to 54.75% for nohz idle kvm guest(noirqtime, idle=poll, four vCPUs running on one pCPU). Signed-off-by: Wanpeng Li Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Radim Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1470893795-3527-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 1934f658c036..8b9bcc5a58fa 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -508,13 +508,20 @@ void account_process_tick(struct task_struct *p, int user_tick) */ void account_idle_ticks(unsigned long ticks) { - + cputime_t cputime, steal; if (sched_clock_irqtime) { irqtime_account_idle_ticks(ticks); return; } - account_idle_time(jiffies_to_cputime(ticks)); + cputime = cputime_one_jiffy; + steal = steal_account_process_time(cputime); + + if (steal >= cputime) + return; + + cputime -= steal; + account_idle_time(cputime); } /* -- cgit v1.3-8-gc7d7 From 26f2c75cd2cf10a6120ef02ca9a94db77cc9c8e0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 11 Aug 2016 14:58:24 +0200 Subject: sched/cputime: Fix omitted ticks passed in parameter Commit: f9bcf1e0e014 ("sched/cputime: Fix steal time accounting") ... fixes a leak on steal time accounting but forgets to account the ticks passed in parameters, assuming there is only one to take into account. Let's consider that parameter back. Signed-off-by: Frederic Weisbecker Acked-by: Wanpeng Li Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: linux-tip-commits@vger.kernel.org Link: http://lkml.kernel.org/r/20160811125822.GB4214@lerouge Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8b9bcc5a58fa..9858266fb0b3 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -509,12 +509,13 @@ void account_process_tick(struct task_struct *p, int user_tick) void account_idle_ticks(unsigned long ticks) { cputime_t cputime, steal; + if (sched_clock_irqtime) { irqtime_account_idle_ticks(ticks); return; } - cputime = cputime_one_jiffy; + cputime = jiffies_to_cputime(ticks); steal = steal_account_process_time(cputime); if (steal >= cputime) -- cgit v1.3-8-gc7d7 From 62822e2ec4ad091ba31f823f577ef80db52e3c2c Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Thu, 11 Aug 2016 14:49:29 -0700 Subject: PM / hibernate: Restore processor state before using per-CPU variables Restore the processor state before calling any other functions to ensure per-CPU variables can be used with KASLR memory randomization. Tracing functions use per-CPU variables (GS based on x86) and one was called just before restoring the processor state fully. It resulted in a double fault when both the tracing & the exception handler functions tried to use a per-CPU variable. Fixes: bb3632c6101b (PM / sleep: trace events for suspend/resume) Reported-and-tested-by: Borislav Petkov Reported-by: Jiri Kosina Tested-by: Rafael J. Wysocki Tested-by: Jiri Kosina Signed-off-by: Thomas Garnier Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 0ee1df0a0bd6..61761aa7cc19 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -300,12 +300,12 @@ static int create_image(int platform_mode) save_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); error = swsusp_arch_suspend(); + /* Restore control flow magically appears here */ + restore_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); if (error) printk(KERN_ERR "PM: Error %d creating hibernation image\n", error); - /* Restore control flow magically appears here */ - restore_processor_state(); if (!in_suspend) events_check_enabled = false; -- cgit v1.3-8-gc7d7 From 747ea55e4f78fd980350c39570a986b8c1c3e4aa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 12 Aug 2016 22:17:17 +0200 Subject: bpf: fix bpf_skb_in_cgroup helper naming While hashing out BPF's current_task_under_cgroup helper bits, it came to discussion that the skb_in_cgroup helper name was suboptimally chosen. Tejun says: So, I think in_cgroup should mean that the object is in that particular cgroup while under_cgroup in the subhierarchy of that cgroup. Let's rename the other subhierarchy test to under too. I think that'd be a lot less confusing going forward. [...] It's more intuitive and gives us the room to implement the real "in" test if ever necessary in the future. Since this touches uapi bits, we need to change this as long as v4.8 is not yet officially released. Thus, change the helper enum and rename related bits. Fixes: 4a482f34afcc ("cgroup: bpf: Add bpf_skb_in_cgroup_proto") Reference: http://patchwork.ozlabs.org/patch/658500/ Suggested-by: Sargun Dhillon Suggested-by: Tejun Heo Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 4 ++-- kernel/bpf/verifier.c | 4 ++-- net/core/filter.c | 10 +++++----- samples/bpf/bpf_helpers.h | 4 ++-- samples/bpf/test_cgrp2_tc_kern.c | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index da218fec6056..9e5fc168c8a3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -339,7 +339,7 @@ enum bpf_func_id { BPF_FUNC_skb_change_type, /** - * bpf_skb_in_cgroup(skb, map, index) - Check cgroup2 membership of skb + * bpf_skb_under_cgroup(skb, map, index) - Check cgroup2 membership of skb * @skb: pointer to skb * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type * @index: index of the cgroup in the bpf_map @@ -348,7 +348,7 @@ enum bpf_func_id { * == 1 skb succeeded the cgroup2 descendant test * < 0 error */ - BPF_FUNC_skb_in_cgroup, + BPF_FUNC_skb_under_cgroup, /** * bpf_get_hash_recalc(skb) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7094c69ac199..daea765d72e6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1053,7 +1053,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) goto error; break; case BPF_MAP_TYPE_CGROUP_ARRAY: - if (func_id != BPF_FUNC_skb_in_cgroup) + if (func_id != BPF_FUNC_skb_under_cgroup) goto error; break; default: @@ -1075,7 +1075,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; break; - case BPF_FUNC_skb_in_cgroup: + case BPF_FUNC_skb_under_cgroup: if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) goto error; break; diff --git a/net/core/filter.c b/net/core/filter.c index b5add4ef0d1d..bd9bf2e5fafa 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2317,7 +2317,7 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) } #ifdef CONFIG_SOCK_CGROUP_DATA -static u64 bpf_skb_in_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *)(long)r1; struct bpf_map *map = (struct bpf_map *)(long)r2; @@ -2340,8 +2340,8 @@ static u64 bpf_skb_in_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp); } -static const struct bpf_func_proto bpf_skb_in_cgroup_proto = { - .func = bpf_skb_in_cgroup, +static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { + .func = bpf_skb_under_cgroup, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -2421,8 +2421,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; #ifdef CONFIG_SOCK_CGROUP_DATA - case BPF_FUNC_skb_in_cgroup: - return &bpf_skb_in_cgroup_proto; + case BPF_FUNC_skb_under_cgroup: + return &bpf_skb_under_cgroup_proto; #endif default: return sk_filter_func_proto(func_id); diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index 217c8d507f2e..7927a090fa0d 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -72,8 +72,8 @@ static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flag (void *) BPF_FUNC_l3_csum_replace; static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flags) = (void *) BPF_FUNC_l4_csum_replace; -static int (*bpf_skb_in_cgroup)(void *ctx, void *map, int index) = - (void *) BPF_FUNC_skb_in_cgroup; +static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) = + (void *) BPF_FUNC_skb_under_cgroup; #if defined(__x86_64__) diff --git a/samples/bpf/test_cgrp2_tc_kern.c b/samples/bpf/test_cgrp2_tc_kern.c index 2732c37c8d5b..10ff73404e3a 100644 --- a/samples/bpf/test_cgrp2_tc_kern.c +++ b/samples/bpf/test_cgrp2_tc_kern.c @@ -57,7 +57,7 @@ int handle_egress(struct __sk_buff *skb) bpf_trace_printk(dont_care_msg, sizeof(dont_care_msg), eth->h_proto, ip6h->nexthdr); return TC_ACT_OK; - } else if (bpf_skb_in_cgroup(skb, &test_cgrp2_array_pin, 0) != 1) { + } else if (bpf_skb_under_cgroup(skb, &test_cgrp2_array_pin, 0) != 1) { bpf_trace_printk(pass_msg, sizeof(pass_msg)); return TC_ACT_OK; } else { -- cgit v1.3-8-gc7d7 From 924d8696751c4b9e58263bc82efdafcf875596a6 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 16 Aug 2016 10:46:38 +0100 Subject: PM / hibernate: Fix rtree_next_node() to avoid walking off list ends rtree_next_node() walks the linked list of leaf nodes to find the next block of pages in the struct memory_bitmap. If it walks off the end of the list of nodes, it walks the list of memory zones to find the next region of memory. If it walks off the end of the list of zones, it returns false. This leaves the struct bm_position's node and zone pointers pointing at their respective struct list_heads in struct mem_zone_bm_rtree. memory_bm_find_bit() uses struct bm_position's node and zone pointers to avoid walking lists and trees if the next bit appears in the same node/zone. It handles these values being stale. Swap rtree_next_node()s 'step then test' to 'test-next then step', this means if we reach the end of memory we return false and leave the node and zone pointers as they were. This fixes a panic on resume using AMD Seattle with 64K pages: [ 6.868732] Freezing user space processes ... (elapsed 0.000 seconds) done. [ 6.875753] Double checking all user space processes after OOM killer disable... (elapsed 0.000 seconds) [ 6.896453] PM: Using 3 thread(s) for decompression. [ 6.896453] PM: Loading and decompressing image data (5339 pages)... [ 7.318890] PM: Image loading progress: 0% [ 7.323395] Unable to handle kernel paging request at virtual address 00800040 [ 7.330611] pgd = ffff000008df0000 [ 7.334003] [00800040] *pgd=00000083fffe0003, *pud=00000083fffe0003, *pmd=00000083fffd0003, *pte=0000000000000000 [ 7.344266] Internal error: Oops: 96000005 [#1] PREEMPT SMP [ 7.349825] Modules linked in: [ 7.352871] CPU: 2 PID: 1 Comm: swapper/0 Tainted: G W I 4.8.0-rc1 #4737 [ 7.360512] Hardware name: AMD Overdrive/Supercharger/Default string, BIOS ROD1002C 04/08/2016 [ 7.369109] task: ffff8003c0220000 task.stack: ffff8003c0280000 [ 7.375020] PC is at set_bit+0x18/0x30 [ 7.378758] LR is at memory_bm_set_bit+0x24/0x30 [ 7.383362] pc : [] lr : [] pstate: 60000045 [ 7.390743] sp : ffff8003c0283b00 [ 7.473551] [ 7.475031] Process swapper/0 (pid: 1, stack limit = 0xffff8003c0280020) [ 7.481718] Stack: (0xffff8003c0283b00 to 0xffff8003c0284000) [ 7.800075] Call trace: [ 7.887097] [] set_bit+0x18/0x30 [ 7.891876] [] duplicate_memory_bitmap.constprop.38+0x54/0x70 [ 7.899172] [] snapshot_write_next+0x22c/0x47c [ 7.905166] [] load_image_lzo+0x754/0xa88 [ 7.910725] [] swsusp_read+0x144/0x230 [ 7.916025] [] load_image_and_restore+0x58/0x90 [ 7.922105] [] software_resume+0x2f0/0x338 [ 7.927752] [] do_one_initcall+0x38/0x11c [ 7.933314] [] kernel_init_freeable+0x14c/0x1ec [ 7.939395] [] kernel_init+0x10/0xfc [ 7.944520] [] ret_from_fork+0x10/0x40 [ 7.949820] Code: d2800022 8b400c21 f9800031 9ac32043 (c85f7c22) [ 7.955909] ---[ end trace 0024a5986e6ff323 ]--- [ 7.960529] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b Here struct mem_zone_bm_rtree's start_pfn has been returned instead of struct rtree_node's addr as the node/zone pointers are corrupt after we walked off the end of the lists during mark_unsafe_pages(). This behaviour was exposed by commit 6dbecfd345a6 ("PM / hibernate: Simplify mark_unsafe_pages()"), which caused mark_unsafe_pages() to call duplicate_memory_bitmap(), which uses memory_bm_find_bit() after walking off the end of the memory bitmap. Fixes: 3a20cb177961 (PM / Hibernate: Implement position keeping in radix tree) Signed-off-by: James Morse [ rjw: Subject ] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d90df926b59f..2ba18691fe76 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -835,9 +835,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) */ static bool rtree_next_node(struct memory_bitmap *bm) { - bm->cur.node = list_entry(bm->cur.node->list.next, - struct rtree_node, list); - if (&bm->cur.node->list != &bm->cur.zone->leaves) { + if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) { + bm->cur.node = list_entry(bm->cur.node->list.next, + struct rtree_node, list); bm->cur.node_pfn += BM_BITS_PER_BLOCK; bm->cur.node_bit = 0; touch_softlockup_watchdog(); @@ -845,9 +845,9 @@ static bool rtree_next_node(struct memory_bitmap *bm) } /* No more nodes, goto next zone */ - bm->cur.zone = list_entry(bm->cur.zone->list.next, + if (!list_is_last(&bm->cur.zone->list, &bm->zones)) { + bm->cur.zone = list_entry(bm->cur.zone->list.next, struct mem_zone_bm_rtree, list); - if (&bm->cur.zone->list != &bm->zones) { bm->cur.node = list_entry(bm->cur.zone->leaves.next, struct rtree_node, list); bm->cur.node_pfn = 0; -- cgit v1.3-8-gc7d7 From 7afafc8a44bf0ab841b17d450b02aedb3a138985 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 16 Aug 2016 10:59:35 +0300 Subject: block: Fix secure erase Commit 288dab8a35a0 ("block: add a separate operation type for secure erase") split REQ_OP_SECURE_ERASE from REQ_OP_DISCARD without considering all the places REQ_OP_DISCARD was being used to mean either. Fix those. Signed-off-by: Adrian Hunter Fixes: 288dab8a35a0 ("block: add a separate operation type for secure erase") Signed-off-by: Jens Axboe --- block/bio.c | 21 +++++++++++---------- block/blk-merge.c | 33 +++++++++++++++++++-------------- block/elevator.c | 2 +- drivers/mmc/card/block.c | 1 + drivers/mmc/card/queue.c | 3 ++- drivers/mmc/card/queue.h | 4 +++- include/linux/bio.h | 10 ++++++++-- include/linux/blkdev.h | 6 ++++-- kernel/trace/blktrace.c | 2 +- 9 files changed, 50 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/block/bio.c b/block/bio.c index f39477538fef..aa7354088008 100644 --- a/block/bio.c +++ b/block/bio.c @@ -667,18 +667,19 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; - if (bio_op(bio) == REQ_OP_DISCARD) - goto integrity_clone; - - if (bio_op(bio) == REQ_OP_WRITE_SAME) { + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + break; + case REQ_OP_WRITE_SAME: bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; - goto integrity_clone; + break; + default: + bio_for_each_segment(bv, bio_src, iter) + bio->bi_io_vec[bio->bi_vcnt++] = bv; + break; } - bio_for_each_segment(bv, bio_src, iter) - bio->bi_io_vec[bio->bi_vcnt++] = bv; - -integrity_clone: if (bio_integrity(bio_src)) { int ret; @@ -1788,7 +1789,7 @@ struct bio *bio_split(struct bio *bio, int sectors, * Discards need a mutable bio_vec to accommodate the payload * required by the DSM TRIM and UNMAP commands. */ - if (bio_op(bio) == REQ_OP_DISCARD) + if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE) split = bio_clone_bioset(bio, gfp, bs); else split = bio_clone_fast(bio, gfp, bs); diff --git a/block/blk-merge.c b/block/blk-merge.c index 3eec75a9e91d..72627e3cf91e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -172,12 +172,18 @@ void blk_queue_split(struct request_queue *q, struct bio **bio, struct bio *split, *res; unsigned nsegs; - if (bio_op(*bio) == REQ_OP_DISCARD) + switch (bio_op(*bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: split = blk_bio_discard_split(q, *bio, bs, &nsegs); - else if (bio_op(*bio) == REQ_OP_WRITE_SAME) + break; + case REQ_OP_WRITE_SAME: split = blk_bio_write_same_split(q, *bio, bs, &nsegs); - else + break; + default: split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs); + break; + } /* physical segments can be figured out during splitting */ res = split ? split : *bio; @@ -213,7 +219,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, * This should probably be returning 0, but blk_add_request_payload() * (Christoph!!!!) */ - if (bio_op(bio) == REQ_OP_DISCARD) + if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE) return 1; if (bio_op(bio) == REQ_OP_WRITE_SAME) @@ -385,7 +391,9 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, nsegs = 0; cluster = blk_queue_cluster(q); - if (bio_op(bio) == REQ_OP_DISCARD) { + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: /* * This is a hack - drivers should be neither modifying the * biovec, nor relying on bi_vcnt - but because of @@ -393,19 +401,16 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, * a payload we need to set up here (thank you Christoph) and * bi_vcnt is really the only way of telling if we need to. */ - - if (bio->bi_vcnt) - goto single_segment; - - return 0; - } - - if (bio_op(bio) == REQ_OP_WRITE_SAME) { -single_segment: + if (!bio->bi_vcnt) + return 0; + /* Fall through */ + case REQ_OP_WRITE_SAME: *sg = sglist; bvec = bio_iovec(bio); sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); return 1; + default: + break; } for_each_bio(bio) diff --git a/block/elevator.c b/block/elevator.c index 7096c22041e7..f7d973a56fd7 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -366,7 +366,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) list_for_each_prev(entry, &q->queue_head) { struct request *pos = list_entry_rq(entry); - if ((req_op(rq) == REQ_OP_DISCARD) != (req_op(pos) == REQ_OP_DISCARD)) + if (req_op(rq) != req_op(pos)) break; if (rq_data_dir(rq) != rq_data_dir(pos)) break; diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index 48a5dd740f3b..82503e6f04b3 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -1726,6 +1726,7 @@ static u8 mmc_blk_prep_packed_list(struct mmc_queue *mq, struct request *req) break; if (req_op(next) == REQ_OP_DISCARD || + req_op(next) == REQ_OP_SECURE_ERASE || req_op(next) == REQ_OP_FLUSH) break; diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c index bf14642a576a..29578e98603d 100644 --- a/drivers/mmc/card/queue.c +++ b/drivers/mmc/card/queue.c @@ -33,7 +33,8 @@ static int mmc_prep_request(struct request_queue *q, struct request *req) /* * We only like normal block requests and discards. */ - if (req->cmd_type != REQ_TYPE_FS && req_op(req) != REQ_OP_DISCARD) { + if (req->cmd_type != REQ_TYPE_FS && req_op(req) != REQ_OP_DISCARD && + req_op(req) != REQ_OP_SECURE_ERASE) { blk_dump_rq_flags(req, "MMC bad request"); return BLKPREP_KILL; } diff --git a/drivers/mmc/card/queue.h b/drivers/mmc/card/queue.h index d62531124d54..fee5e1271465 100644 --- a/drivers/mmc/card/queue.h +++ b/drivers/mmc/card/queue.h @@ -4,7 +4,9 @@ static inline bool mmc_req_is_special(struct request *req) { return req && - (req_op(req) == REQ_OP_FLUSH || req_op(req) == REQ_OP_DISCARD); + (req_op(req) == REQ_OP_FLUSH || + req_op(req) == REQ_OP_DISCARD || + req_op(req) == REQ_OP_SECURE_ERASE); } struct request; diff --git a/include/linux/bio.h b/include/linux/bio.h index 59ffaa68b11b..23ddf4b46a9b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -71,7 +71,8 @@ static inline bool bio_has_data(struct bio *bio) { if (bio && bio->bi_iter.bi_size && - bio_op(bio) != REQ_OP_DISCARD) + bio_op(bio) != REQ_OP_DISCARD && + bio_op(bio) != REQ_OP_SECURE_ERASE) return true; return false; @@ -79,7 +80,9 @@ static inline bool bio_has_data(struct bio *bio) static inline bool bio_no_advance_iter(struct bio *bio) { - return bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_WRITE_SAME; + return bio_op(bio) == REQ_OP_DISCARD || + bio_op(bio) == REQ_OP_SECURE_ERASE || + bio_op(bio) == REQ_OP_WRITE_SAME; } static inline bool bio_is_rw(struct bio *bio) @@ -199,6 +202,9 @@ static inline unsigned bio_segments(struct bio *bio) if (bio_op(bio) == REQ_OP_DISCARD) return 1; + if (bio_op(bio) == REQ_OP_SECURE_ERASE) + return 1; + if (bio_op(bio) == REQ_OP_WRITE_SAME) return 1; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2c210b6a7bcf..e79055c8b577 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -882,7 +882,7 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, int op) { - if (unlikely(op == REQ_OP_DISCARD)) + if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)) return min(q->limits.max_discard_sectors, UINT_MAX >> 9); if (unlikely(op == REQ_OP_WRITE_SAME)) @@ -913,7 +913,9 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq, if (unlikely(rq->cmd_type != REQ_TYPE_FS)) return q->limits.max_hw_sectors; - if (!q->limits.chunk_sectors || (req_op(rq) == REQ_OP_DISCARD)) + if (!q->limits.chunk_sectors || + req_op(rq) == REQ_OP_DISCARD || + req_op(rq) == REQ_OP_SECURE_ERASE) return blk_queue_get_max_sectors(q, req_op(rq)); return min(blk_max_size_offset(q, offset), diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7598e6ca817a..dbafc5df03f3 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -223,7 +223,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(op_flags, META); what |= MASK_TC_BIT(op_flags, PREFLUSH); what |= MASK_TC_BIT(op_flags, FUA); - if (op == REQ_OP_DISCARD) + if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE) what |= BLK_TC_ACT(BLK_TC_DISCARD); if (op == REQ_OP_FLUSH) what |= BLK_TC_ACT(BLK_TC_FLUSH); -- cgit v1.3-8-gc7d7 From 1e12c4a9393b75a744aada2c8115434572698bc3 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 11 Aug 2016 14:19:42 +0100 Subject: genirq: Correctly configure the trigger on chained interrupts Commit 1e2a7d78499e ("irqdomain: Don't set type when mapping an IRQ") moved the trigger configuration call from the irqdomain mapping to the interrupt being actually requested. This patch failed to handle the case where we configure a chained interrupt, which doesn't get requested through the usual path. In order to solve this, let's call __irq_set_trigger just before starting the cascade interrupt. Special care must be taken to make the flow handler stick, as the .irq_set_type method could have reset it (it doesn't know we're dealing with a chained interrupt). Based on an initial patch by Jon Hunter. Fixes: 1e2a7d78499e ("irqdomain: Don't set type when mapping an IRQ") Reported-by: John Stultz Reported-by: Linus Walleij Tested-by: John Stultz Acked-by: Jon Hunter Signed-off-by: Marc Zyngier --- kernel/irq/chip.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b4c1bc7c9ca2..637389088b3f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -820,6 +820,17 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, desc->name = name; if (handle != handle_bad_irq && is_chained) { + /* + * We're about to start this interrupt immediately, + * hence the need to set the trigger configuration. + * But the .set_type callback may have overridden the + * flow handler, ignoring that we're dealing with a + * chained interrupt. Reset it immediately because we + * do know better. + */ + __irq_set_trigger(desc, irqd_get_trigger_type(&desc->irq_data)); + desc->handle_irq = handle; + irq_settings_set_noprobe(desc); irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); -- cgit v1.3-8-gc7d7 From 568ac888215c7fb2fabe8ea739b00ec3c1f5d440 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 10 Aug 2016 15:43:06 -0400 Subject: cgroup: reduce read locked section of cgroup_threadgroup_rwsem during fork cgroup_threadgroup_rwsem is acquired in read mode during process exit and fork. It is also grabbed in write mode during __cgroups_proc_write(). I've recently run into a scenario with lots of memory pressure and OOM and I am beginning to see systemd __switch_to+0x1f8/0x350 __schedule+0x30c/0x990 schedule+0x48/0xc0 percpu_down_write+0x114/0x170 __cgroup_procs_write.isra.12+0xb8/0x3c0 cgroup_file_write+0x74/0x1a0 kernfs_fop_write+0x188/0x200 __vfs_write+0x6c/0xe0 vfs_write+0xc0/0x230 SyS_write+0x6c/0x110 system_call+0x38/0xb4 This thread is waiting on the reader of cgroup_threadgroup_rwsem to exit. The reader itself is under memory pressure and has gone into reclaim after fork. There are times the reader also ends up waiting on oom_lock as well. __switch_to+0x1f8/0x350 __schedule+0x30c/0x990 schedule+0x48/0xc0 jbd2_log_wait_commit+0xd4/0x180 ext4_evict_inode+0x88/0x5c0 evict+0xf8/0x2a0 dispose_list+0x50/0x80 prune_icache_sb+0x6c/0x90 super_cache_scan+0x190/0x210 shrink_slab.part.15+0x22c/0x4c0 shrink_zone+0x288/0x3c0 do_try_to_free_pages+0x1dc/0x590 try_to_free_pages+0xdc/0x260 __alloc_pages_nodemask+0x72c/0xc90 alloc_pages_current+0xb4/0x1a0 page_table_alloc+0xc0/0x170 __pte_alloc+0x58/0x1f0 copy_page_range+0x4ec/0x950 copy_process.isra.5+0x15a0/0x1870 _do_fork+0xa8/0x4b0 ppc_clone+0x8/0xc In the meanwhile, all processes exiting/forking are blocked almost stalling the system. This patch moves the threadgroup_change_begin from before cgroup_fork() to just before cgroup_canfork(). There is no nee to worry about threadgroup changes till the task is actually added to the threadgroup. This avoids having to call reclaim with cgroup_threadgroup_rwsem held. tj: Subject and description edits. Signed-off-by: Balbir Singh Acked-by: Zefan Li Cc: Oleg Nesterov Cc: Andrew Morton Cc: stable@vger.kernel.org # v4.2+ Signed-off-by: Tejun Heo --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 52e725d4a866..aaf782327bf3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1404,7 +1404,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; - threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1556,6 +1555,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; + threadgroup_change_begin(current); /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed @@ -1656,6 +1656,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, bad_fork_cancel_cgroup: cgroup_cancel_fork(p); bad_fork_free_pid: + threadgroup_change_end(current); if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_thread: @@ -1688,7 +1689,6 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: #endif - threadgroup_change_end(current); delayacct_tsk_free(p); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); -- cgit v1.3-8-gc7d7 From 6c4687cc17a788a6dd8de3e27dbeabb7cbd3e066 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 17 Aug 2016 17:36:29 +0200 Subject: uprobes: Fix the memcg accounting __replace_page() wronlgy calls mem_cgroup_cancel_charge() in "success" path, it should only do this if page_check_address() fails. This means that every enable/disable leads to unbalanced mem_cgroup_uncharge() from put_page(old_page), it is trivial to underflow the page_counter->count and trigger OOM. Reported-and-tested-by: Brenden Blanco Signed-off-by: Oleg Nesterov Reviewed-by: Johannes Weiner Acked-by: Michal Hocko Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: stable@vger.kernel.org # 3.17+ Fixes: 00501b531c47 ("mm: memcontrol: rewrite charge API") Link: http://lkml.kernel.org/r/20160817153629.GB29724@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b7a525ab2083..8c50276b60d1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -172,8 +172,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); err = -EAGAIN; ptep = page_check_address(page, mm, addr, &ptl, 0); - if (!ptep) + if (!ptep) { + mem_cgroup_cancel_charge(kpage, memcg, false); goto unlock; + } get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr, false); @@ -200,7 +202,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: - mem_cgroup_cancel_charge(kpage, memcg, false); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; -- cgit v1.3-8-gc7d7 From cca2094605efe6ccf43ff2876dd5bccc799202d8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 16 Aug 2016 13:33:26 +0200 Subject: perf/core: Fix event_function_local() Vincent reported triggering the WARN_ON_ONCE() in event_function_local(). While thinking through cases I noticed that by using event_function() directly, we miss the inactive case usually handled by event_function_call(). Therefore construct a blend of event_function_call() and event_function() that handles the cases relevant to event_function_local(). Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: stable@vger.kernel.org # 4.5+ Fixes: fae3fde65138 ("perf: Collapse and fix event_function_call() users") Signed-off-by: Ingo Molnar --- kernel/events/core.c | 60 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1903b8f3a705..6e454bfd514f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -242,18 +242,6 @@ unlock: return ret; } -static void event_function_local(struct perf_event *event, event_f func, void *data) -{ - struct event_function_struct efs = { - .event = event, - .func = func, - .data = data, - }; - - int ret = event_function(&efs); - WARN_ON_ONCE(ret); -} - static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; @@ -303,6 +291,54 @@ again: raw_spin_unlock_irq(&ctx->lock); } +/* + * Similar to event_function_call() + event_function(), but hard assumes IRQs + * are already disabled and we're on the right CPU. + */ +static void event_function_local(struct perf_event *event, event_f func, void *data) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct task_struct *task = READ_ONCE(ctx->task); + struct perf_event_context *task_ctx = NULL; + + WARN_ON_ONCE(!irqs_disabled()); + + if (task) { + if (task == TASK_TOMBSTONE) + return; + + task_ctx = ctx; + } + + perf_ctx_lock(cpuctx, task_ctx); + + task = ctx->task; + if (task == TASK_TOMBSTONE) + goto unlock; + + if (task) { + /* + * We must be either inactive or active and the right task, + * otherwise we're screwed, since we cannot IPI to somewhere + * else. + */ + if (ctx->is_active) { + if (WARN_ON_ONCE(task != current)) + goto unlock; + + if (WARN_ON_ONCE(cpuctx->task_ctx != ctx)) + goto unlock; + } + } else { + WARN_ON_ONCE(&cpuctx->ctx != ctx); + } + + func(event, cpuctx, ctx, data); +unlock: + perf_ctx_unlock(cpuctx, task_ctx); +} + #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ -- cgit v1.3-8-gc7d7 From 4059ffd09d694f704e18a4baf97fc0016c32e9ad Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Mon, 18 Jul 2016 10:43:05 -0600 Subject: perf/core: Fix file name handling for start/stop filters Binary file names have to be supplied for both range and start/stop filters but the current code only processes the filename if an address range filter is specified. This code adds processing of the filename for start/stop filters. Signed-off-by: Mathieu Poirier Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1468860187-318-2-git-send-email-mathieu.poirier@linaro.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6e454bfd514f..52780ef941d3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7972,8 +7972,10 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, goto fail; } - if (token == IF_SRC_FILE) { - filename = match_strdup(&args[2]); + if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { + int fpos = filter->range ? 2 : 1; + + filename = match_strdup(&args[fpos]); if (!filename) { ret = -ENOMEM; goto fail; -- cgit v1.3-8-gc7d7 From 12b40a2393719a37ff86a0b43bece6d28a75cbfc Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Mon, 18 Jul 2016 10:43:06 -0600 Subject: perf/core: Update filters only on executable mmap Function perf_event_mmap() is called by the MM subsystem each time part of a binary is loaded in memory. There can be several mapping for a binary, many times unrelated to the code section. Each time a section of a binary is mapped address filters are updated, event when the map doesn't pertain to the code section. The end result is that filters are configured based on the last map event that was received rather than the last mapping of the code segment. For example if we have an executable 'main' that calls library 'libcstest.so.1.0', and that we want to collect traces on code that is in that library. The perf cmd line for this scenario would be: perf record -e cs_etm// --filter 'filter 0x72c/0x40@/opt/lib/libcstest.so.1.0' --per-thread ./main Resulting in binaries being mapped this way: root@linaro-nano:~# cat /proc/1950/maps 00400000-00401000 r-xp 00000000 08:02 33169 /home/linaro/main 00410000-00411000 r--p 00000000 08:02 33169 /home/linaro/main 00411000-00412000 rw-p 00001000 08:02 33169 /home/linaro/main 7fa2464000-7fa2474000 rw-p 00000000 00:00 0 7fa2474000-7fa25a4000 r-xp 00000000 08:02 543 /lib/aarch64-linux-gnu/libc-2.21.so 7fa25a4000-7fa25b3000 ---p 00130000 08:02 543 /lib/aarch64-linux-gnu/libc-2.21.so 7fa25b3000-7fa25b7000 r--p 0012f000 08:02 543 /lib/aarch64-linux-gnu/libc-2.21.so 7fa25b7000-7fa25b9000 rw-p 00133000 08:02 543 /lib/aarch64-linux-gnu/libc-2.21.so 7fa25b9000-7fa25bd000 rw-p 00000000 00:00 0 7fa25bd000-7fa25be000 r-xp 00000000 08:02 38308 /opt/lib/libcstest.so.1.0 7fa25be000-7fa25cd000 ---p 00001000 08:02 38308 /opt/lib/libcstest.so.1.0 7fa25cd000-7fa25ce000 r--p 00000000 08:02 38308 /opt/lib/libcstest.so.1.0 7fa25ce000-7fa25cf000 rw-p 00001000 08:02 38308 /opt/lib/libcstest.so.1.0 7fa25cf000-7fa25eb000 r-xp 00000000 08:02 574 /lib/aarch64-linux-gnu/ld-2.21.so 7fa25ef000-7fa25f2000 rw-p 00000000 00:00 0 7fa25f7000-7fa25f9000 rw-p 00000000 00:00 0 7fa25f9000-7fa25fa000 r--p 00000000 00:00 0 [vvar] 7fa25fa000-7fa25fb000 r-xp 00000000 00:00 0 [vdso] 7fa25fb000-7fa25fc000 r--p 0001c000 08:02 574 /lib/aarch64-linux-gnu/ld-2.21.so 7fa25fc000-7fa25fe000 rw-p 0001d000 08:02 574 /lib/aarch64-linux-gnu/ld-2.21.so 7ff2ea8000-7ff2ec9000 rw-p 00000000 00:00 0 [stack] root@linaro-nano:~# Before 'main()' can execute 'libcstest.so.1.0' has to be loaded in memory. Once that has been done perf_event_mmap() has been called 4 times, with the last map starting at address 0x7fa25ce000 and the address filter configured to start filtering when the IP has passed over address 0x0x7fa25ce72c (0x7fa25ce000 + 0x72c). But that is wrong since the code segment for library 'libcstest.so.1.0' as been mapped at 0x7fa25bd000, resulting in traces not being collected. This patch corrects the situation by requesting that address filters be updated only if the mapped event is for a code segment. Signed-off-by: Mathieu Poirier Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1468860187-318-3-git-send-email-mathieu.poirier@linaro.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 52780ef941d3..9a030a96bc1f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6689,6 +6689,13 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma) struct perf_event_context *ctx; int ctxn; + /* + * Data tracing isn't supported yet and as such there is no need + * to keep track of anything that isn't related to executable code: + */ + if (!(vma->vm_flags & VM_EXEC)) + return; + rcu_read_lock(); for_each_task_context_nr(ctxn) { ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); -- cgit v1.3-8-gc7d7 From 99f5bc9bfa9094e7c264a8e09f9507b391a3d1d1 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Mon, 18 Jul 2016 10:43:07 -0600 Subject: perf/core: Enable mapping of the stop filters At this time the perf_addr_filter_needs_mmap() function will _not_ return true on a user space 'stop' filter. But stop filters need exactly the same kind of mapping that range and start filters get. Signed-off-by: Mathieu Poirier Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1468860187-318-4-git-send-email-mathieu.poirier@linaro.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9a030a96bc1f..a5fc5c8cdfb0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6619,15 +6619,6 @@ got_name: kfree(buf); } -/* - * Whether this @filter depends on a dynamic object which is not loaded - * yet or its load addresses are not known. - */ -static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter) -{ - return filter->filter && filter->inode; -} - /* * Check whether inode and address range match filter criteria. */ @@ -7848,7 +7839,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event) list_for_each_entry(filter, &ifh->list, entry) { event->addr_filters_offs[count] = 0; - if (perf_addr_filter_needs_mmap(filter)) + /* + * Adjust base offset if the filter is associated to a binary + * that needs to be mapped: + */ + if (filter->inode) event->addr_filters_offs[count] = perf_addr_filter_apply(filter, mm); -- cgit v1.3-8-gc7d7 From 71e7bc2bab77e64882c031c2af943c3256c1adb0 Mon Sep 17 00:00:00 2001 From: David Carrillo-Cisneros Date: Wed, 17 Aug 2016 13:55:04 -0700 Subject: perf/core: Check return value of the perf_event_read() IPI The call to smp_call_function_single in perf_event_read() may fail if an invalid or not online CPU index is passed. Warn user if such bug is present and return error. Signed-off-by: David Carrillo-Cisneros Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Paul Turner Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vegard Nossum Cc: Vince Weaver Link: http://lkml.kernel.org/r/1471467307-61171-2-git-send-email-davidcc@google.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a5fc5c8cdfb0..5650f5317e0c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3549,9 +3549,10 @@ static int perf_event_read(struct perf_event *event, bool group) .group = group, .ret = 0, }; - smp_call_function_single(event->oncpu, - __perf_event_read, &data, 1); - ret = data.ret; + ret = smp_call_function_single(event->oncpu, __perf_event_read, &data, 1); + /* The event must have been read from an online CPU: */ + WARN_ON_ONCE(ret); + ret = ret ? : data.ret; } else if (event->state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; -- cgit v1.3-8-gc7d7 From 173be9a14f7b2e901cf77c18b1aafd4d672e9d9e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Aug 2016 18:38:42 +0200 Subject: sched/cputime: Fix NO_HZ_FULL getrusage() monotonicity regression Mike reports: Roughly 10% of the time, ltp testcase getrusage04 fails: getrusage04 0 TINFO : Expected timers granularity is 4000 us getrusage04 0 TINFO : Using 1 as multiply factor for max [us]time increment (1000+4000us)! getrusage04 0 TINFO : utime: 0us; stime: 179us getrusage04 0 TINFO : utime: 3751us; stime: 0us getrusage04 1 TFAIL : getrusage04.c:133: stime increased > 5000us: And tracked it down to the case where the task simply doesn't get _any_ [us]time ticks. Update the code to assume all rtime is utime when we lack information, thus ensuring a task that elides the tick gets time accounted. Reported-by: Mike Galbraith Tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Fredrik Markstrom Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Cc: Rik van Riel Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Wanpeng Li Cc: stable@vger.kernel.org # 4.3+ Fixes: 9d7fb0427648 ("sched/cputime: Guarantee stime + utime == rtime") Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 9858266fb0b3..2ee83b200504 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -614,19 +614,25 @@ static void cputime_adjust(struct task_cputime *curr, stime = curr->stime; utime = curr->utime; - if (utime == 0) { - stime = rtime; + /* + * If either stime or both stime and utime are 0, assume all runtime is + * userspace. Once a task gets some ticks, the monotonicy code at + * 'update' will ensure things converge to the observed ratio. + */ + if (stime == 0) { + utime = rtime; goto update; } - if (stime == 0) { - utime = rtime; + if (utime == 0) { + stime = rtime; goto update; } stime = scale_stime((__force u64)stime, (__force u64)rtime, (__force u64)(stime + utime)); +update: /* * Make sure stime doesn't go backwards; this preserves monotonicity * for utime because rtime is monotonic. @@ -649,7 +655,6 @@ static void cputime_adjust(struct task_cputime *curr, stime = rtime - utime; } -update: prev->stime = stime; prev->utime = utime; out: -- cgit v1.3-8-gc7d7 From 03cbc732639ddcad15218c4b2046d255851ff1e3 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 17 Aug 2016 10:05:46 +0800 Subject: sched/cputime: Resync steal time when guest & host lose sync Commit: 57430218317e ("sched/cputime: Count actually elapsed irq & softirq time") ... fixed a bug but also triggered a regression: On an i5 laptop, 4 pCPUs, 4vCPUs for one full dynticks guest, there are four CPU hog processes(for loop) running in the guest, I hot-unplug the pCPUs on host one by one until there is only one left, then observe CPU utilization via 'top' in the guest, it shows: 100% st for cpu0(housekeeping) 75% st for other CPUs (nohz full mode) However, w/o this commit it shows the correct 75% for all four CPUs. When a guest is interrupted for a longer amount of time, missed clock ticks are not redelivered later. Because of that, we should not limit the amount of steal time accounted to the amount of time that the calling functions think have passed. However, the interval returned by account_other_time() is NOT rounded down to the nearest jiffy, while the base interval in get_vtime_delta() it is subtracted from is, so the max cputime limit is required to avoid underflow. This patch fixes the regression by limiting the account_other_time() from get_vtime_delta() to avoid underflow, and lets the other three call sites (in account_other_time() and steal_account_process_time()) account however much steal time the host told us elapsed. Suggested-by: Rik van Riel Suggested-by: Paolo Bonzini Signed-off-by: Wanpeng Li Reviewed-by: Rik van Riel Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Radim Krcmar Cc: Thomas Gleixner Cc: kvm@vger.kernel.org Link: http://lkml.kernel.org/r/1471399546-4069-1-git-send-email-wanpeng.li@hotmail.com [ Improved the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 2ee83b200504..a846cf89eb96 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -263,6 +263,11 @@ void account_idle_time(cputime_t cputime) cpustat[CPUTIME_IDLE] += (__force u64) cputime; } +/* + * When a guest is interrupted for a longer amount of time, missed clock + * ticks are not redelivered later. Due to that, this function may on + * occasion account more time than the calling functions think elapsed. + */ static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) { #ifdef CONFIG_PARAVIRT @@ -371,7 +376,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, * idle, or potentially user or system time. Due to rounding, * other time can exceed ticks occasionally. */ - other = account_other_time(cputime); + other = account_other_time(ULONG_MAX); if (other >= cputime) return; cputime -= other; @@ -486,7 +491,7 @@ void account_process_tick(struct task_struct *p, int user_tick) } cputime = cputime_one_jiffy; - steal = steal_account_process_time(cputime); + steal = steal_account_process_time(ULONG_MAX); if (steal >= cputime) return; @@ -516,7 +521,7 @@ void account_idle_ticks(unsigned long ticks) } cputime = jiffies_to_cputime(ticks); - steal = steal_account_process_time(cputime); + steal = steal_account_process_time(ULONG_MAX); if (steal >= cputime) return; @@ -699,6 +704,13 @@ static cputime_t get_vtime_delta(struct task_struct *tsk) unsigned long now = READ_ONCE(jiffies); cputime_t delta, other; + /* + * Unlike tick based timing, vtime based timing never has lost + * ticks, and no need for steal time accounting to make up for + * lost ticks. Vtime accounts a rounded version of actual + * elapsed time. Limit account_other_time to prevent rounding + * errors from causing elapsed vtime to go negative. + */ delta = jiffies_to_cputime(now - tsk->vtime_snap); other = account_other_time(delta); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); -- cgit v1.3-8-gc7d7 From 4396f46c8c628329bd35ee4b84140b8b001a11eb Mon Sep 17 00:00:00 2001 From: Shawn Lin Date: Mon, 22 Aug 2016 16:21:52 +0800 Subject: genirq: Fix potential memleak when failing to get irq pm Obviously we should free action here if irq_chip_pm_get failed. Fixes: be45beb2df69: "genirq: Add runtime power management support for IRQ chips" Signed-off-by: Shawn Lin Cc: Jon Hunter Cc: Marc Zyngier Link: http://lkml.kernel.org/r/1471854112-13006-1-git-send-email-shawn.lin@rock-chips.com Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 73a2b786b5e9..9530fcd27704 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1681,8 +1681,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, action->dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); - if (retval < 0) + if (retval < 0) { + kfree(action); return retval; + } chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); @@ -1985,8 +1987,10 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, action->percpu_dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); - if (retval < 0) + if (retval < 0) { + kfree(action); return retval; + } chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); -- cgit v1.3-8-gc7d7 From 3ee0ce2a54dff07d09440723594df89bc1a12e79 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Aug 2016 07:06:45 -0700 Subject: genirq/affinity: Use get/put_online_cpus around cpumask operations Without locking out CPU mask operations we might end up with an inconsistent view of the cpumask in the function. Fixes: 5e385a6ef31f: "genirq: Add a helper to spread an affinity mask for MSI/MSI-X vectors" Signed-off-by: Christoph Hellwig Link: http://lkml.kernel.org/r/1470924405-25728-1-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- kernel/irq/affinity.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index f68959341c0f..32f6cfcff212 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -39,6 +39,7 @@ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) return NULL; } + get_online_cpus(); if (max_vecs >= num_online_cpus()) { cpumask_copy(affinity_mask, cpu_online_mask); *nr_vecs = num_online_cpus(); @@ -56,6 +57,7 @@ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) } *nr_vecs = vecs; } + put_online_cpus(); return affinity_mask; } -- cgit v1.3-8-gc7d7 From 27727df240c7cc84f2ba6047c6f18d5addfd25ef Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 23 Aug 2016 16:08:21 -0700 Subject: timekeeping: Avoid taking lock in NMI path with CONFIG_DEBUG_TIMEKEEPING When I added some extra sanity checking in timekeeping_get_ns() under CONFIG_DEBUG_TIMEKEEPING, I missed that the NMI safe __ktime_get_fast_ns() method was using timekeeping_get_ns(). Thus the locking added to the debug checks broke the NMI-safety of __ktime_get_fast_ns(). This patch open-codes the timekeeping_get_ns() logic for __ktime_get_fast_ns(), so can avoid any deadlocks in NMI. Fixes: 4ca22c2648f9 "timekeeping: Add warnings when overflows or underflows are observed" Reported-by: Steven Rostedt Reported-by: Peter Zijlstra Signed-off-by: John Stultz Cc: stable Link: http://lkml.kernel.org/r/1471993702-29148-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3b65746c7f15..e07fb093f819 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -401,7 +401,10 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); - now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); + now = ktime_to_ns(tkr->base); + + now += clocksource_delta(tkr->read(tkr->clock), + tkr->cycle_last, tkr->mask); } while (read_seqcount_retry(&tkf->seq, seq)); return now; -- cgit v1.3-8-gc7d7 From a4f8f6667f099036c88f231dcad4cf233652c824 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 23 Aug 2016 16:08:22 -0700 Subject: timekeeping: Cap array access in timekeeping_debug It was reported that hibernation could fail on the 2nd attempt, where the system hangs at hibernate() -> syscore_resume() -> i8237A_resume() -> claim_dma_lock(), because the lock has already been taken. However there is actually no other process would like to grab this lock on that problematic platform. Further investigation showed that the problem is triggered by setting /sys/power/pm_trace to 1 before the 1st hibernation. Since once pm_trace is enabled, the rtc becomes unmeaningful after suspend, and meanwhile some BIOSes would like to adjust the 'invalid' RTC (e.g, smaller than 1970) to the release date of that motherboard during POST stage, thus after resumed, it may seem that the system had a significant long sleep time which is a completely meaningless value. Then in timekeeping_resume -> tk_debug_account_sleep_time, if the bit31 of the sleep time happened to be set to 1, fls() returns 32 and we add 1 to sleep_time_bin[32], which causes an out of bounds array access and therefor memory being overwritten. As depicted by System.map: 0xffffffff81c9d080 b sleep_time_bin 0xffffffff81c9d100 B dma_spin_lock the dma_spin_lock.val is set to 1, which caused this problem. This patch adds a sanity check in tk_debug_account_sleep_time() to ensure we don't index past the sleep_time_bin array. [jstultz: Problem diagnosed and original patch by Chen Yu, I've solved the issue slightly differently, but borrowed his excelent explanation of the issue here.] Fixes: 5c83545f24ab "power: Add option to log time spent in suspend" Reported-by: Janek Kozicki Reported-by: Chen Yu Signed-off-by: John Stultz Cc: linux-pm@vger.kernel.org Cc: Peter Zijlstra Cc: Xunlei Pang Cc: "Rafael J. Wysocki" Cc: stable Cc: Zhang Rui Link: http://lkml.kernel.org/r/1471993702-29148-3-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping_debug.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index f6bd65236712..107310a6f36f 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -23,7 +23,9 @@ #include "timekeeping_internal.h" -static unsigned int sleep_time_bin[32] = {0}; +#define NUM_BINS 32 + +static unsigned int sleep_time_bin[NUM_BINS] = {0}; static int tk_debug_show_sleep_time(struct seq_file *s, void *data) { @@ -69,6 +71,9 @@ late_initcall(tk_debug_sleep_time_init); void tk_debug_account_sleep_time(struct timespec64 *t) { - sleep_time_bin[fls(t->tv_sec)]++; + /* Cap bin index so we don't overflow the array */ + int bin = min(fls(t->tv_sec), NUM_BINS-1); + + sleep_time_bin[bin]++; } -- cgit v1.3-8-gc7d7 From 8b6a3fe8fab97716990a3abde1a01fb5a34552a3 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 24 Aug 2016 10:07:14 +0100 Subject: perf/core: Use this_cpu_ptr() when stopping AUX events When tearing down an AUX buf for an event via perf_mmap_close(), __perf_event_output_stop() is called on the event's CPU to ensure that trace generation is halted before the process of unmapping and freeing the buffer pages begins. The callback is performed via cpu_function_call(), which ensures that it runs with interrupts disabled and is therefore not preemptible. Unfortunately, the current code grabs the per-cpu context pointer using get_cpu_ptr(), which unnecessarily disables preemption and doesn't pair the call with put_cpu_ptr(), leading to a preempt_count() imbalance and a BUG when freeing the AUX buffer later on: WARNING: CPU: 1 PID: 2249 at kernel/events/ring_buffer.c:539 __rb_free_aux+0x10c/0x120 Modules linked in: [...] Call Trace: [] dump_stack+0x4f/0x72 [] __warn+0xc6/0xe0 [] warn_slowpath_null+0x18/0x20 [] __rb_free_aux+0x10c/0x120 [] rb_free_aux+0x13/0x20 [] perf_mmap_close+0x29e/0x2f0 [] ? perf_iterate_ctx+0xe0/0xe0 [] remove_vma+0x25/0x60 [] exit_mmap+0x106/0x140 [] mmput+0x1c/0xd0 [] do_exit+0x253/0xbf0 [] do_group_exit+0x3e/0xb0 [] get_signal+0x249/0x640 [] do_signal+0x23/0x640 [] ? _raw_write_unlock_irq+0x12/0x30 [] ? _raw_spin_unlock_irq+0x9/0x10 [] ? __schedule+0x2c6/0x710 [] exit_to_usermode_loop+0x74/0x90 [] prepare_exit_to_usermode+0x26/0x30 [] retint_user+0x8/0x10 This patch uses this_cpu_ptr() instead of get_cpu_ptr(), since preemption is already disabled by the caller. Signed-off-by: Will Deacon Reviewed-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Fixes: 95ff4ca26c49 ("perf/core: Free AUX pages in unmap path") Link: http://lkml.kernel.org/r/20160824091905.GA16944@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5650f5317e0c..3cfabdf7b942 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6166,7 +6166,7 @@ static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; struct pmu *pmu = event->pmu; - struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); struct remote_output ro = { .rb = event->rb, }; -- cgit v1.3-8-gc7d7 From e7d316a02f683864a12389f8808570e37fb90aa3 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Thu, 25 Aug 2016 15:16:51 -0700 Subject: sysctl: handle error writing UINT_MAX to u32 fields We have scripts which write to certain fields on 3.18 kernels but this seems to be failing on 4.4 kernels. An entry which we write to here is xfrm_aevent_rseqth which is u32. echo 4294967295 > /proc/sys/net/core/xfrm_aevent_rseqth Commit 230633d109e3 ("kernel/sysctl.c: detect overflows when converting to int") prevented writing to sysctl entries when integer overflow occurs. However, this does not apply to unsigned integers. Heinrich suggested that we introduce a new option to handle 64 bit limits and set min as 0 and max as UINT_MAX. This might not work as it leads to issues similar to __do_proc_doulongvec_minmax. Alternatively, we would need to change the datatype of the entry to 64 bit. static int __do_proc_doulongvec_minmax(void *data, struct ctl_table { i = (unsigned long *) data; //This cast is causing to read beyond the size of data (u32) vleft = table->maxlen / sizeof(unsigned long); //vleft is 0 because maxlen is sizeof(u32) which is lesser than sizeof(unsigned long) on x86_64. Introduce a new proc handler proc_douintvec. Individual proc entries will need to be updated to use the new handler. [akpm@linux-foundation.org: coding-style fixes] Fixes: 230633d109e3 ("kernel/sysctl.c:detect overflows when converting to int") Link: http://lkml.kernel.org/r/1471479806-5252-1-git-send-email-subashab@codeaurora.org Signed-off-by: Subash Abhinov Kasiviswanathan Cc: Heinrich Schuchardt Cc: Kees Cook Cc: "David S. Miller" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sysctl.h | 2 ++ kernel/sysctl.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 697e160c78d0..a4f7203a9017 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -42,6 +42,8 @@ extern int proc_dostring(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_dointvec(struct ctl_table *, int, void __user *, size_t *, loff_t *); +extern int proc_douintvec(struct ctl_table *, int, + void __user *, size_t *, loff_t *); extern int proc_dointvec_minmax(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_dointvec_jiffies(struct ctl_table *, int, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b43d0b27c1fe..a13bbdaab47d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2140,6 +2140,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, return 0; } +static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + if (*negp) + return -EINVAL; + *valp = *lvalp; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long)val; + } + return 0; +} + static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, @@ -2259,8 +2274,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write, int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,buffer,lenp,ppos, - NULL,NULL); + return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); +} + +/** + * proc_douintvec - read a vector of unsigned integers + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_conv, NULL); } /* @@ -2858,6 +2892,12 @@ int proc_dointvec(struct ctl_table *table, int write, return -ENOSYS; } +int proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2903,6 +2943,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, * exception granted :-) */ EXPORT_SYMBOL(proc_dointvec); +EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); -- cgit v1.3-8-gc7d7 From ae6c33ba6e37eea3012fe2640b22400ef3f2d0f3 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Thu, 25 Aug 2016 15:17:00 -0700 Subject: printk: fix parsing of "brl=" option Commit bbeddf52adc1 ("printk: move braille console support into separate braille.[ch] files") moved the parsing of braille-related options into _braille_console_setup(), changing the type of variable str from char* to char**. In this commit, memcmp(str, "brl,", 4) was correctly updated to memcmp(*str, "brl,", 4) but not memcmp(str, "brl=", 4). Update the code to make "brl=" option work again and replace memcmp() with strncmp() to make the compiler able to detect such an issue. Fixes: bbeddf52adc1 ("printk: move braille console support into separate braille.[ch] files") Link: http://lkml.kernel.org/r/20160823165700.28952-1-nicolas.iooss_linux@m4x.org Signed-off-by: Nicolas Iooss Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/braille.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c index 276762f3a460..d5760c42f042 100644 --- a/kernel/printk/braille.c +++ b/kernel/printk/braille.c @@ -9,10 +9,10 @@ char *_braille_console_setup(char **str, char **brl_options) { - if (!memcmp(*str, "brl,", 4)) { + if (!strncmp(*str, "brl,", 4)) { *brl_options = ""; *str += 4; - } else if (!memcmp(str, "brl=", 4)) { + } else if (!strncmp(*str, "brl=", 4)) { *brl_options = *str + 4; *str = strchr(*brl_options, ','); if (!*str) -- cgit v1.3-8-gc7d7 From 485a252a5559b45d7df04c819ec91177c62c270b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 10 Aug 2016 16:28:09 -0700 Subject: seccomp: Fix tracer exit notifications during fatal signals This fixes a ptrace vs fatal pending signals bug as manifested in seccomp now that seccomp was reordered to happen after ptrace. The short version is that seccomp should not attempt to call do_exit() while fatal signals are pending under a tracer. The existing code was trying to be as defensively paranoid as possible, but it now ends up confusing ptrace. Instead, the syscall can just be skipped (which solves the original concern that the do_exit() was addressing) and normal signal handling, tracer notification, and process death can happen. Paraphrasing from the original bug report: If a tracee task is in a PTRACE_EVENT_SECCOMP trap, or has been resumed after such a trap but not yet been scheduled, and another task in the thread-group calls exit_group(), then the tracee task exits without the ptracer receiving a PTRACE_EVENT_EXIT notification. Test case here: https://gist.github.com/khuey/3c43ac247c72cef8c956ca73281c9be7 The bug happens because when __seccomp_filter() detects fatal_signal_pending(), it calls do_exit() without dequeuing the fatal signal. When do_exit() sends the PTRACE_EVENT_EXIT notification and that task is descheduled, __schedule() notices that there is a fatal signal pending and changes its state from TASK_TRACED to TASK_RUNNING. That prevents the ptracer's waitpid() from returning the ptrace event. A more detailed analysis is here: https://github.com/mozilla/rr/issues/1762#issuecomment-237396255. Reported-by: Robert O'Callahan Reported-by: Kyle Huey Tested-by: Kyle Huey Fixes: 93e35efb8de4 ("x86/ptrace: run seccomp after ptrace") Signed-off-by: Kees Cook Acked-by: Oleg Nesterov Acked-by: James Morris --- kernel/seccomp.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ef6c6c3f9d8a..0db7c8a2afe2 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -605,12 +605,16 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, ptrace_event(PTRACE_EVENT_SECCOMP, data); /* * The delivery of a fatal signal during event - * notification may silently skip tracer notification. - * Terminating the task now avoids executing a system - * call that may not be intended. + * notification may silently skip tracer notification, + * which could leave us with a potentially unmodified + * syscall that the tracer would have liked to have + * changed. Since the process is about to die, we just + * force the syscall to be skipped and let the signal + * kill the process and correctly handle any tracer exit + * notifications. */ if (fatal_signal_pending(current)) - do_exit(SIGSYS); + goto skip; /* Check if the tracer forced the syscall to be skipped. */ this_syscall = syscall_get_nr(current, task_pt_regs(current)); if (this_syscall < 0) -- cgit v1.3-8-gc7d7 From cd81a9170e69e018bbaba547c1fd85a585f5697a Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 23 Aug 2016 16:20:38 +0200 Subject: mm: introduce get_task_exe_file For more convenient access if one has a pointer to the task. As a minor nit take advantage of the fact that only task lock + rcu are needed to safely grab ->exe_file. This saves mm refcount dance. Use the helper in proc_exe_link. Signed-off-by: Mateusz Guzik Acked-by: Konstantin Khlebnikov Acked-by: Richard Guy Briggs Cc: # 4.3.x Signed-off-by: Paul Moore --- fs/proc/base.c | 7 +------ include/linux/mm.h | 1 + kernel/fork.c | 23 +++++++++++++++++++++++ 3 files changed, 25 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/fs/proc/base.c b/fs/proc/base.c index 0d163a84082d..da8b1943ba04 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1552,18 +1552,13 @@ static const struct file_operations proc_pid_set_comm_operations = { static int proc_exe_link(struct dentry *dentry, struct path *exe_path) { struct task_struct *task; - struct mm_struct *mm; struct file *exe_file; task = get_proc_task(d_inode(dentry)); if (!task) return -ENOENT; - mm = get_task_mm(task); + exe_file = get_task_exe_file(task); put_task_struct(task); - if (!mm) - return -ENOENT; - exe_file = get_mm_exe_file(mm); - mmput(mm); if (exe_file) { *exe_path = exe_file->f_path; path_get(&exe_file->f_path); diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f468e0d2534..004c73a988b7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1987,6 +1987,7 @@ extern void mm_drop_all_locks(struct mm_struct *mm); extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); +extern struct file *get_task_exe_file(struct task_struct *task); extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages); extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages); diff --git a/kernel/fork.c b/kernel/fork.c index d277e83ed3e0..42451aeb245f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -773,6 +773,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm) } EXPORT_SYMBOL(get_mm_exe_file); +/** + * get_task_exe_file - acquire a reference to the task's executable file + * + * Returns %NULL if task's mm (if any) has no associated executable file or + * this is a kernel thread with borrowed mm (see the comment above get_task_mm). + * User must release file via fput(). + */ +struct file *get_task_exe_file(struct task_struct *task) +{ + struct file *exe_file = NULL; + struct mm_struct *mm; + + task_lock(task); + mm = task->mm; + if (mm) { + if (!(task->flags & PF_KTHREAD)) + exe_file = get_mm_exe_file(mm); + } + task_unlock(task); + return exe_file; +} +EXPORT_SYMBOL(get_task_exe_file); + /** * get_task_mm - acquire a reference to the task's mm * -- cgit v1.3-8-gc7d7 From 5efc244346f9f338765da3d592f7947b0afdc4b5 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 23 Aug 2016 16:20:39 +0200 Subject: audit: fix exe_file access in audit_exe_compare Prior to the change the function would blindly deference mm, exe_file and exe_file->f_inode, each of which could have been NULL or freed. Use get_task_exe_file to safely obtain stable exe_file. Signed-off-by: Mateusz Guzik Acked-by: Konstantin Khlebnikov Acked-by: Richard Guy Briggs Cc: # 4.3.x Signed-off-by: Paul Moore --- kernel/audit_watch.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 3cf1c5978d39..4846691957da 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -19,6 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include #include #include #include @@ -544,10 +545,11 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) unsigned long ino; dev_t dev; - rcu_read_lock(); - exe_file = rcu_dereference(tsk->mm->exe_file); + exe_file = get_task_exe_file(tsk); + if (!exe_file) + return 0; ino = exe_file->f_inode->i_ino; dev = exe_file->f_inode->i_sb->s_dev; - rcu_read_unlock(); + fput(exe_file); return audit_mark_compare(mark, ino, dev); } -- cgit v1.3-8-gc7d7 From 070c43eea5043e950daa423707ae3c77e2f48edb Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 1 Sep 2016 16:14:44 -0700 Subject: kexec: fix double-free when failing to relocate the purgatory If kexec_apply_relocations fails, kexec_load_purgatory frees pi->sechdrs and pi->purgatory_buf. This is redundant, because in case of error kimage_file_prepare_segments calls kimage_file_post_load_cleanup, which will also free those buffers. This causes two warnings like the following, one for pi->sechdrs and the other for pi->purgatory_buf: kexec-bzImage64: Loading purgatory failed ------------[ cut here ]------------ WARNING: CPU: 1 PID: 2119 at mm/vmalloc.c:1490 __vunmap+0xc1/0xd0 Trying to vfree() nonexistent vm area (ffffc90000e91000) Modules linked in: CPU: 1 PID: 2119 Comm: kexec Not tainted 4.8.0-rc3+ #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x4d/0x65 __warn+0xcb/0xf0 warn_slowpath_fmt+0x4f/0x60 ? find_vmap_area+0x19/0x70 ? kimage_file_post_load_cleanup+0x47/0xb0 __vunmap+0xc1/0xd0 vfree+0x2e/0x70 kimage_file_post_load_cleanup+0x5e/0xb0 SyS_kexec_file_load+0x448/0x680 ? putname+0x54/0x60 ? do_sys_open+0x190/0x1f0 entry_SYSCALL_64_fastpath+0x13/0x8f ---[ end trace 158bb74f5950ca2b ]--- Fix by setting pi->sechdrs an pi->purgatory_buf to NULL, since vfree won't try to free a NULL pointer. Link: http://lkml.kernel.org/r/1472083546-23683-1-git-send-email-bauerman@linux.vnet.ibm.com Signed-off-by: Thiago Jung Bauermann Acked-by: Baoquan He Cc: "Eric W. Biederman" Cc: Vivek Goyal Cc: Dave Young Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 503bc2d348e5..037c321c5618 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -887,7 +887,10 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, return 0; out: vfree(pi->sechdrs); + pi->sechdrs = NULL; + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; return ret; } -- cgit v1.3-8-gc7d7 From 236dec051078a8691950f56949612b4b74107e48 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 1 Sep 2016 16:14:47 -0700 Subject: kconfig: tinyconfig: provide whole choice blocks to avoid warnings Using "make tinyconfig" produces a couple of annoying warnings that show up for build test machines all the time: .config:966:warning: override: NOHIGHMEM changes choice state .config:965:warning: override: SLOB changes choice state .config:963:warning: override: KERNEL_XZ changes choice state .config:962:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state .config:933:warning: override: SLOB changes choice state .config:930:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state .config:870:warning: override: SLOB changes choice state .config:868:warning: override: KERNEL_XZ changes choice state .config:867:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state I've made a previous attempt at fixing them and we discussed a number of alternatives. I tried changing the Makefile to use "merge_config.sh -n $(fragment-list)" but couldn't get that to work properly. This is yet another approach, based on the observation that we do want to see a warning for conflicting 'choice' options, and that we can simply make them non-conflicting by listing all other options as disabled. This is a trivial patch that we can apply independent of plans for other changes. Link: http://lkml.kernel.org/r/20160829214952.1334674-2-arnd@arndb.de Link: https://storage.kernelci.org/mainline/v4.7-rc6/x86-tinyconfig/build.log https://patchwork.kernel.org/patch/9212749/ Signed-off-by: Arnd Bergmann Reviewed-by: Josh Triplett Reviewed-by: Masahiro Yamada Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/configs/tiny.config | 2 ++ kernel/configs/tiny.config | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config index 4e2ecfa23c15..4b429df40d7a 100644 --- a/arch/x86/configs/tiny.config +++ b/arch/x86/configs/tiny.config @@ -1 +1,3 @@ CONFIG_NOHIGHMEM=y +# CONFIG_HIGHMEM4G is not set +# CONFIG_HIGHMEM64G is not set diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index c2de56ab0fce..7fa0c4ae6394 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -1,4 +1,12 @@ +# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set CONFIG_CC_OPTIMIZE_FOR_SIZE=y +# CONFIG_KERNEL_GZIP is not set +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set CONFIG_KERNEL_XZ=y +# CONFIG_KERNEL_LZO is not set +# CONFIG_KERNEL_LZ4 is not set CONFIG_OPTIMIZE_INLINING=y +# CONFIG_SLAB is not set +# CONFIG_SLUB is not set CONFIG_SLOB=y -- cgit v1.3-8-gc7d7 From 19feeff18bbfde659baa58c2346f15a24d7c405e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 1 Sep 2016 16:15:04 -0700 Subject: printk/nmi: avoid direct printk()-s from __printk_nmi_flush() __printk_nmi_flush() can be called from nmi_panic(), therefore it has to test whether it's executed in NMI context and thus must route the messages through deferred printk() or via direct printk(). This is to avoid potential deadlocks, as described in commit cf9b1106c81c ("printk/nmi: flush NMI messages on the system panic"). However there remain two places where __printk_nmi_flush() does unconditional direct printk() calls: - pr_err("printk_nmi_flush: internal error ...") - pr_cont("\n") Factor out print_nmi_seq_line() parts into a new printk_nmi_flush_line() function, which takes care of in_nmi(), and use it in __printk_nmi_flush() for printing and error-reporting. Link: http://lkml.kernel.org/r/20160830161354.581-1-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Cc: Petr Mladek Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/nmi.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index b69eb8a2876f..16bab471c7e2 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -99,26 +99,32 @@ again: return add; } -/* - * printk one line from the temporary buffer from @start index until - * and including the @end index. - */ -static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end) +static void printk_nmi_flush_line(const char *text, int len) { - const char *buf = s->buffer + start; - /* * The buffers are flushed in NMI only on panic. The messages must * go only into the ring buffer at this stage. Consoles will get * explicitly called later when a crashdump is not generated. */ if (in_nmi()) - printk_deferred("%.*s", (end - start) + 1, buf); + printk_deferred("%.*s", len, text); else - printk("%.*s", (end - start) + 1, buf); + printk("%.*s", len, text); } +/* + * printk one line from the temporary buffer from @start index until + * and including the @end index. + */ +static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s, + int start, int end) +{ + const char *buf = s->buffer + start; + + printk_nmi_flush_line(buf, (end - start) + 1); +} + /* * Flush data from the associated per_CPU buffer. The function * can be called either via IRQ work or independently. @@ -150,9 +156,11 @@ more: * the buffer an unexpected way. If we printed something then * @len must only increase. */ - if (i && i >= len) - pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n", - i, len); + if (i && i >= len) { + const char *msg = "printk_nmi_flush: internal error\n"; + + printk_nmi_flush_line(msg, strlen(msg)); + } if (!len) goto out; /* Someone else has already flushed the buffer. */ @@ -166,14 +174,14 @@ more: /* Print line by line. */ for (; i < size; i++) { if (s->buffer[i] == '\n') { - print_nmi_seq_line(s, last_i, i); + printk_nmi_flush_seq_line(s, last_i, i); last_i = i + 1; } } /* Check if there was a partial line. */ if (last_i < size) { - print_nmi_seq_line(s, last_i, size - 1); - pr_cont("\n"); + printk_nmi_flush_seq_line(s, last_i, size - 1); + printk_nmi_flush_line("\n", strlen("\n")); } /* -- cgit v1.3-8-gc7d7 From c11600e4fed67ae4cd6a8096936afd445410e8ed Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 1 Sep 2016 16:15:07 -0700 Subject: mm, mempolicy: task->mempolicy must be NULL before dropping final reference KASAN allocates memory from the page allocator as part of kmem_cache_free(), and that can reference current->mempolicy through any number of allocation functions. It needs to be NULL'd out before the final reference is dropped to prevent a use-after-free bug: BUG: KASAN: use-after-free in alloc_pages_current+0x363/0x370 at addr ffff88010b48102c CPU: 0 PID: 15425 Comm: trinity-c2 Not tainted 4.8.0-rc2+ #140 ... Call Trace: dump_stack kasan_object_err kasan_report_error __asan_report_load2_noabort alloc_pages_current <-- use after free depot_save_stack save_stack kasan_slab_free kmem_cache_free __mpol_put <-- free do_exit This patch sets current->mempolicy to NULL before dropping the final reference. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1608301442180.63329@chino.kir.corp.google.com Fixes: cd11016e5f52 ("mm, kasan: stackdepot implementation. Enable stackdepot for SLAB") Signed-off-by: David Rientjes Reported-by: Vegard Nossum Acked-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: [4.6+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 4 ++++ kernel/exit.c | 7 +------ mm/mempolicy.c | 17 +++++++++++++++++ 3 files changed, 22 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 4429d255c8ab..5e5b2969d931 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -195,6 +195,7 @@ static inline bool vma_migratable(struct vm_area_struct *vma) } extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); +extern void mpol_put_task_policy(struct task_struct *); #else @@ -297,5 +298,8 @@ static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, return -1; /* no node preference */ } +static inline void mpol_put_task_policy(struct task_struct *task) +{ +} #endif /* CONFIG_NUMA */ #endif diff --git a/kernel/exit.c b/kernel/exit.c index 2f974ae042a6..091a78be3b09 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -848,12 +848,7 @@ void do_exit(long code) TASKS_RCU(preempt_enable()); exit_notify(tsk, group_dead); proc_exit_connector(tsk); -#ifdef CONFIG_NUMA - task_lock(tsk); - mpol_put(tsk->mempolicy); - tsk->mempolicy = NULL; - task_unlock(tsk); -#endif + mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d8c4e38fb5f4..2da72a5b6ecc 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2336,6 +2336,23 @@ out: return ret; } +/* + * Drop the (possibly final) reference to task->mempolicy. It needs to be + * dropped after task->mempolicy is set to NULL so that any allocation done as + * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed + * policy. + */ +void mpol_put_task_policy(struct task_struct *task) +{ + struct mempolicy *pol; + + task_lock(task); + pol = task->mempolicy; + task->mempolicy = NULL; + task_unlock(task); + mpol_put(pol); +} + static void sp_delete(struct shared_policy *sp, struct sp_node *n) { pr_debug("deleting %lx-l%lx\n", n->start, n->end); -- cgit v1.3-8-gc7d7 From 735f2770a770156100f534646158cb58cb8b2939 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 1 Sep 2016 16:15:13 -0700 Subject: kernel/fork: fix CLONE_CHILD_CLEARTID regression in nscd Commit fec1d0115240 ("[PATCH] Disable CLONE_CHILD_CLEARTID for abnormal exit") has caused a subtle regression in nscd which uses CLONE_CHILD_CLEARTID to clear the nscd_certainly_running flag in the shared databases, so that the clients are notified when nscd is restarted. Now, when nscd uses a non-persistent database, clients that have it mapped keep thinking the database is being updated by nscd, when in fact nscd has created a new (anonymous) one (for non-persistent databases it uses an unlinked file as backend). The original proposal for the CLONE_CHILD_CLEARTID change claimed (https://lkml.org/lkml/2006/10/25/233): : The NPTL library uses the CLONE_CHILD_CLEARTID flag on clone() syscalls : on behalf of pthread_create() library calls. This feature is used to : request that the kernel clear the thread-id in user space (at an address : provided in the syscall) when the thread disassociates itself from the : address space, which is done in mm_release(). : : Unfortunately, when a multi-threaded process incurs a core dump (such as : from a SIGSEGV), the core-dumping thread sends SIGKILL signals to all of : the other threads, which then proceed to clear their user-space tids : before synchronizing in exit_mm() with the start of core dumping. This : misrepresents the state of process's address space at the time of the : SIGSEGV and makes it more difficult for someone to debug NPTL and glibc : problems (misleading him/her to conclude that the threads had gone away : before the fault). : : The fix below is to simply avoid the CLONE_CHILD_CLEARTID action if a : core dump has been initiated. The resulting patch from Roland (https://lkml.org/lkml/2006/10/26/269) seems to have a larger scope than the original patch asked for. It seems that limitting the scope of the check to core dumping should work for SIGSEGV issue describe above. [Changelog partly based on Andreas' description] Fixes: fec1d0115240 ("[PATCH] Disable CLONE_CHILD_CLEARTID for abnormal exit") Link: http://lkml.kernel.org/r/1471968749-26173-1-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Tested-by: William Preston Acked-by: Oleg Nesterov Cc: Roland McGrath Cc: Andreas Schwab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index aaf782327bf3..93bdba13d7d9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -913,14 +913,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) deactivate_mm(tsk, mm); /* - * If we're exiting normally, clear a user-space tid field if - * requested. We leave this alone when dying by signal, to leave - * the value intact in a core dump, and to save the unnecessary - * trouble, say, a killed vfork parent shouldn't touch this mm. - * Userland only wants this done for a sys_exit. + * Signal userspace if we're not exiting with a core dump + * because we want to leave the value intact for debugging + * purposes. */ if (tsk->clear_child_tid) { - if (!(tsk->flags & PF_SIGNALED) && + if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) && atomic_read(&mm->mm_users) > 1) { /* * We don't check the error code - if userspace has -- cgit v1.3-8-gc7d7 From 08d072599234c959b0b82b63fa252c129225a899 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 2 Sep 2016 14:38:23 +0800 Subject: tick/nohz: Fix softlockup on scheduler stalls in kvm guest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tick_nohz_start_idle() is prevented to be called if the idle tick can't be stopped since commit 1f3b0f8243cb934 ("tick/nohz: Optimize nohz idle enter"). As a result, after suspend/resume the host machine, full dynticks kvm guest will softlockup: NMI watchdog: BUG: soft lockup - CPU#0 stuck for 26s! [swapper/0:0] Call Trace: default_idle+0x31/0x1a0 arch_cpu_idle+0xf/0x20 default_idle_call+0x2a/0x50 cpu_startup_entry+0x39b/0x4d0 rest_init+0x138/0x140 ? rest_init+0x5/0x140 start_kernel+0x4c1/0x4ce ? set_init_arg+0x55/0x55 ? early_idt_handler_array+0x120/0x120 x86_64_start_reservations+0x24/0x26 x86_64_start_kernel+0x142/0x14f In addition, cat /proc/stat | grep cpu in guest or host: cpu 398 16 5049 15754 5490 0 1 46 0 0 cpu0 206 5 450 0 0 0 1 14 0 0 cpu1 81 0 3937 3149 1514 0 0 9 0 0 cpu2 45 6 332 6052 2243 0 0 11 0 0 cpu3 65 2 328 6552 1732 0 0 11 0 0 The idle and iowait states are weird 0 for cpu0(housekeeping). The bug is present in both guest and host kernels, and they both have cpu0's idle and iowait states issue, however, host kernel's suspend/resume path etc will touch watchdog to avoid the softlockup. - The watchdog will not be touched in tick_nohz_stop_idle path (need be touched since the scheduler stall is expected) if idle_active flags are not detected. - The idle and iowait states will not be accounted when exit idle loop (resched or interrupt) if idle start time and idle_active flags are not set. This patch fixes it by reverting commit 1f3b0f8243cb934 since can't stop idle tick doesn't mean can't be idle. Fixes: 1f3b0f8243cb934 ("tick/nohz: Optimize nohz idle enter") Signed-off-by: Wanpeng Li Cc: Sanjeev Yadav Cc: Gaurav Jindal Cc: stable@vger.kernel.org Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Peter Zijlstra Cc: Paolo Bonzini Link: http://lkml.kernel.org/r/1472798303-4154-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 204fdc86863d..2ec7c00228f3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -908,10 +908,11 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) ktime_t now, expires; int cpu = smp_processor_id(); + now = tick_nohz_start_idle(ts); + if (can_stop_idle_tick(cpu, ts)) { int was_stopped = ts->tick_stopped; - now = tick_nohz_start_idle(ts); ts->idle_calls++; expires = tick_nohz_stop_sched_tick(ts, now, cpu); -- cgit v1.3-8-gc7d7 From 58763148758057ffc447bf990321d3ea86d199a0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 30 Aug 2016 10:15:03 +0200 Subject: perf/core: Remove WARN from perf_event_read() This effectively reverts commit: 71e7bc2bab77 ("perf/core: Check return value of the perf_event_read() IPI") ... and puts in a comment explaining why we ignore the return value. Reported-by: Vegard Nossum Signed-off-by: Peter Zijlstra (Intel) Cc: David Carrillo-Cisneros Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 71e7bc2bab77 ("perf/core: Check return value of the perf_event_read() IPI") Signed-off-by: Ingo Molnar --- kernel/events/core.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3cfabdf7b942..07ac8596a728 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3549,10 +3549,18 @@ static int perf_event_read(struct perf_event *event, bool group) .group = group, .ret = 0, }; - ret = smp_call_function_single(event->oncpu, __perf_event_read, &data, 1); - /* The event must have been read from an online CPU: */ - WARN_ON_ONCE(ret); - ret = ret ? : data.ret; + /* + * Purposely ignore the smp_call_function_single() return + * value. + * + * If event->oncpu isn't a valid CPU it means the event got + * scheduled out and that will have updated the event count. + * + * Therefore, either way, we'll have an up-to-date event count + * after this. + */ + (void)smp_call_function_single(event->oncpu, __perf_event_read, &data, 1); + ret = data.ret; } else if (event->state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; -- cgit v1.3-8-gc7d7 From 135e8c9250dd5c8c9aae5984fde6f230d0cbfeaf Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Mon, 5 Sep 2016 13:16:40 +1000 Subject: sched/core: Fix a race between try_to_wake_up() and a woken up task The origin of the issue I've seen is related to a missing memory barrier between check for task->state and the check for task->on_rq. The task being woken up is already awake from a schedule() and is doing the following: do { schedule() set_current_state(TASK_(UN)INTERRUPTIBLE); } while (!cond); The waker, actually gets stuck doing the following in try_to_wake_up(): while (p->on_cpu) cpu_relax(); Analysis: The instance I've seen involves the following race: CPU1 CPU2 while () { if (cond) break; do { schedule(); set_current_state(TASK_UN..) } while (!cond); wakeup_routine() spin_lock_irqsave(wait_lock) raw_spin_lock_irqsave(wait_lock) wake_up_process() } try_to_wake_up() set_current_state(TASK_RUNNING); .. list_del(&waiter.list); CPU2 wakes up CPU1, but before it can get the wait_lock and set current state to TASK_RUNNING the following occurs: CPU3 wakeup_routine() raw_spin_lock_irqsave(wait_lock) if (!list_empty) wake_up_process() try_to_wake_up() raw_spin_lock_irqsave(p->pi_lock) .. if (p->on_rq && ttwu_wakeup()) .. while (p->on_cpu) cpu_relax() .. CPU3 tries to wake up the task on CPU1 again since it finds it on the wait_queue, CPU1 is spinning on wait_lock, but immediately after CPU2, CPU3 got it. CPU3 checks the state of p on CPU1, it is TASK_UNINTERRUPTIBLE and the task is spinning on the wait_lock. Interestingly since p->on_rq is checked under pi_lock, I've noticed that try_to_wake_up() finds p->on_rq to be 0. This was the most confusing bit of the analysis, but p->on_rq is changed under runqueue lock, rq_lock, the p->on_rq check is not reliable without this fix IMHO. The race is visible (based on the analysis) only when ttwu_queue() does a remote wakeup via ttwu_queue_remote. In which case the p->on_rq change is not done uder the pi_lock. The result is that after a while the entire system locks up on the raw_spin_irqlock_save(wait_lock) and the holder spins infintely Reproduction of the issue: The issue can be reproduced after a long run on my system with 80 threads and having to tweak available memory to very low and running memory stress-ng mmapfork test. It usually takes a long time to reproduce. I am trying to work on a test case that can reproduce the issue faster, but thats work in progress. I am still testing the changes on my still in a loop and the tests seem OK thus far. Big thanks to Benjamin and Nick for helping debug this as well. Ben helped catch the missing barrier, Nick caught every missing bit in my theory. Signed-off-by: Balbir Singh [ Updated comment to clarify matching barriers. Many architectures do not have a full barrier in switch_to() so that cannot be relied upon. ] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Benjamin Herrenschmidt Cc: Alexey Kardashevskiy Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Nicholas Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Link: http://lkml.kernel.org/r/e02cce7b-d9ca-1ad0-7a61-ea97c7582b37@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2a906f20fba7..44817c640e99 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2016,6 +2016,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) success = 1; /* we're going to change ->state */ cpu = task_cpu(p); + /* + * Ensure we load p->on_rq _after_ p->state, otherwise it would + * be possible to, falsely, observe p->on_rq == 0 and get stuck + * in smp_cond_load_acquire() below. + * + * sched_ttwu_pending() try_to_wake_up() + * [S] p->on_rq = 1; [L] P->state + * UNLOCK rq->lock -----. + * \ + * +--- RMB + * schedule() / + * LOCK rq->lock -----' + * UNLOCK rq->lock + * + * [task p] + * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq + * + * Pairs with the UNLOCK+LOCK on rq->lock from the + * last wakeup of our task and the schedule that got our task + * current. + */ + smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) goto stat; -- cgit v1.3-8-gc7d7 From c86d06ba2818c5126078cb0cf4e0175ec381045b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Sep 2016 08:38:13 -0400 Subject: PM / QoS: avoid calling cancel_delayed_work_sync() during early boot of_clk_init() ends up calling into pm_qos_update_request() very early during boot where irq is expected to stay disabled. pm_qos_update_request() uses cancel_delayed_work_sync() which correctly assumes that irq is enabled on invocation and unconditionally disables and re-enables it. Gate cancel_delayed_work_sync() invocation with kevented_up() to avoid enabling irq unexpectedly during early boot. Signed-off-by: Tejun Heo Reported-and-tested-by: Qiao Zhou Link: http://lkml.kernel.org/r/d2501c4c-8e7b-bea3-1b01-000b36b5dfe9@asrmicro.com Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 97b0df71303e..168ff442ebde 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -482,7 +482,16 @@ void pm_qos_update_request(struct pm_qos_request *req, return; } - cancel_delayed_work_sync(&req->work); + /* + * This function may be called very early during boot, for example, + * from of_clk_init(), where irq needs to stay disabled. + * cancel_delayed_work_sync() assumes that irq is enabled on + * invocation and re-enables it on return. Avoid calling it until + * workqueue is initialized. + */ + if (keventd_up()) + cancel_delayed_work_sync(&req->work); + __pm_qos_update_request(req, new_value); } EXPORT_SYMBOL_GPL(pm_qos_update_request); -- cgit v1.3-8-gc7d7 From 9049771f7d5490a302589976984810064c83ab40 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 7 Sep 2016 08:51:21 -0700 Subject: mm: fix cache mode of dax pmd mappings track_pfn_insert() in vmf_insert_pfn_pmd() is marking dax mappings as uncacheable rendering them impractical for application usage. DAX-pte mappings are cached and the goal of establishing DAX-pmd mappings is to attain more performance, not dramatically less (3 orders of magnitude). track_pfn_insert() relies on a previous call to reserve_memtype() to establish the expected page_cache_mode for the range. While memremap() arranges for reserve_memtype() to be called, devm_memremap_pages() does not. So, teach track_pfn_insert() and untrack_pfn() how to handle tracking without a vma, and arrange for devm_memremap_pages() to establish the write-back-cache reservation in the memtype tree. Cc: Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Nilesh Choudhury Cc: Kirill A. Shutemov Reported-by: Toshi Kani Reported-by: Kai Zhang Acked-by: Andrew Morton Signed-off-by: Dan Williams --- arch/x86/mm/pat.c | 17 ++++++++++------- kernel/memremap.c | 9 +++++++++ 2 files changed, 19 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index ecb1b69c1651..170cc4ff057b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -927,9 +927,10 @@ int track_pfn_copy(struct vm_area_struct *vma) } /* - * prot is passed in as a parameter for the new mapping. If the vma has a - * linear pfn mapping for the entire range reserve the entire vma range with - * single reserve_pfn_range call. + * prot is passed in as a parameter for the new mapping. If the vma has + * a linear pfn mapping for the entire range, or no vma is provided, + * reserve the entire pfn + size range with single reserve_pfn_range + * call. */ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size) @@ -938,11 +939,12 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, enum page_cache_mode pcm; /* reserve the whole chunk starting from paddr */ - if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { + if (!vma || (addr == vma->vm_start + && size == (vma->vm_end - vma->vm_start))) { int ret; ret = reserve_pfn_range(paddr, size, prot, 0); - if (!ret) + if (ret == 0 && vma) vma->vm_flags |= VM_PAT; return ret; } @@ -997,7 +999,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, resource_size_t paddr; unsigned long prot; - if (!(vma->vm_flags & VM_PAT)) + if (vma && !(vma->vm_flags & VM_PAT)) return; /* free the chunk starting from pfn or the whole chunk */ @@ -1011,7 +1013,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); - vma->vm_flags &= ~VM_PAT; + if (vma) + vma->vm_flags &= ~VM_PAT; } /* diff --git a/kernel/memremap.c b/kernel/memremap.c index 251d16b4cb41..b501e390bb34 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -247,6 +247,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data) align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); arch_remove_memory(align_start, align_size); + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, "%s: failed to free all reserved pages\n", __func__); @@ -282,6 +283,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap) { resource_size_t key, align_start, align_size, align_end; + pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; struct page_map *page_map; int error, nid, is_ram; @@ -351,6 +353,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (nid < 0) nid = numa_mem_id(); + error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0, + align_size); + if (error) + goto err_pfn_remap; + error = arch_add_memory(nid, align_start, align_size, true); if (error) goto err_add_memory; @@ -371,6 +378,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, return __va(res->start); err_add_memory: + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); + err_pfn_remap: err_radix: pgmap_radix_release(res); devres_free(page_map); -- cgit v1.3-8-gc7d7 From 767ae08678c2c796bcd7f582ee457aee20a28a1e Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 6 Sep 2016 16:23:49 +0300 Subject: perf/core: Fix a race between mmap_close() and set_output() of AUX events In the mmap_close() path we need to stop all the AUX events that are writing data to the AUX area that we are unmapping, before we can safely free the pages. To determine if an event needs to be stopped, we're comparing its ->rb against the one that's getting unmapped. However, a SET_OUTPUT ioctl may turn up inside an AUX transaction and swizzle event::rb to some other ring buffer, but the transaction will keep writing data to the old ring buffer until the event gets scheduled out. At this point, mmap_close() will skip over such an event and will proceed to free the AUX area, while it's still being used by this event, which will set off a warning in the mmap_close() path and cause a memory corruption. To avoid this, always stop an AUX event before its ->rb is updated; this will release the (potentially) last reference on the AUX area of the buffer. If the event gets restarted, its new ring buffer will be used. If another SET_OUTPUT comes and switches it back to the old ring buffer that's getting unmapped, it's also fine: this ring buffer's aux_mmap_count will be zero and AUX transactions won't start any more. Reported-by: Vince Weaver Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160906132353.19887-2-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 07ac8596a728..a54f2c2cdb20 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2496,11 +2496,11 @@ static int __perf_event_stop(void *info) return 0; } -static int perf_event_restart(struct perf_event *event) +static int perf_event_stop(struct perf_event *event, int restart) { struct stop_event_data sd = { .event = event, - .restart = 1, + .restart = restart, }; int ret = 0; @@ -4845,6 +4845,19 @@ static void ring_buffer_attach(struct perf_event *event, spin_unlock_irqrestore(&rb->event_lock, flags); } + /* + * Avoid racing with perf_mmap_close(AUX): stop the event + * before swizzling the event::rb pointer; if it's getting + * unmapped, its aux_mmap_count will be 0 and it won't + * restart. See the comment in __perf_pmu_output_stop(). + * + * Data will inevitably be lost when set_output is done in + * mid-air, but then again, whoever does it like this is + * not in for the data anyway. + */ + if (has_aux(event)) + perf_event_stop(event, 0); + rcu_assign_pointer(event->rb, rb); if (old_rb) { @@ -6120,7 +6133,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) - perf_event_restart(event); + perf_event_stop(event, 1); } void perf_event_exec(void) @@ -6164,7 +6177,13 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) /* * In case of inheritance, it will be the parent that links to the - * ring-buffer, but it will be the child that's actually using it: + * ring-buffer, but it will be the child that's actually using it. + * + * We are using event::rb to determine if the event should be stopped, + * however this may race with ring_buffer_attach() (through set_output), + * which will make us skip the event that actually needs to be stopped. + * So ring_buffer_attach() has to stop an aux event before re-assigning + * its rb pointer. */ if (rcu_dereference(parent->rb) == rb) ro->err = __perf_event_stop(&sd); @@ -6678,7 +6697,7 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data) raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) - perf_event_restart(event); + perf_event_stop(event, 1); } /* @@ -7867,7 +7886,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) mmput(mm); restart: - perf_event_restart(event); + perf_event_stop(event, 1); } /* -- cgit v1.3-8-gc7d7 From b79ccadd6bb10e72cf784a298ca6dc1398eb9a24 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 6 Sep 2016 16:23:50 +0300 Subject: perf/core: Fix aux_mmap_count vs aux_refcount order The order of accesses to ring buffer's aux_mmap_count and aux_refcount has to be preserved across the users, namely perf_mmap_close() and perf_aux_output_begin(), otherwise the inversion can result in the latter holding the last reference to the aux buffer and subsequently free'ing it in atomic context, triggering a warning. > ------------[ cut here ]------------ > WARNING: CPU: 0 PID: 257 at kernel/events/ring_buffer.c:541 __rb_free_aux+0x11a/0x130 > CPU: 0 PID: 257 Comm: stopbug Not tainted 4.8.0-rc1+ #2596 > Call Trace: > [] __warn+0xcb/0xf0 > [] warn_slowpath_null+0x1d/0x20 > [] __rb_free_aux+0x11a/0x130 > [] rb_free_aux+0x18/0x20 > [] perf_aux_output_begin+0x163/0x1e0 > [] bts_event_start+0x3a/0xd0 > [] bts_event_add+0x5d/0x80 > [] event_sched_in.isra.104+0xf6/0x2f0 > [] group_sched_in+0x6e/0x190 > [] ctx_sched_in+0x2fe/0x5f0 > [] perf_event_sched_in+0x60/0x80 > [] ctx_resched+0x5b/0x90 > [] __perf_event_enable+0x1e1/0x240 > [] event_function+0xa9/0x180 > [] ? perf_cgroup_attach+0x70/0x70 > [] remote_function+0x3f/0x50 > [] flush_smp_call_function_queue+0x83/0x150 > [] generic_smp_call_function_single_interrupt+0x13/0x60 > [] smp_call_function_single_interrupt+0x27/0x40 > [] call_function_single_interrupt+0x89/0x90 > [] finish_task_switch+0xa6/0x210 > [] ? finish_task_switch+0x67/0x210 > [] __schedule+0x3dd/0xb50 > [] schedule+0x35/0x80 > [] sys_sched_yield+0x61/0x70 > [] entry_SYSCALL_64_fastpath+0x18/0xa8 > ---[ end trace 6235f556f5ea83a9 ]--- This patch puts the checks in perf_aux_output_begin() in the same order as that of perf_mmap_close(). Reported-by: Vince Weaver Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160906132353.19887-3-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ae9b90dc9a5a..257fa460b846 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -330,15 +330,22 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (!rb) return NULL; - if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount)) + if (!rb_has_aux(rb)) goto err; /* - * If rb::aux_mmap_count is zero (and rb_has_aux() above went through), - * the aux buffer is in perf_mmap_close(), about to get freed. + * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(), + * about to get freed, so we leave immediately. + * + * Checking rb::aux_mmap_count and rb::refcount has to be done in + * the same order, see perf_mmap_close. Otherwise we end up freeing + * aux pages in this path, which is a bug, because in_atomic(). */ if (!atomic_read(&rb->aux_mmap_count)) - goto err_put; + goto err; + + if (!atomic_inc_not_zero(&rb->aux_refcount)) + goto err; /* * Nesting is not supported for AUX area, make sure nested -- cgit v1.3-8-gc7d7 From 28b89b9e6f7b6c8fef7b3af39828722bca20cfee Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Sun, 11 Sep 2016 21:14:58 -0700 Subject: cpuset: handle race between CPU hotplug and cpuset_hotplug_work A discrepancy between cpu_online_mask and cpuset's effective_cpus mask is inevitable during hotplug since cpuset defers updating of effective_cpus mask using a workqueue, during which time nothing prevents the system from more hotplug operations. For that reason guarantee_online_cpus() walks up the cpuset hierarchy until it finds an intersection under the assumption that top cpuset's effective_cpus mask intersects with cpu_online_mask even with such a race occurring. However a sequence of CPU hotplugs can open a time window, during which none of the effective CPUs in the top cpuset intersect with cpu_online_mask. For example when there are 4 possible CPUs 0-3 and only CPU0 is online: ======================== =========================== cpu_online_mask top_cpuset.effective_cpus ======================== =========================== echo 1 > cpu2/online. CPU hotplug notifier woke up hotplug work but not yet scheduled. [0,2] [0] echo 0 > cpu0/online. The workqueue is still runnable. [2] [0] ======================== =========================== Now there is no intersection between cpu_online_mask and top_cpuset.effective_cpus. Thus invoking sys_sched_setaffinity() at this moment can cause following: Unable to handle kernel NULL pointer dereference at virtual address 000000d0 ------------[ cut here ]------------ Kernel BUG at ffffffc0001389b0 [verbose debug info unavailable] Internal error: Oops - BUG: 96000005 [#1] PREEMPT SMP Modules linked in: CPU: 2 PID: 1420 Comm: taskset Tainted: G W 4.4.8+ #98 task: ffffffc06a5c4880 ti: ffffffc06e124000 task.ti: ffffffc06e124000 PC is at guarantee_online_cpus+0x2c/0x58 LR is at cpuset_cpus_allowed+0x4c/0x6c Process taskset (pid: 1420, stack limit = 0xffffffc06e124020) Call trace: [] guarantee_online_cpus+0x2c/0x58 [] cpuset_cpus_allowed+0x4c/0x6c [] sched_setaffinity+0xc0/0x1ac [] SyS_sched_setaffinity+0x98/0xac [] el0_svc_naked+0x24/0x28 The top cpuset's effective_cpus are guaranteed to be identical to cpu_online_mask eventually. Hence fall back to cpu_online_mask when there is no intersection between top cpuset's effective_cpus and cpu_online_mask. Signed-off-by: Joonwoo Park Acked-by: Li Zefan Cc: Tejun Heo Cc: cgroups@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: # 3.17+ Signed-off-by: Tejun Heo --- kernel/cpuset.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c27e53326bef..c08585ad996b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -325,8 +325,7 @@ static struct file_system_type cpuset_fs_type = { /* * Return in pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. The top - * cpuset always has some cpus online. + * until we find one that does have some online cpus. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. @@ -335,8 +334,20 @@ static struct file_system_type cpuset_fs_type = { */ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { - while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) + while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { cs = parent_cs(cs); + if (unlikely(!cs)) { + /* + * The top cpuset doesn't have any online cpu as a + * consequence of a race between cpuset_hotplug_work + * and cpu hotplug notifier. But we know the top + * cpuset's effective_cpus is on its way to to be + * identical to cpu_online_mask. + */ + cpumask_copy(pmask, cpu_online_mask); + return; + } + } cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); } -- cgit v1.3-8-gc7d7 From 8a15b81741879fa89601b03c0e50b0d780d65bc0 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 16 Sep 2016 13:02:37 +0000 Subject: cpuset: fix non static symbol warning Fixes the following sparse warning: kernel/cpuset.c:2088:6: warning: symbol 'cpuset_fork' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: Tejun Heo --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c08585ad996b..2b4c20ab5bbe 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2085,7 +2085,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) * which could have been changed by cpuset just after it inherits the * state from the parent and before it sits on the cgroup's task list. */ -void cpuset_fork(struct task_struct *task) +static void cpuset_fork(struct task_struct *task) { if (task_css_is_root(task, cpuset_cgrp_id)) return; -- cgit v1.3-8-gc7d7 From 1984e075915cbae65336a99b1879865080d8e55e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 19 Sep 2016 09:49:27 +0100 Subject: genirq: Skip chained interrupt trigger setup if type is IRQ_TYPE_NONE There is no point in trying to configure the trigger of a chained interrupt if no trigger information has been configured. At best this is ignored, and at the worse this confuses the underlying irqchip (which is likely not to handle such a thing), and unnecessarily alarms the user. Only apply the configuration if type is not IRQ_TYPE_NONE. Fixes: 1e12c4a9393b ("genirq: Correctly configure the trigger on chained interrupts") Reported-and-tested-by: Geert Uytterhoeven Signed-off-by: Marc Zyngier Link: https://lkml.kernel.org/r/CAMuHMdVW1eTn20=EtYcJ8hkVwohaSuH_yQXrY2MGBEvZ8fpFOg@mail.gmail.com Link: http://lkml.kernel.org/r/1474274967-15984-1-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 637389088b3f..26ba5654d9d5 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -820,6 +820,8 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, desc->name = name; if (handle != handle_bad_irq && is_chained) { + unsigned int type = irqd_get_trigger_type(&desc->irq_data); + /* * We're about to start this interrupt immediately, * hence the need to set the trigger configuration. @@ -828,8 +830,10 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, * chained interrupt. Reset it immediately because we * do know better. */ - __irq_set_trigger(desc, irqd_get_trigger_type(&desc->irq_data)); - desc->handle_irq = handle; + if (type != IRQ_TYPE_NONE) { + __irq_set_trigger(desc, type); + desc->handle_irq = handle; + } irq_settings_set_noprobe(desc); irq_settings_set_norequest(desc); -- cgit v1.3-8-gc7d7 From d979a39d7242e0601bf9b60e89628fb8ac577179 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 19 Sep 2016 14:44:38 -0700 Subject: cgroup: duplicate cgroup reference when cloning sockets When a socket is cloned, the associated sock_cgroup_data is duplicated but not its reference on the cgroup. As a result, the cgroup reference count will underflow when both sockets are destroyed later on. Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Link: http://lkml.kernel.org/r/20160914194846.11153-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Cc: Michal Hocko Cc: Vladimir Davydov Cc: [4.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 6 ++++++ net/core/sock.c | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..5e8dab5bf9ad 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6270,6 +6270,12 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) if (cgroup_sk_alloc_disabled) return; + /* Socket clone path */ + if (skcd->val) { + cgroup_get(sock_cgroup_ptr(skcd)); + return; + } + rcu_read_lock(); while (true) { diff --git a/net/core/sock.c b/net/core/sock.c index 25dab8b60223..fd7b41edf1ce 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1362,7 +1362,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (!try_module_get(prot->owner)) goto out_free_sec; sk_tx_queue_clear(sk); - cgroup_sk_alloc(&sk->sk_cgrp_data); } return sk; @@ -1422,6 +1421,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); + cgroup_sk_alloc(&sk->sk_cgrp_data); sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); } @@ -1566,6 +1566,9 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); + + cgroup_sk_alloc(&newsk->sk_cgrp_data); + /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) -- cgit v1.3-8-gc7d7 From 3bf6215a1b30db7df6083c708caab3fe1a8e8abe Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 20 Sep 2016 18:48:11 +0300 Subject: perf/core: Limit matching exclusive events to one PMU An "exclusive" PMU is the one that can only have one event scheduled in at any given time. There may be more than one of such PMUs in a system, though, like Intel PT and BTS. It should be allowed to have one event for either of those inside the same context (there may be other constraints that may prevent this, but those would be hardware-specific). However, the exclusivity code is written so that only one event from any of the "exclusive" PMUs is allowed in a context. Fix this by making the exclusive event filter explicitly match two events' PMUs. Signed-off-by: Alexander Shishkin Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160920154811.3255-3-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a54f2c2cdb20..fc9bb2225291 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3929,7 +3929,7 @@ static void exclusive_event_destroy(struct perf_event *event) static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) { - if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && + if ((e1->pmu == e2->pmu) && (e1->cpu == e2->cpu || e1->cpu == -1 || e2->cpu == -1)) -- cgit v1.3-8-gc7d7 From 9157056da8f8c4a6305f15619e269f164b63a6de Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 23 Sep 2016 16:55:49 -0400 Subject: cgroup: fix invalid controller enable rejections with cgroup namespace On the v2 hierarchy, "cgroup.subtree_control" rejects controller enables if the cgroup has processes in it. The enforcement of this logic assumes that the cgroup wouldn't have any css_sets associated with it if there are no tasks in the cgroup, which is no longer true since a79a908fd2b0 ("cgroup: introduce cgroup namespaces"). When a cgroup namespace is created, it pins the css_set of the creating task to use it as the root css_set of the namespace. This extra reference stays as long as the namespace is around and makes "cgroup.subtree_control" think that the namespace root cgroup is not empty even when it is and thus reject controller enables. Fix it by making cgroup_subtree_control() walk and test emptiness of each css_set instead of testing whether the list_head is empty. While at it, update the comment of cgroup_task_count() to indicate that the returned value may be higher than the number of tasks, which has always been true due to temporary references and doesn't break anything. Signed-off-by: Tejun Heo Reported-by: Evgeny Vereshchagin Cc: Serge E. Hallyn Cc: Aditya Kali Cc: Eric W. Biederman Cc: stable@vger.kernel.org # v4.6+ Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces") Link: https://github.com/systemd/systemd/pull/3589#issuecomment-249089541 --- kernel/cgroup.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..0d4ee1ea5c31 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3446,9 +3446,28 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * Except for the root, subtree_control must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ - if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { - ret = -EBUSY; - goto out_unlock; + if (enable && cgroup_parent(cgrp)) { + struct cgrp_cset_link *link; + + /* + * Because namespaces pin csets too, @cgrp->cset_links + * might not be empty even when @cgrp is empty. Walk and + * verify each cset. + */ + spin_lock_irq(&css_set_lock); + + ret = 0; + list_for_each_entry(link, &cgrp->cset_links, cset_link) { + if (css_set_populated(link->cset)) { + ret = -EBUSY; + break; + } + } + + spin_unlock_irq(&css_set_lock); + + if (ret) + goto out_unlock; } /* save and update control masks and prepare csses */ @@ -3899,7 +3918,9 @@ void cgroup_file_notify(struct cgroup_file *cfile) * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question * - * Return the number of tasks in the cgroup. + * Return the number of tasks in the cgroup. The returned number can be + * higher than the actual number of tasks due to css_set references from + * namespace roots and temporary usages. */ static int cgroup_task_count(const struct cgroup *cgrp) { -- cgit v1.3-8-gc7d7 From 1245800c0f96eb6ebb368593e251d66c01e61022 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 23 Sep 2016 22:57:13 -0400 Subject: tracing: Move mutex to protect against resetting of seq data The iter->seq can be reset outside the protection of the mutex. So can reading of user data. Move the mutex up to the beginning of the function. Fixes: d7350c3f45694 ("tracing/core: make the read callbacks reentrants") Cc: stable@vger.kernel.org # 2.6.30+ Reported-by: Al Viro Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a4bd6b68a0b..8fb4847b0450 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4890,19 +4890,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, struct trace_iterator *iter = filp->private_data; ssize_t sret; - /* return any leftover data */ - sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (sret != -EBUSY) - return sret; - - trace_seq_init(&iter->seq); - /* * Avoid more than one consumer on a single file descriptor * This is just a matter of traces coherency, the ring buffer itself * is protected. */ mutex_lock(&iter->mutex); + + /* return any leftover data */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (sret != -EBUSY) + goto out; + + trace_seq_init(&iter->seq); + if (iter->trace->read) { sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); if (sret) -- cgit v1.3-8-gc7d7 From 1ae2293dd6d2f5c823cf97e60b70d03631cd622f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 17 Sep 2016 18:31:46 -0400 Subject: fix memory leaks in tracing_buffers_splice_read() Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- kernel/trace/trace.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8fb4847b0450..77eeab2776ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5930,9 +5930,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, return -EBUSY; #endif - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; - if (*ppos & (PAGE_SIZE - 1)) return -EINVAL; @@ -5942,6 +5939,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + again: trace_access_lock(iter->cpu_file); entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); @@ -5999,19 +5999,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { if (ret) - return ret; + goto out; + ret = -EAGAIN; if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) - return -EAGAIN; + goto out; ret = wait_on_pipe(iter, true); if (ret) - return ret; + goto out; goto again; } ret = splice_to_pipe(pipe, &spd); +out: splice_shrink_spd(&spd); return ret; -- cgit v1.3-8-gc7d7