From 97c79a38cd454602645f0470ffb444b3b75ce574 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 28 Apr 2016 13:16:33 -0300
Subject: perf core: Per event callchain limit

Additionally to being able to control the system wide maximum depth via
/proc/sys/kernel/perf_event_max_stack, now we are able to ask for
different depths per event, using perf_event_attr.sample_max_stack for
that.

This uses an u16 hole at the end of perf_event_attr, that, when
perf_event_attr.sample_type has the PERF_SAMPLE_CALLCHAIN, if
sample_max_stack is zero, means use perf_event_max_stack, otherwise
it'll be bounds checked under callchain_mutex.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/n/tip-kolmn1yo40p7jhswxwrc7rrd@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/bpf/stackmap.c     |  2 +-
 kernel/events/callchain.c | 14 ++++++++++++--
 kernel/events/core.c      |  5 ++++-
 3 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index a82d7605db3f..f1de5c1a2af6 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -99,7 +99,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	if (err)
 		goto free_smap;
 
-	err = get_callchain_buffers();
+	err = get_callchain_buffers(sysctl_perf_event_max_stack);
 	if (err)
 		goto free_smap;
 
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 179ef4640964..e9fdb5203de5 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -104,7 +104,7 @@ fail:
 	return -ENOMEM;
 }
 
-int get_callchain_buffers(void)
+int get_callchain_buffers(int event_max_stack)
 {
 	int err = 0;
 	int count;
@@ -121,6 +121,15 @@ int get_callchain_buffers(void)
 		/* If the allocation failed, give up */
 		if (!callchain_cpus_entries)
 			err = -ENOMEM;
+		/*
+		 * If requesting per event more than the global cap,
+		 * return a different error to help userspace figure
+		 * this out.
+		 *
+		 * And also do it here so that we have &callchain_mutex held.
+		 */
+		if (event_max_stack > sysctl_perf_event_max_stack)
+			err = -EOVERFLOW;
 		goto exit;
 	}
 
@@ -174,11 +183,12 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
 	bool user   = !event->attr.exclude_callchain_user;
 	/* Disallow cross-task user callchains. */
 	bool crosstask = event->ctx->task && event->ctx->task != current;
+	const u32 max_stack = event->attr.sample_max_stack;
 
 	if (!kernel && !user)
 		return NULL;
 
-	return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true);
+	return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true);
 }
 
 struct perf_callchain_entry *
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 050a290c72c7..79363f298445 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8843,7 +8843,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 	if (!event->parent) {
 		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
-			err = get_callchain_buffers();
+			err = get_callchain_buffers(attr->sample_max_stack);
 			if (err)
 				goto err_addr_filters;
 		}
@@ -9165,6 +9165,9 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EINVAL;
 	}
 
+	if (!attr.sample_max_stack)
+		attr.sample_max_stack = sysctl_perf_event_max_stack;
+
 	/*
 	 * In cgroup mode, the pid argument is used to pass the fd
 	 * opened to the cgroup directory in cgroupfs. The cpu argument
-- 
cgit v1.3-8-gc7d7


From 2b4c7afe79a8a0a0e05edeaded5653c190153f9b Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Sun, 15 May 2016 22:47:39 -0400
Subject: audit: fixup: log on errors from filter user rules

In commit 724e4fcc the intention was to pass any errors back from
audit_filter_user_rules() to audit_filter_user().  Add that code.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditfilter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 94ca7b1e5e7e..8a8aa3fbc8d8 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1339,8 +1339,8 @@ static int audit_filter_user_rules(struct audit_krule *rule, int type,
 			break;
 		}
 
-		if (!result)
-			return 0;
+		if (result <= 0)
+			return result;
 	}
 	switch (rule->action) {
 	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
-- 
cgit v1.3-8-gc7d7


From e788892ba3cc71d385b75895f7a375fbc659ce86 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 2 Jun 2016 23:24:15 +0200
Subject: cpufreq: governor: Get rid of governor events

The design of the cpufreq governor API is not very straightforward,
as struct cpufreq_governor provides only one callback to be invoked
from different code paths for different purposes.  The purpose it is
invoked for is determined by its second "event" argument, causing it
to act as a "callback multiplexer" of sorts.

Unfortunately, that leads to extra complexity in governors, some of
which implement the ->governor() callback as a switch statement
that simply checks the event argument and invokes a separate function
to handle that specific event.

That extra complexity can be eliminated by replacing the all-purpose
->governor() callback with a family of callbacks to carry out specific
governor operations: initialization and exit, start and stop and policy
limits updates.  That also turns out to reduce the code size too, so
do it.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 arch/powerpc/platforms/cell/cpufreq_spudemand.c |  72 ++++++++--------
 drivers/cpufreq/cpufreq.c                       |  31 ++++---
 drivers/cpufreq/cpufreq_conservative.c          |   7 +-
 drivers/cpufreq/cpufreq_governor.c              |  39 +++------
 drivers/cpufreq/cpufreq_governor.h              |  20 ++++-
 drivers/cpufreq/cpufreq_ondemand.c              |   7 +-
 drivers/cpufreq/cpufreq_performance.c           |  17 +---
 drivers/cpufreq/cpufreq_powersave.c             |  17 +---
 drivers/cpufreq/cpufreq_userspace.c             | 104 ++++++++++++------------
 include/linux/cpufreq.h                         |  14 ++--
 kernel/sched/cpufreq_schedutil.c                |  34 ++------
 11 files changed, 158 insertions(+), 204 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c
index 82607d621aca..88301e53f085 100644
--- a/arch/powerpc/platforms/cell/cpufreq_spudemand.c
+++ b/arch/powerpc/platforms/cell/cpufreq_spudemand.c
@@ -85,61 +85,57 @@ static void spu_gov_cancel_work(struct spu_gov_info_struct *info)
 	cancel_delayed_work_sync(&info->work);
 }
 
-static int spu_gov_govern(struct cpufreq_policy *policy, unsigned int event)
+static int spu_gov_start(struct cpufreq_policy *policy)
 {
 	unsigned int cpu = policy->cpu;
-	struct spu_gov_info_struct *info, *affected_info;
+	struct spu_gov_info_struct *info = &per_cpu(spu_gov_info, cpu);
+	struct spu_gov_info_struct *affected_info;
 	int i;
-	int ret = 0;
 
-	info = &per_cpu(spu_gov_info, cpu);
-
-	switch (event) {
-	case CPUFREQ_GOV_START:
-		if (!cpu_online(cpu)) {
-			printk(KERN_ERR "cpu %d is not online\n", cpu);
-			ret = -EINVAL;
-			break;
-		}
+	if (!cpu_online(cpu)) {
+		printk(KERN_ERR "cpu %d is not online\n", cpu);
+		return -EINVAL;
+	}
 
-		if (!policy->cur) {
-			printk(KERN_ERR "no cpu specified in policy\n");
-			ret = -EINVAL;
-			break;
-		}
+	if (!policy->cur) {
+		printk(KERN_ERR "no cpu specified in policy\n");
+		return -EINVAL;
+	}
 
-		/* initialize spu_gov_info for all affected cpus */
-		for_each_cpu(i, policy->cpus) {
-			affected_info = &per_cpu(spu_gov_info, i);
-			affected_info->policy = policy;
-		}
+	/* initialize spu_gov_info for all affected cpus */
+	for_each_cpu(i, policy->cpus) {
+		affected_info = &per_cpu(spu_gov_info, i);
+		affected_info->policy = policy;
+	}
 
-		info->poll_int = POLL_TIME;
+	info->poll_int = POLL_TIME;
 
-		/* setup timer */
-		spu_gov_init_work(info);
+	/* setup timer */
+	spu_gov_init_work(info);
 
-		break;
+	return 0;
+}
 
-	case CPUFREQ_GOV_STOP:
-		/* cancel timer */
-		spu_gov_cancel_work(info);
+static void spu_gov_stop(struct cpufreq_policy *policy)
+{
+	unsigned int cpu = policy->cpu;
+	struct spu_gov_info_struct *info = &per_cpu(spu_gov_info, cpu);
+	int i;
 
-		/* clean spu_gov_info for all affected cpus */
-		for_each_cpu (i, policy->cpus) {
-			info = &per_cpu(spu_gov_info, i);
-			info->policy = NULL;
-		}
+	/* cancel timer */
+	spu_gov_cancel_work(info);
 
-		break;
+	/* clean spu_gov_info for all affected cpus */
+	for_each_cpu (i, policy->cpus) {
+		info = &per_cpu(spu_gov_info, i);
+		info->policy = NULL;
 	}
-
-	return ret;
 }
 
 static struct cpufreq_governor spu_governor = {
 	.name = "spudemand",
-	.governor = spu_gov_govern,
+	.start = spu_gov_start,
+	.stop = spu_gov_stop,
 	.owner = THIS_MODULE,
 };
 
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 268566e40684..d98ff688b2f1 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2023,10 +2023,12 @@ static int cpufreq_init_governor(struct cpufreq_policy *policy)
 
 	pr_debug("%s: for CPU %u\n", __func__, policy->cpu);
 
-	ret = policy->governor->governor(policy, CPUFREQ_GOV_POLICY_INIT);
-	if (ret) {
-		module_put(policy->governor->owner);
-		return ret;
+	if (policy->governor->init) {
+		ret = policy->governor->init(policy);
+		if (ret) {
+			module_put(policy->governor->owner);
+			return ret;
+		}
 	}
 
 	policy->governor->initialized++;
@@ -2040,7 +2042,8 @@ static void cpufreq_exit_governor(struct cpufreq_policy *policy)
 
 	pr_debug("%s: for CPU %u\n", __func__, policy->cpu);
 
-	policy->governor->governor(policy, CPUFREQ_GOV_POLICY_EXIT);
+	if (policy->governor->exit)
+		policy->governor->exit(policy);
 
 	policy->governor->initialized--;
 	module_put(policy->governor->owner);
@@ -2061,11 +2064,15 @@ static int cpufreq_start_governor(struct cpufreq_policy *policy)
 	if (cpufreq_driver->get && !cpufreq_driver->setpolicy)
 		cpufreq_update_current_freq(policy);
 
-	ret = policy->governor->governor(policy, CPUFREQ_GOV_START);
-	if (ret)
-		return ret;
+	if (policy->governor->start) {
+		ret = policy->governor->start(policy);
+		if (ret)
+			return ret;
+	}
+
+	if (policy->governor->limits)
+		policy->governor->limits(policy);
 
-	policy->governor->governor(policy, CPUFREQ_GOV_LIMITS);
 	return 0;
 }
 
@@ -2076,7 +2083,8 @@ static void cpufreq_stop_governor(struct cpufreq_policy *policy)
 
 	pr_debug("%s: for CPU %u\n", __func__, policy->cpu);
 
-	policy->governor->governor(policy, CPUFREQ_GOV_STOP);
+	if (policy->governor->stop)
+		policy->governor->stop(policy);
 }
 
 static void cpufreq_governor_limits(struct cpufreq_policy *policy)
@@ -2086,7 +2094,8 @@ static void cpufreq_governor_limits(struct cpufreq_policy *policy)
 
 	pr_debug("%s: for CPU %u\n", __func__, policy->cpu);
 
-	policy->governor->governor(policy, CPUFREQ_GOV_LIMITS);
+	if (policy->governor->limits)
+		policy->governor->limits(policy);
 }
 
 int cpufreq_register_governor(struct cpufreq_governor *governor)
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 316df247e00d..2568508fca38 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -313,12 +313,7 @@ static void cs_start(struct cpufreq_policy *policy)
 }
 
 static struct dbs_governor cs_dbs_gov = {
-	.gov = {
-		.name = "conservative",
-		.governor = cpufreq_governor_dbs,
-		.max_transition_latency = TRANSITION_LATENCY_LIMIT,
-		.owner = THIS_MODULE,
-	},
+	.gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("conservative"),
 	.kobj_type = { .default_attrs = cs_attributes },
 	.gov_dbs_timer = cs_dbs_timer,
 	.alloc = cs_alloc,
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index be498d56dd69..ca9927c15a71 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -389,7 +389,7 @@ static void free_policy_dbs_info(struct policy_dbs_info *policy_dbs,
 	gov->free(policy_dbs);
 }
 
-static int cpufreq_governor_init(struct cpufreq_policy *policy)
+int cpufreq_dbs_governor_init(struct cpufreq_policy *policy)
 {
 	struct dbs_governor *gov = dbs_governor_of(policy);
 	struct dbs_data *dbs_data;
@@ -474,8 +474,9 @@ out:
 	mutex_unlock(&gov_dbs_data_mutex);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(cpufreq_dbs_governor_init);
 
-static int cpufreq_governor_exit(struct cpufreq_policy *policy)
+void cpufreq_dbs_governor_exit(struct cpufreq_policy *policy)
 {
 	struct dbs_governor *gov = dbs_governor_of(policy);
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
@@ -500,10 +501,10 @@ static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 	free_policy_dbs_info(policy_dbs, gov);
 
 	mutex_unlock(&gov_dbs_data_mutex);
-	return 0;
 }
+EXPORT_SYMBOL_GPL(cpufreq_dbs_governor_exit);
 
-static int cpufreq_governor_start(struct cpufreq_policy *policy)
+int cpufreq_dbs_governor_start(struct cpufreq_policy *policy)
 {
 	struct dbs_governor *gov = dbs_governor_of(policy);
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
@@ -539,14 +540,15 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy)
 	gov_set_update_util(policy_dbs, sampling_rate);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(cpufreq_dbs_governor_start);
 
-static int cpufreq_governor_stop(struct cpufreq_policy *policy)
+void cpufreq_dbs_governor_stop(struct cpufreq_policy *policy)
 {
 	gov_cancel_work(policy);
-	return 0;
 }
+EXPORT_SYMBOL_GPL(cpufreq_dbs_governor_stop);
 
-static int cpufreq_governor_limits(struct cpufreq_policy *policy)
+void cpufreq_dbs_governor_limits(struct cpufreq_policy *policy)
 {
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 
@@ -560,26 +562,5 @@ static int cpufreq_governor_limits(struct cpufreq_policy *policy)
 	gov_update_sample_delay(policy_dbs, 0);
 
 	mutex_unlock(&policy_dbs->timer_mutex);
-
-	return 0;
-}
-
-int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event)
-{
-	if (event == CPUFREQ_GOV_POLICY_INIT) {
-		return cpufreq_governor_init(policy);
-	} else if (policy->governor_data) {
-		switch (event) {
-		case CPUFREQ_GOV_POLICY_EXIT:
-			return cpufreq_governor_exit(policy);
-		case CPUFREQ_GOV_START:
-			return cpufreq_governor_start(policy);
-		case CPUFREQ_GOV_STOP:
-			return cpufreq_governor_stop(policy);
-		case CPUFREQ_GOV_LIMITS:
-			return cpufreq_governor_limits(policy);
-		}
-	}
-	return -EINVAL;
 }
-EXPORT_SYMBOL_GPL(cpufreq_governor_dbs);
+EXPORT_SYMBOL_GPL(cpufreq_dbs_governor_limits);
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 34eb214b6d57..36f0d19dd869 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -148,6 +148,25 @@ static inline struct dbs_governor *dbs_governor_of(struct cpufreq_policy *policy
 	return container_of(policy->governor, struct dbs_governor, gov);
 }
 
+/* Governor callback routines */
+int cpufreq_dbs_governor_init(struct cpufreq_policy *policy);
+void cpufreq_dbs_governor_exit(struct cpufreq_policy *policy);
+int cpufreq_dbs_governor_start(struct cpufreq_policy *policy);
+void cpufreq_dbs_governor_stop(struct cpufreq_policy *policy);
+void cpufreq_dbs_governor_limits(struct cpufreq_policy *policy);
+
+#define CPUFREQ_DBS_GOVERNOR_INITIALIZER(_name_)			\
+	{								\
+		.name = _name_,						\
+		.max_transition_latency	= TRANSITION_LATENCY_LIMIT,	\
+		.owner = THIS_MODULE,					\
+		.init = cpufreq_dbs_governor_init,			\
+		.exit = cpufreq_dbs_governor_exit,			\
+		.start = cpufreq_dbs_governor_start,			\
+		.stop = cpufreq_dbs_governor_stop,			\
+		.limits = cpufreq_dbs_governor_limits,			\
+	}
+
 /* Governor specific operations */
 struct od_ops {
 	unsigned int (*powersave_bias_target)(struct cpufreq_policy *policy,
@@ -155,7 +174,6 @@ struct od_ops {
 };
 
 unsigned int dbs_update(struct cpufreq_policy *policy);
-int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event);
 void od_register_powersave_bias_handler(unsigned int (*f)
 		(struct cpufreq_policy *, unsigned int, unsigned int),
 		unsigned int powersave_bias);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 300163430516..09d6c0c405e4 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -420,12 +420,7 @@ static struct od_ops od_ops = {
 };
 
 static struct dbs_governor od_dbs_gov = {
-	.gov = {
-		.name = "ondemand",
-		.governor = cpufreq_governor_dbs,
-		.max_transition_latency	= TRANSITION_LATENCY_LIMIT,
-		.owner = THIS_MODULE,
-	},
+	.gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("ondemand"),
 	.kobj_type = { .default_attrs = od_attributes },
 	.gov_dbs_timer = od_dbs_timer,
 	.alloc = od_alloc,
diff --git a/drivers/cpufreq/cpufreq_performance.c b/drivers/cpufreq/cpufreq_performance.c
index dd4dca3ab844..dafb679adc58 100644
--- a/drivers/cpufreq/cpufreq_performance.c
+++ b/drivers/cpufreq/cpufreq_performance.c
@@ -16,25 +16,16 @@
 #include <linux/init.h>
 #include <linux/module.h>
 
-static int cpufreq_governor_performance(struct cpufreq_policy *policy,
-					unsigned int event)
+static void cpufreq_gov_performance_limits(struct cpufreq_policy *policy)
 {
-	switch (event) {
-	case CPUFREQ_GOV_LIMITS:
-		pr_debug("setting to %u kHz\n", policy->max);
-		__cpufreq_driver_target(policy, policy->max,
-						CPUFREQ_RELATION_H);
-		break;
-	default:
-		break;
-	}
-	return 0;
+	pr_debug("setting to %u kHz\n", policy->max);
+	__cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
 }
 
 static struct cpufreq_governor cpufreq_gov_performance = {
 	.name		= "performance",
-	.governor	= cpufreq_governor_performance,
 	.owner		= THIS_MODULE,
+	.limits		= cpufreq_gov_performance_limits,
 };
 
 static int __init cpufreq_gov_performance_init(void)
diff --git a/drivers/cpufreq/cpufreq_powersave.c b/drivers/cpufreq/cpufreq_powersave.c
index 691e573e46ee..78a651038faf 100644
--- a/drivers/cpufreq/cpufreq_powersave.c
+++ b/drivers/cpufreq/cpufreq_powersave.c
@@ -16,24 +16,15 @@
 #include <linux/init.h>
 #include <linux/module.h>
 
-static int cpufreq_governor_powersave(struct cpufreq_policy *policy,
-					unsigned int event)
+static void cpufreq_gov_powersave_limits(struct cpufreq_policy *policy)
 {
-	switch (event) {
-	case CPUFREQ_GOV_LIMITS:
-		pr_debug("setting to %u kHz\n", policy->min);
-		__cpufreq_driver_target(policy, policy->min,
-						CPUFREQ_RELATION_L);
-		break;
-	default:
-		break;
-	}
-	return 0;
+	pr_debug("setting to %u kHz\n", policy->min);
+	__cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
 }
 
 static struct cpufreq_governor cpufreq_gov_powersave = {
 	.name		= "powersave",
-	.governor	= cpufreq_governor_powersave,
+	.limits		= cpufreq_gov_powersave_limits,
 	.owner		= THIS_MODULE,
 };
 
diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c
index 9f3dec9a3f36..bd897e3e134d 100644
--- a/drivers/cpufreq/cpufreq_userspace.c
+++ b/drivers/cpufreq/cpufreq_userspace.c
@@ -65,66 +65,66 @@ static int cpufreq_userspace_policy_init(struct cpufreq_policy *policy)
 	return 0;
 }
 
-static int cpufreq_governor_userspace(struct cpufreq_policy *policy,
-				   unsigned int event)
+static void cpufreq_userspace_policy_exit(struct cpufreq_policy *policy)
+{
+	mutex_lock(&userspace_mutex);
+	kfree(policy->governor_data);
+	policy->governor_data = NULL;
+	mutex_unlock(&userspace_mutex);
+}
+
+static int cpufreq_userspace_policy_start(struct cpufreq_policy *policy)
 {
 	unsigned int *setspeed = policy->governor_data;
-	unsigned int cpu = policy->cpu;
-	int rc = 0;
 
-	if (event == CPUFREQ_GOV_POLICY_INIT)
-		return cpufreq_userspace_policy_init(policy);
+	BUG_ON(!policy->cur);
+	pr_debug("started managing cpu %u\n", policy->cpu);
 
-	if (!setspeed)
-		return -EINVAL;
-
-	switch (event) {
-	case CPUFREQ_GOV_POLICY_EXIT:
-		mutex_lock(&userspace_mutex);
-		policy->governor_data = NULL;
-		kfree(setspeed);
-		mutex_unlock(&userspace_mutex);
-		break;
-	case CPUFREQ_GOV_START:
-		BUG_ON(!policy->cur);
-		pr_debug("started managing cpu %u\n", cpu);
-
-		mutex_lock(&userspace_mutex);
-		per_cpu(cpu_is_managed, cpu) = 1;
-		*setspeed = policy->cur;
-		mutex_unlock(&userspace_mutex);
-		break;
-	case CPUFREQ_GOV_STOP:
-		pr_debug("managing cpu %u stopped\n", cpu);
-
-		mutex_lock(&userspace_mutex);
-		per_cpu(cpu_is_managed, cpu) = 0;
-		*setspeed = 0;
-		mutex_unlock(&userspace_mutex);
-		break;
-	case CPUFREQ_GOV_LIMITS:
-		mutex_lock(&userspace_mutex);
-		pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz, last set to %u kHz\n",
-			cpu, policy->min, policy->max, policy->cur, *setspeed);
-
-		if (policy->max < *setspeed)
-			__cpufreq_driver_target(policy, policy->max,
-						CPUFREQ_RELATION_H);
-		else if (policy->min > *setspeed)
-			__cpufreq_driver_target(policy, policy->min,
-						CPUFREQ_RELATION_L);
-		else
-			__cpufreq_driver_target(policy, *setspeed,
-						CPUFREQ_RELATION_L);
-		mutex_unlock(&userspace_mutex);
-		break;
-	}
-	return rc;
+	mutex_lock(&userspace_mutex);
+	per_cpu(cpu_is_managed, policy->cpu) = 1;
+	*setspeed = policy->cur;
+	mutex_unlock(&userspace_mutex);
+	return 0;
+}
+
+static void cpufreq_userspace_policy_stop(struct cpufreq_policy *policy)
+{
+	unsigned int *setspeed = policy->governor_data;
+
+	pr_debug("managing cpu %u stopped\n", policy->cpu);
+
+	mutex_lock(&userspace_mutex);
+	per_cpu(cpu_is_managed, policy->cpu) = 0;
+	*setspeed = 0;
+	mutex_unlock(&userspace_mutex);
+}
+
+static void cpufreq_userspace_policy_limits(struct cpufreq_policy *policy)
+{
+	unsigned int *setspeed = policy->governor_data;
+
+	mutex_lock(&userspace_mutex);
+
+	pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz, last set to %u kHz\n",
+		 policy->cpu, policy->min, policy->max, policy->cur, *setspeed);
+
+	if (policy->max < *setspeed)
+		__cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
+	else if (policy->min > *setspeed)
+		__cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
+	else
+		__cpufreq_driver_target(policy, *setspeed, CPUFREQ_RELATION_L);
+
+	mutex_unlock(&userspace_mutex);
 }
 
 static struct cpufreq_governor cpufreq_gov_userspace = {
 	.name		= "userspace",
-	.governor	= cpufreq_governor_userspace,
+	.init		= cpufreq_userspace_policy_init,
+	.exit		= cpufreq_userspace_policy_exit,
+	.start		= cpufreq_userspace_policy_start,
+	.stop		= cpufreq_userspace_policy_stop,
+	.limits		= cpufreq_userspace_policy_limits,
 	.store_setspeed	= cpufreq_set,
 	.show_setspeed	= show_speed,
 	.owner		= THIS_MODULE,
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 4e81e08db752..5bcda3e6aa9e 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -455,18 +455,14 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div,
 #define MIN_LATENCY_MULTIPLIER		(20)
 #define TRANSITION_LATENCY_LIMIT	(10 * 1000 * 1000)
 
-/* Governor Events */
-#define CPUFREQ_GOV_START	1
-#define CPUFREQ_GOV_STOP	2
-#define CPUFREQ_GOV_LIMITS	3
-#define CPUFREQ_GOV_POLICY_INIT	4
-#define CPUFREQ_GOV_POLICY_EXIT	5
-
 struct cpufreq_governor {
 	char	name[CPUFREQ_NAME_LEN];
 	int	initialized;
-	int	(*governor)	(struct cpufreq_policy *policy,
-				 unsigned int event);
+	int	(*init)(struct cpufreq_policy *policy);
+	void	(*exit)(struct cpufreq_policy *policy);
+	int	(*start)(struct cpufreq_policy *policy);
+	void	(*stop)(struct cpufreq_policy *policy);
+	void	(*limits)(struct cpufreq_policy *policy);
 	ssize_t	(*show_setspeed)	(struct cpufreq_policy *policy,
 					 char *buf);
 	int	(*store_setspeed)	(struct cpufreq_policy *policy,
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 14c4aa25cc45..fdcee3cf38fc 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -394,7 +394,7 @@ static int sugov_init(struct cpufreq_policy *policy)
 	return ret;
 }
 
-static int sugov_exit(struct cpufreq_policy *policy)
+static void sugov_exit(struct cpufreq_policy *policy)
 {
 	struct sugov_policy *sg_policy = policy->governor_data;
 	struct sugov_tunables *tunables = sg_policy->tunables;
@@ -412,7 +412,6 @@ static int sugov_exit(struct cpufreq_policy *policy)
 	mutex_unlock(&global_tunables_lock);
 
 	sugov_policy_free(sg_policy);
-	return 0;
 }
 
 static int sugov_start(struct cpufreq_policy *policy)
@@ -444,7 +443,7 @@ static int sugov_start(struct cpufreq_policy *policy)
 	return 0;
 }
 
-static int sugov_stop(struct cpufreq_policy *policy)
+static void sugov_stop(struct cpufreq_policy *policy)
 {
 	struct sugov_policy *sg_policy = policy->governor_data;
 	unsigned int cpu;
@@ -456,10 +455,9 @@ static int sugov_stop(struct cpufreq_policy *policy)
 
 	irq_work_sync(&sg_policy->irq_work);
 	cancel_work_sync(&sg_policy->work);
-	return 0;
 }
 
-static int sugov_limits(struct cpufreq_policy *policy)
+static void sugov_limits(struct cpufreq_policy *policy)
 {
 	struct sugov_policy *sg_policy = policy->governor_data;
 
@@ -477,32 +475,16 @@ static int sugov_limits(struct cpufreq_policy *policy)
 	}
 
 	sg_policy->need_freq_update = true;
-	return 0;
-}
-
-int sugov_governor(struct cpufreq_policy *policy, unsigned int event)
-{
-	if (event == CPUFREQ_GOV_POLICY_INIT) {
-		return sugov_init(policy);
-	} else if (policy->governor_data) {
-		switch (event) {
-		case CPUFREQ_GOV_POLICY_EXIT:
-			return sugov_exit(policy);
-		case CPUFREQ_GOV_START:
-			return sugov_start(policy);
-		case CPUFREQ_GOV_STOP:
-			return sugov_stop(policy);
-		case CPUFREQ_GOV_LIMITS:
-			return sugov_limits(policy);
-		}
-	}
-	return -EINVAL;
 }
 
 static struct cpufreq_governor schedutil_gov = {
 	.name = "schedutil",
-	.governor = sugov_governor,
 	.owner = THIS_MODULE,
+	.init = sugov_init,
+	.exit = sugov_exit,
+	.start = sugov_start,
+	.stop = sugov_stop,
+	.limits = sugov_limits,
 };
 
 static int __init sugov_module_init(void)
-- 
cgit v1.3-8-gc7d7


From bf2be2de8493dd5f86d6e0f0d4eecb5810ad035b Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 18 May 2016 17:55:31 +0530
Subject: cpufreq: governor: Create cpufreq_policy_apply_limits()

Create a new helper to avoid code duplication across governors.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_governor.c | 7 +------
 include/linux/cpufreq.h            | 8 ++++++++
 kernel/sched/cpufreq_schedutil.c   | 9 +--------
 3 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 18eab7f4ba4c..4b88b5bec4ef 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -553,12 +553,7 @@ void cpufreq_dbs_governor_limits(struct cpufreq_policy *policy)
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 
 	mutex_lock(&policy_dbs->timer_mutex);
-
-	if (policy->max < policy->cur)
-		__cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
-	else if (policy->min > policy->cur)
-		__cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
-
+	cpufreq_policy_apply_limits(policy);
 	gov_update_sample_delay(policy_dbs, 0);
 
 	mutex_unlock(&policy_dbs->timer_mutex);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 5bcda3e6aa9e..3be54a0b4373 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -489,6 +489,14 @@ void cpufreq_unregister_governor(struct cpufreq_governor *governor);
 struct cpufreq_governor *cpufreq_default_governor(void);
 struct cpufreq_governor *cpufreq_fallback_governor(void);
 
+static inline void cpufreq_policy_apply_limits(struct cpufreq_policy *policy)
+{
+	if (policy->max < policy->cur)
+		__cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
+	else if (policy->min > policy->cur)
+		__cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
+}
+
 /* Governor attribute set */
 struct gov_attr_set {
 	struct kobject kobj;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index fdcee3cf38fc..758efd7f3abe 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -463,14 +463,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
 
 	if (!policy->fast_switch_enabled) {
 		mutex_lock(&sg_policy->work_lock);
-
-		if (policy->max < policy->cur)
-			__cpufreq_driver_target(policy, policy->max,
-						CPUFREQ_RELATION_H);
-		else if (policy->min > policy->cur)
-			__cpufreq_driver_target(policy, policy->min,
-						CPUFREQ_RELATION_L);
-
+		cpufreq_policy_apply_limits(policy);
 		mutex_unlock(&sg_policy->work_lock);
 	}
 
-- 
cgit v1.3-8-gc7d7


From 1a99ae3f00d3c7c7885ee529ac9a874b19caa0cf Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Tue, 10 May 2016 21:03:18 +0800
Subject: sched/fair: Fix the wrong throttled clock time for
 cfs_rq_clock_task()

Two minor fixes for cfs_rq_clock_task():

 1) If cfs_rq is currently being throttled, we need to subtract the cfs
    throttled clock time.

 2) Make "throttled_clock_task_time" update SMP unrelated. Now UP cases
    need it as well.

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1462885398-14724-1-git-send-email-xlpang@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 218f8e83db73..1e87bb633d43 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3688,7 +3688,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 {
 	if (unlikely(cfs_rq->throttle_count))
-		return cfs_rq->throttled_clock_task;
+		return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
 
 	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
 }
@@ -3826,13 +3826,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
 	cfs_rq->throttle_count--;
-#ifdef CONFIG_SMP
 	if (!cfs_rq->throttle_count) {
 		/* adjust cfs_rq_clock_task() */
 		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
 					     cfs_rq->throttled_clock_task;
 	}
-#endif
 
 	return 0;
 }
-- 
cgit v1.3-8-gc7d7


From df55f462b905f3b2d40ec3fb865891382a6ebfb1 Mon Sep 17 00:00:00 2001
From: "Gaurav Jindal (Gaurav Jindal)" <Gaurav.Jindal@spreadtrum.com>
Date: Thu, 12 May 2016 10:13:33 +0000
Subject: sched/idle: Optimize the generic idle loop

Currently, smp_processor_id() is used to fetch the current CPU in
cpu_idle_loop(). Every time the idle thread runs, it fetches the
current CPU using smp_processor_id().

Since the idle thread is per CPU, the current CPU is constant, so we
can lift the load out of the loop, saving execution cycles/time in the
loop.

x86-64:

Before patch (execution in loop):
	148:    0f ae e8                lfence
	14b:    65 8b 04 25 00 00 00 00 mov %gs:0x0,%eax
	152:    00
	153:    89 c0                   mov %eax,%eax
	155:    49 0f a3 04 24          bt %rax,(%r12)

After patch (execution in loop):
	150:    0f ae e8                lfence
	153:    4d 0f a3 34 24          bt %r14,(%r12)

ARM64:

Before patch (execution in loop):
	168:    d5033d9f        dsb     ld
	16c:    b9405661        ldr     w1,[x19,#84]
	170:    1100fc20        add     w0,w1,#0x3f
	174:    6b1f003f        cmp     w1,wzr
	178:    1a81b000        csel    w0,w0,w1,lt
	17c:    130c7000        asr     w0,w0,#6
	180:    937d7c00        sbfiz   x0,x0,#3,#32
	184:    f8606aa0        ldr     x0,[x21,x0]
	188:    9ac12401        lsr     x1,x0,x1
	18c:    36000e61        tbz     w1,#0,358

After patch (execution in loop):
	1a8:    d50339df        dsb     ld
	1ac:    f8776ac0        ldr     x0,[x22,x23]
	ab0:    ea18001f        tst     x0,x24
	1b4:    54000ea0        b.eq    388

Further observance on ARM64 for 4 seconds shows that cpu_idle_loop is
called 8672 times. Shifting the code will save instructions executed
in loop and eventually time as well.

Signed-off-by: Gaurav Jindal <gaurav.jindal@spreadtrum.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Sanjeev Yadav <sanjeev.yadav@spreadtrum.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160512101330.GA488@gauravjindalubtnb.del.spreadtrum.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/idle.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index bd12c6c714ec..db4ff7c100b9 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -201,6 +201,8 @@ exit_idle:
  */
 static void cpu_idle_loop(void)
 {
+	int cpu = smp_processor_id();
+
 	while (1) {
 		/*
 		 * If the arch has a polling bit, we maintain an invariant:
@@ -219,7 +221,7 @@ static void cpu_idle_loop(void)
 			check_pgt_cache();
 			rmb();
 
-			if (cpu_is_offline(smp_processor_id())) {
+			if (cpu_is_offline(cpu)) {
 				cpuhp_report_idle_dead();
 				arch_cpu_idle_dead();
 			}
-- 
cgit v1.3-8-gc7d7


From 150593bf869393d10a79f6bd3df2585ecc20a9bb Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 18 May 2016 19:02:18 +0200
Subject: sched/api: Introduce task_rcu_dereference() and try_get_task_struct()

Generally task_struct is only protected by RCU if it was found on a
RCU protected list (say, for_each_process() or find_task_by_vpid()).

As Kirill pointed out rq->curr isn't protected by RCU, the scheduler
drops the (potentially) last reference without RCU gp, this means
that we need to fix the code which uses foreign_rq->curr under
rcu_read_lock().

Add a new helper which can be used to dereference rq->curr or any
other pointer to task_struct assuming that it should be cleared or
updated before the final put_task_struct(). It returns non-NULL
only if this task can't go away before rcu_read_unlock().

( Also add try_get_task_struct() to make it easier to use this API
  correctly. )

Suggested-by: Kirill Tkhai <ktkhai@parallels.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
[ Updated comments; added try_get_task_struct()]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Kirill Tkhai <tkhai@yandex.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Link: http://lkml.kernel.org/r/20160518170218.GY3192@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  3 ++
 kernel/exit.c         | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e42ada26345..dee41bf59e6b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2139,6 +2139,9 @@ static inline void put_task_struct(struct task_struct *t)
 		__put_task_struct(t);
 }
 
+struct task_struct *task_rcu_dereference(struct task_struct **ptask);
+struct task_struct *try_get_task_struct(struct task_struct **ptask);
+
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 extern void task_cputime(struct task_struct *t,
 			 cputime_t *utime, cputime_t *stime);
diff --git a/kernel/exit.c b/kernel/exit.c
index 9e6e1356e6bb..2fb4d44c51b1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -210,6 +210,82 @@ repeat:
 		goto repeat;
 }
 
+/*
+ * Note that if this function returns a valid task_struct pointer (!NULL)
+ * task->usage must remain >0 for the duration of the RCU critical section.
+ */
+struct task_struct *task_rcu_dereference(struct task_struct **ptask)
+{
+	struct sighand_struct *sighand;
+	struct task_struct *task;
+
+	/*
+	 * We need to verify that release_task() was not called and thus
+	 * delayed_put_task_struct() can't run and drop the last reference
+	 * before rcu_read_unlock(). We check task->sighand != NULL,
+	 * but we can read the already freed and reused memory.
+	 */
+retry:
+	task = rcu_dereference(*ptask);
+	if (!task)
+		return NULL;
+
+	probe_kernel_address(&task->sighand, sighand);
+
+	/*
+	 * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
+	 * was already freed we can not miss the preceding update of this
+	 * pointer.
+	 */
+	smp_rmb();
+	if (unlikely(task != READ_ONCE(*ptask)))
+		goto retry;
+
+	/*
+	 * We've re-checked that "task == *ptask", now we have two different
+	 * cases:
+	 *
+	 * 1. This is actually the same task/task_struct. In this case
+	 *    sighand != NULL tells us it is still alive.
+	 *
+	 * 2. This is another task which got the same memory for task_struct.
+	 *    We can't know this of course, and we can not trust
+	 *    sighand != NULL.
+	 *
+	 *    In this case we actually return a random value, but this is
+	 *    correct.
+	 *
+	 *    If we return NULL - we can pretend that we actually noticed that
+	 *    *ptask was updated when the previous task has exited. Or pretend
+	 *    that probe_slab_address(&sighand) reads NULL.
+	 *
+	 *    If we return the new task (because sighand is not NULL for any
+	 *    reason) - this is fine too. This (new) task can't go away before
+	 *    another gp pass.
+	 *
+	 *    And note: We could even eliminate the false positive if re-read
+	 *    task->sighand once again to avoid the falsely NULL. But this case
+	 *    is very unlikely so we don't care.
+	 */
+	if (!sighand)
+		return NULL;
+
+	return task;
+}
+
+struct task_struct *try_get_task_struct(struct task_struct **ptask)
+{
+	struct task_struct *task;
+
+	rcu_read_lock();
+	task = task_rcu_dereference(ptask);
+	if (task)
+		get_task_struct(task);
+	rcu_read_unlock();
+
+	return task;
+}
+
 /*
  * Determine if a process group is "orphaned", according to the POSIX
  * definition in 2.2.2.52.  Orphaned process groups are not to be affected
-- 
cgit v1.3-8-gc7d7


From bac7857319bcf7fed329a10bb760053e761115c0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 18 May 2016 21:57:33 +0200
Subject: sched/fair: Use task_rcu_dereference()

Simplify task_numa_compare()'s task reference magic by using
task_rcu_dereference().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Kirill Tkhai <ktkhai@parallels.com>
Cc: Kirill Tkhai <tkhai@yandex.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Link: http://lkml.kernel.org/r/20160518195733.GA15914@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 34 ++++------------------------------
 1 file changed, 4 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1e87bb633d43..c6dd8bab010c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1305,6 +1305,8 @@ static void task_numa_assign(struct task_numa_env *env,
 {
 	if (env->best_task)
 		put_task_struct(env->best_task);
+	if (p)
+		get_task_struct(p);
 
 	env->best_task = p;
 	env->best_imp = imp;
@@ -1372,31 +1374,11 @@ static void task_numa_compare(struct task_numa_env *env,
 	long imp = env->p->numa_group ? groupimp : taskimp;
 	long moveimp = imp;
 	int dist = env->dist;
-	bool assigned = false;
 
 	rcu_read_lock();
-
-	raw_spin_lock_irq(&dst_rq->lock);
-	cur = dst_rq->curr;
-	/*
-	 * No need to move the exiting task or idle task.
-	 */
-	if ((cur->flags & PF_EXITING) || is_idle_task(cur))
+	cur = task_rcu_dereference(&dst_rq->curr);
+	if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
 		cur = NULL;
-	else {
-		/*
-		 * The task_struct must be protected here to protect the
-		 * p->numa_faults access in the task_weight since the
-		 * numa_faults could already be freed in the following path:
-		 * finish_task_switch()
-		 *     --> put_task_struct()
-		 *         --> __put_task_struct()
-		 *             --> task_numa_free()
-		 */
-		get_task_struct(cur);
-	}
-
-	raw_spin_unlock_irq(&dst_rq->lock);
 
 	/*
 	 * Because we have preemption enabled we can get migrated around and
@@ -1479,7 +1461,6 @@ balance:
 		 */
 		if (!load_too_imbalanced(src_load, dst_load, env)) {
 			imp = moveimp - 1;
-			put_task_struct(cur);
 			cur = NULL;
 			goto assign;
 		}
@@ -1505,16 +1486,9 @@ balance:
 		env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
 
 assign:
-	assigned = true;
 	task_numa_assign(env, cur, imp);
 unlock:
 	rcu_read_unlock();
-	/*
-	 * The dst_rq->curr isn't assigned. The protection for task_struct is
-	 * finished.
-	 */
-	if (cur && !assigned)
-		put_task_struct(cur);
 }
 
 static void task_numa_find_cpu(struct task_numa_env *env,
-- 
cgit v1.3-8-gc7d7


From f2fb6bef92514432398a653df1c2f1041d79ac46 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Wed, 23 Mar 2016 11:24:37 -0700
Subject: perf/core: Optimize side-band event delivery

The perf_event_aux() function iterates all PMUs and all events in
their respective per-CPU contexts to find the events to deliver
side-band records to.

For example, the brk test case in lkp triggers many mmap() operations,
which, if we're also running perf, results in many perf_event_aux()
invocations.

If we enable uncore PMU support (even when uncore events are not used),
dozens of uncore PMUs will be iterated, which can significantly
decrease brk_test's throughput.

For example, the brk throughput:

  without uncore PMUs: 2647573 ops_per_sec
  with    uncore PMUs: 1768444 ops_per_sec

... a 33% reduction.

To get at the per-CPU events that need side-band records, this patch
puts these events on a per-CPU list, this avoids iterating the PMUs
and any events that do not need side-band records.

Per task events are unchanged to avoid extra overhead on the context
switch paths.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reported-by: Huang, Ying <ying.huang@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1458757477-3781-1-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  6 ++++
 kernel/events/core.c       | 85 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 79 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0e43355c7aad..92e9ce737432 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -517,6 +517,11 @@ struct swevent_hlist {
 struct perf_cgroup;
 struct ring_buffer;
 
+struct pmu_event_list {
+	raw_spinlock_t		lock;
+	struct list_head	list;
+};
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -675,6 +680,7 @@ struct perf_event {
 	int				cgrp_defer_enabled;
 #endif
 
+	struct list_head		sb_list;
 #endif /* CONFIG_PERF_EVENTS */
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 79363f298445..6615c8922ee3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -335,6 +335,7 @@ static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -3665,6 +3666,26 @@ static void free_event_rcu(struct rcu_head *head)
 static void ring_buffer_attach(struct perf_event *event,
 			       struct ring_buffer *rb);
 
+static void detach_sb_event(struct perf_event *event)
+{
+	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+	raw_spin_lock(&pel->lock);
+	list_del_rcu(&event->sb_list);
+	raw_spin_unlock(&pel->lock);
+}
+
+static void unaccount_pmu_sb_event(struct perf_event *event)
+{
+	if (event->parent)
+		return;
+
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return;
+
+	detach_sb_event(event);
+}
+
 static void unaccount_event_cpu(struct perf_event *event, int cpu)
 {
 	if (event->parent)
@@ -3728,6 +3749,8 @@ static void unaccount_event(struct perf_event *event)
 	}
 
 	unaccount_event_cpu(event, event->cpu);
+
+	unaccount_pmu_sb_event(event);
 }
 
 static void perf_sched_delayed(struct work_struct *work)
@@ -5888,13 +5911,25 @@ perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
 	rcu_read_unlock();
 }
 
+static void perf_event_sb_iterate(perf_event_aux_output_cb output, void *data)
+{
+	struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
+	struct perf_event *event;
+
+	list_for_each_entry_rcu(event, &pel->list, sb_list) {
+		if (event->state < PERF_EVENT_STATE_INACTIVE)
+			continue;
+		if (!event_filter_match(event))
+			continue;
+		output(event, data);
+	}
+}
+
 static void
 perf_event_aux(perf_event_aux_output_cb output, void *data,
 	       struct perf_event_context *task_ctx)
 {
-	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
-	struct pmu *pmu;
 	int ctxn;
 
 	/*
@@ -5909,20 +5944,15 @@ perf_event_aux(perf_event_aux_output_cb output, void *data,
 	}
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-		if (cpuctx->unique_pmu != pmu)
-			goto next;
-		perf_event_aux_ctx(&cpuctx->ctx, output, data, false);
-		ctxn = pmu->task_ctx_nr;
-		if (ctxn < 0)
-			goto next;
+	preempt_disable();
+	perf_event_sb_iterate(output, data);
+
+	for_each_task_context_nr(ctxn) {
 		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
 		if (ctx)
 			perf_event_aux_ctx(ctx, output, data, false);
-next:
-		put_cpu_ptr(pmu->pmu_cpu_context);
 	}
+	preempt_enable();
 	rcu_read_unlock();
 }
 
@@ -8615,6 +8645,32 @@ unlock:
 	return pmu;
 }
 
+static void attach_sb_event(struct perf_event *event)
+{
+	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+	raw_spin_lock(&pel->lock);
+	list_add_rcu(&event->sb_list, &pel->list);
+	raw_spin_unlock(&pel->lock);
+}
+
+static void account_pmu_sb_event(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+
+	if (event->parent)
+		return;
+
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return;
+
+	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
+	    attr->comm || attr->comm_exec ||
+	    attr->task ||
+	    attr->context_switch)
+		attach_sb_event(event);
+}
+
 static void account_event_cpu(struct perf_event *event, int cpu)
 {
 	if (event->parent)
@@ -8695,6 +8751,8 @@ static void account_event(struct perf_event *event)
 enabled:
 
 	account_event_cpu(event, event->cpu);
+
+	account_pmu_sb_event(event);
 }
 
 /*
@@ -10203,6 +10261,9 @@ static void __init perf_event_init_all_cpus(void)
 		swhash = &per_cpu(swevent_htable, cpu);
 		mutex_init(&swhash->hlist_mutex);
 		INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+
+		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
+		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
 	}
 }
 
-- 
cgit v1.3-8-gc7d7


From aab5b71ef2b5c62323b9abe397e2db57b18e1f78 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 12 May 2016 17:26:46 +0200
Subject: perf/core: Rename the perf_event_aux*() APIs to perf_event_sb*(), to
 separate them from AUX ring-buffer records

There are now two different things called AUX in perf, the
infrastructure to deliver the mmap/comm/task records and the
AUX part in the mmap buffer (with associated AUX_RECORD).

Since the former is internal, rename it to side-band to reduce
the confusion factor.

No change in functionality.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 69 +++++++++++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6615c8922ee3..f54454ea5f31 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5879,11 +5879,11 @@ perf_event_read_event(struct perf_event *event,
 	perf_output_end(&handle);
 }
 
-typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
+typedef void (perf_iterate_f)(struct perf_event *event, void *data);
 
 static void
-perf_event_aux_ctx(struct perf_event_context *ctx,
-		   perf_event_aux_output_cb output,
+perf_iterate_ctx(struct perf_event_context *ctx,
+		   perf_iterate_f output,
 		   void *data, bool all)
 {
 	struct perf_event *event;
@@ -5900,18 +5900,7 @@ perf_event_aux_ctx(struct perf_event_context *ctx,
 	}
 }
 
-static void
-perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
-			struct perf_event_context *task_ctx)
-{
-	rcu_read_lock();
-	preempt_disable();
-	perf_event_aux_ctx(task_ctx, output, data, false);
-	preempt_enable();
-	rcu_read_unlock();
-}
-
-static void perf_event_sb_iterate(perf_event_aux_output_cb output, void *data)
+static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
 {
 	struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
 	struct perf_event *event;
@@ -5925,33 +5914,40 @@ static void perf_event_sb_iterate(perf_event_aux_output_cb output, void *data)
 	}
 }
 
+/*
+ * Iterate all events that need to receive side-band events.
+ *
+ * For new callers; ensure that account_pmu_sb_event() includes
+ * your event, otherwise it might not get delivered.
+ */
 static void
-perf_event_aux(perf_event_aux_output_cb output, void *data,
+perf_iterate_sb(perf_iterate_f output, void *data,
 	       struct perf_event_context *task_ctx)
 {
 	struct perf_event_context *ctx;
 	int ctxn;
 
+	rcu_read_lock();
+	preempt_disable();
+
 	/*
-	 * If we have task_ctx != NULL we only notify
-	 * the task context itself. The task_ctx is set
-	 * only for EXIT events before releasing task
+	 * If we have task_ctx != NULL we only notify the task context itself.
+	 * The task_ctx is set only for EXIT events before releasing task
 	 * context.
 	 */
 	if (task_ctx) {
-		perf_event_aux_task_ctx(output, data, task_ctx);
-		return;
+		perf_iterate_ctx(task_ctx, output, data, false);
+		goto done;
 	}
 
-	rcu_read_lock();
-	preempt_disable();
-	perf_event_sb_iterate(output, data);
+	perf_iterate_sb_cpu(output, data);
 
 	for_each_task_context_nr(ctxn) {
 		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
 		if (ctx)
-			perf_event_aux_ctx(ctx, output, data, false);
+			perf_iterate_ctx(ctx, output, data, false);
 	}
+done:
 	preempt_enable();
 	rcu_read_unlock();
 }
@@ -6001,7 +5997,7 @@ void perf_event_exec(void)
 
 		perf_event_enable_on_exec(ctxn);
 
-		perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL,
+		perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
 				   true);
 	}
 	rcu_read_unlock();
@@ -6045,9 +6041,9 @@ static int __perf_pmu_output_stop(void *info)
 	};
 
 	rcu_read_lock();
-	perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
+	perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
 	if (cpuctx->task_ctx)
-		perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+		perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
 				   &ro, false);
 	rcu_read_unlock();
 
@@ -6176,7 +6172,7 @@ static void perf_event_task(struct task_struct *task,
 		},
 	};
 
-	perf_event_aux(perf_event_task_output,
+	perf_iterate_sb(perf_event_task_output,
 		       &task_event,
 		       task_ctx);
 }
@@ -6255,7 +6251,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 
 	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
 
-	perf_event_aux(perf_event_comm_output,
+	perf_iterate_sb(perf_event_comm_output,
 		       comm_event,
 		       NULL);
 }
@@ -6486,7 +6482,7 @@ got_name:
 
 	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
 
-	perf_event_aux(perf_event_mmap_output,
+	perf_iterate_sb(perf_event_mmap_output,
 		       mmap_event,
 		       NULL);
 
@@ -6569,7 +6565,7 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
 		if (!ctx)
 			continue;
 
-		perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true);
+		perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
 	}
 	rcu_read_unlock();
 }
@@ -6756,7 +6752,7 @@ static void perf_event_switch(struct task_struct *task,
 		},
 	};
 
-	perf_event_aux(perf_event_switch_output,
+	perf_iterate_sb(perf_event_switch_output,
 		       &switch_event,
 		       NULL);
 }
@@ -8654,6 +8650,13 @@ static void attach_sb_event(struct perf_event *event)
 	raw_spin_unlock(&pel->lock);
 }
 
+/*
+ * We keep a list of all !task (and therefore per-cpu) events
+ * that need to receive side-band records.
+ *
+ * This avoids having to scan all the various PMU per-cpu contexts
+ * looking for them.
+ */
 static void account_pmu_sb_event(struct perf_event *event)
 {
 	struct perf_event_attr *attr = &event->attr;
-- 
cgit v1.3-8-gc7d7


From ab7fdefba68f66c8523571c3b3a940635d781824 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Tue, 3 May 2016 00:26:06 -0700
Subject: perf/core: Fix implicitly enable dynamic interrupt throttle

This patch fixes an issue which was introduced by commit:

  91a612eea9a3 ("perf/core: Fix dynamic interrupt throttle")

... which commit unconditionally sets the perf_sample_allowed_ns value
to !0. But that could trigger a bug in the following corner case:

The user can disable the dynamic interrupt throttle mechanism by setting
perf_cpu_time_max_percent to 0. Then they change perf_event_max_sample_rate.
For this case, the mechanism will be enabled implicitly, because
perf_sample_allowed_ns becomes !0 - which is not what we want.

This patch only updates perf_sample_allowed_ns when the dynamic
interrupt throttle mechanism is enabled.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Link: http://lkml.kernel.org/r/1462260366-3160-1-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index f54454ea5f31..f94f164b5054 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -397,6 +397,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
 	if (ret || !write)
 		return ret;
 
+	/*
+	 * If throttling is disabled don't allow the write:
+	 */
+	if (sysctl_perf_cpu_time_max_percent == 100 ||
+	    sysctl_perf_cpu_time_max_percent == 0)
+		return -EINVAL;
+
 	max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
 	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 	update_perf_cpu_limits();
-- 
cgit v1.3-8-gc7d7


From a1396555abff9ff9b74c2e4da13e27e81fd094b2 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <Vineet.Gupta1@synopsys.com>
Date: Mon, 9 May 2016 15:07:40 +0530
Subject: perf/abi: Change the errno for sampling event not supported in
 hardware

Change the return code for sampling event not supported from -ENOTSUPP
to -EOPNOTSUPP.

This allows userspace to identify this case specifically, instead of
printing the catch-all error message it did previously.

Technically this is an ABI change, but we think we can get away
with it.

Old behavior:
 -------
 | # perf record ls
 | Error:
 | The sys_perf_event_open() syscall returned with 524 (Unknown error 524)
 | for event (cycles:ppp).
 | /bin/dmesg may provide additional information.
 | No CONFIG_PERF_EVENTS=y kernel support configured?

New behavior:
 -------
 | # perf record ls
 | Error:
 | PMU Hardware doesn't support sampling/overflow-interrupts.

Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <acme@redhat.com>
Cc: <linux-snps-arc@lists.infradead.org>
Cc: <vincent.weaver@maine.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Vineet Gupta <Vineet.Gupta1@synopsys.com>
Link: http://lkml.kernel.org/r/1462786660-2900-3-git-send-email-vgupta@synopsys.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index f94f164b5054..5d48306879d5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9309,7 +9309,7 @@ SYSCALL_DEFINE5(perf_event_open,
 
 	if (is_sampling_event(event)) {
 		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
-			err = -ENOTSUPP;
+			err = -EOPNOTSUPP;
 			goto err_alloc;
 		}
 	}
-- 
cgit v1.3-8-gc7d7


From 133e89ef5ef338e1358b16246521ba17d935c396 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Fri, 13 May 2016 11:56:26 -0700
Subject: locking/rwsem: Enable lockless waiter wakeup(s)

As wake_qs gain users, we can teach rwsems about them such that
waiters can be awoken without the wait_lock. This is for both
readers and writer, the former being the most ideal candidate
as we can batch the wakeups shortening the critical region that
much more -- ie writer task blocking a bunch of tasks waiting to
service page-faults (mmap_sem readers).

In general applying wake_qs to rwsem (xadd) is not difficult as
the wait_lock is intended to be released soon _anyways_, with
the exception of when a writer slowpath will proactively wakeup
any queued readers if it sees that the lock is owned by a reader,
in which we simply do the wakeups with the lock held (see comment
in __rwsem_down_write_failed_common()).

Similar to other locking primitives, delaying the waiter being
awoken does allow, at least in theory, the lock to be stolen in
the case of writers, however no harm was seen in this (in fact
lock stealing tends to be a _good_ thing in most workloads), and
this is a tiny window anyways.

Some page-fault (pft) and mmap_sem intensive benchmarks show some
pretty constant reduction in systime (by up to ~8 and ~10%) on a
2-socket, 12 core AMD box. In addition, on an 8-core Westmere doing
page allocations (page_test)

aim9:
	 4.6-rc6				4.6-rc6
						rwsemv2
Min      page_test   378167.89 (  0.00%)   382613.33 (  1.18%)
Min      exec_test      499.00 (  0.00%)      502.67 (  0.74%)
Min      fork_test     3395.47 (  0.00%)     3537.64 (  4.19%)
Hmean    page_test   395433.06 (  0.00%)   414693.68 (  4.87%)
Hmean    exec_test      499.67 (  0.00%)      505.30 (  1.13%)
Hmean    fork_test     3504.22 (  0.00%)     3594.95 (  2.59%)
Stddev   page_test    17426.57 (  0.00%)    26649.92 (-52.93%)
Stddev   exec_test        0.47 (  0.00%)        1.41 (-199.05%)
Stddev   fork_test       63.74 (  0.00%)       32.59 ( 48.86%)
Max      page_test   429873.33 (  0.00%)   456960.00 (  6.30%)
Max      exec_test      500.33 (  0.00%)      507.66 (  1.47%)
Max      fork_test     3653.33 (  0.00%)     3650.90 ( -0.07%)

	     4.6-rc6     4.6-rc6
			 rwsemv2
User            1.12        0.04
System          0.23        0.04
Elapsed       727.27      721.98

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman.Long@hpe.com
Cc: dave@stgolabs.net
Cc: jason.low2@hp.com
Cc: peter@hurleysoftware.com
Link: http://lkml.kernel.org/r/1463165787-25937-2-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem-xadd.c | 58 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 09e30c6225e5..80b05ac0f015 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -114,12 +114,16 @@ enum rwsem_wake_type {
  *   - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
  *   - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
  * - there must be someone on the queue
- * - the spinlock must be held by the caller
+ * - the wait_lock must be held by the caller
+ * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
+ *   to actually wakeup the blocked task(s) and drop the reference count,
+ *   preferably when the wait_lock is released
  * - woken process blocks are discarded from the list after having task zeroed
- * - writers are only woken if downgrading is false
+ * - writers are only marked woken if downgrading is false
  */
 static struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
+__rwsem_mark_wake(struct rw_semaphore *sem,
+		  enum rwsem_wake_type wake_type, struct wake_q_head *wake_q)
 {
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
@@ -128,13 +132,16 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
 	if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
-		if (wake_type == RWSEM_WAKE_ANY)
-			/* Wake writer at the front of the queue, but do not
-			 * grant it the lock yet as we want other writers
-			 * to be able to steal it.  Readers, on the other hand,
-			 * will block as they will notice the queued writer.
+		if (wake_type == RWSEM_WAKE_ANY) {
+			/*
+			 * Mark writer at the front of the queue for wakeup.
+			 * Until the task is actually later awoken later by
+			 * the caller, other writers are able to steal it.
+			 * Readers, on the other hand, will block as they
+			 * will notice the queued writer.
 			 */
-			wake_up_process(waiter->task);
+			wake_q_add(wake_q, waiter->task);
+		}
 		goto out;
 	}
 
@@ -196,7 +203,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
 		 */
 		smp_mb();
 		waiter->task = NULL;
-		wake_up_process(tsk);
+		wake_q_add(wake_q, tsk);
 		put_task_struct(tsk);
 	} while (--loop);
 
@@ -216,6 +223,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk = current;
+	WAKE_Q(wake_q);
 
 	/* set up my own style of waitqueue */
 	waiter.task = tsk;
@@ -238,9 +246,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 	if (count == RWSEM_WAITING_BIAS ||
 	    (count > RWSEM_WAITING_BIAS &&
 	     adjustment != -RWSEM_ACTIVE_READ_BIAS))
-		sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+		sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 
 	raw_spin_unlock_irq(&sem->wait_lock);
+	wake_up_q(&wake_q);
 
 	/* wait to be given the lock */
 	while (true) {
@@ -440,6 +449,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 	bool waiting = true; /* any queued threads before us */
 	struct rwsem_waiter waiter;
 	struct rw_semaphore *ret = sem;
+	WAKE_Q(wake_q);
 
 	/* undo write bias from down_write operation, stop active locking */
 	count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
@@ -472,8 +482,19 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 		 * no active writers, the lock must be read owned; so we try to
 		 * wake any read locks that were queued ahead of us.
 		 */
-		if (count > RWSEM_WAITING_BIAS)
-			sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+		if (count > RWSEM_WAITING_BIAS) {
+			WAKE_Q(wake_q);
+
+			sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
+			/*
+			 * The wakeup is normally called _after_ the wait_lock
+			 * is released, but given that we are proactively waking
+			 * readers we can deal with the wake_q overhead as it is
+			 * similar to releasing and taking the wait_lock again
+			 * for attempting rwsem_try_write_lock().
+			 */
+			wake_up_q(&wake_q);
+		}
 
 	} else
 		count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
@@ -509,8 +530,9 @@ out_nolock:
 	if (list_empty(&sem->wait_list))
 		rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem);
 	else
-		__rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+		__rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 	raw_spin_unlock_irq(&sem->wait_lock);
+	wake_up_q(&wake_q);
 
 	return ERR_PTR(-EINTR);
 }
@@ -537,6 +559,7 @@ __visible
 struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 {
 	unsigned long flags;
+	WAKE_Q(wake_q);
 
 	/*
 	 * If a spinner is present, it is not necessary to do the wakeup.
@@ -573,9 +596,10 @@ locked:
 
 	/* do nothing if list empty */
 	if (!list_empty(&sem->wait_list))
-		sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+		sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 
 	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+	wake_up_q(&wake_q);
 
 	return sem;
 }
@@ -590,14 +614,16 @@ __visible
 struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
 {
 	unsigned long flags;
+	WAKE_Q(wake_q);
 
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
 	/* do nothing if list empty */
 	if (!list_empty(&sem->wait_list))
-		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
+		sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
 
 	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+	wake_up_q(&wake_q);
 
 	return sem;
 }
-- 
cgit v1.3-8-gc7d7


From e38513905eeaae59056eac2c9ac55a43b1fc41b2 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Fri, 13 May 2016 11:56:27 -0700
Subject: locking/rwsem: Rework zeroing reader waiter->task

Readers that are awoken will expect a nil ->task indicating
that a wakeup has occurred. Because of the way readers are
implemented, there's a small chance that the waiter will never
block in the slowpath (rwsem_down_read_failed), and therefore
requires some form of reference counting to avoid the following
scenario:

rwsem_down_read_failed()		rwsem_wake()
  get_task_struct();
  spin_lock_irq(&wait_lock);
  list_add_tail(&waiter.list)
  spin_unlock_irq(&wait_lock);
					  raw_spin_lock_irqsave(&wait_lock)
					  __rwsem_do_wake()
  while (1) {
    set_task_state(TASK_UNINTERRUPTIBLE);
					    waiter->task = NULL
    if (!waiter.task) // true
      break;
    schedule() // never reached

   __set_task_state(TASK_RUNNING);
 do_exit();
					    wake_up_process(tsk); // boom

... and therefore race with do_exit() when the caller returns.

There is also a mismatch between the smp_mb() and its documentation,
in that the serialization is done between reading the task and the
nil store. Furthermore, in addition to having the overlapping of
loads and stores to waiter->task guaranteed to be ordered within
that CPU, both wake_up_process() originally and now wake_q_add()
already imply barriers upon successful calls, which serves the
comment.

Now, as an alternative to perhaps inverting the checks in the blocker
side (which has its own penalty in that schedule is unavoidable),
with lockless wakeups this situation is naturally addressed and we
can just use the refcount held by wake_q_add(), instead doing so
explicitly. Of course, we must guarantee that the nil store is done
as the _last_ operation in that the task must already be marked for
deletion to not fall into the race above. Spurious wakeups are also
handled transparently in that the task's reference is only removed
when wake_up_q() is actually called _after_ the nil store.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman.Long@hpe.com
Cc: dave@stgolabs.net
Cc: jason.low2@hp.com
Cc: peter@hurleysoftware.com
Link: http://lkml.kernel.org/r/1463165787-25937-3-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem-xadd.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 80b05ac0f015..fcbf75ac3dcb 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -194,17 +194,15 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
 		waiter = list_entry(next, struct rwsem_waiter, list);
 		next = waiter->list.next;
 		tsk = waiter->task;
+
+		wake_q_add(wake_q, tsk);
 		/*
-		 * Make sure we do not wakeup the next reader before
-		 * setting the nil condition to grant the next reader;
-		 * otherwise we could miss the wakeup on the other
-		 * side and end up sleeping again. See the pairing
-		 * in rwsem_down_read_failed().
+		 * Ensure that the last operation is setting the reader
+		 * waiter to nil such that rwsem_down_read_failed() cannot
+		 * race with do_exit() by always holding a reference count
+		 * to the task to wakeup.
 		 */
-		smp_mb();
-		waiter->task = NULL;
-		wake_q_add(wake_q, tsk);
-		put_task_struct(tsk);
+		smp_store_release(&waiter->task, NULL);
 	} while (--loop);
 
 	sem->wait_list.next = next;
@@ -228,7 +226,6 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 	/* set up my own style of waitqueue */
 	waiter.task = tsk;
 	waiter.type = RWSEM_WAITING_FOR_READ;
-	get_task_struct(tsk);
 
 	raw_spin_lock_irq(&sem->wait_lock);
 	if (list_empty(&sem->wait_list))
-- 
cgit v1.3-8-gc7d7


From c0fcb6c2d332041256dc55d8a1ec3c0a2d0befb8 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hpe.com>
Date: Mon, 16 May 2016 17:38:00 -0700
Subject: locking/rwsem: Optimize write lock by reducing operations in slowpath

When acquiring the rwsem write lock in the slowpath, we first try
to set count to RWSEM_WAITING_BIAS. When that is successful,
we then atomically add the RWSEM_WAITING_BIAS in cases where
there are other tasks on the wait list. This causes write lock
operations to often issue multiple atomic operations.

We can instead make the list_is_singular() check first, and then
set the count accordingly, so that we issue at most 1 atomic
operation when acquiring the write lock and reduce unnecessary
cacheline contention.

Signed-off-by: Jason Low <jason.low2@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Waiman Long<Waiman.Long@hpe.com>
Acked-by: Davidlohr Bueso <dave@stgolabs.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Jason Low <jason.low2@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Terry Rudd <terry.rudd@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/1463445486-16078-2-git-send-email-jason.low2@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem-xadd.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index fcbf75ac3dcb..b957da7fcb19 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -261,17 +261,28 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 }
 EXPORT_SYMBOL(rwsem_down_read_failed);
 
+/*
+ * This function must be called with the sem->wait_lock held to prevent
+ * race conditions between checking the rwsem wait list and setting the
+ * sem->count accordingly.
+ */
 static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
 {
 	/*
-	 * Try acquiring the write lock. Check count first in order
-	 * to reduce unnecessary expensive cmpxchg() operations.
+	 * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS.
 	 */
-	if (count == RWSEM_WAITING_BIAS &&
-	    cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS,
-		    RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
-		if (!list_is_singular(&sem->wait_list))
-			rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+	if (count != RWSEM_WAITING_BIAS)
+		return false;
+
+	/*
+	 * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there
+	 * are other tasks on the wait list, we need to add on WAITING_BIAS.
+	 */
+	count = list_is_singular(&sem->wait_list) ?
+			RWSEM_ACTIVE_WRITE_BIAS :
+			RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS;
+
+	if (cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) == RWSEM_WAITING_BIAS) {
 		rwsem_set_owner(sem);
 		return true;
 	}
-- 
cgit v1.3-8-gc7d7


From 6e2814745c67ab422b86262b05e6f23a56f28aa3 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hpe.com>
Date: Fri, 20 May 2016 15:19:36 -0700
Subject: locking/mutex: Set and clear owner using WRITE_ONCE()

The mutex owner can get read and written to locklessly.
Use WRITE_ONCE when setting and clearing the owner field
in order to avoid optimizations such as store tearing. This
avoids situations where the owner field gets written to with
multiple stores and another thread could concurrently read
and use a partially written owner value.

Signed-off-by: Jason Low <jason.low2@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Davidlohr Bueso <dave@stgolabs.net>
Acked-by: Waiman Long <Waiman.Long@hpe.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Terry Rudd <terry.rudd@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1463782776.2479.9.camel@j-VirtualBox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex-debug.h |  4 ++--
 kernel/locking/mutex.h       | 10 ++++++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
index 0799fd3e4cfa..372e6530180d 100644
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@@ -29,12 +29,12 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
 
 static inline void mutex_set_owner(struct mutex *lock)
 {
-	lock->owner = current;
+	WRITE_ONCE(lock->owner, current);
 }
 
 static inline void mutex_clear_owner(struct mutex *lock)
 {
-	lock->owner = NULL;
+	WRITE_ONCE(lock->owner, NULL);
 }
 
 #define spin_lock_mutex(lock, flags)			\
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 5cda397607f2..12f96199441c 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -17,14 +17,20 @@
 		__list_del((waiter)->list.prev, (waiter)->list.next)
 
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+/*
+ * The mutex owner can get read and written to locklessly.
+ * We should use WRITE_ONCE when writing the owner value to
+ * avoid store tearing, otherwise, a thread could potentially
+ * read a partially written and incomplete owner value.
+ */
 static inline void mutex_set_owner(struct mutex *lock)
 {
-	lock->owner = current;
+	WRITE_ONCE(lock->owner, current);
 }
 
 static inline void mutex_clear_owner(struct mutex *lock)
 {
-	lock->owner = NULL;
+	WRITE_ONCE(lock->owner, NULL);
 }
 #else
 static inline void mutex_set_owner(struct mutex *lock)
-- 
cgit v1.3-8-gc7d7


From 98f368e9e2630a3ce3e80fb10fb2e02038cf9578 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Thu, 2 Jun 2016 23:43:21 -0500
Subject: kernel: Add noaudit variant of ns_capable()

When checking the current cred for a capability in a specific user
namespace, it isn't always desirable to have the LSMs audit the check.
This patch adds a noaudit variant of ns_capable() for when those
situations arise.

The common logic between ns_capable() and the new ns_capable_noaudit()
is moved into a single, shared function to keep duplicated code to a
minimum and ease maintainability.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/capability.h |  5 +++++
 kernel/capability.c        | 46 ++++++++++++++++++++++++++++++++++++----------
 2 files changed, 41 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 00690ff92edf..5f3c63dde2d5 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -206,6 +206,7 @@ extern bool has_ns_capability_noaudit(struct task_struct *t,
 				      struct user_namespace *ns, int cap);
 extern bool capable(int cap);
 extern bool ns_capable(struct user_namespace *ns, int cap);
+extern bool ns_capable_noaudit(struct user_namespace *ns, int cap);
 #else
 static inline bool has_capability(struct task_struct *t, int cap)
 {
@@ -233,6 +234,10 @@ static inline bool ns_capable(struct user_namespace *ns, int cap)
 {
 	return true;
 }
+static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap)
+{
+	return true;
+}
 #endif /* CONFIG_MULTIUSER */
 extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
 extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
diff --git a/kernel/capability.c b/kernel/capability.c
index 45432b54d5c6..00411c82dac5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -361,6 +361,24 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
 	return has_ns_capability_noaudit(t, &init_user_ns, cap);
 }
 
+static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
+{
+	int capable;
+
+	if (unlikely(!cap_valid(cap))) {
+		pr_crit("capable() called with invalid cap=%u\n", cap);
+		BUG();
+	}
+
+	capable = audit ? security_capable(current_cred(), ns, cap) :
+			  security_capable_noaudit(current_cred(), ns, cap);
+	if (capable == 0) {
+		current->flags |= PF_SUPERPRIV;
+		return true;
+	}
+	return false;
+}
+
 /**
  * ns_capable - Determine if the current task has a superior capability in effect
  * @ns:  The usernamespace we want the capability in
@@ -374,19 +392,27 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
  */
 bool ns_capable(struct user_namespace *ns, int cap)
 {
-	if (unlikely(!cap_valid(cap))) {
-		pr_crit("capable() called with invalid cap=%u\n", cap);
-		BUG();
-	}
-
-	if (security_capable(current_cred(), ns, cap) == 0) {
-		current->flags |= PF_SUPERPRIV;
-		return true;
-	}
-	return false;
+	return ns_capable_common(ns, cap, true);
 }
 EXPORT_SYMBOL(ns_capable);
 
+/**
+ * ns_capable_noaudit - Determine if the current task has a superior capability
+ * (unaudited) in effect
+ * @ns:  The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable_noaudit(struct user_namespace *ns, int cap)
+{
+	return ns_capable_common(ns, cap, false);
+}
+EXPORT_SYMBOL(ns_capable_noaudit);
 
 /**
  * capable - Determine if the current task has a superior capability in effect
-- 
cgit v1.3-8-gc7d7


From 4e49ea4a3d276365bf7396c9b77b4d1d5923835a Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Sun, 5 Jun 2016 14:31:41 -0500
Subject: block/fs/drivers: remove rw argument from submit_bio

This has callers of submit_bio/submit_bio_wait set the bio->bi_rw
instead of passing it in. This makes that use the same as
generic_make_request and how we set the other bio fields.

Signed-off-by: Mike Christie <mchristi@redhat.com>

Fixed up fs/ext4/crypto.c

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c                         |  7 +++----
 block/blk-core.c                    | 11 ++++-------
 block/blk-flush.c                   |  3 ++-
 block/blk-lib.c                     | 20 +++++++++++---------
 drivers/block/drbd/drbd_actlog.c    |  2 +-
 drivers/block/drbd/drbd_bitmap.c    |  4 ++--
 drivers/block/floppy.c              |  3 ++-
 drivers/block/xen-blkback/blkback.c |  4 +++-
 drivers/block/xen-blkfront.c        |  4 ++--
 drivers/md/bcache/debug.c           |  6 ++++--
 drivers/md/bcache/journal.c         |  2 +-
 drivers/md/bcache/super.c           |  4 ++--
 drivers/md/dm-bufio.c               |  3 ++-
 drivers/md/dm-io.c                  |  3 ++-
 drivers/md/dm-log-writes.c          |  9 ++++++---
 drivers/md/dm-thin.c                |  3 ++-
 drivers/md/md.c                     | 10 +++++++---
 drivers/md/raid1.c                  |  3 ++-
 drivers/md/raid10.c                 |  4 +++-
 drivers/md/raid5-cache.c            |  7 ++++---
 drivers/target/target_core_iblock.c | 24 +++++++++++++-----------
 fs/btrfs/check-integrity.c          | 18 ++++++++++--------
 fs/btrfs/check-integrity.h          |  4 ++--
 fs/btrfs/disk-io.c                  |  3 ++-
 fs/btrfs/extent_io.c                |  7 ++++---
 fs/btrfs/raid56.c                   | 17 ++++++++++++-----
 fs/btrfs/scrub.c                    | 15 ++++++++++-----
 fs/btrfs/volumes.c                  | 14 +++++++-------
 fs/buffer.c                         |  3 ++-
 fs/crypto/crypto.c                  |  3 ++-
 fs/direct-io.c                      |  3 ++-
 fs/ext4/crypto.c                    |  3 ++-
 fs/ext4/page-io.c                   |  3 ++-
 fs/ext4/readpage.c                  |  9 +++++----
 fs/f2fs/data.c                      |  4 +++-
 fs/f2fs/segment.c                   |  6 ++++--
 fs/gfs2/lops.c                      |  3 ++-
 fs/gfs2/meta_io.c                   |  3 ++-
 fs/gfs2/ops_fstype.c                |  3 ++-
 fs/hfsplus/wrapper.c                |  3 ++-
 fs/jfs/jfs_logmgr.c                 |  6 ++++--
 fs/jfs/jfs_metapage.c               | 10 ++++++----
 fs/logfs/dev_bdev.c                 | 15 ++++++++++-----
 fs/mpage.c                          |  3 ++-
 fs/nfs/blocklayout/blocklayout.c    | 22 ++++++++++++----------
 fs/nilfs2/segbuf.c                  |  3 ++-
 fs/ocfs2/cluster/heartbeat.c        | 12 +++++++-----
 fs/xfs/xfs_aops.c                   | 15 ++++++++++-----
 fs/xfs/xfs_buf.c                    |  4 ++--
 include/linux/bio.h                 |  2 +-
 include/linux/fs.h                  |  2 +-
 kernel/power/swap.c                 |  5 +++--
 mm/page_io.c                        | 10 ++++++----
 53 files changed, 221 insertions(+), 148 deletions(-)

(limited to 'kernel')

diff --git a/block/bio.c b/block/bio.c
index 0e4aa42bc30d..fc779eba0b95 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -854,21 +854,20 @@ static void submit_bio_wait_endio(struct bio *bio)
 
 /**
  * submit_bio_wait - submit a bio, and wait until it completes
- * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
  * @bio: The &struct bio which describes the I/O
  *
  * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
  * bio_endio() on failure.
  */
-int submit_bio_wait(int rw, struct bio *bio)
+int submit_bio_wait(struct bio *bio)
 {
 	struct submit_bio_ret ret;
 
-	rw |= REQ_SYNC;
 	init_completion(&ret.event);
 	bio->bi_private = &ret;
 	bio->bi_end_io = submit_bio_wait_endio;
-	submit_bio(rw, bio);
+	bio->bi_rw |= REQ_SYNC;
+	submit_bio(bio);
 	wait_for_completion_io(&ret.event);
 
 	return ret.error;
diff --git a/block/blk-core.c b/block/blk-core.c
index 2475b1c72773..e95340789592 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2094,7 +2094,6 @@ EXPORT_SYMBOL(generic_make_request);
 
 /**
  * submit_bio - submit a bio to the block device layer for I/O
- * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
  * @bio: The &struct bio which describes the I/O
  *
  * submit_bio() is very similar in purpose to generic_make_request(), and
@@ -2102,10 +2101,8 @@ EXPORT_SYMBOL(generic_make_request);
  * interfaces; @bio must be presetup and ready for I/O.
  *
  */
-blk_qc_t submit_bio(int rw, struct bio *bio)
+blk_qc_t submit_bio(struct bio *bio)
 {
-	bio->bi_rw |= rw;
-
 	/*
 	 * If it's a regular read/write or a barrier with data attached,
 	 * go through the normal accounting stuff before submission.
@@ -2113,12 +2110,12 @@ blk_qc_t submit_bio(int rw, struct bio *bio)
 	if (bio_has_data(bio)) {
 		unsigned int count;
 
-		if (unlikely(rw & REQ_WRITE_SAME))
+		if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
 			count = bdev_logical_block_size(bio->bi_bdev) >> 9;
 		else
 			count = bio_sectors(bio);
 
-		if (rw & WRITE) {
+		if (bio->bi_rw & WRITE) {
 			count_vm_events(PGPGOUT, count);
 		} else {
 			task_io_account_read(bio->bi_iter.bi_size);
@@ -2129,7 +2126,7 @@ blk_qc_t submit_bio(int rw, struct bio *bio)
 			char b[BDEVNAME_SIZE];
 			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
 			current->comm, task_pid_nr(current),
-				(rw & WRITE) ? "WRITE" : "READ",
+				(bio->bi_rw & WRITE) ? "WRITE" : "READ",
 				(unsigned long long)bio->bi_iter.bi_sector,
 				bdevname(bio->bi_bdev, b),
 				count);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index b1c91d229e5e..3af4a5ad46f5 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -485,8 +485,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 
 	bio = bio_alloc(gfp_mask, 0);
 	bio->bi_bdev = bdev;
+	bio->bi_rw = WRITE_FLUSH;
 
-	ret = submit_bio_wait(WRITE_FLUSH, bio);
+	ret = submit_bio_wait(bio);
 
 	/*
 	 * The driver must store the error location in ->bi_sector, if
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 23d7f301a196..1f6dec5e0975 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -9,14 +9,14 @@
 
 #include "blk.h"
 
-static struct bio *next_bio(struct bio *bio, int rw, unsigned int nr_pages,
+static struct bio *next_bio(struct bio *bio, unsigned int nr_pages,
 		gfp_t gfp)
 {
 	struct bio *new = bio_alloc(gfp, nr_pages);
 
 	if (bio) {
 		bio_chain(bio, new);
-		submit_bio(rw, bio);
+		submit_bio(bio);
 	}
 
 	return new;
@@ -62,9 +62,10 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 			req_sects = end_sect - sector;
 		}
 
-		bio = next_bio(bio, type, 1, gfp_mask);
+		bio = next_bio(bio, 1, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio->bi_bdev = bdev;
+		bio->bi_rw = type;
 
 		bio->bi_iter.bi_size = req_sects << 9;
 		nr_sects -= req_sects;
@@ -110,7 +111,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, type,
 			&bio);
 	if (!ret && bio) {
-		ret = submit_bio_wait(type, bio);
+		ret = submit_bio_wait(bio);
 		if (ret == -EOPNOTSUPP)
 			ret = 0;
 	}
@@ -147,13 +148,14 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 	max_write_same_sectors = UINT_MAX >> 9;
 
 	while (nr_sects) {
-		bio = next_bio(bio, REQ_WRITE | REQ_WRITE_SAME, 1, gfp_mask);
+		bio = next_bio(bio, 1, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio->bi_bdev = bdev;
 		bio->bi_vcnt = 1;
 		bio->bi_io_vec->bv_page = page;
 		bio->bi_io_vec->bv_offset = 0;
 		bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev);
+		bio->bi_rw = REQ_WRITE | REQ_WRITE_SAME;
 
 		if (nr_sects > max_write_same_sectors) {
 			bio->bi_iter.bi_size = max_write_same_sectors << 9;
@@ -166,7 +168,7 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 	}
 
 	if (bio)
-		ret = submit_bio_wait(REQ_WRITE | REQ_WRITE_SAME, bio);
+		ret = submit_bio_wait(bio);
 	return ret != -EOPNOTSUPP ? ret : 0;
 }
 EXPORT_SYMBOL(blkdev_issue_write_same);
@@ -190,11 +192,11 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 	unsigned int sz;
 
 	while (nr_sects != 0) {
-		bio = next_bio(bio, WRITE,
-				min(nr_sects, (sector_t)BIO_MAX_PAGES),
+		bio = next_bio(bio, min(nr_sects, (sector_t)BIO_MAX_PAGES),
 				gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio->bi_bdev   = bdev;
+		bio->bi_rw = REQ_WRITE;
 
 		while (nr_sects != 0) {
 			sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
@@ -207,7 +209,7 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 	}
 
 	if (bio)
-		return submit_bio_wait(WRITE, bio);
+		return submit_bio_wait(bio);
 	return 0;
 }
 
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 10459a145062..6069e152c32f 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -177,7 +177,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
 	if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
 		bio_io_error(bio);
 	else
-		submit_bio(rw, bio);
+		submit_bio(bio);
 	wait_until_done_or_force_detached(device, bdev, &device->md_io.done);
 	if (!bio->bi_error)
 		err = device->md_io.error;
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 92d6fc020a65..e8959fee1fa3 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -1011,12 +1011,12 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho
 	bio_add_page(bio, page, len, 0);
 	bio->bi_private = ctx;
 	bio->bi_end_io = drbd_bm_endio;
+	bio->bi_rw = rw;
 
 	if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
-		bio->bi_rw |= rw;
 		bio_io_error(bio);
 	} else {
-		submit_bio(rw, bio);
+		submit_bio(bio);
 		/* this should not count as user activity and cause the
 		 * resync to throttle -- see drbd_rs_should_slow_down(). */
 		atomic_add(len >> 9, &device->rs_sect_ev);
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 84708a5f8c52..73ded25b82c4 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3822,8 +3822,9 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive)
 	bio.bi_flags |= (1 << BIO_QUIET);
 	bio.bi_private = &cbdata;
 	bio.bi_end_io = floppy_rb0_cb;
+	bio.bi_rw = READ;
 
-	submit_bio(READ, &bio);
+	submit_bio(&bio);
 	process_fd_request();
 
 	init_completion(&cbdata.complete);
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 4809c1501d7e..79fe4934f18b 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -1369,6 +1369,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 			bio->bi_private = pending_req;
 			bio->bi_end_io  = end_block_io_op;
 			bio->bi_iter.bi_sector  = preq.sector_number;
+			bio->bi_rw	= operation;
 		}
 
 		preq.sector_number += seg[i].nsec;
@@ -1386,13 +1387,14 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 		bio->bi_bdev    = preq.bdev;
 		bio->bi_private = pending_req;
 		bio->bi_end_io  = end_block_io_op;
+		bio->bi_rw	= operation;
 	}
 
 	atomic_set(&pending_req->pendcnt, nbio);
 	blk_start_plug(&plug);
 
 	for (i = 0; i < nbio; i++)
-		submit_bio(operation, biolist[i]);
+		submit_bio(biolist[i]);
 
 	/* Let the I/Os go.. */
 	blk_finish_plug(&plug);
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index ca13df854639..52963a26660a 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -2114,7 +2114,7 @@ static int blkif_recover(struct blkfront_info *info)
 				bio_trim(cloned_bio, offset, size);
 				cloned_bio->bi_private = split_bio;
 				cloned_bio->bi_end_io = split_bio_end;
-				submit_bio(cloned_bio->bi_rw, cloned_bio);
+				submit_bio(cloned_bio);
 			}
 			/*
 			 * Now we have to wait for all those smaller bios to
@@ -2123,7 +2123,7 @@ static int blkif_recover(struct blkfront_info *info)
 			continue;
 		}
 		/* We don't need to split this bio */
-		submit_bio(bio->bi_rw, bio);
+		submit_bio(bio);
 	}
 
 	return 0;
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 8b1f1d5c1819..52b6bcf6b6c8 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -52,9 +52,10 @@ void bch_btree_verify(struct btree *b)
 	bio->bi_bdev		= PTR_CACHE(b->c, &b->key, 0)->bdev;
 	bio->bi_iter.bi_sector	= PTR_OFFSET(&b->key, 0);
 	bio->bi_iter.bi_size	= KEY_SIZE(&v->key) << 9;
+	bio->bi_rw		= REQ_META|READ_SYNC;
 	bch_bio_map(bio, sorted);
 
-	submit_bio_wait(REQ_META|READ_SYNC, bio);
+	submit_bio_wait(bio);
 	bch_bbio_free(bio, b->c);
 
 	memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9);
@@ -113,11 +114,12 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 	check = bio_clone(bio, GFP_NOIO);
 	if (!check)
 		return;
+	check->bi_rw |= READ_SYNC;
 
 	if (bio_alloc_pages(check, GFP_NOIO))
 		goto out_put;
 
-	submit_bio_wait(READ_SYNC, check);
+	submit_bio_wait(check);
 
 	bio_for_each_segment(bv, bio, iter) {
 		void *p1 = kmap_atomic(bv.bv_page);
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 29eba7219b01..af3f9f7f20e2 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -418,7 +418,7 @@ static void journal_discard_work(struct work_struct *work)
 	struct journal_device *ja =
 		container_of(work, struct journal_device, discard_work);
 
-	submit_bio(0, &ja->discard_bio);
+	submit_bio(&ja->discard_bio);
 }
 
 static void do_journal_discard(struct cache *ca)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index f5dbb4e884d8..1eb526a7b8b3 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -212,7 +212,7 @@ static void __write_super(struct cache_sb *sb, struct bio *bio)
 	unsigned i;
 
 	bio->bi_iter.bi_sector	= SB_SECTOR;
-	bio->bi_rw		= REQ_SYNC|REQ_META;
+	bio->bi_rw		= REQ_WRITE|REQ_SYNC|REQ_META;
 	bio->bi_iter.bi_size	= SB_SIZE;
 	bch_bio_map(bio, NULL);
 
@@ -238,7 +238,7 @@ static void __write_super(struct cache_sb *sb, struct bio *bio)
 	pr_debug("ver %llu, flags %llu, seq %llu",
 		 sb->version, sb->flags, sb->seq);
 
-	submit_bio(REQ_WRITE, bio);
+	submit_bio(bio);
 }
 
 static void bch_write_bdev_super_unlock(struct closure *cl)
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index cd77216beff1..9d3ee7f6c6c7 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -634,6 +634,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 	 * the dm_buffer's inline bio is local to bufio.
 	 */
 	b->bio.bi_private = end_io;
+	b->bio.bi_rw = rw;
 
 	/*
 	 * We assume that if len >= PAGE_SIZE ptr is page-aligned.
@@ -660,7 +661,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 		ptr += PAGE_SIZE;
 	} while (len > 0);
 
-	submit_bio(rw, &b->bio);
+	submit_bio(&b->bio);
 }
 
 static void submit_io(struct dm_buffer *b, int rw, sector_t block,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 06d426eb5a30..50f17e32951a 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -322,6 +322,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
 		bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
 		bio->bi_bdev = where->bdev;
 		bio->bi_end_io = endio;
+		bio->bi_rw = rw;
 		store_io_and_region_in_bio(bio, io, region);
 
 		if (rw & REQ_DISCARD) {
@@ -355,7 +356,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
 		}
 
 		atomic_inc(&io->count);
-		submit_bio(rw, bio);
+		submit_bio(bio);
 	} while (remaining);
 }
 
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 608302e222af..addcc4be00b6 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -205,6 +205,7 @@ static int write_metadata(struct log_writes_c *lc, void *entry,
 	bio->bi_bdev = lc->logdev->bdev;
 	bio->bi_end_io = log_end_io;
 	bio->bi_private = lc;
+	bio->bi_rw = WRITE;
 
 	page = alloc_page(GFP_KERNEL);
 	if (!page) {
@@ -226,7 +227,7 @@ static int write_metadata(struct log_writes_c *lc, void *entry,
 		DMERR("Couldn't add page to the log block");
 		goto error_bio;
 	}
-	submit_bio(WRITE, bio);
+	submit_bio(bio);
 	return 0;
 error_bio:
 	bio_put(bio);
@@ -269,6 +270,7 @@ static int log_one_block(struct log_writes_c *lc,
 	bio->bi_bdev = lc->logdev->bdev;
 	bio->bi_end_io = log_end_io;
 	bio->bi_private = lc;
+	bio->bi_rw = WRITE;
 
 	for (i = 0; i < block->vec_cnt; i++) {
 		/*
@@ -279,7 +281,7 @@ static int log_one_block(struct log_writes_c *lc,
 				   block->vecs[i].bv_len, 0);
 		if (ret != block->vecs[i].bv_len) {
 			atomic_inc(&lc->io_blocks);
-			submit_bio(WRITE, bio);
+			submit_bio(bio);
 			bio = bio_alloc(GFP_KERNEL, block->vec_cnt - i);
 			if (!bio) {
 				DMERR("Couldn't alloc log bio");
@@ -290,6 +292,7 @@ static int log_one_block(struct log_writes_c *lc,
 			bio->bi_bdev = lc->logdev->bdev;
 			bio->bi_end_io = log_end_io;
 			bio->bi_private = lc;
+			bio->bi_rw = WRITE;
 
 			ret = bio_add_page(bio, block->vecs[i].bv_page,
 					   block->vecs[i].bv_len, 0);
@@ -301,7 +304,7 @@ static int log_one_block(struct log_writes_c *lc,
 		}
 		sector += block->vecs[i].bv_len >> SECTOR_SHIFT;
 	}
-	submit_bio(WRITE, bio);
+	submit_bio(bio);
 out:
 	kfree(block->data);
 	kfree(block);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index fc803d50f9f0..8c070eed982c 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -371,7 +371,8 @@ static void end_discard(struct discard_op *op, int r)
 		 * need to wait for the chain to complete.
 		 */
 		bio_chain(op->bio, op->parent_bio);
-		submit_bio(REQ_WRITE | REQ_DISCARD, op->bio);
+		op->bio->bi_rw = REQ_WRITE | REQ_DISCARD;
+		submit_bio(op->bio);
 	}
 
 	blk_finish_plug(&op->plug);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 866825f10b4c..fb3950be3070 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -394,8 +394,9 @@ static void submit_flushes(struct work_struct *ws)
 			bi->bi_end_io = md_end_flush;
 			bi->bi_private = rdev;
 			bi->bi_bdev = rdev->bdev;
+			bi->bi_rw = WRITE_FLUSH;
 			atomic_inc(&mddev->flush_pending);
-			submit_bio(WRITE_FLUSH, bi);
+			submit_bio(bi);
 			rcu_read_lock();
 			rdev_dec_pending(rdev, mddev);
 		}
@@ -742,9 +743,10 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 	bio_add_page(bio, page, size, 0);
 	bio->bi_private = rdev;
 	bio->bi_end_io = super_written;
+	bio->bi_rw = WRITE_FLUSH_FUA;
 
 	atomic_inc(&mddev->pending_writes);
-	submit_bio(WRITE_FLUSH_FUA, bio);
+	submit_bio(bio);
 }
 
 void md_super_wait(struct mddev *mddev)
@@ -761,6 +763,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 
 	bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
 		rdev->meta_bdev : rdev->bdev;
+	bio->bi_rw = rw;
 	if (metadata_op)
 		bio->bi_iter.bi_sector = sector + rdev->sb_start;
 	else if (rdev->mddev->reshape_position != MaxSector &&
@@ -770,7 +773,8 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 	else
 		bio->bi_iter.bi_sector = sector + rdev->data_offset;
 	bio_add_page(bio, page, size, 0);
-	submit_bio_wait(rw, bio);
+
+	submit_bio_wait(bio);
 
 	ret = !bio->bi_error;
 	bio_put(bio);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c7c8cde0ab21..bc95d4dd600b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2208,7 +2208,8 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
 		bio_trim(wbio, sector - r1_bio->sector, sectors);
 		wbio->bi_iter.bi_sector += rdev->data_offset;
 		wbio->bi_bdev = rdev->bdev;
-		if (submit_bio_wait(WRITE, wbio) < 0)
+
+		if (submit_bio_wait(wbio) < 0)
 			/* failure! */
 			ok = rdev_set_badblocks(rdev, sector,
 						sectors, 0)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c7de2a53e625..0be6497d8e34 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2474,7 +2474,9 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
 				   choose_data_offset(r10_bio, rdev) +
 				   (sector - r10_bio->sector));
 		wbio->bi_bdev = rdev->bdev;
-		if (submit_bio_wait(WRITE, wbio) < 0)
+		wbio->bi_rw = WRITE;
+
+		if (submit_bio_wait(wbio) < 0)
 			/* Failure! */
 			ok = rdev_set_badblocks(rdev, sector,
 						sectors, 0)
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index e889e2deb7b3..0b014535f5dd 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -254,7 +254,7 @@ static void r5l_submit_current_io(struct r5l_log *log)
 	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
 	spin_unlock_irqrestore(&log->io_list_lock, flags);
 
-	submit_bio(WRITE, io->current_bio);
+	submit_bio(io->current_bio);
 }
 
 static struct bio *r5l_bio_alloc(struct r5l_log *log)
@@ -373,7 +373,7 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
 		io->current_bio = r5l_bio_alloc(log);
 		bio_chain(io->current_bio, prev);
 
-		submit_bio(WRITE, prev);
+		submit_bio(prev);
 	}
 
 	if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
@@ -686,7 +686,8 @@ void r5l_flush_stripe_to_raid(struct r5l_log *log)
 	bio_reset(&log->flush_bio);
 	log->flush_bio.bi_bdev = log->rdev->bdev;
 	log->flush_bio.bi_end_io = r5l_log_flush_endio;
-	submit_bio(WRITE_FLUSH, &log->flush_bio);
+	log->flush_bio.bi_rw = WRITE_FLUSH;
+	submit_bio(&log->flush_bio);
 }
 
 static void r5l_write_super(struct r5l_log *log, sector_t cp);
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 7c4efb4417b0..c25109cc0329 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -312,7 +312,7 @@ static void iblock_bio_done(struct bio *bio)
 }
 
 static struct bio *
-iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num)
+iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num, int rw)
 {
 	struct iblock_dev *ib_dev = IBLOCK_DEV(cmd->se_dev);
 	struct bio *bio;
@@ -334,18 +334,19 @@ iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num)
 	bio->bi_private = cmd;
 	bio->bi_end_io = &iblock_bio_done;
 	bio->bi_iter.bi_sector = lba;
+	bio->bi_rw = rw;
 
 	return bio;
 }
 
-static void iblock_submit_bios(struct bio_list *list, int rw)
+static void iblock_submit_bios(struct bio_list *list)
 {
 	struct blk_plug plug;
 	struct bio *bio;
 
 	blk_start_plug(&plug);
 	while ((bio = bio_list_pop(list)))
-		submit_bio(rw, bio);
+		submit_bio(bio);
 	blk_finish_plug(&plug);
 }
 
@@ -387,9 +388,10 @@ iblock_execute_sync_cache(struct se_cmd *cmd)
 	bio = bio_alloc(GFP_KERNEL, 0);
 	bio->bi_end_io = iblock_end_io_flush;
 	bio->bi_bdev = ib_dev->ibd_bd;
+	bio->bi_rw = WRITE_FLUSH;
 	if (!immed)
 		bio->bi_private = cmd;
-	submit_bio(WRITE_FLUSH, bio);
+	submit_bio(bio);
 	return 0;
 }
 
@@ -478,7 +480,7 @@ iblock_execute_write_same(struct se_cmd *cmd)
 		goto fail;
 	cmd->priv = ibr;
 
-	bio = iblock_get_bio(cmd, block_lba, 1);
+	bio = iblock_get_bio(cmd, block_lba, 1, WRITE);
 	if (!bio)
 		goto fail_free_ibr;
 
@@ -491,7 +493,7 @@ iblock_execute_write_same(struct se_cmd *cmd)
 		while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset)
 				!= sg->length) {
 
-			bio = iblock_get_bio(cmd, block_lba, 1);
+			bio = iblock_get_bio(cmd, block_lba, 1, WRITE);
 			if (!bio)
 				goto fail_put_bios;
 
@@ -504,7 +506,7 @@ iblock_execute_write_same(struct se_cmd *cmd)
 		sectors -= 1;
 	}
 
-	iblock_submit_bios(&list, WRITE);
+	iblock_submit_bios(&list);
 	return 0;
 
 fail_put_bios:
@@ -712,7 +714,7 @@ iblock_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 		return 0;
 	}
 
-	bio = iblock_get_bio(cmd, block_lba, sgl_nents);
+	bio = iblock_get_bio(cmd, block_lba, sgl_nents, rw);
 	if (!bio)
 		goto fail_free_ibr;
 
@@ -732,11 +734,11 @@ iblock_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 		while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset)
 				!= sg->length) {
 			if (bio_cnt >= IBLOCK_MAX_BIO_PER_TASK) {
-				iblock_submit_bios(&list, rw);
+				iblock_submit_bios(&list);
 				bio_cnt = 0;
 			}
 
-			bio = iblock_get_bio(cmd, block_lba, sg_num);
+			bio = iblock_get_bio(cmd, block_lba, sg_num, rw);
 			if (!bio)
 				goto fail_put_bios;
 
@@ -756,7 +758,7 @@ iblock_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 			goto fail_put_bios;
 	}
 
-	iblock_submit_bios(&list, rw);
+	iblock_submit_bios(&list);
 	iblock_complete_cmd(cmd);
 	return 0;
 
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index b677a6ea6001..50f81916663b 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1673,6 +1673,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 		}
 		bio->bi_bdev = block_ctx->dev->bdev;
 		bio->bi_iter.bi_sector = dev_bytenr >> 9;
+		bio->bi_rw = READ;
 
 		for (j = i; j < num_pages; j++) {
 			ret = bio_add_page(bio, block_ctx->pagev[j],
@@ -1685,7 +1686,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 			       "btrfsic: error, failed to add a single page!\n");
 			return -1;
 		}
-		if (submit_bio_wait(READ, bio)) {
+		if (submit_bio_wait(bio)) {
 			printk(KERN_INFO
 			       "btrfsic: read error at logical %llu dev %s!\n",
 			       block_ctx->start, block_ctx->dev->name);
@@ -2918,9 +2919,10 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
 	return submit_bh(rw, bh);
 }
 
-static void __btrfsic_submit_bio(int rw, struct bio *bio)
+static void __btrfsic_submit_bio(struct bio *bio)
 {
 	struct btrfsic_dev_state *dev_state;
+	int rw = bio->bi_rw;
 
 	if (!btrfsic_is_initialized)
 		return;
@@ -3016,16 +3018,16 @@ leave:
 	mutex_unlock(&btrfsic_mutex);
 }
 
-void btrfsic_submit_bio(int rw, struct bio *bio)
+void btrfsic_submit_bio(struct bio *bio)
 {
-	__btrfsic_submit_bio(rw, bio);
-	submit_bio(rw, bio);
+	__btrfsic_submit_bio(bio);
+	submit_bio(bio);
 }
 
-int btrfsic_submit_bio_wait(int rw, struct bio *bio)
+int btrfsic_submit_bio_wait(struct bio *bio)
 {
-	__btrfsic_submit_bio(rw, bio);
-	return submit_bio_wait(rw, bio);
+	__btrfsic_submit_bio(bio);
+	return submit_bio_wait(bio);
 }
 
 int btrfsic_mount(struct btrfs_root *root,
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
index 13b8566c97ab..c04e249058c3 100644
--- a/fs/btrfs/check-integrity.h
+++ b/fs/btrfs/check-integrity.h
@@ -21,8 +21,8 @@
 
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 int btrfsic_submit_bh(int rw, struct buffer_head *bh);
-void btrfsic_submit_bio(int rw, struct bio *bio);
-int btrfsic_submit_bio_wait(int rw, struct bio *bio);
+void btrfsic_submit_bio(struct bio *bio);
+int btrfsic_submit_bio_wait(struct bio *bio);
 #else
 #define btrfsic_submit_bh submit_bh
 #define btrfsic_submit_bio submit_bio
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6628fca9f4ed..3666558a2375 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3486,12 +3486,13 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 
 	bio->bi_end_io = btrfs_end_empty_barrier;
 	bio->bi_bdev = device->bdev;
+	bio->bi_rw = WRITE_FLUSH;
 	init_completion(&device->flush_wait);
 	bio->bi_private = &device->flush_wait;
 	device->flush_bio = bio;
 
 	bio_get(bio);
-	btrfsic_submit_bio(WRITE_FLUSH, bio);
+	btrfsic_submit_bio(bio);
 
 	return 0;
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6e953de83f08..40e8dcc319d4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2049,9 +2049,10 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
 		return -EIO;
 	}
 	bio->bi_bdev = dev->bdev;
+	bio->bi_rw = WRITE_SYNC;
 	bio_add_page(bio, page, length, pg_offset);
 
-	if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
+	if (btrfsic_submit_bio_wait(bio)) {
 		/* try to remap that extent elsewhere? */
 		btrfs_bio_counter_dec(fs_info);
 		bio_put(bio);
@@ -2735,14 +2736,14 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
 	start = page_offset(page) + bvec->bv_offset;
 
 	bio->bi_private = NULL;
-
+	bio->bi_rw = rw;
 	bio_get(bio);
 
 	if (tree->ops && tree->ops->submit_bio_hook)
 		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
 					   mirror_num, bio_flags, start);
 	else
-		btrfsic_submit_bio(rw, bio);
+		btrfsic_submit_bio(bio);
 
 	bio_put(bio);
 	return ret;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index f8b6d411a034..36d5bf7bbb59 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1320,7 +1320,9 @@ write_data:
 
 		bio->bi_private = rbio;
 		bio->bi_end_io = raid_write_end_io;
-		submit_bio(WRITE, bio);
+		bio->bi_rw = WRITE;
+
+		submit_bio(bio);
 	}
 	return;
 
@@ -1573,11 +1575,12 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
 
 		bio->bi_private = rbio;
 		bio->bi_end_io = raid_rmw_end_io;
+		bio->bi_rw = READ;
 
 		btrfs_bio_wq_end_io(rbio->fs_info, bio,
 				    BTRFS_WQ_ENDIO_RAID56);
 
-		submit_bio(READ, bio);
+		submit_bio(bio);
 	}
 	/* the actual write will happen once the reads are done */
 	return 0;
@@ -2097,11 +2100,12 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
 
 		bio->bi_private = rbio;
 		bio->bi_end_io = raid_recover_end_io;
+		bio->bi_rw = READ;
 
 		btrfs_bio_wq_end_io(rbio->fs_info, bio,
 				    BTRFS_WQ_ENDIO_RAID56);
 
-		submit_bio(READ, bio);
+		submit_bio(bio);
 	}
 out:
 	return 0;
@@ -2433,7 +2437,9 @@ submit_write:
 
 		bio->bi_private = rbio;
 		bio->bi_end_io = raid_write_end_io;
-		submit_bio(WRITE, bio);
+		bio->bi_rw = WRITE;
+
+		submit_bio(bio);
 	}
 	return;
 
@@ -2610,11 +2616,12 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
 
 		bio->bi_private = rbio;
 		bio->bi_end_io = raid56_parity_scrub_end_io;
+		bio->bi_rw = READ;
 
 		btrfs_bio_wq_end_io(rbio->fs_info, bio,
 				    BTRFS_WQ_ENDIO_RAID56);
 
-		submit_bio(READ, bio);
+		submit_bio(bio);
 	}
 	/* the actual write will happen once the reads are done */
 	return;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 70427ef66b04..9899f3e44a56 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1504,8 +1504,9 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 				sblock->no_io_error_seen = 0;
 		} else {
 			bio->bi_iter.bi_sector = page->physical >> 9;
+			bio->bi_rw = READ;
 
-			if (btrfsic_submit_bio_wait(READ, bio))
+			if (btrfsic_submit_bio_wait(bio))
 				sblock->no_io_error_seen = 0;
 		}
 
@@ -1583,6 +1584,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 			return -EIO;
 		bio->bi_bdev = page_bad->dev->bdev;
 		bio->bi_iter.bi_sector = page_bad->physical >> 9;
+		bio->bi_rw = WRITE;
 
 		ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
 		if (PAGE_SIZE != ret) {
@@ -1590,7 +1592,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 			return -EIO;
 		}
 
-		if (btrfsic_submit_bio_wait(WRITE, bio)) {
+		if (btrfsic_submit_bio_wait(bio)) {
 			btrfs_dev_stat_inc_and_print(page_bad->dev,
 				BTRFS_DEV_STAT_WRITE_ERRS);
 			btrfs_dev_replace_stats_inc(
@@ -1684,6 +1686,7 @@ again:
 		bio->bi_end_io = scrub_wr_bio_end_io;
 		bio->bi_bdev = sbio->dev->bdev;
 		bio->bi_iter.bi_sector = sbio->physical >> 9;
+		bio->bi_rw = WRITE;
 		sbio->err = 0;
 	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
 		   spage->physical_for_dev_replace ||
@@ -1731,7 +1734,7 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
 	 * orders the requests before sending them to the driver which
 	 * doubled the write performance on spinning disks when measured
 	 * with Linux 3.5 */
-	btrfsic_submit_bio(WRITE, sbio->bio);
+	btrfsic_submit_bio(sbio->bio);
 }
 
 static void scrub_wr_bio_end_io(struct bio *bio)
@@ -2041,7 +2044,7 @@ static void scrub_submit(struct scrub_ctx *sctx)
 	sbio = sctx->bios[sctx->curr];
 	sctx->curr = -1;
 	scrub_pending_bio_inc(sctx);
-	btrfsic_submit_bio(READ, sbio->bio);
+	btrfsic_submit_bio(sbio->bio);
 }
 
 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
@@ -2088,6 +2091,7 @@ again:
 		bio->bi_end_io = scrub_bio_end_io;
 		bio->bi_bdev = sbio->dev->bdev;
 		bio->bi_iter.bi_sector = sbio->physical >> 9;
+		bio->bi_rw = READ;
 		sbio->err = 0;
 	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
 		   spage->physical ||
@@ -4436,6 +4440,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 	bio->bi_iter.bi_size = 0;
 	bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
 	bio->bi_bdev = dev->bdev;
+	bio->bi_rw = WRITE_SYNC;
 	ret = bio_add_page(bio, page, PAGE_SIZE, 0);
 	if (ret != PAGE_SIZE) {
 leave_with_eio:
@@ -4444,7 +4449,7 @@ leave_with_eio:
 		return -EIO;
 	}
 
-	if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
+	if (btrfsic_submit_bio_wait(bio))
 		goto leave_with_eio;
 
 	bio_put(bio);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index da9e0036a864..037f3ab841d8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -462,7 +462,7 @@ loop_lock:
 			sync_pending = 0;
 		}
 
-		btrfsic_submit_bio(cur->bi_rw, cur);
+		btrfsic_submit_bio(cur);
 		num_run++;
 		batch_run++;
 
@@ -5996,7 +5996,7 @@ static void btrfs_end_bio(struct bio *bio)
  */
 static noinline void btrfs_schedule_bio(struct btrfs_root *root,
 					struct btrfs_device *device,
-					int rw, struct bio *bio)
+					struct bio *bio)
 {
 	int should_queue = 1;
 	struct btrfs_pending_bios *pending_bios;
@@ -6007,9 +6007,9 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
 	}
 
 	/* don't bother with additional async steps for reads, right now */
-	if (!(rw & REQ_WRITE)) {
+	if (!(bio->bi_rw & REQ_WRITE)) {
 		bio_get(bio);
-		btrfsic_submit_bio(rw, bio);
+		btrfsic_submit_bio(bio);
 		bio_put(bio);
 		return;
 	}
@@ -6023,7 +6023,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
 	atomic_inc(&root->fs_info->nr_async_bios);
 	WARN_ON(bio->bi_next);
 	bio->bi_next = NULL;
-	bio->bi_rw |= rw;
 
 	spin_lock(&device->io_lock);
 	if (bio->bi_rw & REQ_SYNC)
@@ -6057,6 +6056,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
 	btrfs_io_bio(bio)->stripe_index = dev_nr;
 	bio->bi_end_io = btrfs_end_bio;
 	bio->bi_iter.bi_sector = physical >> 9;
+	bio->bi_rw |= rw;
 #ifdef DEBUG
 	{
 		struct rcu_string *name;
@@ -6075,9 +6075,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
 	btrfs_bio_counter_inc_noblocked(root->fs_info);
 
 	if (async)
-		btrfs_schedule_bio(root, dev, rw, bio);
+		btrfs_schedule_bio(root, dev, bio);
 	else
-		btrfsic_submit_bio(rw, bio);
+		btrfsic_submit_bio(bio);
 }
 
 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
diff --git a/fs/buffer.c b/fs/buffer.c
index 754813a6962b..9a55e7f8b25c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3030,8 +3030,9 @@ static int submit_bh_wbc(int rw, struct buffer_head *bh,
 		rw |= REQ_META;
 	if (buffer_prio(bh))
 		rw |= REQ_PRIO;
+	bio->bi_rw = rw;
 
-	submit_bio(rw, bio);
+	submit_bio(bio);
 	return 0;
 }
 
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 2fc8c43ce531..5b758566199e 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -318,6 +318,7 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
 		bio->bi_bdev = inode->i_sb->s_bdev;
 		bio->bi_iter.bi_sector =
 			pblk << (inode->i_sb->s_blocksize_bits - 9);
+		bio->bi_rw = WRITE;
 		ret = bio_add_page(bio, ciphertext_page,
 					inode->i_sb->s_blocksize, 0);
 		if (ret != inode->i_sb->s_blocksize) {
@@ -327,7 +328,7 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
 			err = -EIO;
 			goto errout;
 		}
-		err = submit_bio_wait(WRITE, bio);
+		err = submit_bio_wait(bio);
 		if ((err == 0) && bio->bi_error)
 			err = -EIO;
 		bio_put(bio);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index f3b4408be590..1bcdd5dde00d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -375,6 +375,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
 
 	bio->bi_bdev = bdev;
 	bio->bi_iter.bi_sector = first_sector;
+	bio->bi_rw = dio->rw;
 	if (dio->is_async)
 		bio->bi_end_io = dio_bio_end_aio;
 	else
@@ -412,7 +413,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 			       sdio->logical_offset_in_bio);
 		dio->bio_cookie = BLK_QC_T_NONE;
 	} else
-		dio->bio_cookie = submit_bio(dio->rw, bio);
+		dio->bio_cookie = submit_bio(bio);
 
 	sdio->bio = NULL;
 	sdio->boundary = 0;
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 6a6c27373b54..811bd5de8099 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -428,6 +428,7 @@ int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk,
 		bio->bi_bdev = inode->i_sb->s_bdev;
 		bio->bi_iter.bi_sector =
 			pblk << (inode->i_sb->s_blocksize_bits - 9);
+		bio->bi_rw = WRITE;
 		ret = bio_add_page(bio, ciphertext_page,
 				   inode->i_sb->s_blocksize, 0);
 		if (ret != inode->i_sb->s_blocksize) {
@@ -439,7 +440,7 @@ int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk,
 			err = -EIO;
 			goto errout;
 		}
-		err = submit_bio_wait(WRITE, bio);
+		err = submit_bio_wait(bio);
 		if ((err == 0) && bio->bi_error)
 			err = -EIO;
 		bio_put(bio);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 2a01df9cc1c3..a72dbcc7a43e 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -342,7 +342,8 @@ void ext4_io_submit(struct ext4_io_submit *io)
 	if (bio) {
 		int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ?
 			    WRITE_SYNC : WRITE;
-		submit_bio(io_op, io->io_bio);
+		io->io_bio->bi_rw = io_op;
+		submit_bio(io->io_bio);
 	}
 	io->io_bio = NULL;
 }
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index dc54a4b60eba..130bd45f4a99 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -271,7 +271,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
 		 */
 		if (bio && (last_block_in_bio != blocks[0] - 1)) {
 		submit_and_realloc:
-			submit_bio(READ, bio);
+			submit_bio(bio);
 			bio = NULL;
 		}
 		if (bio == NULL) {
@@ -294,6 +294,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
 			bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
 			bio->bi_end_io = mpage_end_io;
 			bio->bi_private = ctx;
+			bio->bi_rw = READ;
 		}
 
 		length = first_hole << blkbits;
@@ -303,14 +304,14 @@ int ext4_mpage_readpages(struct address_space *mapping,
 		if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
 		     (relative_block == map.m_len)) ||
 		    (first_hole != blocks_per_page)) {
-			submit_bio(READ, bio);
+			submit_bio(bio);
 			bio = NULL;
 		} else
 			last_block_in_bio = blocks[blocks_per_page - 1];
 		goto next_page;
 	confused:
 		if (bio) {
-			submit_bio(READ, bio);
+			submit_bio(bio);
 			bio = NULL;
 		}
 		if (!PageUptodate(page))
@@ -323,6 +324,6 @@ int ext4_mpage_readpages(struct address_space *mapping,
 	}
 	BUG_ON(pages && !list_empty(pages));
 	if (bio)
-		submit_bio(READ, bio);
+		submit_bio(bio);
 	return 0;
 }
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 9a8bbc1fb1fa..c595c8f84c48 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -102,7 +102,8 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw,
 {
 	if (!is_read_io(rw))
 		atomic_inc(&sbi->nr_wb_bios);
-	submit_bio(rw, bio);
+	bio->bi_rw = rw;
+	submit_bio(bio);
 }
 
 static void __submit_merged_bio(struct f2fs_bio_info *io)
@@ -1080,6 +1081,7 @@ submit_and_realloc:
 			bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(block_nr);
 			bio->bi_end_io = f2fs_read_end_io;
 			bio->bi_private = ctx;
+			bio->bi_rw = READ;
 		}
 
 		if (bio_add_page(bio, page, blocksize, 0) < blocksize)
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 2e6f537a0e7d..3fe2faba842a 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -406,7 +406,8 @@ repeat:
 		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
 
 		bio->bi_bdev = sbi->sb->s_bdev;
-		ret = submit_bio_wait(WRITE_FLUSH, bio);
+		bio->bi_rw = WRITE_FLUSH;
+		ret = submit_bio_wait(bio);
 
 		llist_for_each_entry_safe(cmd, next,
 					  fcc->dispatch_list, llnode) {
@@ -438,7 +439,8 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 		int ret;
 
 		bio->bi_bdev = sbi->sb->s_bdev;
-		ret = submit_bio_wait(WRITE_FLUSH, bio);
+		bio->bi_rw = WRITE_FLUSH;
+		ret = submit_bio_wait(bio);
 		bio_put(bio);
 		return ret;
 	}
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index d5369a109781..ce282423a853 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -240,7 +240,8 @@ void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw)
 {
 	if (sdp->sd_log_bio) {
 		atomic_inc(&sdp->sd_log_in_flight);
-		submit_bio(rw, sdp->sd_log_bio);
+		sdp->sd_log_bio->bi_rw = rw;
+		submit_bio(sdp->sd_log_bio);
 		sdp->sd_log_bio = NULL;
 	}
 }
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 8eaadabbc771..66670e14f654 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -230,7 +230,8 @@ static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num)
 		bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
 	}
 	bio->bi_end_io = gfs2_meta_read_endio;
-	submit_bio(rw, bio);
+	bio->bi_rw = rw;
+	submit_bio(bio);
 }
 
 /**
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 45463600fb81..d9e19f61077b 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -246,7 +246,8 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
 
 	bio->bi_end_io = end_bio_io_page;
 	bio->bi_private = page;
-	submit_bio(READ_SYNC | REQ_META, bio);
+	bio->bi_rw = READ_SYNC | REQ_META;
+	submit_bio(bio);
 	wait_on_page_locked(page);
 	bio_put(bio);
 	if (!PageUptodate(page)) {
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index cc6235671437..d026bb3fb36f 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -65,6 +65,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
 	bio = bio_alloc(GFP_NOIO, 1);
 	bio->bi_iter.bi_sector = sector;
 	bio->bi_bdev = sb->s_bdev;
+	bio->bi_rw = rw;
 
 	if (!(rw & WRITE) && data)
 		*data = (u8 *)buf + offset;
@@ -83,7 +84,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
 		buf = (u8 *)buf + len;
 	}
 
-	ret = submit_bio_wait(rw, bio);
+	ret = submit_bio_wait(bio);
 out:
 	bio_put(bio);
 	return ret < 0 ? ret : 0;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 63759d723920..3ee3f32562f9 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2002,12 +2002,13 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
 
 	bio->bi_end_io = lbmIODone;
 	bio->bi_private = bp;
+	bio->bi_rw = READ_SYNC;
 	/*check if journaling to disk has been disabled*/
 	if (log->no_integrity) {
 		bio->bi_iter.bi_size = 0;
 		lbmIODone(bio);
 	} else {
-		submit_bio(READ_SYNC, bio);
+		submit_bio(bio);
 	}
 
 	wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
@@ -2145,13 +2146,14 @@ static void lbmStartIO(struct lbuf * bp)
 
 	bio->bi_end_io = lbmIODone;
 	bio->bi_private = bp;
+	bio->bi_rw = WRITE_SYNC;
 
 	/* check if journaling to disk has been disabled */
 	if (log->no_integrity) {
 		bio->bi_iter.bi_size = 0;
 		lbmIODone(bio);
 	} else {
-		submit_bio(WRITE_SYNC, bio);
+		submit_bio(bio);
 		INCREMENT(lmStat.submitted);
 	}
 }
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index b60e015cc757..97254436a4ed 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -411,7 +411,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 			inc_io(page);
 			if (!bio->bi_iter.bi_size)
 				goto dump_bio;
-			submit_bio(WRITE, bio);
+			submit_bio(bio);
 			nr_underway++;
 			bio = NULL;
 		} else
@@ -434,6 +434,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 		bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9);
 		bio->bi_end_io = metapage_write_end_io;
 		bio->bi_private = page;
+		bio->bi_rw = WRITE;
 
 		/* Don't call bio_add_page yet, we may add to this vec */
 		bio_offset = offset;
@@ -448,7 +449,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 		if (!bio->bi_iter.bi_size)
 			goto dump_bio;
 
-		submit_bio(WRITE, bio);
+		submit_bio(bio);
 		nr_underway++;
 	}
 	if (redirty)
@@ -506,7 +507,7 @@ static int metapage_readpage(struct file *fp, struct page *page)
 				insert_metapage(page, NULL);
 			inc_io(page);
 			if (bio)
-				submit_bio(READ, bio);
+				submit_bio(bio);
 
 			bio = bio_alloc(GFP_NOFS, 1);
 			bio->bi_bdev = inode->i_sb->s_bdev;
@@ -514,6 +515,7 @@ static int metapage_readpage(struct file *fp, struct page *page)
 				pblock << (inode->i_blkbits - 9);
 			bio->bi_end_io = metapage_read_end_io;
 			bio->bi_private = page;
+			bio->bi_rw = READ;
 			len = xlen << inode->i_blkbits;
 			offset = block_offset << inode->i_blkbits;
 			if (bio_add_page(bio, page, len, offset) < len)
@@ -523,7 +525,7 @@ static int metapage_readpage(struct file *fp, struct page *page)
 			block_offset++;
 	}
 	if (bio)
-		submit_bio(READ, bio);
+		submit_bio(bio);
 	else
 		unlock_page(page);
 
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index cc26f8f215f5..29704bda6394 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -29,8 +29,9 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
 	bio.bi_bdev = bdev;
 	bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9);
 	bio.bi_iter.bi_size = PAGE_SIZE;
+	bio.bi_rw = rw;
 
-	return submit_bio_wait(rw, &bio);
+	return submit_bio_wait(&bio);
 }
 
 static int bdev_readpage(void *_sb, struct page *page)
@@ -95,8 +96,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
 			bio->bi_iter.bi_sector = ofs >> 9;
 			bio->bi_private = sb;
 			bio->bi_end_io = writeseg_end_io;
+			bio->bi_rw = WRITE;
 			atomic_inc(&super->s_pending_writes);
-			submit_bio(WRITE, bio);
+			submit_bio(bio);
 
 			ofs += i * PAGE_SIZE;
 			index += i;
@@ -122,8 +124,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
 	bio->bi_iter.bi_sector = ofs >> 9;
 	bio->bi_private = sb;
 	bio->bi_end_io = writeseg_end_io;
+	bio->bi_rw = WRITE;
 	atomic_inc(&super->s_pending_writes);
-	submit_bio(WRITE, bio);
+	submit_bio(bio);
 	return 0;
 }
 
@@ -185,8 +188,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
 			bio->bi_iter.bi_sector = ofs >> 9;
 			bio->bi_private = sb;
 			bio->bi_end_io = erase_end_io;
+			bio->bi_rw = WRITE;
 			atomic_inc(&super->s_pending_writes);
-			submit_bio(WRITE, bio);
+			submit_bio(bio);
 
 			ofs += i * PAGE_SIZE;
 			index += i;
@@ -206,8 +210,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
 	bio->bi_iter.bi_sector = ofs >> 9;
 	bio->bi_private = sb;
 	bio->bi_end_io = erase_end_io;
+	bio->bi_rw = WRITE;
 	atomic_inc(&super->s_pending_writes);
-	submit_bio(WRITE, bio);
+	submit_bio(bio);
 	return 0;
 }
 
diff --git a/fs/mpage.c b/fs/mpage.c
index eedc644b78d7..2c251ecfeeea 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -59,8 +59,9 @@ static void mpage_end_io(struct bio *bio)
 static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
 	bio->bi_end_io = mpage_end_io;
+	bio->bi_rw = rw;
 	guard_bio_eod(rw, bio);
-	submit_bio(rw, bio);
+	submit_bio(bio);
 	return NULL;
 }
 
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 17a42e4eb872..4c79f4ddb052 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -102,14 +102,15 @@ static inline void put_parallel(struct parallel_io *p)
 }
 
 static struct bio *
-bl_submit_bio(int rw, struct bio *bio)
+bl_submit_bio(struct bio *bio)
 {
 	if (bio) {
 		get_parallel(bio->bi_private);
 		dprintk("%s submitting %s bio %u@%llu\n", __func__,
-			rw == READ ? "read" : "write", bio->bi_iter.bi_size,
+			bio->bi_rw == READ ? "read" : "write",
+			bio->bi_iter.bi_size,
 			(unsigned long long)bio->bi_iter.bi_sector);
-		submit_bio(rw, bio);
+		submit_bio(bio);
 	}
 	return NULL;
 }
@@ -158,7 +159,7 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
 	if (disk_addr < map->start || disk_addr >= map->start + map->len) {
 		if (!dev->map(dev, disk_addr, map))
 			return ERR_PTR(-EIO);
-		bio = bl_submit_bio(rw, bio);
+		bio = bl_submit_bio(bio);
 	}
 	disk_addr += map->disk_offset;
 	disk_addr -= map->start;
@@ -174,9 +175,10 @@ retry:
 				disk_addr >> SECTOR_SHIFT, end_io, par);
 		if (!bio)
 			return ERR_PTR(-ENOMEM);
+		bio->bi_rw = rw;
 	}
 	if (bio_add_page(bio, page, *len, offset) < *len) {
-		bio = bl_submit_bio(rw, bio);
+		bio = bl_submit_bio(bio);
 		goto retry;
 	}
 	return bio;
@@ -252,7 +254,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
 	for (i = pg_index; i < header->page_array.npages; i++) {
 		if (extent_length <= 0) {
 			/* We've used up the previous extent */
-			bio = bl_submit_bio(READ, bio);
+			bio = bl_submit_bio(bio);
 
 			/* Get the next one */
 			if (!ext_tree_lookup(bl, isect, &be, false)) {
@@ -273,7 +275,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
 		}
 
 		if (is_hole(&be)) {
-			bio = bl_submit_bio(READ, bio);
+			bio = bl_submit_bio(bio);
 			/* Fill hole w/ zeroes w/o accessing device */
 			dprintk("%s Zeroing page for hole\n", __func__);
 			zero_user_segment(pages[i], pg_offset, pg_len);
@@ -306,7 +308,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
 		header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
 	}
 out:
-	bl_submit_bio(READ, bio);
+	bl_submit_bio(bio);
 	blk_finish_plug(&plug);
 	put_parallel(par);
 	return PNFS_ATTEMPTED;
@@ -398,7 +400,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 	for (i = pg_index; i < header->page_array.npages; i++) {
 		if (extent_length <= 0) {
 			/* We've used up the previous extent */
-			bio = bl_submit_bio(WRITE, bio);
+			bio = bl_submit_bio(bio);
 			/* Get the next one */
 			if (!ext_tree_lookup(bl, isect, &be, true)) {
 				header->pnfs_error = -EINVAL;
@@ -427,7 +429,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 
 	header->res.count = header->args.count;
 out:
-	bl_submit_bio(WRITE, bio);
+	bl_submit_bio(bio);
 	blk_finish_plug(&plug);
 	put_parallel(par);
 	return PNFS_ATTEMPTED;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index bf36df10540b..0f62909557e7 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -364,7 +364,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
 
 	bio->bi_end_io = nilfs_end_bio_write;
 	bio->bi_private = segbuf;
-	submit_bio(mode, bio);
+	bio->bi_rw = mode;
+	submit_bio(bio);
 	segbuf->sb_nbio++;
 
 	wi->bio = NULL;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 6aaf3e351391..8b1d86ebd8d1 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -530,7 +530,7 @@ static void o2hb_bio_end_io(struct bio *bio)
 static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
 				      struct o2hb_bio_wait_ctxt *wc,
 				      unsigned int *current_slot,
-				      unsigned int max_slots)
+				      unsigned int max_slots, int rw)
 {
 	int len, current_page;
 	unsigned int vec_len, vec_start;
@@ -556,6 +556,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
 	bio->bi_bdev = reg->hr_bdev;
 	bio->bi_private = wc;
 	bio->bi_end_io = o2hb_bio_end_io;
+	bio->bi_rw = rw;
 
 	vec_start = (cs << bits) % PAGE_SIZE;
 	while(cs < max_slots) {
@@ -591,7 +592,8 @@ static int o2hb_read_slots(struct o2hb_region *reg,
 	o2hb_bio_wait_init(&wc);
 
 	while(current_slot < max_slots) {
-		bio = o2hb_setup_one_bio(reg, &wc, &current_slot, max_slots);
+		bio = o2hb_setup_one_bio(reg, &wc, &current_slot, max_slots,
+					 READ);
 		if (IS_ERR(bio)) {
 			status = PTR_ERR(bio);
 			mlog_errno(status);
@@ -599,7 +601,7 @@ static int o2hb_read_slots(struct o2hb_region *reg,
 		}
 
 		atomic_inc(&wc.wc_num_reqs);
-		submit_bio(READ, bio);
+		submit_bio(bio);
 	}
 
 	status = 0;
@@ -623,7 +625,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
 
 	slot = o2nm_this_node();
 
-	bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1);
+	bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, WRITE_SYNC);
 	if (IS_ERR(bio)) {
 		status = PTR_ERR(bio);
 		mlog_errno(status);
@@ -631,7 +633,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
 	}
 
 	atomic_inc(&write_wc->wc_num_reqs);
-	submit_bio(WRITE_SYNC, bio);
+	submit_bio(bio);
 
 	status = 0;
 bail:
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4c463b99fe57..0cd1603856b4 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -438,7 +438,10 @@ xfs_submit_ioend(
 
 	ioend->io_bio->bi_private = ioend;
 	ioend->io_bio->bi_end_io = xfs_end_bio;
-
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		ioend->io_bio->bi_rw = WRITE_SYNC;
+	else
+		ioend->io_bio->bi_rw = WRITE;
 	/*
 	 * If we are failing the IO now, just mark the ioend with an
 	 * error and finish it. This will run IO completion immediately
@@ -451,8 +454,7 @@ xfs_submit_ioend(
 		return status;
 	}
 
-	submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
-		   ioend->io_bio);
+	submit_bio(ioend->io_bio);
 	return 0;
 }
 
@@ -510,8 +512,11 @@ xfs_chain_bio(
 
 	bio_chain(ioend->io_bio, new);
 	bio_get(ioend->io_bio);		/* for xfs_destroy_ioend */
-	submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
-		   ioend->io_bio);
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		ioend->io_bio->bi_rw = WRITE_SYNC;
+	else
+		ioend->io_bio->bi_rw = WRITE;
+	submit_bio(ioend->io_bio);
 	ioend->io_bio = new;
 }
 
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index e71cfbd5acb3..0777c67f169b 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1166,7 +1166,7 @@ next_chunk:
 	bio->bi_iter.bi_sector = sector;
 	bio->bi_end_io = xfs_buf_bio_end_io;
 	bio->bi_private = bp;
-
+	bio->bi_rw = rw;
 
 	for (; size && nr_pages; nr_pages--, page_index++) {
 		int	rbytes, nbytes = PAGE_SIZE - offset;
@@ -1190,7 +1190,7 @@ next_chunk:
 			flush_kernel_vmap_range(bp->b_addr,
 						xfs_buf_vmap_len(bp));
 		}
-		submit_bio(rw, bio);
+		submit_bio(bio);
 		if (size)
 			goto next_chunk;
 	} else {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9faebf7f9a33..3bde94234eea 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -473,7 +473,7 @@ static inline void bio_io_error(struct bio *bio)
 struct request_queue;
 extern int bio_phys_segments(struct request_queue *, struct bio *);
 
-extern int submit_bio_wait(int rw, struct bio *bio);
+extern int submit_bio_wait(struct bio *bio);
 extern void bio_advance(struct bio *, unsigned);
 
 extern void bio_init(struct bio *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dd288148a6b1..65e4c51ecb3d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2747,7 +2747,7 @@ static inline void remove_inode_hash(struct inode *inode)
 extern void inode_sb_list_add(struct inode *inode);
 
 #ifdef CONFIG_BLOCK
-extern blk_qc_t submit_bio(int, struct bio *);
+extern blk_qc_t submit_bio(struct bio *);
 extern int bdev_read_only(struct block_device *);
 #endif
 extern int set_blocksize(struct block_device *, int);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 160e1006640d..be227f5aa9dc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -271,6 +271,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
 	bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
 	bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
 	bio->bi_bdev = hib_resume_bdev;
+	bio->bi_rw = rw;
 
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
 		printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
@@ -283,9 +284,9 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
 		bio->bi_end_io = hib_end_io;
 		bio->bi_private = hb;
 		atomic_inc(&hb->count);
-		submit_bio(rw, bio);
+		submit_bio(bio);
 	} else {
-		error = submit_bio_wait(rw, bio);
+		error = submit_bio_wait(bio);
 		bio_put(bio);
 	}
 
diff --git a/mm/page_io.c b/mm/page_io.c
index 242dba07545b..5a5fd66d7bd5 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -259,7 +259,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		bio_end_io_t end_write_func)
 {
 	struct bio *bio;
-	int ret, rw = WRITE;
+	int ret;
 	struct swap_info_struct *sis = page_swap_info(page);
 
 	if (sis->flags & SWP_FILE) {
@@ -317,12 +317,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		ret = -ENOMEM;
 		goto out;
 	}
+	bio->bi_rw = WRITE;
 	if (wbc->sync_mode == WB_SYNC_ALL)
-		rw |= REQ_SYNC;
+		bio->bi_rw |= REQ_SYNC;
 	count_vm_event(PSWPOUT);
 	set_page_writeback(page);
 	unlock_page(page);
-	submit_bio(rw, bio);
+	submit_bio(bio);
 out:
 	return ret;
 }
@@ -369,8 +370,9 @@ int swap_readpage(struct page *page)
 		ret = -ENOMEM;
 		goto out;
 	}
+	bio->bi_rw = READ;
 	count_vm_event(PSWPIN);
-	submit_bio(READ, bio);
+	submit_bio(bio);
 out:
 	return ret;
 }
-- 
cgit v1.3-8-gc7d7


From 162b99e3119767cb6478c55a5aed70469389df88 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Sun, 5 Jun 2016 14:32:02 -0500
Subject: pm: use bio op accessors

Separate the op from the rq_flag_bits and have the pm code
set/get the bio using bio_set_op_attrs/bio_op.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/power/swap.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index be227f5aa9dc..c1aaac431055 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -261,7 +261,7 @@ static void hib_end_io(struct bio *bio)
 	bio_put(bio);
 }
 
-static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
+static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
 		struct hib_bio_batch *hb)
 {
 	struct page *page = virt_to_page(addr);
@@ -271,7 +271,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
 	bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
 	bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
 	bio->bi_bdev = hib_resume_bdev;
-	bio->bi_rw = rw;
+	bio_set_op_attrs(bio, op, op_flags);
 
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
 		printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
@@ -307,7 +307,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 {
 	int error;
 
-	hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
+	hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
+		      swsusp_header, NULL);
 	if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
 	    !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
 		memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
@@ -316,8 +317,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 		swsusp_header->flags = flags;
 		if (flags & SF_CRC32_MODE)
 			swsusp_header->crc32 = handle->crc32;
-		error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
-					swsusp_header, NULL);
+		error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+				      swsusp_resume_block, swsusp_header, NULL);
 	} else {
 		printk(KERN_ERR "PM: Swap header not found!\n");
 		error = -ENODEV;
@@ -390,7 +391,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
 	} else {
 		src = buf;
 	}
-	return hib_submit_io(WRITE_SYNC, offset, src, hb);
+	return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb);
 }
 
 static void release_swap_writer(struct swap_map_handle *handle)
@@ -993,7 +994,8 @@ static int get_swap_reader(struct swap_map_handle *handle,
 			return -ENOMEM;
 		}
 
-		error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL);
+		error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset,
+				      tmp->map, NULL);
 		if (error) {
 			release_swap_reader(handle);
 			return error;
@@ -1017,7 +1019,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
 	offset = handle->cur->entries[handle->k];
 	if (!offset)
 		return -EFAULT;
-	error = hib_submit_io(READ_SYNC, offset, buf, hb);
+	error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb);
 	if (error)
 		return error;
 	if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -1526,7 +1528,8 @@ int swsusp_check(void)
 	if (!IS_ERR(hib_resume_bdev)) {
 		set_blocksize(hib_resume_bdev, PAGE_SIZE);
 		clear_page(swsusp_header);
-		error = hib_submit_io(READ_SYNC, swsusp_resume_block,
+		error = hib_submit_io(REQ_OP_READ, READ_SYNC,
+					swsusp_resume_block,
 					swsusp_header, NULL);
 		if (error)
 			goto put;
@@ -1534,7 +1537,8 @@ int swsusp_check(void)
 		if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
 			memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
 			/* Reset swap signature now */
-			error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
+			error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+						swsusp_resume_block,
 						swsusp_header, NULL);
 		} else {
 			error = -EINVAL;
@@ -1578,10 +1582,12 @@ int swsusp_unmark(void)
 {
 	int error;
 
-	hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
+	hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
+		      swsusp_header, NULL);
 	if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
 		memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
-		error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
+		error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+					swsusp_resume_block,
 					swsusp_header, NULL);
 	} else {
 		printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
-- 
cgit v1.3-8-gc7d7


From 1b9a9ab78b0ab79dc1f0ddd5fbed7833ec7de3a4 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Sun, 5 Jun 2016 14:32:18 -0500
Subject: blktrace: use op accessors

Have blktrace use the req/bio op accessor to get the REQ_OP.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blktrace_api.h  |  2 +-
 include/trace/events/bcache.h | 12 ++++++---
 include/trace/events/block.h  | 31 ++++++++++++++--------
 kernel/trace/blktrace.c       | 62 +++++++++++++++++++++++++------------------
 4 files changed, 65 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 0f3172b8b225..cceb72f9e29f 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -118,7 +118,7 @@ static inline int blk_cmd_buf_len(struct request *rq)
 }
 
 extern void blk_dump_cmd(char *buf, struct request *rq);
-extern void blk_fill_rwbs(char *rwbs, u32 rw, int bytes);
+extern void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes);
 
 #endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */
 
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index 981acf74b14f..65673d8b81ac 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -27,7 +27,8 @@ DECLARE_EVENT_CLASS(bcache_request,
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->orig_sector	= bio->bi_iter.bi_sector - 16;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+			      bio->bi_iter.bi_size);
 	),
 
 	TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)",
@@ -101,7 +102,8 @@ DECLARE_EVENT_CLASS(bcache_bio,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+			      bio->bi_iter.bi_size);
 	),
 
 	TP_printk("%d,%d  %s %llu + %u",
@@ -136,7 +138,8 @@ TRACE_EVENT(bcache_read,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+			      bio->bi_iter.bi_size);
 		__entry->cache_hit = hit;
 		__entry->bypass = bypass;
 	),
@@ -167,7 +170,8 @@ TRACE_EVENT(bcache_write,
 		__entry->inode		= inode;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+			      bio->bi_iter.bi_size);
 		__entry->writeback = writeback;
 		__entry->bypass = bypass;
 	),
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index e8a5eca1dbe5..5a2a7592068f 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -84,7 +84,8 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
 					0 : blk_rq_sectors(rq);
 		__entry->errors    = rq->errors;
 
-		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
+		blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags,
+			      blk_rq_bytes(rq));
 		blk_dump_cmd(__get_str(cmd), rq);
 	),
 
@@ -162,7 +163,7 @@ TRACE_EVENT(block_rq_complete,
 		__entry->nr_sector = nr_bytes >> 9;
 		__entry->errors    = rq->errors;
 
-		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes);
+		blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, nr_bytes);
 		blk_dump_cmd(__get_str(cmd), rq);
 	),
 
@@ -198,7 +199,8 @@ DECLARE_EVENT_CLASS(block_rq,
 		__entry->bytes     = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
 					blk_rq_bytes(rq) : 0;
 
-		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
+		blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags,
+			      blk_rq_bytes(rq));
 		blk_dump_cmd(__get_str(cmd), rq);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
@@ -272,7 +274,8 @@ TRACE_EVENT(block_bio_bounce,
 					  bio->bi_bdev->bd_dev : 0;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+			      bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -310,7 +313,8 @@ TRACE_EVENT(block_bio_complete,
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
 		__entry->error		= error;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+			      bio->bi_iter.bi_size);
 	),
 
 	TP_printk("%d,%d %s %llu + %u [%d]",
@@ -337,7 +341,8 @@ DECLARE_EVENT_CLASS(block_bio_merge,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+			      bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -404,7 +409,8 @@ TRACE_EVENT(block_bio_queue,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+			      bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -432,7 +438,7 @@ DECLARE_EVENT_CLASS(block_get_rq,
 		__entry->dev		= bio ? bio->bi_bdev->bd_dev : 0;
 		__entry->sector		= bio ? bio->bi_iter.bi_sector : 0;
 		__entry->nr_sector	= bio ? bio_sectors(bio) : 0;
-		blk_fill_rwbs(__entry->rwbs,
+		blk_fill_rwbs(__entry->rwbs, bio ? bio_op(bio) : 0,
 			      bio ? bio->bi_rw : 0, __entry->nr_sector);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
         ),
@@ -567,7 +573,8 @@ TRACE_EVENT(block_split,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->new_sector	= new_sector;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+			      bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -610,7 +617,8 @@ TRACE_EVENT(block_bio_remap,
 		__entry->nr_sector	= bio_sectors(bio);
 		__entry->old_dev	= dev;
 		__entry->old_sector	= from;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+			      bio->bi_iter.bi_size);
 	),
 
 	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
@@ -656,7 +664,8 @@ TRACE_EVENT(block_rq_remap,
 		__entry->old_dev	= dev;
 		__entry->old_sector	= from;
 		__entry->nr_bios	= blk_rq_count_bios(rq);
-		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
+		blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags,
+			      blk_rq_bytes(rq));
 	),
 
 	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u",
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9aef8654e90d..2d16fad519b2 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -199,7 +199,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
  * blk_io_trace structure and places it in a per-cpu subbuffer.
  */
 static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
-		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
+		     int op, int op_flags, u32 what, int error, int pdu_len,
+		     void *pdu_data)
 {
 	struct task_struct *tsk = current;
 	struct ring_buffer_event *event = NULL;
@@ -214,13 +215,14 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
 		return;
 
-	what |= ddir_act[rw & WRITE];
-	what |= MASK_TC_BIT(rw, SYNC);
-	what |= MASK_TC_BIT(rw, RAHEAD);
-	what |= MASK_TC_BIT(rw, META);
-	what |= MASK_TC_BIT(rw, DISCARD);
-	what |= MASK_TC_BIT(rw, FLUSH);
-	what |= MASK_TC_BIT(rw, FUA);
+	what |= ddir_act[op_is_write(op) ? WRITE : READ];
+	what |= MASK_TC_BIT(op_flags, SYNC);
+	what |= MASK_TC_BIT(op_flags, RAHEAD);
+	what |= MASK_TC_BIT(op_flags, META);
+	what |= MASK_TC_BIT(op_flags, FLUSH);
+	what |= MASK_TC_BIT(op_flags, FUA);
+	if (op == REQ_OP_DISCARD)
+		what |= BLK_TC_ACT(BLK_TC_DISCARD);
 
 	pid = tsk->pid;
 	if (act_log_check(bt, what, sector, pid))
@@ -708,11 +710,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags,
+		__blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags,
 				what, rq->errors, rq->cmd_len, rq->cmd);
 	} else  {
 		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, blk_rq_pos(rq), nr_bytes,
+		__blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq),
 				rq->cmd_flags, what, rq->errors, 0, NULL);
 	}
 }
@@ -770,7 +772,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 		return;
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-			bio->bi_rw, what, error, 0, NULL);
+			bio_op(bio), bio->bi_rw, what, error, 0, NULL);
 }
 
 static void blk_add_trace_bio_bounce(void *ignore,
@@ -818,7 +820,8 @@ static void blk_add_trace_getrq(void *ignore,
 		struct blk_trace *bt = q->blk_trace;
 
 		if (bt)
-			__blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+			__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
+					NULL);
 	}
 }
 
@@ -833,7 +836,7 @@ static void blk_add_trace_sleeprq(void *ignore,
 		struct blk_trace *bt = q->blk_trace;
 
 		if (bt)
-			__blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
+			__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
 					0, 0, NULL);
 	}
 }
@@ -843,7 +846,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
 	struct blk_trace *bt = q->blk_trace;
 
 	if (bt)
-		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+		__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
 }
 
 static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -860,7 +863,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
 		else
 			what = BLK_TA_UNPLUG_TIMER;
 
-		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+		__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
 	}
 }
 
@@ -874,8 +877,9 @@ static void blk_add_trace_split(void *ignore,
 		__be64 rpdu = cpu_to_be64(pdu);
 
 		__blk_add_trace(bt, bio->bi_iter.bi_sector,
-				bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
-				bio->bi_error, sizeof(rpdu), &rpdu);
+				bio->bi_iter.bi_size, bio_op(bio), bio->bi_rw,
+				BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu),
+				&rpdu);
 	}
 }
 
@@ -907,7 +911,7 @@ static void blk_add_trace_bio_remap(void *ignore,
 	r.sector_from = cpu_to_be64(from);
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-			bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
+			bio_op(bio), bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
 			sizeof(r), &r);
 }
 
@@ -940,7 +944,7 @@ static void blk_add_trace_rq_remap(void *ignore,
 	r.sector_from = cpu_to_be64(from);
 
 	__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
-			rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
+			rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors,
 			sizeof(r), &r);
 }
 
@@ -965,10 +969,10 @@ void blk_add_driver_data(struct request_queue *q,
 		return;
 
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
-		__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
+		__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0,
 				BLK_TA_DRV_DATA, rq->errors, len, data);
 	else
-		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
+		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0,
 				BLK_TA_DRV_DATA, rq->errors, len, data);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1769,21 +1773,27 @@ void blk_dump_cmd(char *buf, struct request *rq)
 	}
 }
 
-void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
 {
 	int i = 0;
 
 	if (rw & REQ_FLUSH)
 		rwbs[i++] = 'F';
 
-	if (rw & WRITE)
+	switch (op) {
+	case REQ_OP_WRITE:
+	case REQ_OP_WRITE_SAME:
 		rwbs[i++] = 'W';
-	else if (rw & REQ_DISCARD)
+		break;
+	case REQ_OP_DISCARD:
 		rwbs[i++] = 'D';
-	else if (bytes)
+		break;
+	case REQ_OP_READ:
 		rwbs[i++] = 'R';
-	else
+		break;
+	default:
 		rwbs[i++] = 'N';
+	}
 
 	if (rw & REQ_FUA)
 		rwbs[i++] = 'F';
-- 
cgit v1.3-8-gc7d7


From 3a5e02ced11e22ecd9da3d6710afe15bcfee1d10 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Sun, 5 Jun 2016 14:32:23 -0500
Subject: block, drivers: add REQ_OP_FLUSH operation

This adds a REQ_OP_FLUSH operation that is sent to request_fn
based drivers by the block layer's flush code, instead of
sending requests with the request->cmd_flags REQ_FLUSH bit set.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/writeback_cache_control.txt | 6 +++---
 arch/um/drivers/ubd_kern.c                      | 2 +-
 block/blk-flush.c                               | 4 ++--
 drivers/block/loop.c                            | 4 ++--
 drivers/block/nbd.c                             | 2 +-
 drivers/block/osdblk.c                          | 2 +-
 drivers/block/ps3disk.c                         | 4 ++--
 drivers/block/skd_main.c                        | 2 +-
 drivers/block/virtio_blk.c                      | 2 +-
 drivers/block/xen-blkfront.c                    | 8 ++++----
 drivers/ide/ide-disk.c                          | 2 +-
 drivers/md/dm.c                                 | 2 +-
 drivers/mmc/card/block.c                        | 6 +++---
 drivers/mmc/card/queue.h                        | 3 ++-
 drivers/mtd/mtd_blkdevs.c                       | 2 +-
 drivers/nvme/host/core.c                        | 2 +-
 drivers/scsi/sd.c                               | 7 +++----
 include/linux/blk_types.h                       | 3 ++-
 include/linux/blkdev.h                          | 3 +++
 kernel/trace/blktrace.c                         | 5 +++++
 20 files changed, 40 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/block/writeback_cache_control.txt b/Documentation/block/writeback_cache_control.txt
index 59e0516cbf6b..da70bdacd503 100644
--- a/Documentation/block/writeback_cache_control.txt
+++ b/Documentation/block/writeback_cache_control.txt
@@ -73,9 +73,9 @@ doing:
 
 	blk_queue_write_cache(sdkp->disk->queue, true, false);
 
-and handle empty REQ_FLUSH requests in its prep_fn/request_fn.  Note that
+and handle empty REQ_OP_FLUSH requests in its prep_fn/request_fn.  Note that
 REQ_FLUSH requests with a payload are automatically turned into a sequence
-of an empty REQ_FLUSH request followed by the actual write by the block
+of an empty REQ_OP_FLUSH request followed by the actual write by the block
 layer.  For devices that also support the FUA bit the block layer needs
 to be told to pass through the REQ_FUA bit using:
 
@@ -83,4 +83,4 @@ to be told to pass through the REQ_FUA bit using:
 
 and the driver must handle write requests that have the REQ_FUA bit set
 in prep_fn/request_fn.  If the FUA bit is not natively supported the block
-layer turns it into an empty REQ_FLUSH request after the actual write.
+layer turns it into an empty REQ_OP_FLUSH request after the actual write.
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 17e96dc29596..ef6b4d960bad 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1286,7 +1286,7 @@ static void do_ubd_request(struct request_queue *q)
 
 		req = dev->request;
 
-		if (req->cmd_flags & REQ_FLUSH) {
+		if (req_op(req) == REQ_OP_FLUSH) {
 			io_req = kmalloc(sizeof(struct io_thread_req),
 					 GFP_ATOMIC);
 			if (io_req == NULL) {
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 9fd1f63a6348..21f0d5b0d2ca 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -29,7 +29,7 @@
  * The actual execution of flush is double buffered.  Whenever a request
  * needs to execute PRE or POSTFLUSH, it queues at
  * fq->flush_queue[fq->flush_pending_idx].  Once certain criteria are met, a
- * flush is issued and the pending_idx is toggled.  When the flush
+ * REQ_OP_FLUSH is issued and the pending_idx is toggled.  When the flush
  * completes, all the requests which were pending are proceeded to the next
  * step.  This allows arbitrary merging of different types of FLUSH/FUA
  * requests.
@@ -330,7 +330,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 	}
 
 	flush_rq->cmd_type = REQ_TYPE_FS;
-	flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
+	req_set_op_attrs(flush_rq, REQ_OP_FLUSH, WRITE_FLUSH | REQ_FLUSH_SEQ);
 	flush_rq->rq_disk = first_rq->rq_disk;
 	flush_rq->end_io = flush_end_io;
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b9b737cafd5f..364d491d4bdd 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -542,7 +542,7 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 	pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
 
 	if (op_is_write(req_op(rq))) {
-		if (rq->cmd_flags & REQ_FLUSH)
+		if (req_op(rq) == REQ_OP_FLUSH)
 			ret = lo_req_flush(lo, rq);
 		else if (req_op(rq) == REQ_OP_DISCARD)
 			ret = lo_discard(lo, rq, pos);
@@ -1659,7 +1659,7 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (lo->lo_state != Lo_bound)
 		return -EIO;
 
-	if (lo->use_dio && (!(cmd->rq->cmd_flags & REQ_FLUSH) ||
+	if (lo->use_dio && (req_op(cmd->rq) != REQ_OP_FLUSH ||
 	    req_op(cmd->rq) == REQ_OP_DISCARD))
 		cmd->use_aio = true;
 	else
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 6c2c28d124d0..d6f3c9336f29 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -284,7 +284,7 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 		type = NBD_CMD_DISC;
 	else if (req_op(req) == REQ_OP_DISCARD)
 		type = NBD_CMD_TRIM;
-	else if (req->cmd_flags & REQ_FLUSH)
+	else if (req_op(req) == REQ_OP_FLUSH)
 		type = NBD_CMD_FLUSH;
 	else if (rq_data_dir(req) == WRITE)
 		type = NBD_CMD_WRITE;
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index c2854a2bfdb0..92900f5f0b47 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -321,7 +321,7 @@ static void osdblk_rq_fn(struct request_queue *q)
 		 * driver-specific, etc.
 		 */
 
-		do_flush = rq->cmd_flags & REQ_FLUSH;
+		do_flush = (req_op(rq) == REQ_OP_FLUSH);
 		do_write = (rq_data_dir(rq) == WRITE);
 
 		if (!do_flush) { /* osd_flush does not use a bio */
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 4b7e405830d7..acb44529c05e 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -196,7 +196,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev,
 	dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__);
 
 	while ((req = blk_fetch_request(q))) {
-		if (req->cmd_flags & REQ_FLUSH) {
+		if (req_op(req) == REQ_OP_FLUSH) {
 			if (ps3disk_submit_flush_request(dev, req))
 				break;
 		} else if (req->cmd_type == REQ_TYPE_FS) {
@@ -256,7 +256,7 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data)
 		return IRQ_HANDLED;
 	}
 
-	if (req->cmd_flags & REQ_FLUSH) {
+	if (req_op(req) == REQ_OP_FLUSH) {
 		read = 0;
 		op = "flush";
 	} else {
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 910e065918af..5c07a23e2ada 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -597,7 +597,7 @@ static void skd_request_fn(struct request_queue *q)
 		data_dir = rq_data_dir(req);
 		io_flags = req->cmd_flags;
 
-		if (io_flags & REQ_FLUSH)
+		if (req_op(req) == REQ_OP_FLUSH)
 			flush++;
 
 		if (io_flags & REQ_FUA)
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 42758b52768c..18e4069dd24b 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -172,7 +172,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
 
 	vbr->req = req;
-	if (req->cmd_flags & REQ_FLUSH) {
+	if (req_op(req) == REQ_OP_FLUSH) {
 		vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_FLUSH);
 		vbr->out_hdr.sector = 0;
 		vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 6fd160197b7a..3aeb25bd5057 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -743,7 +743,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
 		 * The indirect operation can only be a BLKIF_OP_READ or
 		 * BLKIF_OP_WRITE
 		 */
-		BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA));
+		BUG_ON(req_op(req) == REQ_OP_FLUSH || req->cmd_flags & REQ_FUA);
 		ring_req->operation = BLKIF_OP_INDIRECT;
 		ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
 			BLKIF_OP_WRITE : BLKIF_OP_READ;
@@ -755,7 +755,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
 		ring_req->u.rw.handle = info->handle;
 		ring_req->operation = rq_data_dir(req) ?
 			BLKIF_OP_WRITE : BLKIF_OP_READ;
-		if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
+		if (req_op(req) == REQ_OP_FLUSH || req->cmd_flags & REQ_FUA) {
 			/*
 			 * Ideally we can do an unordered flush-to-disk.
 			 * In case the backend onlysupports barriers, use that.
@@ -865,7 +865,7 @@ static inline bool blkif_request_flush_invalid(struct request *req,
 					       struct blkfront_info *info)
 {
 	return ((req->cmd_type != REQ_TYPE_FS) ||
-		((req->cmd_flags & REQ_FLUSH) &&
+		((req_op(req) == REQ_OP_FLUSH) &&
 		 !(info->feature_flush & REQ_FLUSH)) ||
 		((req->cmd_flags & REQ_FUA) &&
 		 !(info->feature_flush & REQ_FUA)));
@@ -2055,7 +2055,7 @@ static int blkif_recover(struct blkfront_info *info)
 			/*
 			 * Get the bios in the request so we can re-queue them.
 			 */
-			if (copy[i].request->cmd_flags & REQ_FLUSH ||
+			if (req_op(copy[i].request) == REQ_OP_FLUSH ||
 			    req_op(copy[i].request) == REQ_OP_DISCARD ||
 			    copy[i].request->cmd_flags & (REQ_FUA | REQ_SECURE)) {
 				/*
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 05dbcce70b0e..e378ef70ed63 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -431,7 +431,7 @@ static int idedisk_prep_fn(struct request_queue *q, struct request *rq)
 	ide_drive_t *drive = q->queuedata;
 	struct ide_cmd *cmd;
 
-	if (!(rq->cmd_flags & REQ_FLUSH))
+	if (req_op(rq) != REQ_OP_FLUSH)
 		return BLKPREP_OK;
 
 	if (rq->special) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f6b104c77b6d..fcc68c8edba0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2171,7 +2171,7 @@ static void dm_request_fn(struct request_queue *q)
 
 		/* always use block 0 to find the target for flushes for now */
 		pos = 0;
-		if (!(rq->cmd_flags & REQ_FLUSH))
+		if (req_op(rq) != REQ_OP_FLUSH)
 			pos = blk_rq_pos(rq);
 
 		if ((dm_request_peeked_before_merge_deadline(md) &&
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index 201a8719f6c4..bca20f88a8b2 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -1722,7 +1722,8 @@ static u8 mmc_blk_prep_packed_list(struct mmc_queue *mq, struct request *req)
 		    !IS_ALIGNED(blk_rq_sectors(next), 8))
 			break;
 
-		if (req_op(next) == REQ_OP_DISCARD || next->cmd_flags & REQ_FLUSH)
+		if (req_op(next) == REQ_OP_DISCARD ||
+		    req_op(next) == REQ_OP_FLUSH)
 			break;
 
 		if (rq_data_dir(cur) != rq_data_dir(next))
@@ -2147,7 +2148,6 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 	struct mmc_card *card = md->queue.card;
 	struct mmc_host *host = card->host;
 	unsigned long flags;
-	unsigned int cmd_flags = req ? req->cmd_flags : 0;
 
 	if (req && !mq->mqrq_prev->req)
 		/* claim host only for the first request */
@@ -2171,7 +2171,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 			ret = mmc_blk_issue_secdiscard_rq(mq, req);
 		else
 			ret = mmc_blk_issue_discard_rq(mq, req);
-	} else if (cmd_flags & REQ_FLUSH) {
+	} else if (req && req_op(req) == REQ_OP_FLUSH) {
 		/* complete ongoing async transfer before issuing flush */
 		if (card->host->areq)
 			mmc_blk_issue_rw_rq(mq, NULL);
diff --git a/drivers/mmc/card/queue.h b/drivers/mmc/card/queue.h
index 9fb26f20a44d..d62531124d54 100644
--- a/drivers/mmc/card/queue.h
+++ b/drivers/mmc/card/queue.h
@@ -3,7 +3,8 @@
 
 static inline bool mmc_req_is_special(struct request *req)
 {
-	return req && (req->cmd_flags & REQ_FLUSH || req_op(req) == REQ_OP_DISCARD);
+	return req &&
+		(req_op(req) == REQ_OP_FLUSH || req_op(req) == REQ_OP_DISCARD);
 }
 
 struct request;
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 4eb9a5fb151c..78b3eb45faf6 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -87,7 +87,7 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 	if (req->cmd_type != REQ_TYPE_FS)
 		return -EIO;
 
-	if (req->cmd_flags & REQ_FLUSH)
+	if (req_op(req) == REQ_OP_FLUSH)
 		return tr->flush(dev);
 
 	if (blk_rq_pos(req) + blk_rq_cur_sectors(req) >
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 089b8b8aad4f..abdfdcfb66f4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -290,7 +290,7 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 
 	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
 		memcpy(cmd, req->cmd, sizeof(*cmd));
-	else if (req->cmd_flags & REQ_FLUSH)
+	else if (req_op(req) == REQ_OP_FLUSH)
 		nvme_setup_flush(ns, cmd);
 	else if (req_op(req) == REQ_OP_DISCARD)
 		ret = nvme_setup_discard(ns, req, cmd);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index fad86ad89e64..5a9db0fe1ee0 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1143,12 +1143,11 @@ static int sd_init_command(struct scsi_cmnd *cmd)
 		return sd_setup_discard_cmnd(cmd);
 	case REQ_OP_WRITE_SAME:
 		return sd_setup_write_same_cmnd(cmd);
+	case REQ_OP_FLUSH:
+		return sd_setup_flush_cmnd(cmd);
 	case REQ_OP_READ:
 	case REQ_OP_WRITE:
-		if (rq->cmd_flags & REQ_FLUSH)
-			return sd_setup_flush_cmnd(cmd);
-		else
-			return sd_setup_read_write_cmnd(cmd);
+		return sd_setup_read_write_cmnd(cmd);
 	default:
 		BUG();
 	}
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 23c1ab2a9475..32d87522f349 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -249,9 +249,10 @@ enum req_op {
 	REQ_OP_WRITE,
 	REQ_OP_DISCARD,		/* request to discard sectors */
 	REQ_OP_WRITE_SAME,	/* write same block many times */
+	REQ_OP_FLUSH,		/* request for cache flush */
 };
 
-#define REQ_OP_BITS 2
+#define REQ_OP_BITS 3
 
 typedef unsigned int blk_qc_t;
 #define BLK_QC_T_NONE	-1U
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 78ae3dbf2de1..0c9f8793c87e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -666,6 +666,9 @@ static inline bool rq_mergeable(struct request *rq)
 	if (rq->cmd_type != REQ_TYPE_FS)
 		return false;
 
+	if (req_op(rq) == REQ_OP_FLUSH)
+		return false;
+
 	if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
 		return false;
 
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2d16fad519b2..0c70fbb6ea8d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -223,6 +223,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	what |= MASK_TC_BIT(op_flags, FUA);
 	if (op == REQ_OP_DISCARD)
 		what |= BLK_TC_ACT(BLK_TC_DISCARD);
+	if (op == REQ_OP_FLUSH)
+		what |= BLK_TC_ACT(BLK_TC_FLUSH);
 
 	pid = tsk->pid;
 	if (act_log_check(bt, what, sector, pid))
@@ -1788,6 +1790,9 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
 	case REQ_OP_DISCARD:
 		rwbs[i++] = 'D';
 		break;
+	case REQ_OP_FLUSH:
+		rwbs[i++] = 'F';
+		break;
 	case REQ_OP_READ:
 		rwbs[i++] = 'R';
 		break;
-- 
cgit v1.3-8-gc7d7


From 28a8f0d317bf225ff15008f5dd66ae16242dd843 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Sun, 5 Jun 2016 14:32:25 -0500
Subject: block, drivers, fs: rename REQ_FLUSH to REQ_PREFLUSH

To avoid confusion between REQ_OP_FLUSH, which is handled by
request_fn drivers, and upper layers requesting the block layer
perform a flush sequence along with possibly a WRITE, this patch
renames REQ_FLUSH to REQ_PREFLUSH.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/writeback_cache_control.txt | 22 +++++++++++-----------
 Documentation/device-mapper/log-writes.txt      | 10 +++++-----
 block/blk-core.c                                | 12 ++++++------
 block/blk-flush.c                               | 16 ++++++++--------
 block/blk-mq.c                                  |  4 ++--
 drivers/block/drbd/drbd_actlog.c                |  4 ++--
 drivers/block/drbd/drbd_main.c                  |  2 +-
 drivers/block/drbd/drbd_protocol.h              |  2 +-
 drivers/block/drbd/drbd_receiver.c              |  2 +-
 drivers/block/drbd/drbd_req.c                   |  2 +-
 drivers/md/bcache/journal.c                     |  2 +-
 drivers/md/bcache/request.c                     |  8 ++++----
 drivers/md/dm-cache-target.c                    | 12 ++++++------
 drivers/md/dm-crypt.c                           |  7 ++++---
 drivers/md/dm-era-target.c                      |  4 ++--
 drivers/md/dm-io.c                              |  2 +-
 drivers/md/dm-log-writes.c                      |  2 +-
 drivers/md/dm-raid1.c                           |  5 +++--
 drivers/md/dm-region-hash.c                     |  4 ++--
 drivers/md/dm-snap.c                            |  6 +++---
 drivers/md/dm-stripe.c                          |  2 +-
 drivers/md/dm-thin.c                            |  8 ++++----
 drivers/md/dm.c                                 | 12 ++++++------
 drivers/md/linear.c                             |  2 +-
 drivers/md/md.c                                 |  2 +-
 drivers/md/md.h                                 |  2 +-
 drivers/md/multipath.c                          |  2 +-
 drivers/md/raid0.c                              |  2 +-
 drivers/md/raid1.c                              |  3 ++-
 drivers/md/raid10.c                             |  2 +-
 drivers/md/raid5-cache.c                        |  2 +-
 drivers/md/raid5.c                              |  2 +-
 fs/btrfs/check-integrity.c                      |  8 ++++----
 fs/jbd2/journal.c                               |  2 +-
 fs/xfs/xfs_buf.c                                |  2 +-
 include/linux/blk_types.h                       |  8 ++++----
 include/linux/fs.h                              |  4 ++--
 include/trace/events/f2fs.h                     |  2 +-
 kernel/trace/blktrace.c                         |  5 +++--
 39 files changed, 102 insertions(+), 98 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/block/writeback_cache_control.txt b/Documentation/block/writeback_cache_control.txt
index da70bdacd503..8a6bdada5f6b 100644
--- a/Documentation/block/writeback_cache_control.txt
+++ b/Documentation/block/writeback_cache_control.txt
@@ -20,11 +20,11 @@ a forced cache flush, and the Force Unit Access (FUA) flag for requests.
 Explicit cache flushes
 ----------------------
 
-The REQ_FLUSH flag can be OR ed into the r/w flags of a bio submitted from
+The REQ_PREFLUSH flag can be OR ed into the r/w flags of a bio submitted from
 the filesystem and will make sure the volatile cache of the storage device
 has been flushed before the actual I/O operation is started.  This explicitly
 guarantees that previously completed write requests are on non-volatile
-storage before the flagged bio starts. In addition the REQ_FLUSH flag can be
+storage before the flagged bio starts. In addition the REQ_PREFLUSH flag can be
 set on an otherwise empty bio structure, which causes only an explicit cache
 flush without any dependent I/O.  It is recommend to use
 the blkdev_issue_flush() helper for a pure cache flush.
@@ -41,21 +41,21 @@ signaled after the data has been committed to non-volatile storage.
 Implementation details for filesystems
 --------------------------------------
 
-Filesystems can simply set the REQ_FLUSH and REQ_FUA bits and do not have to
+Filesystems can simply set the REQ_PREFLUSH and REQ_FUA bits and do not have to
 worry if the underlying devices need any explicit cache flushing and how
-the Forced Unit Access is implemented.  The REQ_FLUSH and REQ_FUA flags
+the Forced Unit Access is implemented.  The REQ_PREFLUSH and REQ_FUA flags
 may both be set on a single bio.
 
 
 Implementation details for make_request_fn based block drivers
 --------------------------------------------------------------
 
-These drivers will always see the REQ_FLUSH and REQ_FUA bits as they sit
+These drivers will always see the REQ_PREFLUSH and REQ_FUA bits as they sit
 directly below the submit_bio interface.  For remapping drivers the REQ_FUA
 bits need to be propagated to underlying devices, and a global flush needs
-to be implemented for bios with the REQ_FLUSH bit set.  For real device
-drivers that do not have a volatile cache the REQ_FLUSH and REQ_FUA bits
-on non-empty bios can simply be ignored, and REQ_FLUSH requests without
+to be implemented for bios with the REQ_PREFLUSH bit set.  For real device
+drivers that do not have a volatile cache the REQ_PREFLUSH and REQ_FUA bits
+on non-empty bios can simply be ignored, and REQ_PREFLUSH requests without
 data can be completed successfully without doing any work.  Drivers for
 devices with volatile caches need to implement the support for these
 flags themselves without any help from the block layer.
@@ -65,8 +65,8 @@ Implementation details for request_fn based block drivers
 --------------------------------------------------------------
 
 For devices that do not support volatile write caches there is no driver
-support required, the block layer completes empty REQ_FLUSH requests before
-entering the driver and strips off the REQ_FLUSH and REQ_FUA bits from
+support required, the block layer completes empty REQ_PREFLUSH requests before
+entering the driver and strips off the REQ_PREFLUSH and REQ_FUA bits from
 requests that have a payload.  For devices with volatile write caches the
 driver needs to tell the block layer that it supports flushing caches by
 doing:
@@ -74,7 +74,7 @@ doing:
 	blk_queue_write_cache(sdkp->disk->queue, true, false);
 
 and handle empty REQ_OP_FLUSH requests in its prep_fn/request_fn.  Note that
-REQ_FLUSH requests with a payload are automatically turned into a sequence
+REQ_PREFLUSH requests with a payload are automatically turned into a sequence
 of an empty REQ_OP_FLUSH request followed by the actual write by the block
 layer.  For devices that also support the FUA bit the block layer needs
 to be told to pass through the REQ_FUA bit using:
diff --git a/Documentation/device-mapper/log-writes.txt b/Documentation/device-mapper/log-writes.txt
index c10f30c9b534..f4ebcbaf50f3 100644
--- a/Documentation/device-mapper/log-writes.txt
+++ b/Documentation/device-mapper/log-writes.txt
@@ -14,14 +14,14 @@ Log Ordering
 
 We log things in order of completion once we are sure the write is no longer in
 cache.  This means that normal WRITE requests are not actually logged until the
-next REQ_FLUSH request.  This is to make it easier for userspace to replay the
-log in a way that correlates to what is on disk and not what is in cache, to
-make it easier to detect improper waiting/flushing.
+next REQ_PREFLUSH request.  This is to make it easier for userspace to replay
+the log in a way that correlates to what is on disk and not what is in cache,
+to make it easier to detect improper waiting/flushing.
 
 This works by attaching all WRITE requests to a list once the write completes.
-Once we see a REQ_FLUSH request we splice this list onto the request and once
+Once we see a REQ_PREFLUSH request we splice this list onto the request and once
 the FLUSH request completes we log all of the WRITEs and then the FLUSH.  Only
-completed WRITEs, at the time the REQ_FLUSH is issued, are added in order to
+completed WRITEs, at the time the REQ_PREFLUSH is issued, are added in order to
 simulate the worst case scenario with regard to power failures.  Consider the
 following example (W means write, C means complete):
 
diff --git a/block/blk-core.c b/block/blk-core.c
index c7d66c23a708..32a283eb7274 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1029,7 +1029,7 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
 	 * Flush requests do not use the elevator so skip initialization.
 	 * This allows a request to share the flush and elevator data.
 	 */
-	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
+	if (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA))
 		return false;
 
 	return true;
@@ -1736,7 +1736,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 		return BLK_QC_T_NONE;
 	}
 
-	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+	if (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA)) {
 		spin_lock_irq(q->queue_lock);
 		where = ELEVATOR_INSERT_FLUSH;
 		goto get_rq;
@@ -1968,9 +1968,9 @@ generic_make_request_checks(struct bio *bio)
 	 * drivers without flush support don't have to worry
 	 * about them.
 	 */
-	if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
+	if ((bio->bi_rw & (REQ_PREFLUSH | REQ_FUA)) &&
 	    !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
-		bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+		bio->bi_rw &= ~(REQ_PREFLUSH | REQ_FUA);
 		if (!nr_sectors) {
 			err = 0;
 			goto end_io;
@@ -2217,7 +2217,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 	 */
 	BUG_ON(blk_queued_rq(rq));
 
-	if (rq->cmd_flags & (REQ_FLUSH|REQ_FUA))
+	if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA))
 		where = ELEVATOR_INSERT_FLUSH;
 
 	add_acct_request(q, rq, where);
@@ -3311,7 +3311,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		/*
 		 * rq is already accounted, so use raw insert
 		 */
-		if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA))
+		if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA))
 			__elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
 		else
 			__elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 21f0d5b0d2ca..d308def812db 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -10,8 +10,8 @@
  * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
  * properties and hardware capability.
  *
- * If a request doesn't have data, only REQ_FLUSH makes sense, which
- * indicates a simple flush request.  If there is data, REQ_FLUSH indicates
+ * If a request doesn't have data, only REQ_PREFLUSH makes sense, which
+ * indicates a simple flush request.  If there is data, REQ_PREFLUSH indicates
  * that the device cache should be flushed before the data is executed, and
  * REQ_FUA means that the data must be on non-volatile media on request
  * completion.
@@ -20,11 +20,11 @@
  * difference.  The requests are either completed immediately if there's no
  * data or executed as normal requests otherwise.
  *
- * If the device has writeback cache and supports FUA, REQ_FLUSH is
+ * If the device has writeback cache and supports FUA, REQ_PREFLUSH is
  * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
  *
- * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is
- * translated to PREFLUSH and REQ_FUA to POSTFLUSH.
+ * If the device has writeback cache and doesn't support FUA, REQ_PREFLUSH
+ * is translated to PREFLUSH and REQ_FUA to POSTFLUSH.
  *
  * The actual execution of flush is double buffered.  Whenever a request
  * needs to execute PRE or POSTFLUSH, it queues at
@@ -103,7 +103,7 @@ static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
 		policy |= REQ_FSEQ_DATA;
 
 	if (fflags & (1UL << QUEUE_FLAG_WC)) {
-		if (rq->cmd_flags & REQ_FLUSH)
+		if (rq->cmd_flags & REQ_PREFLUSH)
 			policy |= REQ_FSEQ_PREFLUSH;
 		if (!(fflags & (1UL << QUEUE_FLAG_FUA)) &&
 		    (rq->cmd_flags & REQ_FUA))
@@ -391,9 +391,9 @@ void blk_insert_flush(struct request *rq)
 
 	/*
 	 * @policy now records what operations need to be done.  Adjust
-	 * REQ_FLUSH and FUA for the driver.
+	 * REQ_PREFLUSH and FUA for the driver.
 	 */
-	rq->cmd_flags &= ~REQ_FLUSH;
+	rq->cmd_flags &= ~REQ_PREFLUSH;
 	if (!(fflags & (1UL << QUEUE_FLAG_FUA)))
 		rq->cmd_flags &= ~REQ_FUA;
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 29bcd9c07a34..13f460368759 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1247,7 +1247,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = rw_is_sync(bio_op(bio), bio->bi_rw);
-	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
+	const int is_flush_fua = bio->bi_rw & (REQ_PREFLUSH | REQ_FUA);
 	struct blk_map_ctx data;
 	struct request *rq;
 	unsigned int request_count = 0;
@@ -1344,7 +1344,7 @@ done:
 static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = rw_is_sync(bio_op(bio), bio->bi_rw);
-	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
+	const int is_flush_fua = bio->bi_rw & (REQ_PREFLUSH | REQ_FUA);
 	struct blk_plug *plug;
 	unsigned int request_count = 0;
 	struct blk_map_ctx data;
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index f236a31cc095..d524973f94b3 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -148,7 +148,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
 	device->md_io.error = -ENODEV;
 
 	if ((op == REQ_OP_WRITE) && !test_bit(MD_NO_FUA, &device->flags))
-		op_flags |= REQ_FUA | REQ_FLUSH;
+		op_flags |= REQ_FUA | REQ_PREFLUSH;
 	op_flags |= REQ_SYNC | REQ_NOIDLE;
 
 	bio = bio_alloc_drbd(GFP_NOIO);
@@ -847,7 +847,7 @@ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
 	unsigned long count = 0;
 	sector_t esector, nr_sectors;
 
-	/* This would be an empty REQ_FLUSH, be silent. */
+	/* This would be an empty REQ_PREFLUSH, be silent. */
 	if ((mode == SET_OUT_OF_SYNC) && size == 0)
 		return 0;
 
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index d55febcaa414..2b37744db0fa 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1609,7 +1609,7 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection,
 	if (connection->agreed_pro_version >= 95)
 		return  (bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
 			(bio->bi_rw & REQ_FUA ? DP_FUA : 0) |
-			(bio->bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
+			(bio->bi_rw & REQ_PREFLUSH ? DP_FLUSH : 0) |
 			(bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0);
 	else
 		return bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
index ef9245363dcc..129f8c76c9b1 100644
--- a/drivers/block/drbd/drbd_protocol.h
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -112,7 +112,7 @@ struct p_header100 {
 #define DP_MAY_SET_IN_SYNC    4
 #define DP_UNPLUG             8 /* not used anymore   */
 #define DP_FUA               16 /* equals REQ_FUA     */
-#define DP_FLUSH             32 /* equals REQ_FLUSH   */
+#define DP_FLUSH             32 /* equals REQ_PREFLUSH   */
 #define DP_DISCARD           64 /* equals REQ_DISCARD */
 #define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
 #define DP_SEND_WRITE_ACK   256 /* This is a proto C write request */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 6c5997894475..1ee002352ea2 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2158,7 +2158,7 @@ static unsigned long wire_flags_to_bio_flags(u32 dpf)
 {
 	return  (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
 		(dpf & DP_FUA ? REQ_FUA : 0) |
-		(dpf & DP_FLUSH ? REQ_FLUSH : 0);
+		(dpf & DP_FLUSH ? REQ_PREFLUSH : 0);
 }
 
 static unsigned long wire_flags_to_bio_op(u32 dpf)
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 2255dcfebd2b..eef6e9575b4e 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1132,7 +1132,7 @@ static int drbd_process_write_request(struct drbd_request *req)
 	 * replicating, in which case there is no point. */
 	if (unlikely(req->i.size == 0)) {
 		/* The only size==0 bios we expect are empty flushes. */
-		D_ASSERT(device, req->master_bio->bi_rw & REQ_FLUSH);
+		D_ASSERT(device, req->master_bio->bi_rw & REQ_PREFLUSH);
 		if (remote)
 			_req_mod(req, QUEUE_AS_DRBD_BARRIER);
 		return remote;
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index a3c3b309ff4a..6925023e12d4 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -631,7 +631,7 @@ static void journal_write_unlocked(struct closure *cl)
 		bio->bi_end_io	= journal_write_endio;
 		bio->bi_private = w;
 		bio_set_op_attrs(bio, REQ_OP_WRITE,
-				 REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA);
+				 REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
 		bch_bio_map(bio, w->data);
 
 		trace_bcache_journal_write(bio);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 016b0aa7199c..69f16f43f8ab 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -205,10 +205,10 @@ static void bch_data_insert_start(struct closure *cl)
 		return bch_data_invalidate(cl);
 
 	/*
-	 * Journal writes are marked REQ_FLUSH; if the original write was a
+	 * Journal writes are marked REQ_PREFLUSH; if the original write was a
 	 * flush, it'll wait on the journal write.
 	 */
-	bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA);
+	bio->bi_rw &= ~(REQ_PREFLUSH|REQ_FUA);
 
 	do {
 		unsigned i;
@@ -668,7 +668,7 @@ static inline struct search *search_alloc(struct bio *bio,
 	s->iop.write_prio	= 0;
 	s->iop.error		= 0;
 	s->iop.flags		= 0;
-	s->iop.flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
+	s->iop.flush_journal	= (bio->bi_rw & (REQ_PREFLUSH|REQ_FUA)) != 0;
 	s->iop.wq		= bcache_wq;
 
 	return s;
@@ -920,7 +920,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 		bch_writeback_add(dc);
 		s->iop.bio = bio;
 
-		if (bio->bi_rw & REQ_FLUSH) {
+		if (bio->bi_rw & REQ_PREFLUSH) {
 			/* Also need to send a flush to the backing device */
 			struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0,
 							     dc->disk.bio_split);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 540e80eb317d..718744db62df 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -788,7 +788,7 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 
 	spin_lock_irqsave(&cache->lock, flags);
 	if (cache->need_tick_bio &&
-	    !(bio->bi_rw & (REQ_FUA | REQ_FLUSH)) &&
+	    !(bio->bi_rw & (REQ_FUA | REQ_PREFLUSH)) &&
 	    bio_op(bio) != REQ_OP_DISCARD) {
 		pb->tick = true;
 		cache->need_tick_bio = false;
@@ -830,7 +830,7 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 
 static int bio_triggers_commit(struct cache *cache, struct bio *bio)
 {
-	return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
+	return bio->bi_rw & (REQ_PREFLUSH | REQ_FUA);
 }
 
 /*
@@ -1069,7 +1069,7 @@ static void dec_io_migrations(struct cache *cache)
 static bool discard_or_flush(struct bio *bio)
 {
 	return bio_op(bio) == REQ_OP_DISCARD ||
-	       bio->bi_rw & (REQ_FLUSH | REQ_FUA);
+	       bio->bi_rw & (REQ_PREFLUSH | REQ_FUA);
 }
 
 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
@@ -1614,8 +1614,8 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
 		remap_to_cache(cache, bio, 0);
 
 	/*
-	 * REQ_FLUSH is not directed at any particular block so we don't
-	 * need to inc_ds().  REQ_FUA's are split into a write + REQ_FLUSH
+	 * REQ_PREFLUSH is not directed at any particular block so we don't
+	 * need to inc_ds().  REQ_FUA's are split into a write + REQ_PREFLUSH
 	 * by dm-core.
 	 */
 	issue(cache, bio);
@@ -1980,7 +1980,7 @@ static void process_deferred_bios(struct cache *cache)
 
 		bio = bio_list_pop(&bios);
 
-		if (bio->bi_rw & REQ_FLUSH)
+		if (bio->bi_rw & REQ_PREFLUSH)
 			process_flush_bio(cache, bio);
 		else if (bio_op(bio) == REQ_OP_DISCARD)
 			process_discard_bio(cache, &structs, bio);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 057d19b5e196..96dd5d7e454a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1911,11 +1911,12 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 	struct crypt_config *cc = ti->private;
 
 	/*
-	 * If bio is REQ_FLUSH or REQ_OP_DISCARD, just bypass crypt queues.
-	 * - for REQ_FLUSH device-mapper core ensures that no IO is in-flight
+	 * If bio is REQ_PREFLUSH or REQ_OP_DISCARD, just bypass crypt queues.
+	 * - for REQ_PREFLUSH device-mapper core ensures that no IO is in-flight
 	 * - for REQ_OP_DISCARD caller must use flush if IO ordering matters
 	 */
-	if (unlikely(bio->bi_rw & REQ_FLUSH || bio_op(bio) == REQ_OP_DISCARD)) {
+	if (unlikely(bio->bi_rw & REQ_PREFLUSH ||
+	    bio_op(bio) == REQ_OP_DISCARD)) {
 		bio->bi_bdev = cc->dev->bdev;
 		if (bio_sectors(bio))
 			bio->bi_iter.bi_sector = cc->start +
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index 665bf3285618..2faf49d8f4d7 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -1540,9 +1540,9 @@ static int era_map(struct dm_target *ti, struct bio *bio)
 	remap_to_origin(era, bio);
 
 	/*
-	 * REQ_FLUSH bios carry no data, so we're not interested in them.
+	 * REQ_PREFLUSH bios carry no data, so we're not interested in them.
 	 */
-	if (!(bio->bi_rw & REQ_FLUSH) &&
+	if (!(bio->bi_rw & REQ_PREFLUSH) &&
 	    (bio_data_dir(bio) == WRITE) &&
 	    !metadata_current_marked(era->md, block)) {
 		defer_bio(era, bio);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 22e0597d631e..0e225fd4a8d1 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -380,7 +380,7 @@ static void dispatch_io(int op, int op_flags, unsigned int num_regions,
 	 */
 	for (i = 0; i < num_regions; i++) {
 		*dp = old_pages;
-		if (where[i].count || (op_flags & REQ_FLUSH))
+		if (where[i].count || (op_flags & REQ_PREFLUSH))
 			do_region(op, op_flags, i, where + i, dp, io);
 	}
 
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 0edb8ea51e46..b5dbf7a0515e 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -555,7 +555,7 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio)
 	struct bio_vec bv;
 	size_t alloc_size;
 	int i = 0;
-	bool flush_bio = (bio->bi_rw & REQ_FLUSH);
+	bool flush_bio = (bio->bi_rw & REQ_PREFLUSH);
 	bool fua_bio = (bio->bi_rw & REQ_FUA);
 	bool discard_bio = (bio_op(bio) == REQ_OP_DISCARD);
 
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 907df2ba8fd4..9f5f460c0e92 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -704,7 +704,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	bio_list_init(&requeue);
 
 	while ((bio = bio_list_pop(writes))) {
-		if ((bio->bi_rw & REQ_FLUSH) ||
+		if ((bio->bi_rw & REQ_PREFLUSH) ||
 		    (bio_op(bio) == REQ_OP_DISCARD)) {
 			bio_list_add(&sync, bio);
 			continue;
@@ -1253,7 +1253,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
 	 * We need to dec pending if this was a write.
 	 */
 	if (rw == WRITE) {
-		if (!(bio->bi_rw & REQ_FLUSH) && bio_op(bio) != REQ_OP_DISCARD)
+		if (!(bio->bi_rw & REQ_PREFLUSH) &&
+		    bio_op(bio) != REQ_OP_DISCARD)
 			dm_rh_dec(ms->rh, bio_record->write_region);
 		return error;
 	}
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 3550ca7c6577..b11813431f31 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -398,7 +398,7 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
 	region_t region = dm_rh_bio_to_region(rh, bio);
 	int recovering = 0;
 
-	if (bio->bi_rw & REQ_FLUSH) {
+	if (bio->bi_rw & REQ_PREFLUSH) {
 		rh->flush_failure = 1;
 		return;
 	}
@@ -526,7 +526,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
 	struct bio *bio;
 
 	for (bio = bios->head; bio; bio = bio->bi_next) {
-		if (bio->bi_rw & REQ_FLUSH || bio_op(bio) == REQ_OP_DISCARD)
+		if (bio->bi_rw & REQ_PREFLUSH || bio_op(bio) == REQ_OP_DISCARD)
 			continue;
 		rh_inc(rh, dm_rh_bio_to_region(rh, bio));
 	}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 70bb0e8b62ce..69ab1ff5f5c9 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1680,7 +1680,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 
 	init_tracked_chunk(bio);
 
-	if (bio->bi_rw & REQ_FLUSH) {
+	if (bio->bi_rw & REQ_PREFLUSH) {
 		bio->bi_bdev = s->cow->bdev;
 		return DM_MAPIO_REMAPPED;
 	}
@@ -1799,7 +1799,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
 
 	init_tracked_chunk(bio);
 
-	if (bio->bi_rw & REQ_FLUSH) {
+	if (bio->bi_rw & REQ_PREFLUSH) {
 		if (!dm_bio_get_target_bio_nr(bio))
 			bio->bi_bdev = s->origin->bdev;
 		else
@@ -2285,7 +2285,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio)
 
 	bio->bi_bdev = o->dev->bdev;
 
-	if (unlikely(bio->bi_rw & REQ_FLUSH))
+	if (unlikely(bio->bi_rw & REQ_PREFLUSH))
 		return DM_MAPIO_REMAPPED;
 
 	if (bio_rw(bio) != WRITE)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index b738178ca068..48f1c01d7b9f 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -286,7 +286,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
 	uint32_t stripe;
 	unsigned target_bio_nr;
 
-	if (bio->bi_rw & REQ_FLUSH) {
+	if (bio->bi_rw & REQ_PREFLUSH) {
 		target_bio_nr = dm_bio_get_target_bio_nr(bio);
 		BUG_ON(target_bio_nr >= sc->stripes);
 		bio->bi_bdev = sc->stripe[target_bio_nr].dev->bdev;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 1b684cbb9ba2..5f9e3d799d66 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -697,7 +697,7 @@ static void remap_to_origin(struct thin_c *tc, struct bio *bio)
 
 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
 {
-	return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
+	return (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA)) &&
 		dm_thin_changed_this_transaction(tc->td);
 }
 
@@ -868,7 +868,7 @@ static void __inc_remap_and_issue_cell(void *context,
 	struct bio *bio;
 
 	while ((bio = bio_list_pop(&cell->bios))) {
-		if (bio->bi_rw & (REQ_FLUSH | REQ_FUA) ||
+		if (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA) ||
 		    bio_op(bio) == REQ_OP_DISCARD)
 			bio_list_add(&info->defer_bios, bio);
 		else {
@@ -1641,7 +1641,7 @@ static void __remap_and_issue_shared_cell(void *context,
 
 	while ((bio = bio_list_pop(&cell->bios))) {
 		if ((bio_data_dir(bio) == WRITE) ||
-		    (bio->bi_rw & (REQ_FLUSH | REQ_FUA) ||
+		    (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA) ||
 		     bio_op(bio) == REQ_OP_DISCARD))
 			bio_list_add(&info->defer_bios, bio);
 		else {
@@ -2556,7 +2556,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_SUBMITTED;
 	}
 
-	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA) ||
+	if (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA) ||
 	    bio_op(bio) == REQ_OP_DISCARD) {
 		thin_defer_bio_with_throttle(tc, bio);
 		return DM_MAPIO_SUBMITTED;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index fcc68c8edba0..aba7ed9abb3a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1003,12 +1003,12 @@ static void dec_pending(struct dm_io *io, int error)
 		if (io_error == DM_ENDIO_REQUEUE)
 			return;
 
-		if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) {
+		if ((bio->bi_rw & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
 			/*
 			 * Preflush done for flush with data, reissue
-			 * without REQ_FLUSH.
+			 * without REQ_PREFLUSH.
 			 */
-			bio->bi_rw &= ~REQ_FLUSH;
+			bio->bi_rw &= ~REQ_PREFLUSH;
 			queue_io(md, bio);
 		} else {
 			/* done with normal IO or empty flush */
@@ -1477,7 +1477,7 @@ EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
 
 /*
  * A target may call dm_accept_partial_bio only from the map routine.  It is
- * allowed for all bio types except REQ_FLUSH.
+ * allowed for all bio types except REQ_PREFLUSH.
  *
  * dm_accept_partial_bio informs the dm that the target only wants to process
  * additional n_sectors sectors of the bio and the rest of the data should be
@@ -1507,7 +1507,7 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
 {
 	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 	unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
-	BUG_ON(bio->bi_rw & REQ_FLUSH);
+	BUG_ON(bio->bi_rw & REQ_PREFLUSH);
 	BUG_ON(bi_size > *tio->len_ptr);
 	BUG_ON(n_sectors > bi_size);
 	*tio->len_ptr -= bi_size - n_sectors;
@@ -1795,7 +1795,7 @@ static void __split_and_process_bio(struct mapped_device *md,
 
 	start_io_acct(ci.io);
 
-	if (bio->bi_rw & REQ_FLUSH) {
+	if (bio->bi_rw & REQ_PREFLUSH) {
 		ci.bio = &ci.md->flush_bio;
 		ci.sector_count = 0;
 		error = __send_empty_flush(&ci);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 1ad3f485672c..70ff888d25d0 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -221,7 +221,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 	struct bio *split;
 	sector_t start_sector, end_sector, data_offset;
 
-	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+	if (unlikely(bio->bi_rw & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index bd4844fe0e98..1f123f5a29da 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -414,7 +414,7 @@ static void md_submit_flush_data(struct work_struct *ws)
 		/* an empty barrier - all done */
 		bio_endio(bio);
 	else {
-		bio->bi_rw &= ~REQ_FLUSH;
+		bio->bi_rw &= ~REQ_PREFLUSH;
 		mddev->pers->make_request(mddev, bio);
 	}
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2e0918fc376d..b4f335245bd6 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -424,7 +424,7 @@ struct mddev {
 
 	/* Generic flush handling.
 	 * The last to finish preflush schedules a worker to submit
-	 * the rest of the request (without the REQ_FLUSH flag).
+	 * the rest of the request (without the REQ_PREFLUSH flag).
 	 */
 	struct bio *flush_bio;
 	atomic_t flush_pending;
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index dd483bb2e111..72ea98e89e57 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -111,7 +111,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
 	struct multipath_bh * mp_bh;
 	struct multipath_info *multipath;
 
-	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+	if (unlikely(bio->bi_rw & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 051a10ca4e09..c3d439083212 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -458,7 +458,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 	struct md_rdev *tmp_dev;
 	struct bio *split;
 
-	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+	if (unlikely(bio->bi_rw & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a3427230f7cf..10e53cd6a995 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1056,7 +1056,8 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
 	const int op = bio_op(bio);
 	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
-	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
+	const unsigned long do_flush_fua = (bio->bi_rw &
+						(REQ_PREFLUSH | REQ_FUA));
 	const unsigned long do_sec = (bio->bi_rw & REQ_SECURE);
 	struct md_rdev *blocked_rdev;
 	struct blk_plug_cb *cb;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 615045a11bac..245640b50153 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1447,7 +1447,7 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
 
 	struct bio *split;
 
-	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+	if (unlikely(bio->bi_rw & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 92651381b094..5504ce2bac06 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -536,7 +536,7 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
 		bio_endio(bio);
 		return 0;
 	}
-	bio->bi_rw &= ~REQ_FLUSH;
+	bio->bi_rw &= ~REQ_PREFLUSH;
 	return -EAGAIN;
 }
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b9122e2c6aa1..7aacf5b55e15 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5150,7 +5150,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	DEFINE_WAIT(w);
 	bool do_prepare;
 
-	if (unlikely(bi->bi_rw & REQ_FLUSH)) {
+	if (unlikely(bi->bi_rw & REQ_PREFLUSH)) {
 		int ret = r5l_handle_flush_request(conf->log, bi);
 
 		if (ret == 0)
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index da944ffddbaf..ca70bc78b27d 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2207,7 +2207,7 @@ static void btrfsic_bio_end_io(struct bio *bp)
 			       block->dev_bytenr, block->mirror_num);
 		next_block = block->next_in_same_bio;
 		block->iodone_w_error = iodone_w_error;
-		if (block->submit_bio_bh_rw & REQ_FLUSH) {
+		if (block->submit_bio_bh_rw & REQ_PREFLUSH) {
 			dev_state->last_flush_gen++;
 			if ((dev_state->state->print_mask &
 			     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
@@ -2243,7 +2243,7 @@ static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
 		       block->dev_bytenr, block->mirror_num);
 
 	block->iodone_w_error = iodone_w_error;
-	if (block->submit_bio_bh_rw & REQ_FLUSH) {
+	if (block->submit_bio_bh_rw & REQ_PREFLUSH) {
 		dev_state->last_flush_gen++;
 		if ((dev_state->state->print_mask &
 		     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
@@ -2884,7 +2884,7 @@ int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh)
 		btrfsic_process_written_block(dev_state, dev_bytenr,
 					      &bh->b_data, 1, NULL,
 					      NULL, bh, op_flags);
-	} else if (NULL != dev_state && (op_flags & REQ_FLUSH)) {
+	} else if (NULL != dev_state && (op_flags & REQ_PREFLUSH)) {
 		if (dev_state->state->print_mask &
 		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
 			printk(KERN_INFO
@@ -2982,7 +2982,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
 			kunmap(bio->bi_io_vec[i].bv_page);
 		}
 		kfree(mapped_datav);
-	} else if (NULL != dev_state && (bio->bi_rw & REQ_FLUSH)) {
+	} else if (NULL != dev_state && (bio->bi_rw & REQ_PREFLUSH)) {
 		if (dev_state->state->print_mask &
 		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
 			printk(KERN_INFO
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 34bc99637d5a..dc68f562f681 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1354,7 +1354,7 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
 
 	trace_jbd2_write_superblock(journal, write_flags);
 	if (!(journal->j_flags & JBD2_BARRIER))
-		write_flags &= ~(REQ_FUA | REQ_FLUSH);
+		write_flags &= ~(REQ_FUA | REQ_PREFLUSH);
 	lock_buffer(bh);
 	if (buffer_write_io_error(bh)) {
 		/*
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d8acd3716dbd..686d8f160f5c 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1237,7 +1237,7 @@ _xfs_buf_ioapply(
 		if (bp->b_flags & XBF_FUA)
 			op_flags |= REQ_FUA;
 		if (bp->b_flags & XBF_FLUSH)
-			op_flags |= REQ_FLUSH;
+			op_flags |= REQ_PREFLUSH;
 
 		/*
 		 * Run the write verifier callback function if it exists. If
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 32d87522f349..562ab8301217 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -168,7 +168,7 @@ enum rq_flag_bits {
 	__REQ_NOIDLE,		/* don't anticipate more IO after this one */
 	__REQ_INTEGRITY,	/* I/O includes block integrity payload */
 	__REQ_FUA,		/* forced unit access */
-	__REQ_FLUSH,		/* request for cache flush */
+	__REQ_PREFLUSH,		/* request for cache flush */
 
 	/* bio only flags */
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
@@ -212,12 +212,12 @@ enum rq_flag_bits {
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
 #define REQ_COMMON_MASK \
 	(REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \
-	 REQ_FLUSH | REQ_FUA | REQ_SECURE | REQ_INTEGRITY | REQ_NOMERGE)
+	 REQ_PREFLUSH | REQ_FUA | REQ_SECURE | REQ_INTEGRITY | REQ_NOMERGE)
 #define REQ_CLONE_MASK		REQ_COMMON_MASK
 
 /* This mask is used for both bio and request merge checking */
 #define REQ_NOMERGE_FLAGS \
-	(REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA | REQ_FLUSH_SEQ)
+	(REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_PREFLUSH | REQ_FUA | REQ_FLUSH_SEQ)
 
 #define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
 #define REQ_THROTTLED		(1ULL << __REQ_THROTTLED)
@@ -235,7 +235,7 @@ enum rq_flag_bits {
 #define REQ_PREEMPT		(1ULL << __REQ_PREEMPT)
 #define REQ_ALLOCED		(1ULL << __REQ_ALLOCED)
 #define REQ_COPY_USER		(1ULL << __REQ_COPY_USER)
-#define REQ_FLUSH		(1ULL << __REQ_FLUSH)
+#define REQ_PREFLUSH		(1ULL << __REQ_PREFLUSH)
 #define REQ_FLUSH_SEQ		(1ULL << __REQ_FLUSH_SEQ)
 #define REQ_IO_STAT		(1ULL << __REQ_IO_STAT)
 #define REQ_MIXED_MERGE		(1ULL << __REQ_MIXED_MERGE)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ccd166477487..183024525d40 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -204,9 +204,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 #define READ_SYNC		REQ_SYNC
 #define WRITE_SYNC		(REQ_SYNC | REQ_NOIDLE)
 #define WRITE_ODIRECT		REQ_SYNC
-#define WRITE_FLUSH		(REQ_SYNC | REQ_NOIDLE | REQ_FLUSH)
+#define WRITE_FLUSH		(REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH)
 #define WRITE_FUA		(REQ_SYNC | REQ_NOIDLE | REQ_FUA)
-#define WRITE_FLUSH_FUA		(REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
+#define WRITE_FLUSH_FUA		(REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA)
 
 /*
  * Attribute flags.  These should be or-ed together to figure out what
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 30efa44a473c..878963a1f058 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -33,7 +33,7 @@ TRACE_DEFINE_ENUM(SSR);
 TRACE_DEFINE_ENUM(__REQ_RAHEAD);
 TRACE_DEFINE_ENUM(__REQ_SYNC);
 TRACE_DEFINE_ENUM(__REQ_NOIDLE);
-TRACE_DEFINE_ENUM(__REQ_FLUSH);
+TRACE_DEFINE_ENUM(__REQ_PREFLUSH);
 TRACE_DEFINE_ENUM(__REQ_FUA);
 TRACE_DEFINE_ENUM(__REQ_PRIO);
 TRACE_DEFINE_ENUM(__REQ_META);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 0c70fbb6ea8d..03b0dd98ff0e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -189,6 +189,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
 				 BLK_TC_ACT(BLK_TC_WRITE) };
 
 #define BLK_TC_RAHEAD		BLK_TC_AHEAD
+#define BLK_TC_PREFLUSH		BLK_TC_FLUSH
 
 /* The ilog2() calls fall out because they're constant */
 #define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
@@ -219,7 +220,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	what |= MASK_TC_BIT(op_flags, SYNC);
 	what |= MASK_TC_BIT(op_flags, RAHEAD);
 	what |= MASK_TC_BIT(op_flags, META);
-	what |= MASK_TC_BIT(op_flags, FLUSH);
+	what |= MASK_TC_BIT(op_flags, PREFLUSH);
 	what |= MASK_TC_BIT(op_flags, FUA);
 	if (op == REQ_OP_DISCARD)
 		what |= BLK_TC_ACT(BLK_TC_DISCARD);
@@ -1779,7 +1780,7 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
 {
 	int i = 0;
 
-	if (rw & REQ_FLUSH)
+	if (rw & REQ_PREFLUSH)
 		rwbs[i++] = 'F';
 
 	switch (op) {
-- 
cgit v1.3-8-gc7d7


From a4f144ebbdf6f7807c477bce8e136047ed27321f Mon Sep 17 00:00:00 2001
From: David Carrillo-Cisneros <davidcc@google.com>
Date: Wed, 1 Jun 2016 12:33:05 -0700
Subject: perf/core: Fix crash due to account/unaccount_sb_event()
 inconsistency

unaccount_pmu_sb_event() did not check for attributes in event->attr
before calling detach_sb_event(), while account_pmu_event() did.

This caused NULL pointer reference in cgroup events that did not
have any of the attributes checked by account_pmu_event().

To trigger the bug just wait for a cgroup event to terminate, e.g.:

  $ mkdir /dev/cgroup/devices/test
  $ perf stat -e cycles -a -G test sleep 0

... see crash ...

Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Zheng <zheng.z.yan@intel.com>
Link: http://lkml.kernel.org/r/1464809585-66072-1-git-send-email-davidcc@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5d48306879d5..ae081a141a4a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3682,15 +3682,28 @@ static void detach_sb_event(struct perf_event *event)
 	raw_spin_unlock(&pel->lock);
 }
 
-static void unaccount_pmu_sb_event(struct perf_event *event)
+static bool is_sb_event(struct perf_event *event)
 {
+	struct perf_event_attr *attr = &event->attr;
+
 	if (event->parent)
-		return;
+		return false;
 
 	if (event->attach_state & PERF_ATTACH_TASK)
-		return;
+		return false;
 
-	detach_sb_event(event);
+	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
+	    attr->comm || attr->comm_exec ||
+	    attr->task ||
+	    attr->context_switch)
+		return true;
+	return false;
+}
+
+static void unaccount_pmu_sb_event(struct perf_event *event)
+{
+	if (is_sb_event(event))
+		detach_sb_event(event);
 }
 
 static void unaccount_event_cpu(struct perf_event *event, int cpu)
@@ -8666,18 +8679,7 @@ static void attach_sb_event(struct perf_event *event)
  */
 static void account_pmu_sb_event(struct perf_event *event)
 {
-	struct perf_event_attr *attr = &event->attr;
-
-	if (event->parent)
-		return;
-
-	if (event->attach_state & PERF_ATTACH_TASK)
-		return;
-
-	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
-	    attr->comm || attr->comm_exec ||
-	    attr->task ||
-	    attr->context_switch)
+	if (is_sb_event(event))
 		attach_sb_event(event);
 }
 
-- 
cgit v1.3-8-gc7d7


From dfaaf3fa01d65cf6e2072965bb0b7aaa7285344f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 30 May 2016 18:31:33 +0200
Subject: locking/lockdep: Use __jhash_mix() for iterate_chain_key()

Use __jhash_mix() to mix the class_idx into the class_key. This
function provides better mixing than the previously used, home grown
mix function.

Leave hashing to the professionals :-)

Suggested-by: George Spelvin <linux@sciencehorizons.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 81f1a7107c0e..589d763a49b3 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -46,6 +46,7 @@
 #include <linux/gfp.h>
 #include <linux/kmemcheck.h>
 #include <linux/random.h>
+#include <linux/jhash.h>
 
 #include <asm/sections.h>
 
@@ -309,10 +310,14 @@ static struct hlist_head chainhash_table[CHAINHASH_SIZE];
  * It's a 64-bit hash, because it's important for the keys to be
  * unique.
  */
-#define iterate_chain_key(key1, key2) \
-	(((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
-	((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
-	(key2))
+static inline u64 iterate_chain_key(u64 key, u32 idx)
+{
+	u32 k0 = key, k1 = key >> 32;
+
+	__jhash_mix(idx, k0, k1); /* Macro that modifies arguments! */
+
+	return k0 | (u64)k1 << 32;
+}
 
 void lockdep_off(void)
 {
-- 
cgit v1.3-8-gc7d7


From a461d58792d4a46b8b10ae50973ec9b2763b694e Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 27 May 2016 15:47:18 +0200
Subject: locking/rtmutex: Only warn once on a trylock from bad context

One warning should be enough to get one motivated to fix this. It is
possible that this happens more than once and that starts flooding the
output. Later the prints will be suppressed so we only get half of it.
Depending on the console system used it might not be helpful.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1464356838-1755-1-git-send-email-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rtmutex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 3e746607abe5..1ec0f48962b3 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1478,7 +1478,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
  */
 int __sched rt_mutex_trylock(struct rt_mutex *lock)
 {
-	if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
+	if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
 		return 0;
 
 	return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
-- 
cgit v1.3-8-gc7d7


From 03c041c5bf6ed584dff36b7cd509e0146a124277 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 3 Jun 2016 17:58:41 -0500
Subject: sched/debug: Always show 'nr_migrations'

The nr_migrations field is updated independently of CONFIG_SCHEDSTATS,
so it can be displayed regardless.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/5b1b04057ae2b14d73c2d03f56582c1d38cfe066.1464994423.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 0368c393a336..2a0a9995256d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
 	nr_switches = p->nvcsw + p->nivcsw;
 
-#ifdef CONFIG_SCHEDSTATS
 	P(se.nr_migrations);
 
+#ifdef CONFIG_SCHEDSTATS
 	if (schedstat_enabled()) {
 		u64 avg_atom, avg_per_cpu;
 
-- 
cgit v1.3-8-gc7d7


From 8d53fa19041ae65c484d81d75179b4a577e6d8e4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 8 Jun 2016 09:12:30 +0200
Subject: locking/qspinlock: Clarify xchg_tail() ordering

While going over the code I noticed that xchg_tail() is a RELEASE but
had no obvious pairing commented.

It pairs with a somewhat unique address dependency through
decode_tail().

So the store-release of xchg_tail() is paired by the address
dependency of the load of xchg_tail followed by the dereference from
the pointer computed from that load.

The @old -> @prev transformation itself is pure, and therefore does
not depend on external state, so that is immaterial wrt. ordering.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <waiman.long@hpe.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 5fc8c311b8fe..ee7deb08d43d 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -90,7 +90,7 @@ static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
  * therefore increment the cpu number by one.
  */
 
-static inline u32 encode_tail(int cpu, int idx)
+static inline __pure u32 encode_tail(int cpu, int idx)
 {
 	u32 tail;
 
@@ -103,7 +103,7 @@ static inline u32 encode_tail(int cpu, int idx)
 	return tail;
 }
 
-static inline struct mcs_spinlock *decode_tail(u32 tail)
+static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
 {
 	int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
 	int idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
@@ -455,6 +455,8 @@ queue:
 	 * pending stuff.
 	 *
 	 * p,*,* -> n,*,*
+	 *
+	 * RELEASE, such that the stores to @node must be complete.
 	 */
 	old = xchg_tail(lock, tail);
 	next = NULL;
@@ -465,6 +467,15 @@ queue:
 	 */
 	if (old & _Q_TAIL_MASK) {
 		prev = decode_tail(old);
+		/*
+		 * The above xchg_tail() is also a load of @lock which generates,
+		 * through decode_tail(), a pointer.
+		 *
+		 * The address dependency matches the RELEASE of xchg_tail()
+		 * such that the access to @prev must happen after.
+		 */
+		smp_read_barrier_depends();
+
 		WRITE_ONCE(prev->next, node);
 
 		pv_wait_node(node, prev);
-- 
cgit v1.3-8-gc7d7


From 055ce0fd1b86c204430cbc0887165599d6e15090 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 8 Jun 2016 10:36:53 +0200
Subject: locking/qspinlock: Add comments

I figured we need to document the spin_is_locked() and
spin_unlock_wait() constraints somwehere.

Ideally 'someone' would rewrite Documentation/atomic_ops.txt and we
could find a place in there. But currently that document is stale to
the point of hardly being useful.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <waiman.long@hpe.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ee7deb08d43d..2f9153b183c9 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -267,6 +267,63 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
 #define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
 #endif
 
+/*
+ * Various notes on spin_is_locked() and spin_unlock_wait(), which are
+ * 'interesting' functions:
+ *
+ * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE
+ * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64,
+ * PPC). Also qspinlock has a similar issue per construction, the setting of
+ * the locked byte can be unordered acquiring the lock proper.
+ *
+ * This gets to be 'interesting' in the following cases, where the /should/s
+ * end up false because of this issue.
+ *
+ *
+ * CASE 1:
+ *
+ * So the spin_is_locked() correctness issue comes from something like:
+ *
+ *   CPU0				CPU1
+ *
+ *   global_lock();			local_lock(i)
+ *     spin_lock(&G)			  spin_lock(&L[i])
+ *     for (i)				  if (!spin_is_locked(&G)) {
+ *       spin_unlock_wait(&L[i]);	    smp_acquire__after_ctrl_dep();
+ *					    return;
+ *					  }
+ *					  // deal with fail
+ *
+ * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such
+ * that there is exclusion between the two critical sections.
+ *
+ * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from
+ * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i])
+ * /should/ be constrained by the ACQUIRE from spin_lock(&G).
+ *
+ * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB.
+ *
+ *
+ * CASE 2:
+ *
+ * For spin_unlock_wait() there is a second correctness issue, namely:
+ *
+ *   CPU0				CPU1
+ *
+ *   flag = set;
+ *   smp_mb();				spin_lock(&l)
+ *   spin_unlock_wait(&l);		if (!flag)
+ *					  // add to lockless list
+ *					spin_unlock(&l);
+ *   // iterate lockless list
+ *
+ * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0
+ * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE
+ * semantics etc..)
+ *
+ * Where flag /should/ be ordered against the locked store of l.
+ */
+
 /*
  * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
  * issuing an _unordered_ store to set _Q_LOCKED_VAL.
-- 
cgit v1.3-8-gc7d7


From 8ee62b1870be8e630158701632a533d0378e15b8 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hpe.com>
Date: Fri, 3 Jun 2016 22:26:02 -0700
Subject: locking/rwsem: Convert sem->count to 'atomic_long_t'

Convert the rwsem count variable to an atomic_long_t since we use it
as an atomic variable. This also allows us to remove the
rwsem_atomic_{add,update}() "abstraction" which would now be an unnecesary
level of indirection. In follow up patches, we also remove the
rwsem_atomic_{add,update}() definitions across the various architectures.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Jason Low <jason.low2@hpe.com>
[ Build warning fixes on various architectures. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Jason Low <jason.low2@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: Terry Rudd <terry.rudd@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Waiman Long <Waiman.Long@hpe.com>
Link: http://lkml.kernel.org/r/1465017963-4839-2-git-send-email-jason.low2@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/include/asm/rwsem.h | 26 +++++++++++++-------------
 arch/ia64/include/asm/rwsem.h  | 24 ++++++++++++------------
 include/asm-generic/rwsem.h    |  6 +++---
 include/linux/rwsem.h          |  8 +++++---
 kernel/locking/rwsem-xadd.c    | 32 +++++++++++++++++---------------
 5 files changed, 50 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h
index 0131a7058778..b40021aabb9f 100644
--- a/arch/alpha/include/asm/rwsem.h
+++ b/arch/alpha/include/asm/rwsem.h
@@ -25,8 +25,8 @@ static inline void __down_read(struct rw_semaphore *sem)
 {
 	long oldcount;
 #ifndef	CONFIG_SMP
-	oldcount = sem->count;
-	sem->count += RWSEM_ACTIVE_READ_BIAS;
+	oldcount = sem->count.counter;
+	sem->count.counter += RWSEM_ACTIVE_READ_BIAS;
 #else
 	long temp;
 	__asm__ __volatile__(
@@ -52,13 +52,13 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 {
 	long old, new, res;
 
-	res = sem->count;
+	res = atomic_long_read(&sem->count);
 	do {
 		new = res + RWSEM_ACTIVE_READ_BIAS;
 		if (new <= 0)
 			break;
 		old = res;
-		res = cmpxchg(&sem->count, old, new);
+		res = atomic_long_cmpxchg(&sem->count, old, new);
 	} while (res != old);
 	return res >= 0 ? 1 : 0;
 }
@@ -67,8 +67,8 @@ static inline long ___down_write(struct rw_semaphore *sem)
 {
 	long oldcount;
 #ifndef	CONFIG_SMP
-	oldcount = sem->count;
-	sem->count += RWSEM_ACTIVE_WRITE_BIAS;
+	oldcount = sem->count.counter;
+	sem->count.counter += RWSEM_ACTIVE_WRITE_BIAS;
 #else
 	long temp;
 	__asm__ __volatile__(
@@ -106,7 +106,7 @@ static inline int __down_write_killable(struct rw_semaphore *sem)
  */
 static inline int __down_write_trylock(struct rw_semaphore *sem)
 {
-	long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
+	long ret = atomic_long_cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
 			   RWSEM_ACTIVE_WRITE_BIAS);
 	if (ret == RWSEM_UNLOCKED_VALUE)
 		return 1;
@@ -117,8 +117,8 @@ static inline void __up_read(struct rw_semaphore *sem)
 {
 	long oldcount;
 #ifndef	CONFIG_SMP
-	oldcount = sem->count;
-	sem->count -= RWSEM_ACTIVE_READ_BIAS;
+	oldcount = sem->count.counter;
+	sem->count.counter -= RWSEM_ACTIVE_READ_BIAS;
 #else
 	long temp;
 	__asm__ __volatile__(
@@ -142,8 +142,8 @@ static inline void __up_write(struct rw_semaphore *sem)
 {
 	long count;
 #ifndef	CONFIG_SMP
-	sem->count -= RWSEM_ACTIVE_WRITE_BIAS;
-	count = sem->count;
+	sem->count.counter -= RWSEM_ACTIVE_WRITE_BIAS;
+	count = sem->count.counter;
 #else
 	long temp;
 	__asm__ __volatile__(
@@ -171,8 +171,8 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 {
 	long oldcount;
 #ifndef	CONFIG_SMP
-	oldcount = sem->count;
-	sem->count -= RWSEM_WAITING_BIAS;
+	oldcount = sem->count.counter;
+	sem->count.counter -= RWSEM_WAITING_BIAS;
 #else
 	long temp;
 	__asm__ __volatile__(
diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h
index 8b23e070b844..c5d544f188ed 100644
--- a/arch/ia64/include/asm/rwsem.h
+++ b/arch/ia64/include/asm/rwsem.h
@@ -40,7 +40,7 @@
 static inline void
 __down_read (struct rw_semaphore *sem)
 {
-	long result = ia64_fetchadd8_acq((unsigned long *)&sem->count, 1);
+	long result = ia64_fetchadd8_acq((unsigned long *)&sem->count.counter, 1);
 
 	if (result < 0)
 		rwsem_down_read_failed(sem);
@@ -55,9 +55,9 @@ ___down_write (struct rw_semaphore *sem)
 	long old, new;
 
 	do {
-		old = sem->count;
+		old = atomic_long_read(&sem->count);
 		new = old + RWSEM_ACTIVE_WRITE_BIAS;
-	} while (cmpxchg_acq(&sem->count, old, new) != old);
+	} while (atomic_long_cmpxchg_acquire(&sem->count, old, new) != old);
 
 	return old;
 }
@@ -85,7 +85,7 @@ __down_write_killable (struct rw_semaphore *sem)
 static inline void
 __up_read (struct rw_semaphore *sem)
 {
-	long result = ia64_fetchadd8_rel((unsigned long *)&sem->count, -1);
+	long result = ia64_fetchadd8_rel((unsigned long *)&sem->count.counter, -1);
 
 	if (result < 0 && (--result & RWSEM_ACTIVE_MASK) == 0)
 		rwsem_wake(sem);
@@ -100,9 +100,9 @@ __up_write (struct rw_semaphore *sem)
 	long old, new;
 
 	do {
-		old = sem->count;
+		old = atomic_long_read(&sem->count);
 		new = old - RWSEM_ACTIVE_WRITE_BIAS;
-	} while (cmpxchg_rel(&sem->count, old, new) != old);
+	} while (atomic_long_cmpxchg_release(&sem->count, old, new) != old);
 
 	if (new < 0 && (new & RWSEM_ACTIVE_MASK) == 0)
 		rwsem_wake(sem);
@@ -115,8 +115,8 @@ static inline int
 __down_read_trylock (struct rw_semaphore *sem)
 {
 	long tmp;
-	while ((tmp = sem->count) >= 0) {
-		if (tmp == cmpxchg_acq(&sem->count, tmp, tmp+1)) {
+	while ((tmp = atomic_long_read(&sem->count)) >= 0) {
+		if (tmp == atomic_long_cmpxchg_acquire(&sem->count, tmp, tmp+1)) {
 			return 1;
 		}
 	}
@@ -129,8 +129,8 @@ __down_read_trylock (struct rw_semaphore *sem)
 static inline int
 __down_write_trylock (struct rw_semaphore *sem)
 {
-	long tmp = cmpxchg_acq(&sem->count, RWSEM_UNLOCKED_VALUE,
-			      RWSEM_ACTIVE_WRITE_BIAS);
+	long tmp = atomic_long_cmpxchg_acquire(&sem->count,
+			RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS);
 	return tmp == RWSEM_UNLOCKED_VALUE;
 }
 
@@ -143,9 +143,9 @@ __downgrade_write (struct rw_semaphore *sem)
 	long old, new;
 
 	do {
-		old = sem->count;
+		old = atomic_long_read(&sem->count);
 		new = old - RWSEM_WAITING_BIAS;
-	} while (cmpxchg_rel(&sem->count, old, new) != old);
+	} while (atomic_long_cmpxchg_release(&sem->count, old, new) != old);
 
 	if (old < 0)
 		rwsem_downgrade_wake(sem);
diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h
index 3fc94a046bf5..a3a93eca766c 100644
--- a/include/asm-generic/rwsem.h
+++ b/include/asm-generic/rwsem.h
@@ -41,8 +41,8 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 {
 	long tmp;
 
-	while ((tmp = sem->count) >= 0) {
-		if (tmp == cmpxchg_acquire(&sem->count, tmp,
+	while ((tmp = atomic_long_read(&sem->count)) >= 0) {
+		if (tmp == atomic_long_cmpxchg_acquire(&sem->count, tmp,
 				   tmp + RWSEM_ACTIVE_READ_BIAS)) {
 			return 1;
 		}
@@ -79,7 +79,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
 {
 	long tmp;
 
-	tmp = cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE,
+	tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE,
 		      RWSEM_ACTIVE_WRITE_BIAS);
 	return tmp == RWSEM_UNLOCKED_VALUE;
 }
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index d37fbb34d06f..dd1d14250340 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -23,10 +23,11 @@ struct rw_semaphore;
 
 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
 #include <linux/rwsem-spinlock.h> /* use a generic implementation */
+#define __RWSEM_INIT_COUNT(name)	.count = RWSEM_UNLOCKED_VALUE
 #else
 /* All arch specific implementations share the same struct */
 struct rw_semaphore {
-	long count;
+	atomic_long_t count;
 	struct list_head wait_list;
 	raw_spinlock_t wait_lock;
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
@@ -54,9 +55,10 @@ extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 /* In all implementations count != 0 means locked */
 static inline int rwsem_is_locked(struct rw_semaphore *sem)
 {
-	return sem->count != 0;
+	return atomic_long_read(&sem->count) != 0;
 }
 
+#define __RWSEM_INIT_COUNT(name)	.count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE)
 #endif
 
 /* Common initializer macros and functions */
@@ -74,7 +76,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 #endif
 
 #define __RWSEM_INITIALIZER(name)				\
-	{ .count = RWSEM_UNLOCKED_VALUE,			\
+	{ __RWSEM_INIT_COUNT(name),				\
 	  .wait_list = LIST_HEAD_INIT((name).wait_list),	\
 	  .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock)	\
 	  __RWSEM_OPT_INIT(name)				\
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index b957da7fcb19..63b40a5c62ec 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -80,7 +80,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
 	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
 	lockdep_init_map(&sem->dep_map, name, key, 0);
 #endif
-	sem->count = RWSEM_UNLOCKED_VALUE;
+	atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
 	raw_spin_lock_init(&sem->wait_lock);
 	INIT_LIST_HEAD(&sem->wait_list);
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
@@ -153,10 +153,11 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
 	if (wake_type != RWSEM_WAKE_READ_OWNED) {
 		adjustment = RWSEM_ACTIVE_READ_BIAS;
  try_reader_grant:
-		oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
+		oldcount = atomic_long_add_return(adjustment, &sem->count) - adjustment;
+
 		if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
 			/* A writer stole the lock. Undo our reader grant. */
-			if (rwsem_atomic_update(-adjustment, sem) &
+			if (atomic_long_sub_return(adjustment, &sem->count) &
 						RWSEM_ACTIVE_MASK)
 				goto out;
 			/* Last active locker left. Retry waking readers. */
@@ -186,7 +187,7 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
 		adjustment -= RWSEM_WAITING_BIAS;
 
 	if (adjustment)
-		rwsem_atomic_add(adjustment, sem);
+		atomic_long_add(adjustment, &sem->count);
 
 	next = sem->wait_list.next;
 	loop = woken;
@@ -233,7 +234,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 	list_add_tail(&waiter.list, &sem->wait_list);
 
 	/* we're now waiting on the lock, but no longer actively locking */
-	count = rwsem_atomic_update(adjustment, sem);
+	count = atomic_long_add_return(adjustment, &sem->count);
 
 	/* If there are no active locks, wake the front queued process(es).
 	 *
@@ -282,7 +283,8 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
 			RWSEM_ACTIVE_WRITE_BIAS :
 			RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS;
 
-	if (cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) == RWSEM_WAITING_BIAS) {
+	if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count)
+							== RWSEM_WAITING_BIAS) {
 		rwsem_set_owner(sem);
 		return true;
 	}
@@ -296,13 +298,13 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
  */
 static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 {
-	long old, count = READ_ONCE(sem->count);
+	long old, count = atomic_long_read(&sem->count);
 
 	while (true) {
 		if (!(count == 0 || count == RWSEM_WAITING_BIAS))
 			return false;
 
-		old = cmpxchg_acquire(&sem->count, count,
+		old = atomic_long_cmpxchg_acquire(&sem->count, count,
 				      count + RWSEM_ACTIVE_WRITE_BIAS);
 		if (old == count) {
 			rwsem_set_owner(sem);
@@ -324,7 +326,7 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 	rcu_read_lock();
 	owner = READ_ONCE(sem->owner);
 	if (!owner) {
-		long count = READ_ONCE(sem->count);
+		long count = atomic_long_read(&sem->count);
 		/*
 		 * If sem->owner is not set, yet we have just recently entered the
 		 * slowpath with the lock being active, then there is a possibility
@@ -375,7 +377,7 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
 	 * held by readers. Check the counter to verify the
 	 * state.
 	 */
-	count = READ_ONCE(sem->count);
+	count = atomic_long_read(&sem->count);
 	return (count == 0 || count == RWSEM_WAITING_BIAS);
 }
 
@@ -460,7 +462,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 	WAKE_Q(wake_q);
 
 	/* undo write bias from down_write operation, stop active locking */
-	count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
+	count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count);
 
 	/* do optimistic spinning and steal lock if possible */
 	if (rwsem_optimistic_spin(sem))
@@ -483,7 +485,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 
 	/* we're now waiting on the lock, but no longer actively locking */
 	if (waiting) {
-		count = READ_ONCE(sem->count);
+		count = atomic_long_read(&sem->count);
 
 		/*
 		 * If there were already threads queued before us and there are
@@ -505,7 +507,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 		}
 
 	} else
-		count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+		count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count);
 
 	/* wait until we successfully acquire the lock */
 	set_current_state(state);
@@ -521,7 +523,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 
 			schedule();
 			set_current_state(state);
-		} while ((count = sem->count) & RWSEM_ACTIVE_MASK);
+		} while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
 
 		raw_spin_lock_irq(&sem->wait_lock);
 	}
@@ -536,7 +538,7 @@ out_nolock:
 	raw_spin_lock_irq(&sem->wait_lock);
 	list_del(&waiter.list);
 	if (list_empty(&sem->wait_list))
-		rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem);
+		atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
 	else
 		__rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 	raw_spin_unlock_irq(&sem->wait_lock);
-- 
cgit v1.3-8-gc7d7


From 19c5d690e41697fcdd19379ab9d10d8d37818414 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Tue, 17 May 2016 21:26:19 -0400
Subject: locking/rwsem: Add reader-owned state to the owner field

Currently, it is not possible to determine for sure if a reader
owns a rwsem by looking at the content of the rwsem data structure.
This patch adds a new state RWSEM_READER_OWNED to the owner field
to indicate that readers currently own the lock. This enables us to
address the following 2 issues in the rwsem optimistic spinning code:

 1) rwsem_can_spin_on_owner() will disallow optimistic spinning if
    the owner field is NULL which can mean either the readers own
    the lock or the owning writer hasn't set the owner field yet.
    In the latter case, we miss the chance to do optimistic spinning.

 2) While a writer is waiting in the OSQ and a reader takes the lock,
    the writer will continue to spin when out of the OSQ in the main
    rwsem_optimistic_spin() loop as the owner field is NULL wasting
    CPU cycles if some of readers are sleeping.

Adding the new state will allow optimistic spinning to go forward as
long as the owner field is not RWSEM_READER_OWNED and the owner is
running, if set, but stop immediately when that state has been reached.

On a 4-socket Haswell machine running on a 4.6-rc1 based kernel, the
fio test with multithreaded randrw and randwrite tests on the same
file on a XFS partition on top of a NVDIMM were run, the aggregated
bandwidths before and after the patch were as follows:

  Test      BW before patch     BW after patch  % change
  ----      ---------------     --------------  --------
  randrw         988 MB/s          1192 MB/s      +21%
  randwrite     1513 MB/s          1623 MB/s      +7.3%

The perf profile of the rwsem_down_write_failed() function in randrw
before and after the patch were:

   19.95%  5.88%  fio  [kernel.vmlinux]  [k] rwsem_down_write_failed
   14.20%  1.52%  fio  [kernel.vmlinux]  [k] rwsem_down_write_failed

The actual CPU cycles spend in rwsem_down_write_failed() dropped from
5.88% to 1.52% after the patch.

The xfstests was also run and no regression was observed.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Jason Low <jason.low2@hp.com>
Acked-by: Davidlohr Bueso <dave@stgolabs.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1463534783-38814-2-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem-xadd.c | 41 ++++++++++++++++++++++-------------------
 kernel/locking/rwsem.c      |  8 ++++++--
 kernel/locking/rwsem.h      | 41 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 63b40a5c62ec..6b0d0605910e 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -163,6 +163,12 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
 			/* Last active locker left. Retry waking readers. */
 			goto try_reader_grant;
 		}
+		/*
+		 * It is not really necessary to set it to reader-owned here,
+		 * but it gives the spinners an early indication that the
+		 * readers now have the lock.
+		 */
+		rwsem_set_reader_owned(sem);
 	}
 
 	/* Grant an infinite number of read locks to the readers at the front
@@ -325,16 +331,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 
 	rcu_read_lock();
 	owner = READ_ONCE(sem->owner);
-	if (!owner) {
-		long count = atomic_long_read(&sem->count);
+	if (!rwsem_owner_is_writer(owner)) {
 		/*
-		 * If sem->owner is not set, yet we have just recently entered the
-		 * slowpath with the lock being active, then there is a possibility
-		 * reader(s) may have the lock. To be safe, bail spinning in these
-		 * situations.
+		 * Don't spin if the rwsem is readers owned.
 		 */
-		if (count & RWSEM_ACTIVE_MASK)
-			ret = false;
+		ret = !rwsem_owner_is_reader(owner);
 		goto done;
 	}
 
@@ -347,8 +348,6 @@ done:
 static noinline
 bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
 {
-	long count;
-
 	rcu_read_lock();
 	while (sem->owner == owner) {
 		/*
@@ -369,16 +368,11 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
 	}
 	rcu_read_unlock();
 
-	if (READ_ONCE(sem->owner))
-		return true; /* new owner, continue spinning */
-
 	/*
-	 * When the owner is not set, the lock could be free or
-	 * held by readers. Check the counter to verify the
-	 * state.
+	 * If there is a new owner or the owner is not set, we continue
+	 * spinning.
 	 */
-	count = atomic_long_read(&sem->count);
-	return (count == 0 || count == RWSEM_WAITING_BIAS);
+	return !rwsem_owner_is_reader(READ_ONCE(sem->owner));
 }
 
 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
@@ -397,7 +391,16 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 
 	while (true) {
 		owner = READ_ONCE(sem->owner);
-		if (owner && !rwsem_spin_on_owner(sem, owner))
+		/*
+		 * Don't spin if
+		 * 1) the owner is a reader as we we can't determine if the
+		 *    reader is actively running or not.
+		 * 2) The rwsem_spin_on_owner() returns false which means
+		 *    the owner isn't running.
+		 */
+		if (rwsem_owner_is_reader(owner) ||
+		   (rwsem_owner_is_writer(owner) &&
+		   !rwsem_spin_on_owner(sem, owner)))
 			break;
 
 		/* wait_lock will be acquired if write_lock is obtained */
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 2e853ad93a3a..45ba475d4be3 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -22,6 +22,7 @@ void __sched down_read(struct rw_semaphore *sem)
 	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
 
 	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+	rwsem_set_reader_owned(sem);
 }
 
 EXPORT_SYMBOL(down_read);
@@ -33,8 +34,10 @@ int down_read_trylock(struct rw_semaphore *sem)
 {
 	int ret = __down_read_trylock(sem);
 
-	if (ret == 1)
+	if (ret == 1) {
 		rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
+		rwsem_set_reader_owned(sem);
+	}
 	return ret;
 }
 
@@ -124,7 +127,7 @@ void downgrade_write(struct rw_semaphore *sem)
 	 * lockdep: a downgraded write will live on as a write
 	 * dependency.
 	 */
-	rwsem_clear_owner(sem);
+	rwsem_set_reader_owned(sem);
 	__downgrade_write(sem);
 }
 
@@ -138,6 +141,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
 	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
 
 	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+	rwsem_set_reader_owned(sem);
 }
 
 EXPORT_SYMBOL(down_read_nested);
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 870ed9a5b426..8f43ba234787 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,3 +1,20 @@
+/*
+ * The owner field of the rw_semaphore structure will be set to
+ * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear
+ * the owner field when it unlocks. A reader, on the other hand, will
+ * not touch the owner field when it unlocks.
+ *
+ * In essence, the owner field now has the following 3 states:
+ *  1) 0
+ *     - lock is free or the owner hasn't set the field yet
+ *  2) RWSEM_READER_OWNED
+ *     - lock is currently or previously owned by readers (lock is free
+ *       or not set by owner yet)
+ *  3) Other non-zero value
+ *     - a writer owns the lock
+ */
+#define RWSEM_READER_OWNED	((struct task_struct *)1UL)
+
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 static inline void rwsem_set_owner(struct rw_semaphore *sem)
 {
@@ -9,6 +26,26 @@ static inline void rwsem_clear_owner(struct rw_semaphore *sem)
 	sem->owner = NULL;
 }
 
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+	/*
+	 * We check the owner value first to make sure that we will only
+	 * do a write to the rwsem cacheline when it is really necessary
+	 * to minimize cacheline contention.
+	 */
+	if (sem->owner != RWSEM_READER_OWNED)
+		sem->owner = RWSEM_READER_OWNED;
+}
+
+static inline bool rwsem_owner_is_writer(struct task_struct *owner)
+{
+	return owner && owner != RWSEM_READER_OWNED;
+}
+
+static inline bool rwsem_owner_is_reader(struct task_struct *owner)
+{
+	return owner == RWSEM_READER_OWNED;
+}
 #else
 static inline void rwsem_set_owner(struct rw_semaphore *sem)
 {
@@ -17,4 +54,8 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem)
 static inline void rwsem_clear_owner(struct rw_semaphore *sem)
 {
 }
+
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+}
 #endif
-- 
cgit v1.3-8-gc7d7


From fb6a44f33be542fd81575ff93a4e8118d6a58592 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Tue, 17 May 2016 21:26:20 -0400
Subject: locking/rwsem: Protect all writes to owner by WRITE_ONCE()

Without using WRITE_ONCE(), the compiler can potentially break a
write into multiple smaller ones (store tearing). So a read from the
same data by another task concurrently may return a partial result.
This can result in a kernel crash if the data is a memory address
that is being dereferenced.

This patch changes all write to rwsem->owner to use WRITE_ONCE()
to make sure that store tearing will not happen. READ_ONCE() may
not be needed for rwsem->owner as long as the value is only used for
comparison and not dereferencing.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: Jason Low <jason.low2@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1463534783-38814-3-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 8f43ba234787..a699f4048ba1 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -16,14 +16,21 @@
 #define RWSEM_READER_OWNED	((struct task_struct *)1UL)
 
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+/*
+ * All writes to owner are protected by WRITE_ONCE() to make sure that
+ * store tearing can't happen as optimistic spinners may read and use
+ * the owner value concurrently without lock. Read from owner, however,
+ * may not need READ_ONCE() as long as the pointer value is only used
+ * for comparison and isn't being dereferenced.
+ */
 static inline void rwsem_set_owner(struct rw_semaphore *sem)
 {
-	sem->owner = current;
+	WRITE_ONCE(sem->owner, current);
 }
 
 static inline void rwsem_clear_owner(struct rw_semaphore *sem)
 {
-	sem->owner = NULL;
+	WRITE_ONCE(sem->owner, NULL);
 }
 
 static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
@@ -34,7 +41,7 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
 	 * to minimize cacheline contention.
 	 */
 	if (sem->owner != RWSEM_READER_OWNED)
-		sem->owner = RWSEM_READER_OWNED;
+		WRITE_ONCE(sem->owner, RWSEM_READER_OWNED);
 }
 
 static inline bool rwsem_owner_is_writer(struct task_struct *owner)
-- 
cgit v1.3-8-gc7d7


From bf7b4c472db44413251bcef79ca1f6bf1ec81475 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Tue, 17 May 2016 21:26:22 -0400
Subject: locking/rwsem: Improve reader wakeup code

In __rwsem_do_wake(), the reader wakeup code will assume a writer
has stolen the lock if the active reader/writer count is not 0.
However, this is not as reliable an indicator as the original
"< RWSEM_WAITING_BIAS" check. If another reader is present, the code
will still break out and exit even if the writer is gone. This patch
changes it to check the same "< RWSEM_WAITING_BIAS" condition to
reduce the chance of false positive.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: Jason Low <jason.low2@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1463534783-38814-5-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem-xadd.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 6b0d0605910e..4f1daf5a472d 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -156,9 +156,14 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
 		oldcount = atomic_long_add_return(adjustment, &sem->count) - adjustment;
 
 		if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
-			/* A writer stole the lock. Undo our reader grant. */
-			if (atomic_long_sub_return(adjustment, &sem->count) &
-						RWSEM_ACTIVE_MASK)
+			/*
+			 * If the count is still less than RWSEM_WAITING_BIAS
+			 * after removing the adjustment, it is assumed that
+			 * a writer has stolen the lock. We have to undo our
+			 * reader grant.
+			 */
+			if (atomic_long_add_return(-adjustment, &sem->count) <
+			    RWSEM_WAITING_BIAS)
 				goto out;
 			/* Last active locker left. Retry waking readers. */
 			goto try_reader_grant;
-- 
cgit v1.3-8-gc7d7


From ddd0fa73c2b71c35de4fe7ae60a5f1a6cddc2cf0 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Tue, 17 May 2016 21:26:23 -0400
Subject: locking/rwsem: Streamline the rwsem_optimistic_spin() code

This patch moves the owner loading and checking code entirely inside of
rwsem_spin_on_owner() to simplify the logic of rwsem_optimistic_spin()
loop.

Suggested-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: Jason Low <jason.low2@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1463534783-38814-6-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem-xadd.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 4f1daf5a472d..2031281bb940 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -350,9 +350,16 @@ done:
 	return ret;
 }
 
-static noinline
-bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
+/*
+ * Return true only if we can still spin on the owner field of the rwsem.
+ */
+static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
 {
+	struct task_struct *owner = READ_ONCE(sem->owner);
+
+	if (!rwsem_owner_is_writer(owner))
+		goto out;
+
 	rcu_read_lock();
 	while (sem->owner == owner) {
 		/*
@@ -372,7 +379,7 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
 		cpu_relax_lowlatency();
 	}
 	rcu_read_unlock();
-
+out:
 	/*
 	 * If there is a new owner or the owner is not set, we continue
 	 * spinning.
@@ -382,7 +389,6 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
 
 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 {
-	struct task_struct *owner;
 	bool taken = false;
 
 	preempt_disable();
@@ -394,21 +400,17 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 	if (!osq_lock(&sem->osq))
 		goto done;
 
-	while (true) {
-		owner = READ_ONCE(sem->owner);
+	/*
+	 * Optimistically spin on the owner field and attempt to acquire the
+	 * lock whenever the owner changes. Spinning will be stopped when:
+	 *  1) the owning writer isn't running; or
+	 *  2) readers own the lock as we can't determine if they are
+	 *     actively running or not.
+	 */
+	while (rwsem_spin_on_owner(sem)) {
 		/*
-		 * Don't spin if
-		 * 1) the owner is a reader as we we can't determine if the
-		 *    reader is actively running or not.
-		 * 2) The rwsem_spin_on_owner() returns false which means
-		 *    the owner isn't running.
+		 * Try to acquire the lock
 		 */
-		if (rwsem_owner_is_reader(owner) ||
-		   (rwsem_owner_is_writer(owner) &&
-		   !rwsem_spin_on_owner(sem, owner)))
-			break;
-
-		/* wait_lock will be acquired if write_lock is obtained */
 		if (rwsem_try_write_lock_unqueued(sem)) {
 			taken = true;
 			break;
@@ -420,7 +422,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 		 * we're an RT task that will live-lock because we won't let
 		 * the owner complete.
 		 */
-		if (!owner && (need_resched() || rt_task(current)))
+		if (!sem->owner && (need_resched() || rt_task(current)))
 			break;
 
 		/*
-- 
cgit v1.3-8-gc7d7


From 6428671bae97caa7040e24e79e969fd87908f4f3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 1 Jun 2016 20:58:15 +0200
Subject: locking/mutex: Optimize mutex_trylock() fast-path

A while back Viro posted a number of 'interesting' mutex_is_locked()
users on IRC, one of those was RCU.

RCU seems to use mutex_is_locked() to avoid doing mutex_trylock(), the
regular load before modify pattern.

While the use isn't wrong per se, its curious in that its needed at all,
mutex_trylock() should be good enough on its own to avoid the pointless
cacheline bounces.

So fix those and remove the mutex_is_locked() (ab)use from RCU.

Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Paul McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Davidlohr Bueso <dave@stgolabs.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <Waiman.Long@hpe.com>
Link: http://lkml.kernel.org/r/20160601185815.GW3190@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/include/asm/mutex.h    | 2 +-
 arch/powerpc/include/asm/mutex.h | 2 +-
 arch/x86/include/asm/mutex_32.h  | 2 +-
 arch/x86/include/asm/mutex_64.h  | 6 +++---
 include/asm-generic/mutex-dec.h  | 2 +-
 include/asm-generic/mutex-xchg.h | 6 +++++-
 kernel/rcu/tree.c                | 1 -
 7 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/include/asm/mutex.h b/arch/ia64/include/asm/mutex.h
index f41e66d65e31..28cb819e0ff9 100644
--- a/arch/ia64/include/asm/mutex.h
+++ b/arch/ia64/include/asm/mutex.h
@@ -82,7 +82,7 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 {
-	if (cmpxchg_acq(count, 1, 0) == 1)
+	if (atomic_read(count) == 1 && cmpxchg_acq(count, 1, 0) == 1)
 		return 1;
 	return 0;
 }
diff --git a/arch/powerpc/include/asm/mutex.h b/arch/powerpc/include/asm/mutex.h
index 127ab23e1f6c..078155fa1189 100644
--- a/arch/powerpc/include/asm/mutex.h
+++ b/arch/powerpc/include/asm/mutex.h
@@ -124,7 +124,7 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 {
-	if (likely(__mutex_cmpxchg_lock(count, 1, 0) == 1))
+	if (likely(atomic_read(count) == 1 && __mutex_cmpxchg_lock(count, 1, 0) == 1))
 		return 1;
 	return 0;
 }
diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h
index 85e6cda45a02..e9355a84fc67 100644
--- a/arch/x86/include/asm/mutex_32.h
+++ b/arch/x86/include/asm/mutex_32.h
@@ -101,7 +101,7 @@ static inline int __mutex_fastpath_trylock(atomic_t *count,
 					   int (*fail_fn)(atomic_t *))
 {
 	/* cmpxchg because it never induces a false contention state. */
-	if (likely(atomic_cmpxchg(count, 1, 0) == 1))
+	if (likely(atomic_read(count) == 1 && atomic_cmpxchg(count, 1, 0) == 1))
 		return 1;
 
 	return 0;
diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h
index 07537a44216e..d9850758464e 100644
--- a/arch/x86/include/asm/mutex_64.h
+++ b/arch/x86/include/asm/mutex_64.h
@@ -118,10 +118,10 @@ do {								\
 static inline int __mutex_fastpath_trylock(atomic_t *count,
 					   int (*fail_fn)(atomic_t *))
 {
-	if (likely(atomic_cmpxchg(count, 1, 0) == 1))
+	if (likely(atomic_read(count) == 1 && atomic_cmpxchg(count, 1, 0) == 1))
 		return 1;
-	else
-		return 0;
+
+	return 0;
 }
 
 #endif /* _ASM_X86_MUTEX_64_H */
diff --git a/include/asm-generic/mutex-dec.h b/include/asm-generic/mutex-dec.h
index fd694cfd678a..c54829d3de37 100644
--- a/include/asm-generic/mutex-dec.h
+++ b/include/asm-generic/mutex-dec.h
@@ -80,7 +80,7 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 {
-	if (likely(atomic_cmpxchg_acquire(count, 1, 0) == 1))
+	if (likely(atomic_read(count) == 1 && atomic_cmpxchg_acquire(count, 1, 0) == 1))
 		return 1;
 	return 0;
 }
diff --git a/include/asm-generic/mutex-xchg.h b/include/asm-generic/mutex-xchg.h
index a6b4a7bd6ac9..3269ec4e195f 100644
--- a/include/asm-generic/mutex-xchg.h
+++ b/include/asm-generic/mutex-xchg.h
@@ -91,8 +91,12 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 {
-	int prev = atomic_xchg_acquire(count, 0);
+	int prev;
 
+	if (atomic_read(count) != 1)
+		return 0;
+
+	prev = atomic_xchg_acquire(count, 0);
 	if (unlikely(prev < 0)) {
 		/*
 		 * The lock was marked contended so we must restore that
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c7f1bc4f817c..b7326893221f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3681,7 +3681,6 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 	if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) &&
 	    (rnp == rnp_root ||
 	     ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) &&
-	    !mutex_is_locked(&rsp->exp_mutex) &&
 	    mutex_trylock(&rsp->exp_mutex))
 		goto fastpath;
 
-- 
cgit v1.3-8-gc7d7


From 288dab8a35a0bde426a09870943c8d3ee3a50dab Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Jun 2016 16:00:36 +0200
Subject: block: add a separate operation type for secure erase

Instead of overloading the discard support with the REQ_SECURE flag.
Use the opportunity to rename the queue flag as well, and remove the
dead checks for this flag in the RAID 1 and RAID 10 drivers that don't
claim support for secure erase.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c                   | 27 +++++++++++++++++----------
 block/blk-lib.c                    | 25 ++++++++++++++-----------
 block/blk-merge.c                  |  6 ++----
 drivers/block/xen-blkback/xenbus.c |  2 +-
 drivers/block/xen-blkfront.c       | 14 +++++++++-----
 drivers/md/raid1.c                 |  3 +--
 drivers/md/raid10.c                |  5 ++---
 drivers/mmc/card/block.c           | 10 ++++++----
 drivers/mmc/card/queue.c           |  2 +-
 include/linux/blk_types.h          |  5 ++---
 include/linux/blkdev.h             | 23 ++++-------------------
 kernel/trace/blktrace.c            |  6 ++++--
 12 files changed, 63 insertions(+), 65 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-core.c b/block/blk-core.c
index 32a283eb7274..db31a2981223 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1977,16 +1977,21 @@ generic_make_request_checks(struct bio *bio)
 		}
 	}
 
-	if ((bio_op(bio) == REQ_OP_DISCARD) &&
-	    (!blk_queue_discard(q) ||
-	     ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
-		err = -EOPNOTSUPP;
-		goto end_io;
-	}
-
-	if (bio_op(bio) == REQ_OP_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
-		err = -EOPNOTSUPP;
-		goto end_io;
+	switch (bio_op(bio)) {
+	case REQ_OP_DISCARD:
+		if (!blk_queue_discard(q))
+			goto not_supported;
+		break;
+	case REQ_OP_SECURE_ERASE:
+		if (!blk_queue_secure_erase(q))
+			goto not_supported;
+		break;
+	case REQ_OP_WRITE_SAME:
+		if (!bdev_write_same(bio->bi_bdev))
+			goto not_supported;
+		break;
+	default:
+		break;
 	}
 
 	/*
@@ -2003,6 +2008,8 @@ generic_make_request_checks(struct bio *bio)
 	trace_block_bio_queue(q, bio);
 	return true;
 
+not_supported:
+	err = -EOPNOTSUPP;
 end_io:
 	bio->bi_error = err;
 	bio_endio(bio);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index ff2a7f04af4d..78626c2fde33 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -23,20 +23,27 @@ static struct bio *next_bio(struct bio *bio, unsigned int nr_pages,
 }
 
 int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
-		sector_t nr_sects, gfp_t gfp_mask, int op_flags,
+		sector_t nr_sects, gfp_t gfp_mask, int flags,
 		struct bio **biop)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 	struct bio *bio = *biop;
 	unsigned int granularity;
+	enum req_op op;
 	int alignment;
 
 	if (!q)
 		return -ENXIO;
-	if (!blk_queue_discard(q))
-		return -EOPNOTSUPP;
-	if ((op_flags & REQ_SECURE) && !blk_queue_secdiscard(q))
-		return -EOPNOTSUPP;
+
+	if (flags & BLKDEV_DISCARD_SECURE) {
+		if (!blk_queue_secure_erase(q))
+			return -EOPNOTSUPP;
+		op = REQ_OP_SECURE_ERASE;
+	} else {
+		if (!blk_queue_discard(q))
+			return -EOPNOTSUPP;
+		op = REQ_OP_DISCARD;
+	}
 
 	/* Zero-sector (unknown) and one-sector granularities are the same.  */
 	granularity = max(q->limits.discard_granularity >> 9, 1U);
@@ -66,7 +73,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		bio = next_bio(bio, 1, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio->bi_bdev = bdev;
-		bio_set_op_attrs(bio, REQ_OP_DISCARD, op_flags);
+		bio_set_op_attrs(bio, op, 0);
 
 		bio->bi_iter.bi_size = req_sects << 9;
 		nr_sects -= req_sects;
@@ -100,16 +107,12 @@ EXPORT_SYMBOL(__blkdev_issue_discard);
 int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
 {
-	int op_flags = 0;
 	struct bio *bio = NULL;
 	struct blk_plug plug;
 	int ret;
 
-	if (flags & BLKDEV_DISCARD_SECURE)
-		op_flags |= REQ_SECURE;
-
 	blk_start_plug(&plug);
-	ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, op_flags,
+	ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags,
 			&bio);
 	if (!ret && bio) {
 		ret = submit_bio_wait(bio);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index c265348b75d1..9772308a8391 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -649,8 +649,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	if (!rq_mergeable(req) || !rq_mergeable(next))
 		return 0;
 
-	if (!blk_check_merge_flags(req->cmd_flags, req_op(req), next->cmd_flags,
-				   req_op(next)))
+	if (req_op(req) != req_op(next))
 		return 0;
 
 	/*
@@ -752,8 +751,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (!rq_mergeable(rq) || !bio_mergeable(bio))
 		return false;
 
-	if (!blk_check_merge_flags(rq->cmd_flags, req_op(rq), bio->bi_rw,
-				   bio_op(bio)))
+	if (req_op(rq) != bio_op(bio))
 		return false;
 
 	/* different data direction or already started, don't merge */
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 3355f1cdd4e5..2994cfa44c8a 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -480,7 +480,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
 	if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags))
 		vbd->flush_support = true;
 
-	if (q && blk_queue_secdiscard(q))
+	if (q && blk_queue_secure_erase(q))
 		vbd->discard_secure = true;
 
 	pr_debug("Successful creation of handle=%04x (dom=%u)\n",
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 343ef7abe5fd..10711292da2c 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -545,7 +545,7 @@ static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_inf
 	ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
 	ring_req->u.discard.id = id;
 	ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
-	if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
+	if (req_op(req) == REQ_OP_SECURE_ERASE && info->feature_secdiscard)
 		ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
 	else
 		ring_req->u.discard.flag = 0;
@@ -841,7 +841,7 @@ static int blkif_queue_request(struct request *req, struct blkfront_ring_info *r
 		return 1;
 
 	if (unlikely(req_op(req) == REQ_OP_DISCARD ||
-		     req->cmd_flags & REQ_SECURE))
+		     req_op(req) == REQ_OP_SECURE_ERASE))
 		return blkif_queue_discard_req(req, rinfo);
 	else
 		return blkif_queue_rw_req(req, rinfo);
@@ -955,7 +955,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
 		rq->limits.discard_granularity = info->discard_granularity;
 		rq->limits.discard_alignment = info->discard_alignment;
 		if (info->feature_secdiscard)
-			queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq);
+			queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, rq);
 	}
 
 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
@@ -1595,7 +1595,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 				info->feature_discard = 0;
 				info->feature_secdiscard = 0;
 				queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
-				queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
+				queue_flag_clear(QUEUE_FLAG_SECERASE, rq);
 			}
 			blk_mq_complete_request(req, error);
 			break;
@@ -2052,10 +2052,14 @@ static int blkif_recover(struct blkfront_info *info)
 			 */
 			if (req_op(copy[i].request) == REQ_OP_FLUSH ||
 			    req_op(copy[i].request) == REQ_OP_DISCARD ||
-			    copy[i].request->cmd_flags & (REQ_FUA | REQ_SECURE)) {
+			    req_op(copy[i].request) == REQ_OP_SECURE_ERASE ||
+			    copy[i].request->cmd_flags & REQ_FUA) {
 				/*
 				 * Flush operations don't contain bios, so
 				 * we need to requeue the whole request
+				 *
+				 * XXX: but this doesn't make any sense for a
+				 * write with the FUA flag set..
 				 */
 				list_add(&copy[i].request->queuelist, &requests);
 				continue;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 10e53cd6a995..41d9c31da3b3 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1058,7 +1058,6 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 	const unsigned long do_flush_fua = (bio->bi_rw &
 						(REQ_PREFLUSH | REQ_FUA));
-	const unsigned long do_sec = (bio->bi_rw & REQ_SECURE);
 	struct md_rdev *blocked_rdev;
 	struct blk_plug_cb *cb;
 	struct raid1_plug_cb *plug = NULL;
@@ -1376,7 +1375,7 @@ read_again:
 				   conf->mirrors[i].rdev->data_offset);
 		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
 		mbio->bi_end_io	= raid1_end_write_request;
-		bio_set_op_attrs(mbio, op, do_flush_fua | do_sync | do_sec);
+		bio_set_op_attrs(mbio, op, do_flush_fua | do_sync);
 		mbio->bi_private = r1_bio;
 
 		atomic_inc(&r1_bio->remaining);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 245640b50153..26ae74fd0d01 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1062,7 +1062,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 	const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
-	const unsigned long do_sec = (bio->bi_rw & REQ_SECURE);
 	unsigned long flags;
 	struct md_rdev *blocked_rdev;
 	struct blk_plug_cb *cb;
@@ -1362,7 +1361,7 @@ retry_write:
 							      rdev));
 			mbio->bi_bdev = rdev->bdev;
 			mbio->bi_end_io	= raid10_end_write_request;
-			bio_set_op_attrs(mbio, op, do_sync | do_fua | do_sec);
+			bio_set_op_attrs(mbio, op, do_sync | do_fua);
 			mbio->bi_private = r10_bio;
 
 			atomic_inc(&r10_bio->remaining);
@@ -1404,7 +1403,7 @@ retry_write:
 						   r10_bio, rdev));
 			mbio->bi_bdev = rdev->bdev;
 			mbio->bi_end_io	= raid10_end_write_request;
-			bio_set_op_attrs(mbio, op, do_sync | do_fua | do_sec);
+			bio_set_op_attrs(mbio, op, do_sync | do_fua);
 			mbio->bi_private = r10_bio;
 
 			atomic_inc(&r10_bio->remaining);
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index bca20f88a8b2..383184743f9a 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -2167,10 +2167,12 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 		/* complete ongoing async transfer before issuing discard */
 		if (card->host->areq)
 			mmc_blk_issue_rw_rq(mq, NULL);
-		if (req->cmd_flags & REQ_SECURE)
-			ret = mmc_blk_issue_secdiscard_rq(mq, req);
-		else
-			ret = mmc_blk_issue_discard_rq(mq, req);
+		ret = mmc_blk_issue_discard_rq(mq, req);
+	} else if (req && req_op(req) == REQ_OP_SECURE_ERASE) {
+		/* complete ongoing async transfer before issuing secure erase*/
+		if (card->host->areq)
+			mmc_blk_issue_rw_rq(mq, NULL);
+		ret = mmc_blk_issue_secdiscard_rq(mq, req);
 	} else if (req && req_op(req) == REQ_OP_FLUSH) {
 		/* complete ongoing async transfer before issuing flush */
 		if (card->host->areq)
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index c2d5f6f35145..bf14642a576a 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -171,7 +171,7 @@ static void mmc_queue_setup_discard(struct request_queue *q,
 	if (card->pref_erase > max_discard)
 		q->limits.discard_granularity = 0;
 	if (mmc_can_secure_erase_trim(card))
-		queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q);
+		queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, q);
 }
 
 /**
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 562ab8301217..efba1f2ace2e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -163,7 +163,6 @@ enum rq_flag_bits {
 	__REQ_SYNC,		/* request is sync (sync write or read) */
 	__REQ_META,		/* metadata io request */
 	__REQ_PRIO,		/* boost priority in cfq */
-	__REQ_SECURE,		/* secure discard (used with REQ_OP_DISCARD) */
 
 	__REQ_NOIDLE,		/* don't anticipate more IO after this one */
 	__REQ_INTEGRITY,	/* I/O includes block integrity payload */
@@ -212,7 +211,7 @@ enum rq_flag_bits {
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
 #define REQ_COMMON_MASK \
 	(REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \
-	 REQ_PREFLUSH | REQ_FUA | REQ_SECURE | REQ_INTEGRITY | REQ_NOMERGE)
+	 REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE)
 #define REQ_CLONE_MASK		REQ_COMMON_MASK
 
 /* This mask is used for both bio and request merge checking */
@@ -239,7 +238,6 @@ enum rq_flag_bits {
 #define REQ_FLUSH_SEQ		(1ULL << __REQ_FLUSH_SEQ)
 #define REQ_IO_STAT		(1ULL << __REQ_IO_STAT)
 #define REQ_MIXED_MERGE		(1ULL << __REQ_MIXED_MERGE)
-#define REQ_SECURE		(1ULL << __REQ_SECURE)
 #define REQ_PM			(1ULL << __REQ_PM)
 #define REQ_HASHED		(1ULL << __REQ_HASHED)
 #define REQ_MQ_INFLIGHT		(1ULL << __REQ_MQ_INFLIGHT)
@@ -248,6 +246,7 @@ enum req_op {
 	REQ_OP_READ,
 	REQ_OP_WRITE,
 	REQ_OP_DISCARD,		/* request to discard sectors */
+	REQ_OP_SECURE_ERASE,	/* request to securely erase sectors */
 	REQ_OP_WRITE_SAME,	/* write same block many times */
 	REQ_OP_FLUSH,		/* request for cache flush */
 };
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0c9f8793c87e..53fee6123893 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -497,7 +497,7 @@ struct request_queue {
 #define QUEUE_FLAG_DISCARD     14	/* supports DISCARD */
 #define QUEUE_FLAG_NOXMERGES   15	/* No extended merges */
 #define QUEUE_FLAG_ADD_RANDOM  16	/* Contributes to random pool */
-#define QUEUE_FLAG_SECDISCARD  17	/* supports SECDISCARD */
+#define QUEUE_FLAG_SECERASE    17	/* supports secure erase */
 #define QUEUE_FLAG_SAME_FORCE  18	/* force complete on same CPU */
 #define QUEUE_FLAG_DEAD        19	/* queue tear-down finished */
 #define QUEUE_FLAG_INIT_DONE   20	/* queue is initialized */
@@ -593,8 +593,8 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 #define blk_queue_stackable(q)	\
 	test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
 #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
-#define blk_queue_secdiscard(q)	(blk_queue_discard(q) && \
-	test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags))
+#define blk_queue_secure_erase(q) \
+	(test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags))
 
 #define blk_noretry_request(rq) \
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
@@ -675,21 +675,6 @@ static inline bool rq_mergeable(struct request *rq)
 	return true;
 }
 
-static inline bool blk_check_merge_flags(unsigned int flags1, unsigned int op1,
-					 unsigned int flags2, unsigned int op2)
-{
-	if ((op1 == REQ_OP_DISCARD) != (op2 == REQ_OP_DISCARD))
-		return false;
-
-	if ((flags1 & REQ_SECURE) != (flags2 & REQ_SECURE))
-		return false;
-
-	if ((op1 == REQ_OP_WRITE_SAME) != (op2 == REQ_OP_WRITE_SAME))
-		return false;
-
-	return true;
-}
-
 static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
 {
 	if (bio_data(a) == bio_data(b))
@@ -1158,7 +1143,7 @@ extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
-		sector_t nr_sects, gfp_t gfp_mask, int op_flags,
+		sector_t nr_sects, gfp_t gfp_mask, int flags,
 		struct bio **biop);
 extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 03b0dd98ff0e..af49caf973eb 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1791,6 +1791,10 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
 	case REQ_OP_DISCARD:
 		rwbs[i++] = 'D';
 		break;
+	case REQ_OP_SECURE_ERASE:
+		rwbs[i++] = 'D';
+		rwbs[i++] = 'E';
+		break;
 	case REQ_OP_FLUSH:
 		rwbs[i++] = 'F';
 		break;
@@ -1809,8 +1813,6 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
 		rwbs[i++] = 'S';
 	if (rw & REQ_META)
 		rwbs[i++] = 'M';
-	if (rw & REQ_SECURE)
-		rwbs[i++] = 'E';
 
 	rwbs[i] = '\0';
 }
-- 
cgit v1.3-8-gc7d7


From b5227d03b7191a9a44bf75a4c228a6a9ddbe781b Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 31 May 2016 16:23:02 -0500
Subject: timers: Clarify usleep_range() function comment

Update the usleep_range() function comment to make it clear that it can
only be used in non-atomic context.

Previously we claimed usleep_range() was a drop-in replacement for udelay()
where wakeup is flexible.  But that's only true in non-atomic contexts,
where it's possible to sleep instead of delay.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20160531212302.28502.44995.stgit@bhelgaas-glaptop2.roam.corp.google.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3a95f9728778..67dd6103003a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1702,9 +1702,15 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max)
 }
 
 /**
- * usleep_range - Drop in replacement for udelay where wakeup is flexible
+ * usleep_range - Sleep for an approximate time
  * @min: Minimum time in usecs to sleep
  * @max: Maximum time in usecs to sleep
+ *
+ * In non-atomic context where the exact wakeup time is flexible, use
+ * usleep_range() instead of udelay().  The sleep improves responsiveness
+ * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
+ * power usage by allowing hrtimers to take advantage of an already-
+ * scheduled interrupt instead of scheduling a new one just for this sleep.
  */
 void __sched usleep_range(unsigned long min, unsigned long max)
 {
-- 
cgit v1.3-8-gc7d7


From 86721ab63b61ef1dd7305308e4049f644703decf Mon Sep 17 00:00:00 2001
From: Pratyush Patel <pratyushpatel.1995@gmail.com>
Date: Tue, 1 Mar 2016 22:58:49 +0530
Subject: hrtimer: Remove redundant #ifdef block

Only need CONFIG_NO_HZ_COMMON as this block is already in a
CONFIG_SMP block.

Signed-off-by: Pratyush Patel <pratyushpatel.1995@gmail.com>
Link: http://lkml.kernel.org/r/20160301172849.GA18152@cyborg
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/hrtimer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index e99df0ff1d42..d13c9aebf7a3 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -177,7 +177,7 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
 #endif
 }
 
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#ifdef CONFIG_NO_HZ_COMMON
 static inline
 struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
 					 int pinned)
-- 
cgit v1.3-8-gc7d7


From fe3464ca8710012a247bb4586dde21b080f88514 Mon Sep 17 00:00:00 2001
From: Jianyu Zhan <nasa4836@gmail.com>
Date: Sat, 19 Mar 2016 21:59:19 +0800
Subject: genirq: Remove redundant NULL check of irq_desc

for_each_irq_desc() macro has already skipped NULL irq_desc, don't
bother to check it again.

Signed-off-by: Jianyu Zhan <nasa4836@gmail.com>
Cc: mingo@kernel.org
Cc: yhlu.kernel@gmail.com
Link: http://lkml.kernel.org/r/1458395959-7046-1-git-send-email-nasa4836@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/proc.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4e1b94726818..50a8f28be247 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -421,12 +421,8 @@ void init_irq_proc(void)
 	/*
 	 * Create entries for all existing IRQs.
 	 */
-	for_each_irq_desc(irq, desc) {
-		if (!desc)
-			continue;
-
+	for_each_irq_desc(irq, desc)
 		register_irq_proc(irq, desc);
-	}
 }
 
 #ifdef CONFIG_GENERIC_IRQ_SHOW
-- 
cgit v1.3-8-gc7d7


From ff5b706f5189fe8d2a6fd576b491b769ec1d29d3 Mon Sep 17 00:00:00 2001
From: Weongyo Jeong <weongyo.linux@gmail.com>
Date: Thu, 31 Mar 2016 12:15:03 -0700
Subject: genirq: Remove unnecessary memset() calls

sprintf() and snprintf() implementation of kernel guarantees that
its result is terminated with null byte if size is larger than 0. So we
don't need to call memset() at all.

Signed-off-by: Weongyo Jeong <weongyo.linux@gmail.com>
Link: http://lkml.kernel.org/r/1459451703-5744-1-git-send-email-weongyo.linux@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/proc.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 50a8f28be247..f30425dce9dd 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -311,7 +311,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
 					!name_unique(irq, action))
 		return;
 
-	memset(name, 0, MAX_NAMELEN);
 	snprintf(name, MAX_NAMELEN, "%s", action->name);
 
 	/* create /proc/irq/1234/handler/ */
@@ -340,7 +339,6 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 	if (desc->dir)
 		goto out_unlock;
 
-	memset(name, 0, MAX_NAMELEN);
 	sprintf(name, "%d", irq);
 
 	/* create /proc/irq/1234 */
@@ -386,7 +384,6 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
 #endif
 	remove_proc_entry("spurious", desc->dir);
 
-	memset(name, 0, MAX_NAMELEN);
 	sprintf(name, "%u", irq);
 	remove_proc_entry(name, root_irq_dir);
 }
-- 
cgit v1.3-8-gc7d7


From b62b2cf5759b0c2206ddff92226f1eb8ac8f9f13 Mon Sep 17 00:00:00 2001
From: Jon Hunter <jonathanh@nvidia.com>
Date: Tue, 7 Jun 2016 16:12:26 +0100
Subject: irqdomain: Fix handling of type settings for existing mappings

When mapping an IRQ, it is possible that a mapping for the IRQ already
exists. If mapping does exist then there are the following issues with
regard to the handling of the IRQ type settings ...
1. If the domain is part of a hierarchy, then:
   a. We do not check that the type settings for the existing mapping
      match those of the new mapping.
   b. We do not check to see if the type settings have been programmed
      yet (and they might not have been) and so we may never set the
      type.
2. If the domain is NOT part of a hierarchy, we will overwrite the
   current type settings programmed if they are different from the
   previous mapping. Please note that irq_create_mapping()
   calls irq_find_mapping() to check if a mapping already exists.

Although, it may be unlikely that the type settings for a shared
interrupt would not match, nonetheless we should check for this.
Therefore, to fix this check if a mapping exists (regardless of whether
the domain is part of a hierarchy or not) and if it does then:
1. Return the IRQ number if the type settings match or are not
   specified.
2. Program the type settings and return the IRQ number if the type
   settings have not been programmed yet.
3. Otherwise if the type setting do not match, then print a warning
   and don't return the IRQ number.

Furthermore, add a warning if the type return by irq_domain_translate()
has bits outside the sense mask set and then clear these bits. If these
bits are not cleared then this will cause the comparision of the type
settings for an existing mapping to fail with that of the new mapping
even if the sense bit themselves match. The reason being is that the
existing type settings are read by calling irq_get_trigger_type() which
will clear any bits outside the sense mask. This will allow us to detect
irqchips that are not correctly clearing these bits and fix them.

Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 kernel/irq/irqdomain.c | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8798b6c9e945..f3ff1eb8dd09 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -588,15 +588,42 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
 	if (irq_domain_translate(domain, fwspec, &hwirq, &type))
 		return 0;
 
-	if (irq_domain_is_hierarchy(domain)) {
+	/*
+	 * WARN if the irqchip returns a type with bits
+	 * outside the sense mask set and clear these bits.
+	 */
+	if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK))
+		type &= IRQ_TYPE_SENSE_MASK;
+
+	/*
+	 * If we've already configured this interrupt,
+	 * don't do it again, or hell will break loose.
+	 */
+	virq = irq_find_mapping(domain, hwirq);
+	if (virq) {
 		/*
-		 * If we've already configured this interrupt,
-		 * don't do it again, or hell will break loose.
+		 * If the trigger type is not specified or matches the
+		 * current trigger type then we are done so return the
+		 * interrupt number.
 		 */
-		virq = irq_find_mapping(domain, hwirq);
-		if (virq)
+		if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
 			return virq;
 
+		/*
+		 * If the trigger type has not been set yet, then set
+		 * it now and return the interrupt number.
+		 */
+		if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) {
+			irq_set_irq_type(virq, type);
+			return virq;
+		}
+
+		pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n",
+			hwirq, of_node_full_name(to_of_node(fwspec->fwnode)));
+		return 0;
+	}
+
+	if (irq_domain_is_hierarchy(domain)) {
 		virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec);
 		if (virq <= 0)
 			return 0;
-- 
cgit v1.3-8-gc7d7


From 4b357daed698c95d6b5eacc1c3c4afa206071ba2 Mon Sep 17 00:00:00 2001
From: Jon Hunter <jonathanh@nvidia.com>
Date: Tue, 7 Jun 2016 16:12:27 +0100
Subject: genirq: Look-up trigger type if not specified by caller

For some devices the IRQ trigger type for a device is read from
firmware, such as device-tree. The IRQ trigger type is typically read
when the mapping for IRQ is created, which is before the IRQ is
requested. Hence, the IRQ trigger type is programmed when mapping the
IRQ and not when requesting the IRQ.

Although this works for most cases, in order to support IRQ chips which
require runtime power management, which may not be accessible prior
to requesting the IRQ, it is desirable to look-up the IRQ trigger type
when it is requested. Therefore, if the IRQ trigger type is not
specified when __setup_irq() is called, look-up the saved IRQ trigger
type. This will allow us to defer the programming of the trigger type
from when the IRQ is mapped to when it is actually requested.

Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 kernel/irq/manage.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ef0bc02c3a70..eaedeb74b49d 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1116,6 +1116,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 
 	new->irq = irq;
 
+	/*
+	 * If the trigger type is not specified by the caller,
+	 * then use the default for this interrupt.
+	 */
+	if (!(new->flags & IRQF_TRIGGER_MASK))
+		new->flags |= irqd_get_trigger_type(&desc->irq_data);
+
 	/*
 	 * Check whether the interrupt nests into another interrupt
 	 * thread.
-- 
cgit v1.3-8-gc7d7


From f35ad083783e8ed6ac030f5feb209f864875b413 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 13 Jun 2016 10:39:44 +0100
Subject: genirq: Look-up percpu trigger type if not specified by caller

As we now do for non-percpu interrupt, perform a lookup of the
interrupt trigger if the user doesn't supply one. The difference
here is that we can only do it at enable time (trigger configuration
can be per-cpu as well).

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 kernel/irq/manage.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index eaedeb74b49d..f78b0846afb1 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1737,7 +1737,14 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
 	if (!desc)
 		return;
 
+	/*
+	 * If the trigger type is not specified by the caller, then
+	 * use the default for this interrupt.
+	 */
 	type &= IRQ_TYPE_SENSE_MASK;
+	if (type == IRQ_TYPE_NONE)
+		type = irqd_get_trigger_type(&desc->irq_data);
+
 	if (type != IRQ_TYPE_NONE) {
 		int ret;
 
-- 
cgit v1.3-8-gc7d7


From 1e2a7d78499ec8859d2b469051b7b80bad3b08aa Mon Sep 17 00:00:00 2001
From: Jon Hunter <jonathanh@nvidia.com>
Date: Tue, 7 Jun 2016 16:12:28 +0100
Subject: irqdomain: Don't set type when mapping an IRQ

Some IRQ chips, such as GPIO controllers or secondary level interrupt
controllers, may require require additional runtime power management
control to ensure they are accessible. For such IRQ chips, it makes sense
to enable the IRQ chip when interrupts are requested and disabled them
again once all interrupts have been freed.

When mapping an IRQ, the IRQ type settings are read and then programmed.
The mapping of the IRQ happens before the IRQ is requested and so the
programming of the type settings occurs before the IRQ is requested. This
is a problem for IRQ chips that require additional power management
control because they may not be accessible yet. Therefore, when mapping
the IRQ, don't program the type settings, just save them and then program
these saved settings when the IRQ is requested (so long as if they are not
overridden via the call to request the IRQ).

Add a stub function for irq_domain_free_irqs() to avoid any compilation
errors when CONFIG_IRQ_DOMAIN_HIERARCHY is not selected.

Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqdomain.h |  3 +++
 kernel/irq/irqdomain.c    | 23 ++++++++++++++++++-----
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index f1f36e04d885..317503763314 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -452,6 +452,9 @@ static inline int irq_domain_alloc_irqs(struct irq_domain *domain,
 	return -1;
 }
 
+static inline void irq_domain_free_irqs(unsigned int virq,
+					unsigned int nr_irqs) { }
+
 static inline bool irq_domain_is_hierarchy(struct irq_domain *domain)
 {
 	return false;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index f3ff1eb8dd09..caa6a63d26f0 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -567,6 +567,7 @@ static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data,
 unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
 {
 	struct irq_domain *domain;
+	struct irq_data *irq_data;
 	irq_hw_number_t hwirq;
 	unsigned int type = IRQ_TYPE_NONE;
 	int virq;
@@ -614,7 +615,11 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
 		 * it now and return the interrupt number.
 		 */
 		if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) {
-			irq_set_irq_type(virq, type);
+			irq_data = irq_get_irq_data(virq);
+			if (!irq_data)
+				return 0;
+
+			irqd_set_trigger_type(irq_data, type);
 			return virq;
 		}
 
@@ -634,10 +639,18 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
 			return virq;
 	}
 
-	/* Set type if specified and different than the current one */
-	if (type != IRQ_TYPE_NONE &&
-	    type != irq_get_trigger_type(virq))
-		irq_set_irq_type(virq, type);
+	irq_data = irq_get_irq_data(virq);
+	if (!irq_data) {
+		if (irq_domain_is_hierarchy(domain))
+			irq_domain_free_irqs(virq, 1);
+		else
+			irq_dispose_mapping(virq);
+		return 0;
+	}
+
+	/* Store trigger type */
+	irqd_set_trigger_type(irq_data, type);
+
 	return virq;
 }
 EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping);
-- 
cgit v1.3-8-gc7d7


From be45beb2df6909d42a6b3b0052601b3eef878fc0 Mon Sep 17 00:00:00 2001
From: Jon Hunter <jonathanh@nvidia.com>
Date: Tue, 7 Jun 2016 16:12:29 +0100
Subject: genirq: Add runtime power management support for IRQ chips

Some IRQ chips may be located in a power domain outside of the CPU
subsystem and hence will require device specific runtime power
management. In order to support such IRQ chips, add a pointer for a
device structure to the irq_chip structure, and if this pointer is
populated by the IRQ chip driver and CONFIG_PM is selected in the kernel
configuration, then the pm_runtime_get/put APIs for this chip will be
called when an IRQ is requested/freed, respectively.

Reviewed-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irq.h    |  4 ++++
 kernel/irq/chip.c      | 40 ++++++++++++++++++++++++++++++++++++++++
 kernel/irq/internals.h |  1 +
 kernel/irq/manage.c    | 31 ++++++++++++++++++++++++++++++-
 4 files changed, 75 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 4d758a7c604a..6c92a847394d 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -315,6 +315,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
 /**
  * struct irq_chip - hardware interrupt chip descriptor
  *
+ * @parent_device:	pointer to parent device for irqchip
  * @name:		name for /proc/interrupts
  * @irq_startup:	start up the interrupt (defaults to ->enable if NULL)
  * @irq_shutdown:	shut down the interrupt (defaults to ->disable if NULL)
@@ -354,6 +355,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
  * @flags:		chip specific flags
  */
 struct irq_chip {
+	struct device	*parent_device;
 	const char	*name;
 	unsigned int	(*irq_startup)(struct irq_data *data);
 	void		(*irq_shutdown)(struct irq_data *data);
@@ -488,6 +490,8 @@ extern void handle_bad_irq(struct irq_desc *desc);
 extern void handle_nested_irq(unsigned int irq);
 
 extern int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg);
+extern int irq_chip_pm_get(struct irq_data *data);
+extern int irq_chip_pm_put(struct irq_data *data);
 #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
 extern void irq_chip_enable_parent(struct irq_data *data);
 extern void irq_chip_disable_parent(struct irq_data *data);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2f9f2b0e79f2..ad8131473774 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1093,3 +1093,43 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 
 	return 0;
 }
+
+/**
+ * irq_chip_pm_get - Enable power for an IRQ chip
+ * @data:	Pointer to interrupt specific data
+ *
+ * Enable the power to the IRQ chip referenced by the interrupt data
+ * structure.
+ */
+int irq_chip_pm_get(struct irq_data *data)
+{
+	int retval;
+
+	if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) {
+		retval = pm_runtime_get_sync(data->chip->parent_device);
+		if (retval < 0) {
+			pm_runtime_put_noidle(data->chip->parent_device);
+			return retval;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * irq_chip_pm_put - Disable power for an IRQ chip
+ * @data:	Pointer to interrupt specific data
+ *
+ * Disable the power to the IRQ chip referenced by the interrupt data
+ * structure, belongs. Note that power will only be disabled, once this
+ * function has been called for all IRQs that have called irq_chip_pm_get().
+ */
+int irq_chip_pm_put(struct irq_data *data)
+{
+	int retval = 0;
+
+	if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device)
+		retval = pm_runtime_put(data->chip->parent_device);
+
+	return (retval < 0) ? retval : 0;
+}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 09be2c903c6d..d5edcdc9382a 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -7,6 +7,7 @@
  */
 #include <linux/irqdesc.h>
 #include <linux/kernel_stat.h>
+#include <linux/pm_runtime.h>
 
 #ifdef CONFIG_SPARSE_IRQ
 # define IRQ_BITMAP_BITS	(NR_IRQS + 8196)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f78b0846afb1..00cfc852cca8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1416,10 +1416,18 @@ int setup_irq(unsigned int irq, struct irqaction *act)
 
 	if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
 		return -EINVAL;
+
+	retval = irq_chip_pm_get(&desc->irq_data);
+	if (retval < 0)
+		return retval;
+
 	chip_bus_lock(desc);
 	retval = __setup_irq(irq, desc, act);
 	chip_bus_sync_unlock(desc);
 
+	if (retval)
+		irq_chip_pm_put(&desc->irq_data);
+
 	return retval;
 }
 EXPORT_SYMBOL_GPL(setup_irq);
@@ -1513,6 +1521,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 		}
 	}
 
+	irq_chip_pm_put(&desc->irq_data);
 	module_put(desc->owner);
 	kfree(action->secondary);
 	return action;
@@ -1655,11 +1664,16 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 	action->name = devname;
 	action->dev_id = dev_id;
 
+	retval = irq_chip_pm_get(&desc->irq_data);
+	if (retval < 0)
+		return retval;
+
 	chip_bus_lock(desc);
 	retval = __setup_irq(irq, desc, action);
 	chip_bus_sync_unlock(desc);
 
 	if (retval) {
+		irq_chip_pm_put(&desc->irq_data);
 		kfree(action->secondary);
 		kfree(action);
 	}
@@ -1836,6 +1850,7 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_
 
 	unregister_handler_proc(irq, action);
 
+	irq_chip_pm_put(&desc->irq_data);
 	module_put(desc->owner);
 	return action;
 
@@ -1898,10 +1913,18 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
 
 	if (!desc || !irq_settings_is_per_cpu_devid(desc))
 		return -EINVAL;
+
+	retval = irq_chip_pm_get(&desc->irq_data);
+	if (retval < 0)
+		return retval;
+
 	chip_bus_lock(desc);
 	retval = __setup_irq(irq, desc, act);
 	chip_bus_sync_unlock(desc);
 
+	if (retval)
+		irq_chip_pm_put(&desc->irq_data);
+
 	return retval;
 }
 
@@ -1945,12 +1968,18 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
 	action->name = devname;
 	action->percpu_dev_id = dev_id;
 
+	retval = irq_chip_pm_get(&desc->irq_data);
+	if (retval < 0)
+		return retval;
+
 	chip_bus_lock(desc);
 	retval = __setup_irq(irq, desc, action);
 	chip_bus_sync_unlock(desc);
 
-	if (retval)
+	if (retval) {
+		irq_chip_pm_put(&desc->irq_data);
 		kfree(action);
+	}
 
 	return retval;
 }
-- 
cgit v1.3-8-gc7d7


From 678309117768e25751594a48a2d873b0552a3130 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Thu, 2 Jun 2016 13:20:32 +0100
Subject: PM / Hibernate: Don't let kasan instrument snapshot.c

Kasan causes the compiler to instrument C code and is used at runtime to
detect accesses to memory that has been freed, or not yet allocated.

The code in snapshot.c saves and restores memory when hibernating. This will
access whole pages in the slab cache that have both free and allocated
areas, resulting in a large number of false positives from Kasan.

Disable instrumentation of this file.

Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/Makefile | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index cb880a14cc39..eb4f717705ba 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,6 +1,8 @@
 
 ccflags-$(CONFIG_PM_DEBUG)	:= -DDEBUG
 
+KASAN_SANITIZE_snapshot.o	:= n
+
 obj-y				+= qos.o
 obj-$(CONFIG_PM)		+= main.o
 obj-$(CONFIG_VT_CONSOLE_SLEEP)	+= console.o
-- 
cgit v1.3-8-gc7d7


From 3d89e5478bf550a50c99e93adf659369798263b0 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Mon, 13 Jun 2016 18:32:45 +0800
Subject: sched/cputime: Fix prev steal time accouting during CPU hotplug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit:

  e9532e69b8d1 ("sched/cputime: Fix steal time accounting vs. CPU hotplug")

... set rq->prev_* to 0 after a CPU hotplug comes back, in order to
fix the case where (after CPU hotplug) steal time is smaller than
rq->prev_steal_time.

However, this should never happen. Steal time was only smaller because of the
KVM-specific bug fixed by the previous patch.  Worse, the previous patch
triggers a bug on CPU hot-unplug/plug operation: because
rq->prev_steal_time is cleared, all of the CPU's past steal time will be
accounted again on hot-plug.

Since the root cause has been fixed, we can just revert commit e9532e69b8d1.

Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 'commit e9532e69b8d1 ("sched/cputime: Fix steal time accounting vs. CPU hotplug")'
Link: http://lkml.kernel.org/r/1465813966-3116-3-git-send-email-wanpeng.li@hotmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  |  1 -
 kernel/sched/sched.h | 13 -------------
 2 files changed, 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 13d0896aff87..c1b537bc4b90 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7227,7 +7227,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
 	struct rq *rq = cpu_rq(cpu);
 
 	rq->calc_load_update = calc_load_update;
-	account_reset_rq(rq);
 	update_max_interval();
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 72f1f3087b04..de607e4febd9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1809,16 +1809,3 @@ static inline void cpufreq_trigger_update(u64 time) {}
 #else /* arch_scale_freq_capacity */
 #define arch_scale_freq_invariant()	(false)
 #endif
-
-static inline void account_reset_rq(struct rq *rq)
-{
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-	rq->prev_irq_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT
-	rq->prev_steal_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-	rq->prev_steal_time_rq = 0;
-#endif
-}
-- 
cgit v1.3-8-gc7d7


From 807e5b80687c06715d62df51a5473b231e3e8b15 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Mon, 13 Jun 2016 18:32:46 +0800
Subject: sched/cputime: Add steal time support to full dynticks CPU time
 accounting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds guest steal-time support to full dynticks CPU
time accounting. After the following commit:

ff9a9b4c4334 ("sched, time: Switch VIRT_CPU_ACCOUNTING_GEN to jiffy granularity")

... time sampling became jiffy based, even if we do the sampling from the
context tracking code, so steal_account_process_tick() can be reused
to account how many 'ticks' are stolen-time, after the last accumulation.

Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1465813966-3116-4-git-send-email-wanpeng.li@hotmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 75f98c5498d5..3d60e5d76fdb 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -257,7 +257,7 @@ void account_idle_time(cputime_t cputime)
 		cpustat[CPUTIME_IDLE] += (__force u64) cputime;
 }
 
-static __always_inline bool steal_account_process_tick(void)
+static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies)
 {
 #ifdef CONFIG_PARAVIRT
 	if (static_key_false(&paravirt_steal_enabled)) {
@@ -272,14 +272,14 @@ static __always_inline bool steal_account_process_tick(void)
 		 * time in jiffies. Lets cast the result to jiffies
 		 * granularity and account the rest on the next rounds.
 		 */
-		steal_jiffies = nsecs_to_jiffies(steal);
+		steal_jiffies = min(nsecs_to_jiffies(steal), max_jiffies);
 		this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
 
 		account_steal_time(jiffies_to_cputime(steal_jiffies));
 		return steal_jiffies;
 	}
 #endif
-	return false;
+	return 0;
 }
 
 /*
@@ -346,7 +346,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 	u64 cputime = (__force u64) cputime_one_jiffy;
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 
-	if (steal_account_process_tick())
+	if (steal_account_process_tick(ULONG_MAX))
 		return;
 
 	cputime *= ticks;
@@ -477,7 +477,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
 		return;
 	}
 
-	if (steal_account_process_tick())
+	if (steal_account_process_tick(ULONG_MAX))
 		return;
 
 	if (user_tick)
@@ -681,12 +681,14 @@ static cputime_t vtime_delta(struct task_struct *tsk)
 static cputime_t get_vtime_delta(struct task_struct *tsk)
 {
 	unsigned long now = READ_ONCE(jiffies);
-	unsigned long delta = now - tsk->vtime_snap;
+	unsigned long delta_jiffies, steal_jiffies;
 
+	delta_jiffies = now - tsk->vtime_snap;
+	steal_jiffies = steal_account_process_tick(delta_jiffies);
 	WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
 	tsk->vtime_snap = now;
 
-	return jiffies_to_cputime(delta);
+	return jiffies_to_cputime(delta_jiffies - steal_jiffies);
 }
 
 static void __vtime_account_system(struct task_struct *tsk)
-- 
cgit v1.3-8-gc7d7


From 2c95afc1e83d93fac3be6923465e1753c2c53b0a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 9 Jun 2016 06:14:38 -0700
Subject: perf/x86/intel, watchdog: Switch NMI watchdog to ref cycles on x86

The NMI watchdog uses either the fixed cycles or a generic cycles
counter. This causes a lot of conflicts with users of the PMU who want
to run a full group including the cycles fixed counter, for example
the --topdown support recently added to perf stat. The code needs to
fall back to not use groups, which can cause measurement inaccuracy
due to multiplexing errors.

This patch switches the NMI watchdog to use reference cycles
on Intel systems.  This is actually more accurate than cycles,
because cycles can tick faster than the measured CPU Frequency
due to Turbo mode.

The ref cycles always tick at their frequency, or slower when
the system is idling. That means the NMI watchdog can never
expire too early, unlike with cycles.

The reference cycles tick roughly at the frequency of the TSC,
so the same period computation can be used.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Cc: jolsa@kernel.org
Link: http://lkml.kernel.org/r/1465478079-19993-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/hw_nmi.c | 8 ++++++++
 include/linux/nmi.h           | 1 +
 kernel/watchdog.c             | 7 +++++++
 3 files changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 7788ce643bf4..016f4263fad4 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -18,8 +18,16 @@
 #include <linux/nmi.h>
 #include <linux/module.h>
 #include <linux/delay.h>
+#include <linux/perf_event.h>
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
+int hw_nmi_get_event(void)
+{
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		return PERF_COUNT_HW_REF_CPU_CYCLES;
+	return PERF_COUNT_HW_CPU_CYCLES;
+}
+
 u64 hw_nmi_get_sample_period(int watchdog_thresh)
 {
 	return (u64)(cpu_khz) * 1000 * watchdog_thresh;
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 4630eeae18e0..79858af27209 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -66,6 +66,7 @@ static inline bool trigger_allbutself_cpu_backtrace(void)
 
 #ifdef CONFIG_LOCKUP_DETECTOR
 u64 hw_nmi_get_sample_period(int watchdog_thresh);
+int hw_nmi_get_event(void);
 extern int nmi_watchdog_enabled;
 extern int soft_watchdog_enabled;
 extern int watchdog_user_enabled;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9acb29f280ec..8dd30fcd91be 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -315,6 +315,12 @@ static int is_softlockup(unsigned long touch_ts)
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 
+/* Can be overriden by architecture */
+__weak int hw_nmi_get_event(void)
+{
+	return PERF_COUNT_HW_CPU_CYCLES;
+}
+
 static struct perf_event_attr wd_hw_attr = {
 	.type		= PERF_TYPE_HARDWARE,
 	.config		= PERF_COUNT_HW_CPU_CYCLES,
@@ -604,6 +610,7 @@ static int watchdog_nmi_enable(unsigned int cpu)
 
 	wd_attr = &wd_hw_attr;
 	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+	wd_attr->config = hw_nmi_get_event();
 
 	/* Try to register using hardware perf events */
 	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
-- 
cgit v1.3-8-gc7d7


From 1f03e8d2919270bd6ef64f39a45ce8df8a9f012a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 4 Apr 2016 10:57:12 +0200
Subject: locking/barriers: Replace smp_cond_acquire() with
 smp_cond_load_acquire()

This new form allows using hardware assisted waiting.

Some hardware (ARM64 and x86) allow monitoring an address for changes,
so by providing a pointer we can use this to replace the cpu_relax()
with hardware optimized methods in the future.

Requested-by: Will Deacon <will.deacon@arm.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler.h   | 25 +++++++++++++++++++------
 kernel/locking/qspinlock.c | 12 ++++++------
 kernel/sched/core.c        |  8 ++++----
 kernel/sched/sched.h       |  2 +-
 kernel/smp.c               |  2 +-
 5 files changed, 31 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 06f27fd9d760..2bcaedc0f032 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -305,21 +305,34 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 })
 
 /**
- * smp_cond_acquire() - Spin wait for cond with ACQUIRE ordering
+ * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering
+ * @ptr: pointer to the variable to wait on
  * @cond: boolean expression to wait for
  *
  * Equivalent to using smp_load_acquire() on the condition variable but employs
  * the control dependency of the wait to reduce the barrier on many platforms.
  *
+ * Due to C lacking lambda expressions we load the value of *ptr into a
+ * pre-named variable @VAL to be used in @cond.
+ *
  * The control dependency provides a LOAD->STORE order, the additional RMB
  * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
  * aka. ACQUIRE.
  */
-#define smp_cond_acquire(cond)	do {		\
-	while (!(cond))				\
-		cpu_relax();			\
-	smp_rmb(); /* ctrl + rmb := acquire */	\
-} while (0)
+#ifndef smp_cond_load_acquire
+#define smp_cond_load_acquire(ptr, cond_expr) ({		\
+	typeof(ptr) __PTR = (ptr);				\
+	typeof(*ptr) VAL;					\
+	for (;;) {						\
+		VAL = READ_ONCE(*__PTR);			\
+		if (cond_expr)					\
+			break;					\
+		cpu_relax();					\
+	}							\
+	smp_rmb(); /* ctrl + rmb := acquire */			\
+	VAL;							\
+})
+#endif
 
 #endif /* __KERNEL__ */
 
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 2f9153b183c9..1b8dda90ebfa 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -475,7 +475,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 	 * sequentiality; this is because not all clear_pending_set_locked()
 	 * implementations imply full barriers.
 	 */
-	smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK));
+	smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
 
 	/*
 	 * take ownership and clear the pending bit.
@@ -562,7 +562,7 @@ queue:
 	 *
 	 * The PV pv_wait_head_or_lock function, if active, will acquire
 	 * the lock and return a non-zero value. So we have to skip the
-	 * smp_cond_acquire() call. As the next PV queue head hasn't been
+	 * smp_cond_load_acquire() call. As the next PV queue head hasn't been
 	 * designated yet, there is no way for the locked value to become
 	 * _Q_SLOW_VAL. So both the set_locked() and the
 	 * atomic_cmpxchg_relaxed() calls will be safe.
@@ -573,7 +573,7 @@ queue:
 	if ((val = pv_wait_head_or_lock(lock, node)))
 		goto locked;
 
-	smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
+	val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK));
 
 locked:
 	/*
@@ -593,9 +593,9 @@ locked:
 			break;
 		}
 		/*
-		 * The smp_cond_acquire() call above has provided the necessary
-		 * acquire semantics required for locking. At most two
-		 * iterations of this loop may be ran.
+		 * The smp_cond_load_acquire() call above has provided the
+		 * necessary acquire semantics required for locking. At most
+		 * two iterations of this loop may be ran.
 		 */
 		old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
 		if (old == val)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 017d5394f5dc..5cd6931cb2cb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1935,7 +1935,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  * chain to provide order. Instead we do:
  *
  *   1) smp_store_release(X->on_cpu, 0)
- *   2) smp_cond_acquire(!X->on_cpu)
+ *   2) smp_cond_load_acquire(!X->on_cpu)
  *
  * Example:
  *
@@ -1946,7 +1946,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  *   sched-out X
  *   smp_store_release(X->on_cpu, 0);
  *
- *                    smp_cond_acquire(!X->on_cpu);
+ *                    smp_cond_load_acquire(&X->on_cpu, !VAL);
  *                    X->state = WAKING
  *                    set_task_cpu(X,2)
  *
@@ -1972,7 +1972,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  * This means that any means of doing remote wakeups must order the CPU doing
  * the wakeup against the CPU the task is going to end up running on. This,
  * however, is already required for the regular Program-Order guarantee above,
- * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
  *
  */
 
@@ -2045,7 +2045,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 * This ensures that tasks getting woken will be fully ordered against
 	 * their previous state and preserve Program Order.
 	 */
-	smp_cond_acquire(!p->on_cpu);
+	smp_cond_load_acquire(&p->on_cpu, !VAL);
 
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 72f1f3087b04..425bf5ddaa5a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1113,7 +1113,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 	 * In particular, the load of prev->state in finish_task_switch() must
 	 * happen before this.
 	 *
-	 * Pairs with the smp_cond_acquire() in try_to_wake_up().
+	 * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
 	 */
 	smp_store_release(&prev->on_cpu, 0);
 #endif
diff --git a/kernel/smp.c b/kernel/smp.c
index 74165443c240..36552beed397 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -107,7 +107,7 @@ void __init call_function_init(void)
  */
 static __always_inline void csd_lock_wait(struct call_single_data *csd)
 {
-	smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK));
+	smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
 }
 
 static __always_inline void csd_lock(struct call_single_data *csd)
-- 
cgit v1.3-8-gc7d7


From 33ac279677dcc2441cb93d8cb9cf7a74df62814d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 24 May 2016 13:17:12 +0200
Subject: locking/barriers: Introduce smp_acquire__after_ctrl_dep()

Introduce smp_acquire__after_ctrl_dep(), this construct is not
uncommon, but the lack of this barrier is.

Use it to better express smp_rmb() uses in WRITE_ONCE(), the IPC
semaphore code and the qspinlock code.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler.h   | 17 ++++++++++++-----
 ipc/sem.c                  | 14 ++------------
 kernel/locking/qspinlock.c |  2 +-
 3 files changed, 15 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 2bcaedc0f032..59a7004fc7dd 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -304,6 +304,17 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 	__u.__val;					\
 })
 
+/**
+ * smp_acquire__after_ctrl_dep() - Provide ACQUIRE ordering after a control dependency
+ *
+ * A control dependency provides a LOAD->STORE order, the additional RMB
+ * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
+ * aka. (load)-ACQUIRE.
+ *
+ * Architectures that do not do load speculation can have this be barrier().
+ */
+#define smp_acquire__after_ctrl_dep()		smp_rmb()
+
 /**
  * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering
  * @ptr: pointer to the variable to wait on
@@ -314,10 +325,6 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  *
  * Due to C lacking lambda expressions we load the value of *ptr into a
  * pre-named variable @VAL to be used in @cond.
- *
- * The control dependency provides a LOAD->STORE order, the additional RMB
- * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
- * aka. ACQUIRE.
  */
 #ifndef smp_cond_load_acquire
 #define smp_cond_load_acquire(ptr, cond_expr) ({		\
@@ -329,7 +336,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 			break;					\
 		cpu_relax();					\
 	}							\
-	smp_rmb(); /* ctrl + rmb := acquire */			\
+	smp_acquire__after_ctrl_dep();				\
 	VAL;							\
 })
 #endif
diff --git a/ipc/sem.c b/ipc/sem.c
index b3757ea0694b..84dff3df11a4 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -259,16 +259,6 @@ static void sem_rcu_free(struct rcu_head *head)
 	ipc_rcu_free(head);
 }
 
-/*
- * spin_unlock_wait() and !spin_is_locked() are not memory barriers, they
- * are only control barriers.
- * The code must pair with spin_unlock(&sem->lock) or
- * spin_unlock(&sem_perm.lock), thus just the control barrier is insufficient.
- *
- * smp_rmb() is sufficient, as writes cannot pass the control barrier.
- */
-#define ipc_smp_acquire__after_spin_is_unlocked()	smp_rmb()
-
 /*
  * Wait until all currently ongoing simple ops have completed.
  * Caller must own sem_perm.lock.
@@ -292,7 +282,7 @@ static void sem_wait_array(struct sem_array *sma)
 		sem = sma->sem_base + i;
 		spin_unlock_wait(&sem->lock);
 	}
-	ipc_smp_acquire__after_spin_is_unlocked();
+	smp_acquire__after_ctrl_dep();
 }
 
 /*
@@ -350,7 +340,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
 			 *	complex_count++;
 			 *	spin_unlock(sem_perm.lock);
 			 */
-			ipc_smp_acquire__after_spin_is_unlocked();
+			smp_acquire__after_ctrl_dep();
 
 			/*
 			 * Now repeat the test of complex_count:
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 1b8dda90ebfa..730655533440 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -379,7 +379,7 @@ void queued_spin_unlock_wait(struct qspinlock *lock)
 		cpu_relax();
 
 done:
-	smp_rmb(); /* CTRL + RMB -> ACQUIRE */
+	smp_acquire__after_ctrl_dep();
 }
 EXPORT_SYMBOL(queued_spin_unlock_wait);
 #endif
-- 
cgit v1.3-8-gc7d7


From be3e7844980352756de4261b276ee2ba5be7a26b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 24 May 2016 14:45:21 +0200
Subject: locking/spinlock: Update spin_unlock_wait() users

With the modified semantics of spin_unlock_wait() a number of
explicit barriers can be removed. Also update the comment for the
do_exit() usecase, as that was somewhat stale/obscure.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 ipc/sem.c          | 1 -
 kernel/exit.c      | 8 ++++++--
 kernel/task_work.c | 1 -
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/ipc/sem.c b/ipc/sem.c
index 84dff3df11a4..ae72b3cddc8d 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -282,7 +282,6 @@ static void sem_wait_array(struct sem_array *sma)
 		sem = sma->sem_base + i;
 		spin_unlock_wait(&sem->lock);
 	}
-	smp_acquire__after_ctrl_dep();
 }
 
 /*
diff --git a/kernel/exit.c b/kernel/exit.c
index 9e6e1356e6bb..0b40791b9e70 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -700,10 +700,14 @@ void do_exit(long code)
 
 	exit_signals(tsk);  /* sets PF_EXITING */
 	/*
-	 * tsk->flags are checked in the futex code to protect against
-	 * an exiting task cleaning up the robust pi futexes.
+	 * Ensure that all new tsk->pi_lock acquisitions must observe
+	 * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
 	 */
 	smp_mb();
+	/*
+	 * Ensure that we must observe the pi_state in exit_mm() ->
+	 * mm_release() -> exit_pi_state_list().
+	 */
 	raw_spin_unlock_wait(&tsk->pi_lock);
 
 	if (unlikely(in_atomic())) {
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 53fa971d000d..6ab4842b00e8 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -108,7 +108,6 @@ void task_work_run(void)
 		 * fail, but it can play with *work and other entries.
 		 */
 		raw_spin_unlock_wait(&task->pi_lock);
-		smp_mb();
 
 		do {
 			next = work->next;
-- 
cgit v1.3-8-gc7d7


From 2f275de5d1ed7269913ef9b4c64a13952c0a38e8 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 27 May 2016 12:57:02 -0700
Subject: seccomp: Add a seccomp_data parameter secure_computing()

Currently, if arch code wants to supply seccomp_data directly to
seccomp (which is generally much faster than having seccomp do it
using the syscall_get_xyz() API), it has to use the two-phase
seccomp hooks. Add it to the easy hooks, too.

Cc: linux-arch@vger.kernel.org
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/arm/kernel/ptrace.c              | 2 +-
 arch/arm64/kernel/ptrace.c            | 2 +-
 arch/mips/kernel/ptrace.c             | 2 +-
 arch/parisc/kernel/ptrace.c           | 2 +-
 arch/powerpc/kernel/ptrace.c          | 2 +-
 arch/s390/kernel/ptrace.c             | 2 +-
 arch/tile/kernel/ptrace.c             | 2 +-
 arch/um/kernel/skas/syscall.c         | 2 +-
 arch/x86/entry/vsyscall/vsyscall_64.c | 2 +-
 include/linux/seccomp.h               | 8 ++++----
 kernel/seccomp.c                      | 4 ++--
 11 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 4d9375814b53..1027d3b54541 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -934,7 +934,7 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno)
 
 	/* Do the secure computing check first; failures should be fast. */
 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
-	if (secure_computing() == -1)
+	if (secure_computing(NULL) == -1)
 		return -1;
 #else
 	/* XXX: remove this once OABI gets fixed */
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 3f6cd5c5234f..6e2cf046615d 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -1247,7 +1247,7 @@ static void tracehook_report_syscall(struct pt_regs *regs,
 asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 {
 	/* Do the secure computing check first; failures should be fast. */
-	if (secure_computing() == -1)
+	if (secure_computing(NULL) == -1)
 		return -1;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 0dcf69194473..c50af846ecf9 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -893,7 +893,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall)
 
 	current_thread_info()->syscall = syscall;
 
-	if (secure_computing() == -1)
+	if (secure_computing(NULL) == -1)
 		return -1;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c
index b5458b37fc5b..8edc47c0b98e 100644
--- a/arch/parisc/kernel/ptrace.c
+++ b/arch/parisc/kernel/ptrace.c
@@ -312,7 +312,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 long do_syscall_trace_enter(struct pt_regs *regs)
 {
 	/* Do the secure computing check first. */
-	if (secure_computing() == -1)
+	if (secure_computing(NULL) == -1)
 		return -1;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 30a03c03fe73..ed799e994773 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -1783,7 +1783,7 @@ static int do_seccomp(struct pt_regs *regs)
 	 * have already loaded -ENOSYS into r3, or seccomp has put
 	 * something else in r3 (via SECCOMP_RET_ERRNO/TRACE).
 	 */
-	if (__secure_computing())
+	if (__secure_computing(NULL))
 		return -1;
 
 	/*
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 49b1c13bf6c9..c238e9958c2a 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -824,7 +824,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 	long ret = 0;
 
 	/* Do the secure computing check first. */
-	if (secure_computing()) {
+	if (secure_computing(NULL)) {
 		/* seccomp failures shouldn't expose any additional code. */
 		ret = -1;
 		goto out;
diff --git a/arch/tile/kernel/ptrace.c b/arch/tile/kernel/ptrace.c
index 54e7b723db99..8c6d2f2fefa3 100644
--- a/arch/tile/kernel/ptrace.c
+++ b/arch/tile/kernel/ptrace.c
@@ -255,7 +255,7 @@ int do_syscall_trace_enter(struct pt_regs *regs)
 {
 	u32 work = ACCESS_ONCE(current_thread_info()->flags);
 
-	if (secure_computing() == -1)
+	if (secure_computing(NULL) == -1)
 		return -1;
 
 	if (work & _TIF_SYSCALL_TRACE) {
diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c
index 48b0dcbd87be..9c5570f0f397 100644
--- a/arch/um/kernel/skas/syscall.c
+++ b/arch/um/kernel/skas/syscall.c
@@ -21,7 +21,7 @@ void handle_syscall(struct uml_pt_regs *r)
 	PT_REGS_SET_SYSCALL_RETURN(regs, -ENOSYS);
 
 	/* Do the secure computing check first; failures should be fast. */
-	if (secure_computing() == -1)
+	if (secure_computing(NULL) == -1)
 		return;
 
 	if (syscall_trace_enter(regs))
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 174c2549939d..85acde5fa442 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -207,7 +207,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 	 */
 	regs->orig_ax = syscall_nr;
 	regs->ax = -ENOSYS;
-	tmp = secure_computing();
+	tmp = secure_computing(NULL);
 	if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
 		warn_bad_vsyscall(KERN_DEBUG, regs,
 				  "seccomp tried to change syscall nr or ip");
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 2296e6b2f690..9eaa7b34d6da 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -28,11 +28,11 @@ struct seccomp {
 };
 
 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
-extern int __secure_computing(void);
-static inline int secure_computing(void)
+extern int __secure_computing(const struct seccomp_data *sd);
+static inline int secure_computing(const struct seccomp_data *sd)
 {
 	if (unlikely(test_thread_flag(TIF_SECCOMP)))
-		return  __secure_computing();
+		return  __secure_computing(sd);
 	return 0;
 }
 
@@ -61,7 +61,7 @@ struct seccomp { };
 struct seccomp_filter { };
 
 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
-static inline int secure_computing(void) { return 0; }
+static inline int secure_computing(struct seccomp_data *sd) { return 0; }
 #else
 static inline void secure_computing_strict(int this_syscall) { return; }
 #endif
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 7002796f14a4..06816290a212 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -554,9 +554,9 @@ void secure_computing_strict(int this_syscall)
 		BUG();
 }
 #else
-int __secure_computing(void)
+int __secure_computing(const struct seccomp_data *sd)
 {
-	u32 phase1_result = seccomp_phase1(NULL);
+	u32 phase1_result = seccomp_phase1(sd);
 
 	if (likely(phase1_result == SECCOMP_PHASE1_OK))
 		return 0;
-- 
cgit v1.3-8-gc7d7


From 8112c4f140fa03f9ee68aad2cc79afa7df5418d3 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 1 Jun 2016 16:02:17 -0700
Subject: seccomp: remove 2-phase API

Since nothing is using the 2-phase API, and it adds more complexity than
benefit, remove it.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@kernel.org>
---
 include/linux/seccomp.h |   6 ---
 kernel/seccomp.c        | 129 +++++++++++++++---------------------------------
 2 files changed, 41 insertions(+), 94 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 9eaa7b34d6da..ecc296c137cd 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -35,12 +35,6 @@ static inline int secure_computing(const struct seccomp_data *sd)
 		return  __secure_computing(sd);
 	return 0;
 }
-
-#define SECCOMP_PHASE1_OK	0
-#define SECCOMP_PHASE1_SKIP	1
-
-extern u32 seccomp_phase1(struct seccomp_data *sd);
-int seccomp_phase2(u32 phase1_result);
 #else
 extern void secure_computing_strict(int this_syscall);
 #endif
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 06816290a212..14a37d71b612 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -173,7 +173,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
  *
  * Returns valid seccomp BPF response codes.
  */
-static u32 seccomp_run_filters(struct seccomp_data *sd)
+static u32 seccomp_run_filters(const struct seccomp_data *sd)
 {
 	struct seccomp_data sd_local;
 	u32 ret = SECCOMP_RET_ALLOW;
@@ -554,20 +554,9 @@ void secure_computing_strict(int this_syscall)
 		BUG();
 }
 #else
-int __secure_computing(const struct seccomp_data *sd)
-{
-	u32 phase1_result = seccomp_phase1(sd);
-
-	if (likely(phase1_result == SECCOMP_PHASE1_OK))
-		return 0;
-	else if (likely(phase1_result == SECCOMP_PHASE1_SKIP))
-		return -1;
-	else
-		return seccomp_phase2(phase1_result);
-}
 
 #ifdef CONFIG_SECCOMP_FILTER
-static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
+static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd)
 {
 	u32 filter_ret, action;
 	int data;
@@ -599,10 +588,33 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
 		goto skip;
 
 	case SECCOMP_RET_TRACE:
-		return filter_ret;  /* Save the rest for phase 2. */
+		/* ENOSYS these calls if there is no tracer attached. */
+		if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
+			syscall_set_return_value(current,
+						 task_pt_regs(current),
+						 -ENOSYS, 0);
+			goto skip;
+		}
+
+		/* Allow the BPF to provide the event message */
+		ptrace_event(PTRACE_EVENT_SECCOMP, data);
+		/*
+		 * The delivery of a fatal signal during event
+		 * notification may silently skip tracer notification.
+		 * Terminating the task now avoids executing a system
+		 * call that may not be intended.
+		 */
+		if (fatal_signal_pending(current))
+			do_exit(SIGSYS);
+		/* Check if the tracer forced the syscall to be skipped. */
+		this_syscall = syscall_get_nr(current, task_pt_regs(current));
+		if (this_syscall < 0)
+			goto skip;
+
+		return 0;
 
 	case SECCOMP_RET_ALLOW:
-		return SECCOMP_PHASE1_OK;
+		return 0;
 
 	case SECCOMP_RET_KILL:
 	default:
@@ -614,96 +626,37 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
 
 skip:
 	audit_seccomp(this_syscall, 0, action);
-	return SECCOMP_PHASE1_SKIP;
+	return -1;
+}
+#else
+static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd)
+{
+	BUG();
 }
 #endif
 
-/**
- * seccomp_phase1() - run fast path seccomp checks on the current syscall
- * @arg sd: The seccomp_data or NULL
- *
- * This only reads pt_regs via the syscall_xyz helpers.  The only change
- * it will make to pt_regs is via syscall_set_return_value, and it will
- * only do that if it returns SECCOMP_PHASE1_SKIP.
- *
- * If sd is provided, it will not read pt_regs at all.
- *
- * It may also call do_exit or force a signal; these actions must be
- * safe.
- *
- * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should
- * be processed normally.
- *
- * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be
- * invoked.  In this case, seccomp_phase1 will have set the return value
- * using syscall_set_return_value.
- *
- * If it returns anything else, then the return value should be passed
- * to seccomp_phase2 from a context in which ptrace hooks are safe.
- */
-u32 seccomp_phase1(struct seccomp_data *sd)
+int __secure_computing(const struct seccomp_data *sd)
 {
 	int mode = current->seccomp.mode;
-	int this_syscall = sd ? sd->nr :
-		syscall_get_nr(current, task_pt_regs(current));
+	int this_syscall;
 
 	if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
 	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
-		return SECCOMP_PHASE1_OK;
+		return 0;
+
+	this_syscall = sd ? sd->nr :
+		syscall_get_nr(current, task_pt_regs(current));
 
 	switch (mode) {
 	case SECCOMP_MODE_STRICT:
 		__secure_computing_strict(this_syscall);  /* may call do_exit */
-		return SECCOMP_PHASE1_OK;
-#ifdef CONFIG_SECCOMP_FILTER
+		return 0;
 	case SECCOMP_MODE_FILTER:
-		return __seccomp_phase1_filter(this_syscall, sd);
-#endif
+		return __seccomp_filter(this_syscall, sd);
 	default:
 		BUG();
 	}
 }
-
-/**
- * seccomp_phase2() - finish slow path seccomp work for the current syscall
- * @phase1_result: The return value from seccomp_phase1()
- *
- * This must be called from a context in which ptrace hooks can be used.
- *
- * Returns 0 if the syscall should be processed or -1 to skip the syscall.
- */
-int seccomp_phase2(u32 phase1_result)
-{
-	struct pt_regs *regs = task_pt_regs(current);
-	u32 action = phase1_result & SECCOMP_RET_ACTION;
-	int data = phase1_result & SECCOMP_RET_DATA;
-
-	BUG_ON(action != SECCOMP_RET_TRACE);
-
-	audit_seccomp(syscall_get_nr(current, regs), 0, action);
-
-	/* Skip these calls if there is no tracer. */
-	if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
-		syscall_set_return_value(current, regs,
-					 -ENOSYS, 0);
-		return -1;
-	}
-
-	/* Allow the BPF to provide the event message */
-	ptrace_event(PTRACE_EVENT_SECCOMP, data);
-	/*
-	 * The delivery of a fatal signal during event
-	 * notification may silently skip tracer notification.
-	 * Terminating the task now avoids executing a system
-	 * call that may not be intended.
-	 */
-	if (fatal_signal_pending(current))
-		do_exit(SIGSYS);
-	if (syscall_get_nr(current, regs) < 0)
-		return -1;  /* Explicit request to skip. */
-
-	return 0;
-}
 #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
 
 long prctl_get_seccomp(void)
-- 
cgit v1.3-8-gc7d7


From ce6526e8afa4b6ad0ab134a4cc50c9c863319637 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 1 Jun 2016 19:29:15 -0700
Subject: seccomp: recheck the syscall after RET_TRACE

When RET_TRACE triggers, a tracer may change a syscall into something that
should be filtered by seccomp. This re-runs seccomp after a trace event
to make sure things continue to pass.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@kernel.org>
---
 kernel/seccomp.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 14a37d71b612..54d15eb2b701 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -556,7 +556,8 @@ void secure_computing_strict(int this_syscall)
 #else
 
 #ifdef CONFIG_SECCOMP_FILTER
-static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd)
+static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
+			    const bool recheck_after_trace)
 {
 	u32 filter_ret, action;
 	int data;
@@ -588,6 +589,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd)
 		goto skip;
 
 	case SECCOMP_RET_TRACE:
+		/* We've been put in this state by the ptracer already. */
+		if (recheck_after_trace)
+			return 0;
+
 		/* ENOSYS these calls if there is no tracer attached. */
 		if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
 			syscall_set_return_value(current,
@@ -611,6 +616,15 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd)
 		if (this_syscall < 0)
 			goto skip;
 
+		/*
+		 * Recheck the syscall, since it may have changed. This
+		 * intentionally uses a NULL struct seccomp_data to force
+		 * a reload of all registers. This does not goto skip since
+		 * a skip would have already been reported.
+		 */
+		if (__seccomp_filter(this_syscall, NULL, true))
+			return -1;
+
 		return 0;
 
 	case SECCOMP_RET_ALLOW:
@@ -629,7 +643,8 @@ skip:
 	return -1;
 }
 #else
-static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd)
+static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
+			    const bool recheck_after_trace)
 {
 	BUG();
 }
@@ -652,7 +667,7 @@ int __secure_computing(const struct seccomp_data *sd)
 		__secure_computing_strict(this_syscall);  /* may call do_exit */
 		return 0;
 	case SECCOMP_MODE_FILTER:
-		return __seccomp_filter(this_syscall, sd);
+		return __seccomp_filter(this_syscall, sd, false);
 	default:
 		BUG();
 	}
-- 
cgit v1.3-8-gc7d7


From 0d95092ccba1a30b74fa52ff94ec5415e63744a0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 8 Apr 2016 05:00:03 -0700
Subject: rcu: Fix outdated rcu_scheduler_active comment

The comment header for rcu_scheduler_active states that it is used
to optimize synchronize_sched() at early boot.  This is incorrect.
The synchronize_sched() function instead checks the number of online
CPUs.  This commit therefore replaces the comment's synchronize_sched()
with synchronize_rcu(), which really does use rcu_scheduler_active for
this purpose.

Reported-by: Lihao Liang <lihao.liang@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c7f1bc4f817c..0fa692f1094e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -130,7 +130,7 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
  * The rcu_scheduler_active variable transitions from zero to one just
  * before the first task is spawned.  So when this variable is zero, RCU
  * can assume that there is but one task, allowing RCU to (for example)
- * optimize synchronize_sched() to a simple barrier().  When this variable
+ * optimize synchronize_rcu() to a simple barrier().  When this variable
  * is one, RCU must actually do all the hard work required to detect real
  * grace periods.  This variable is also used to suppress boot-time false
  * positives from lockdep-RCU error checking.
-- 
cgit v1.3-8-gc7d7


From 590d1757b9d177e7fe3707963d0209d6eefbc746 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 10 Apr 2016 08:23:24 -0700
Subject: rcu: Fix outdated hotplug-exclusion comment in rcu_gp_init()

In the past, RCU grace-period initialization excluded CPU-hotplug
operations, but this is no longer the case.  This commit therefore
removed an outdated comment in rcu_gp_init() claiming that these
are excluded.

Reported-by: Lihao Liang <lihao.liang@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0fa692f1094e..6043c14165d1 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1989,8 +1989,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 	 * of the tree within the rsp->node[] array.  Note that other CPUs
 	 * will access only the leaves of the hierarchy, thus seeing that no
 	 * grace period is in progress, at least until the corresponding
-	 * leaf node has been initialized.  In addition, we have excluded
-	 * CPU-hotplug operations.
+	 * leaf node has been initialized.
 	 *
 	 * The grace period cannot complete until the initialization
 	 * process finishes, because this kthread handles both.
-- 
cgit v1.3-8-gc7d7


From d3acab65f274800dd0901f0816f8bca9f2a8c8ec Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 10 Mar 2016 09:49:04 +0100
Subject: rcu: Remove some superfluous lines

I think you'll find this condition is superfluous, as the whole function
is under #ifdef of that same.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6043c14165d1..4aefeafb9a95 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4363,9 +4363,6 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
-	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
-		return;
-
 	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
 	mask = rdp->grpmask;
 	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
-- 
cgit v1.3-8-gc7d7


From 3549c2bc2c4ea8ecfeb9d21cb81cb00c6002b011 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 15 Apr 2016 16:35:29 -0700
Subject: rcu: Move expedited code from tree.c to tree_exp.h

People have been having some difficulty finding their way around the
RCU code.  This commit therefore pulls some of the expedited grace-period
code from tree.c to a new tree_exp.h file.  This commit is strictly code
movement, with the exception of a forward declaration that was added
for the sync_sched_exp_online_cleanup() function.

A subsequent commit will move the remaining expedited grace-period code
from tree_plugin.h to tree_exp.h.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c     | 545 +-----------------------------------------------
 kernel/rcu/tree_exp.h | 564 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 566 insertions(+), 543 deletions(-)
 create mode 100644 kernel/rcu/tree_exp.h

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4aefeafb9a95..c844b6142a86 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -159,6 +159,7 @@ static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 static void rcu_report_exp_rdp(struct rcu_state *rsp,
 			       struct rcu_data *rdp, bool wake);
+static void sync_sched_exp_online_cleanup(int cpu);
 
 /* rcuc/rcub kthread realtime priority */
 #ifdef CONFIG_RCU_KTHREAD_PRIO
@@ -3447,549 +3448,6 @@ static bool rcu_seq_done(unsigned long *sp, unsigned long s)
 	return ULONG_CMP_GE(READ_ONCE(*sp), s);
 }
 
-/* Wrapper functions for expedited grace periods.  */
-static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
-{
-	rcu_seq_start(&rsp->expedited_sequence);
-}
-static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
-{
-	rcu_seq_end(&rsp->expedited_sequence);
-	smp_mb(); /* Ensure that consecutive grace periods serialize. */
-}
-static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
-{
-	unsigned long s;
-
-	smp_mb(); /* Caller's modifications seen first by other CPUs. */
-	s = rcu_seq_snap(&rsp->expedited_sequence);
-	trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
-	return s;
-}
-static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
-{
-	return rcu_seq_done(&rsp->expedited_sequence, s);
-}
-
-/*
- * Reset the ->expmaskinit values in the rcu_node tree to reflect any
- * recent CPU-online activity.  Note that these masks are not cleared
- * when CPUs go offline, so they reflect the union of all CPUs that have
- * ever been online.  This means that this function normally takes its
- * no-work-to-do fastpath.
- */
-static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
-{
-	bool done;
-	unsigned long flags;
-	unsigned long mask;
-	unsigned long oldmask;
-	int ncpus = READ_ONCE(rsp->ncpus);
-	struct rcu_node *rnp;
-	struct rcu_node *rnp_up;
-
-	/* If no new CPUs onlined since last time, nothing to do. */
-	if (likely(ncpus == rsp->ncpus_snap))
-		return;
-	rsp->ncpus_snap = ncpus;
-
-	/*
-	 * Each pass through the following loop propagates newly onlined
-	 * CPUs for the current rcu_node structure up the rcu_node tree.
-	 */
-	rcu_for_each_leaf_node(rsp, rnp) {
-		raw_spin_lock_irqsave_rcu_node(rnp, flags);
-		if (rnp->expmaskinit == rnp->expmaskinitnext) {
-			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-			continue;  /* No new CPUs, nothing to do. */
-		}
-
-		/* Update this node's mask, track old value for propagation. */
-		oldmask = rnp->expmaskinit;
-		rnp->expmaskinit = rnp->expmaskinitnext;
-		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-
-		/* If was already nonzero, nothing to propagate. */
-		if (oldmask)
-			continue;
-
-		/* Propagate the new CPU up the tree. */
-		mask = rnp->grpmask;
-		rnp_up = rnp->parent;
-		done = false;
-		while (rnp_up) {
-			raw_spin_lock_irqsave_rcu_node(rnp_up, flags);
-			if (rnp_up->expmaskinit)
-				done = true;
-			rnp_up->expmaskinit |= mask;
-			raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags);
-			if (done)
-				break;
-			mask = rnp_up->grpmask;
-			rnp_up = rnp_up->parent;
-		}
-	}
-}
-
-/*
- * Reset the ->expmask values in the rcu_node tree in preparation for
- * a new expedited grace period.
- */
-static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
-{
-	unsigned long flags;
-	struct rcu_node *rnp;
-
-	sync_exp_reset_tree_hotplug(rsp);
-	rcu_for_each_node_breadth_first(rsp, rnp) {
-		raw_spin_lock_irqsave_rcu_node(rnp, flags);
-		WARN_ON_ONCE(rnp->expmask);
-		rnp->expmask = rnp->expmaskinit;
-		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-	}
-}
-
-/*
- * Return non-zero if there is no RCU expedited grace period in progress
- * for the specified rcu_node structure, in other words, if all CPUs and
- * tasks covered by the specified rcu_node structure have done their bit
- * for the current expedited grace period.  Works only for preemptible
- * RCU -- other RCU implementation use other means.
- *
- * Caller must hold the rcu_state's exp_mutex.
- */
-static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
-{
-	return rnp->exp_tasks == NULL &&
-	       READ_ONCE(rnp->expmask) == 0;
-}
-
-/*
- * Report the exit from RCU read-side critical section for the last task
- * that queued itself during or before the current expedited preemptible-RCU
- * grace period.  This event is reported either to the rcu_node structure on
- * which the task was queued or to one of that rcu_node structure's ancestors,
- * recursively up the tree.  (Calm down, calm down, we do the recursion
- * iteratively!)
- *
- * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
- * structure's ->lock.
- */
-static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
-				 bool wake, unsigned long flags)
-	__releases(rnp->lock)
-{
-	unsigned long mask;
-
-	for (;;) {
-		if (!sync_rcu_preempt_exp_done(rnp)) {
-			if (!rnp->expmask)
-				rcu_initiate_boost(rnp, flags);
-			else
-				raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-			break;
-		}
-		if (rnp->parent == NULL) {
-			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-			if (wake) {
-				smp_mb(); /* EGP done before wake_up(). */
-				swake_up(&rsp->expedited_wq);
-			}
-			break;
-		}
-		mask = rnp->grpmask;
-		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */
-		rnp = rnp->parent;
-		raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
-		WARN_ON_ONCE(!(rnp->expmask & mask));
-		rnp->expmask &= ~mask;
-	}
-}
-
-/*
- * Report expedited quiescent state for specified node.  This is a
- * lock-acquisition wrapper function for __rcu_report_exp_rnp().
- *
- * Caller must hold the rcu_state's exp_mutex.
- */
-static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
-					      struct rcu_node *rnp, bool wake)
-{
-	unsigned long flags;
-
-	raw_spin_lock_irqsave_rcu_node(rnp, flags);
-	__rcu_report_exp_rnp(rsp, rnp, wake, flags);
-}
-
-/*
- * Report expedited quiescent state for multiple CPUs, all covered by the
- * specified leaf rcu_node structure.  Caller must hold the rcu_state's
- * exp_mutex.
- */
-static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
-				    unsigned long mask, bool wake)
-{
-	unsigned long flags;
-
-	raw_spin_lock_irqsave_rcu_node(rnp, flags);
-	if (!(rnp->expmask & mask)) {
-		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-		return;
-	}
-	rnp->expmask &= ~mask;
-	__rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
-}
-
-/*
- * Report expedited quiescent state for specified rcu_data (CPU).
- */
-static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
-			       bool wake)
-{
-	rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
-}
-
-/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
-static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
-			       unsigned long s)
-{
-	if (rcu_exp_gp_seq_done(rsp, s)) {
-		trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
-		/* Ensure test happens before caller kfree(). */
-		smp_mb__before_atomic(); /* ^^^ */
-		atomic_long_inc(stat);
-		return true;
-	}
-	return false;
-}
-
-/*
- * Funnel-lock acquisition for expedited grace periods.  Returns true
- * if some other task completed an expedited grace period that this task
- * can piggy-back on, and with no mutex held.  Otherwise, returns false
- * with the mutex held, indicating that the caller must actually do the
- * expedited grace period.
- */
-static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
-{
-	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
-	struct rcu_node *rnp = rdp->mynode;
-	struct rcu_node *rnp_root = rcu_get_root(rsp);
-
-	/* Low-contention fastpath. */
-	if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) &&
-	    (rnp == rnp_root ||
-	     ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) &&
-	    !mutex_is_locked(&rsp->exp_mutex) &&
-	    mutex_trylock(&rsp->exp_mutex))
-		goto fastpath;
-
-	/*
-	 * Each pass through the following loop works its way up
-	 * the rcu_node tree, returning if others have done the work or
-	 * otherwise falls through to acquire rsp->exp_mutex.  The mapping
-	 * from CPU to rcu_node structure can be inexact, as it is just
-	 * promoting locality and is not strictly needed for correctness.
-	 */
-	for (; rnp != NULL; rnp = rnp->parent) {
-		if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
-			return true;
-
-		/* Work not done, either wait here or go up. */
-		spin_lock(&rnp->exp_lock);
-		if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) {
-
-			/* Someone else doing GP, so wait for them. */
-			spin_unlock(&rnp->exp_lock);
-			trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
-						  rnp->grplo, rnp->grphi,
-						  TPS("wait"));
-			wait_event(rnp->exp_wq[(s >> 1) & 0x3],
-				   sync_exp_work_done(rsp,
-						      &rdp->exp_workdone2, s));
-			return true;
-		}
-		rnp->exp_seq_rq = s; /* Followers can wait on us. */
-		spin_unlock(&rnp->exp_lock);
-		trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo,
-					  rnp->grphi, TPS("nxtlvl"));
-	}
-	mutex_lock(&rsp->exp_mutex);
-fastpath:
-	if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
-		mutex_unlock(&rsp->exp_mutex);
-		return true;
-	}
-	rcu_exp_gp_seq_start(rsp);
-	trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
-	return false;
-}
-
-/* Invoked on each online non-idle CPU for expedited quiescent state. */
-static void sync_sched_exp_handler(void *data)
-{
-	struct rcu_data *rdp;
-	struct rcu_node *rnp;
-	struct rcu_state *rsp = data;
-
-	rdp = this_cpu_ptr(rsp->rda);
-	rnp = rdp->mynode;
-	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
-	    __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
-		return;
-	if (rcu_is_cpu_rrupt_from_idle()) {
-		rcu_report_exp_rdp(&rcu_sched_state,
-				   this_cpu_ptr(&rcu_sched_data), true);
-		return;
-	}
-	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
-	resched_cpu(smp_processor_id());
-}
-
-/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
-static void sync_sched_exp_online_cleanup(int cpu)
-{
-	struct rcu_data *rdp;
-	int ret;
-	struct rcu_node *rnp;
-	struct rcu_state *rsp = &rcu_sched_state;
-
-	rdp = per_cpu_ptr(rsp->rda, cpu);
-	rnp = rdp->mynode;
-	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
-		return;
-	ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
-	WARN_ON_ONCE(ret);
-}
-
-/*
- * Select the nodes that the upcoming expedited grace period needs
- * to wait for.
- */
-static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
-				     smp_call_func_t func)
-{
-	int cpu;
-	unsigned long flags;
-	unsigned long mask;
-	unsigned long mask_ofl_test;
-	unsigned long mask_ofl_ipi;
-	int ret;
-	struct rcu_node *rnp;
-
-	sync_exp_reset_tree(rsp);
-	rcu_for_each_leaf_node(rsp, rnp) {
-		raw_spin_lock_irqsave_rcu_node(rnp, flags);
-
-		/* Each pass checks a CPU for identity, offline, and idle. */
-		mask_ofl_test = 0;
-		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
-			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
-			if (raw_smp_processor_id() == cpu ||
-			    !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
-				mask_ofl_test |= rdp->grpmask;
-		}
-		mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
-
-		/*
-		 * Need to wait for any blocked tasks as well.  Note that
-		 * additional blocking tasks will also block the expedited
-		 * GP until such time as the ->expmask bits are cleared.
-		 */
-		if (rcu_preempt_has_tasks(rnp))
-			rnp->exp_tasks = rnp->blkd_tasks.next;
-		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-
-		/* IPI the remaining CPUs for expedited quiescent state. */
-		mask = 1;
-		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
-			if (!(mask_ofl_ipi & mask))
-				continue;
-retry_ipi:
-			ret = smp_call_function_single(cpu, func, rsp, 0);
-			if (!ret) {
-				mask_ofl_ipi &= ~mask;
-				continue;
-			}
-			/* Failed, raced with offline. */
-			raw_spin_lock_irqsave_rcu_node(rnp, flags);
-			if (cpu_online(cpu) &&
-			    (rnp->expmask & mask)) {
-				raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-				schedule_timeout_uninterruptible(1);
-				if (cpu_online(cpu) &&
-				    (rnp->expmask & mask))
-					goto retry_ipi;
-				raw_spin_lock_irqsave_rcu_node(rnp, flags);
-			}
-			if (!(rnp->expmask & mask))
-				mask_ofl_ipi &= ~mask;
-			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-		}
-		/* Report quiescent states for those that went offline. */
-		mask_ofl_test |= mask_ofl_ipi;
-		if (mask_ofl_test)
-			rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
-	}
-}
-
-static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
-{
-	int cpu;
-	unsigned long jiffies_stall;
-	unsigned long jiffies_start;
-	unsigned long mask;
-	int ndetected;
-	struct rcu_node *rnp;
-	struct rcu_node *rnp_root = rcu_get_root(rsp);
-	int ret;
-
-	jiffies_stall = rcu_jiffies_till_stall_check();
-	jiffies_start = jiffies;
-
-	for (;;) {
-		ret = swait_event_timeout(
-				rsp->expedited_wq,
-				sync_rcu_preempt_exp_done(rnp_root),
-				jiffies_stall);
-		if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
-			return;
-		if (ret < 0) {
-			/* Hit a signal, disable CPU stall warnings. */
-			swait_event(rsp->expedited_wq,
-				   sync_rcu_preempt_exp_done(rnp_root));
-			return;
-		}
-		pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
-		       rsp->name);
-		ndetected = 0;
-		rcu_for_each_leaf_node(rsp, rnp) {
-			ndetected += rcu_print_task_exp_stall(rnp);
-			mask = 1;
-			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
-				struct rcu_data *rdp;
-
-				if (!(rnp->expmask & mask))
-					continue;
-				ndetected++;
-				rdp = per_cpu_ptr(rsp->rda, cpu);
-				pr_cont(" %d-%c%c%c", cpu,
-					"O."[!!cpu_online(cpu)],
-					"o."[!!(rdp->grpmask & rnp->expmaskinit)],
-					"N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
-			}
-			mask <<= 1;
-		}
-		pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
-			jiffies - jiffies_start, rsp->expedited_sequence,
-			rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
-		if (ndetected) {
-			pr_err("blocking rcu_node structures:");
-			rcu_for_each_node_breadth_first(rsp, rnp) {
-				if (rnp == rnp_root)
-					continue; /* printed unconditionally */
-				if (sync_rcu_preempt_exp_done(rnp))
-					continue;
-				pr_cont(" l=%u:%d-%d:%#lx/%c",
-					rnp->level, rnp->grplo, rnp->grphi,
-					rnp->expmask,
-					".T"[!!rnp->exp_tasks]);
-			}
-			pr_cont("\n");
-		}
-		rcu_for_each_leaf_node(rsp, rnp) {
-			mask = 1;
-			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
-				if (!(rnp->expmask & mask))
-					continue;
-				dump_cpu_task(cpu);
-			}
-		}
-		jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
-	}
-}
-
-/*
- * Wait for the current expedited grace period to complete, and then
- * wake up everyone who piggybacked on the just-completed expedited
- * grace period.  Also update all the ->exp_seq_rq counters as needed
- * in order to avoid counter-wrap problems.
- */
-static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
-{
-	struct rcu_node *rnp;
-
-	synchronize_sched_expedited_wait(rsp);
-	rcu_exp_gp_seq_end(rsp);
-	trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
-
-	/*
-	 * Switch over to wakeup mode, allowing the next GP, but -only- the
-	 * next GP, to proceed.
-	 */
-	mutex_lock(&rsp->exp_wake_mutex);
-	mutex_unlock(&rsp->exp_mutex);
-
-	rcu_for_each_node_breadth_first(rsp, rnp) {
-		if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
-			spin_lock(&rnp->exp_lock);
-			/* Recheck, avoid hang in case someone just arrived. */
-			if (ULONG_CMP_LT(rnp->exp_seq_rq, s))
-				rnp->exp_seq_rq = s;
-			spin_unlock(&rnp->exp_lock);
-		}
-		wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
-	}
-	trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
-	mutex_unlock(&rsp->exp_wake_mutex);
-}
-
-/**
- * synchronize_sched_expedited - Brute-force RCU-sched grace period
- *
- * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
- * approach to force the grace period to end quickly.  This consumes
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code.  In fact,
- * if you are using synchronize_sched_expedited() in a loop, please
- * restructure your code to batch your updates, and then use a single
- * synchronize_sched() instead.
- *
- * This implementation can be thought of as an application of sequence
- * locking to expedited grace periods, but using the sequence counter to
- * determine when someone else has already done the work instead of for
- * retrying readers.
- */
-void synchronize_sched_expedited(void)
-{
-	unsigned long s;
-	struct rcu_state *rsp = &rcu_sched_state;
-
-	/* If only one CPU, this is automatically a grace period. */
-	if (rcu_blocking_is_gp())
-		return;
-
-	/* If expedited grace periods are prohibited, fall back to normal. */
-	if (rcu_gp_is_normal()) {
-		wait_rcu_gp(call_rcu_sched);
-		return;
-	}
-
-	/* Take a snapshot of the sequence number.  */
-	s = rcu_exp_gp_seq_snap(rsp);
-	if (exp_funnel_lock(rsp, s))
-		return;  /* Someone else did our work for us. */
-
-	/* Initialize the rcu_node tree in preparation for the wait. */
-	sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
-
-	/* Wait and clean up, including waking everyone. */
-	rcu_exp_wait_wake(rsp, s);
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
 /*
  * Check to see if there is any immediate RCU-related work to be done
  * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -4747,4 +4205,5 @@ void __init rcu_init(void)
 		rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
 }
 
+#include "tree_exp.h"
 #include "tree_plugin.h"
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
new file mode 100644
index 000000000000..db0909cf7fe1
--- /dev/null
+++ b/kernel/rcu/tree_exp.h
@@ -0,0 +1,564 @@
+/*
+ * RCU expedited grace periods
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2016
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+/* Wrapper functions for expedited grace periods.  */
+static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
+{
+	rcu_seq_start(&rsp->expedited_sequence);
+}
+static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
+{
+	rcu_seq_end(&rsp->expedited_sequence);
+	smp_mb(); /* Ensure that consecutive grace periods serialize. */
+}
+static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
+{
+	unsigned long s;
+
+	smp_mb(); /* Caller's modifications seen first by other CPUs. */
+	s = rcu_seq_snap(&rsp->expedited_sequence);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
+	return s;
+}
+static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
+{
+	return rcu_seq_done(&rsp->expedited_sequence, s);
+}
+
+/*
+ * Reset the ->expmaskinit values in the rcu_node tree to reflect any
+ * recent CPU-online activity.  Note that these masks are not cleared
+ * when CPUs go offline, so they reflect the union of all CPUs that have
+ * ever been online.  This means that this function normally takes its
+ * no-work-to-do fastpath.
+ */
+static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
+{
+	bool done;
+	unsigned long flags;
+	unsigned long mask;
+	unsigned long oldmask;
+	int ncpus = READ_ONCE(rsp->ncpus);
+	struct rcu_node *rnp;
+	struct rcu_node *rnp_up;
+
+	/* If no new CPUs onlined since last time, nothing to do. */
+	if (likely(ncpus == rsp->ncpus_snap))
+		return;
+	rsp->ncpus_snap = ncpus;
+
+	/*
+	 * Each pass through the following loop propagates newly onlined
+	 * CPUs for the current rcu_node structure up the rcu_node tree.
+	 */
+	rcu_for_each_leaf_node(rsp, rnp) {
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
+		if (rnp->expmaskinit == rnp->expmaskinitnext) {
+			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+			continue;  /* No new CPUs, nothing to do. */
+		}
+
+		/* Update this node's mask, track old value for propagation. */
+		oldmask = rnp->expmaskinit;
+		rnp->expmaskinit = rnp->expmaskinitnext;
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+		/* If was already nonzero, nothing to propagate. */
+		if (oldmask)
+			continue;
+
+		/* Propagate the new CPU up the tree. */
+		mask = rnp->grpmask;
+		rnp_up = rnp->parent;
+		done = false;
+		while (rnp_up) {
+			raw_spin_lock_irqsave_rcu_node(rnp_up, flags);
+			if (rnp_up->expmaskinit)
+				done = true;
+			rnp_up->expmaskinit |= mask;
+			raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags);
+			if (done)
+				break;
+			mask = rnp_up->grpmask;
+			rnp_up = rnp_up->parent;
+		}
+	}
+}
+
+/*
+ * Reset the ->expmask values in the rcu_node tree in preparation for
+ * a new expedited grace period.
+ */
+static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_node *rnp;
+
+	sync_exp_reset_tree_hotplug(rsp);
+	rcu_for_each_node_breadth_first(rsp, rnp) {
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
+		WARN_ON_ONCE(rnp->expmask);
+		rnp->expmask = rnp->expmaskinit;
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+	}
+}
+
+/*
+ * Return non-zero if there is no RCU expedited grace period in progress
+ * for the specified rcu_node structure, in other words, if all CPUs and
+ * tasks covered by the specified rcu_node structure have done their bit
+ * for the current expedited grace period.  Works only for preemptible
+ * RCU -- other RCU implementation use other means.
+ *
+ * Caller must hold the rcu_state's exp_mutex.
+ */
+static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+{
+	return rnp->exp_tasks == NULL &&
+	       READ_ONCE(rnp->expmask) == 0;
+}
+
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period.  This event is reported either to the rcu_node structure on
+ * which the task was queued or to one of that rcu_node structure's ancestors,
+ * recursively up the tree.  (Calm down, calm down, we do the recursion
+ * iteratively!)
+ *
+ * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
+ * structure's ->lock.
+ */
+static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+				 bool wake, unsigned long flags)
+	__releases(rnp->lock)
+{
+	unsigned long mask;
+
+	for (;;) {
+		if (!sync_rcu_preempt_exp_done(rnp)) {
+			if (!rnp->expmask)
+				rcu_initiate_boost(rnp, flags);
+			else
+				raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+			break;
+		}
+		if (rnp->parent == NULL) {
+			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+			if (wake) {
+				smp_mb(); /* EGP done before wake_up(). */
+				swake_up(&rsp->expedited_wq);
+			}
+			break;
+		}
+		mask = rnp->grpmask;
+		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */
+		rnp = rnp->parent;
+		raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
+		WARN_ON_ONCE(!(rnp->expmask & mask));
+		rnp->expmask &= ~mask;
+	}
+}
+
+/*
+ * Report expedited quiescent state for specified node.  This is a
+ * lock-acquisition wrapper function for __rcu_report_exp_rnp().
+ *
+ * Caller must hold the rcu_state's exp_mutex.
+ */
+static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
+					      struct rcu_node *rnp, bool wake)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
+	__rcu_report_exp_rnp(rsp, rnp, wake, flags);
+}
+
+/*
+ * Report expedited quiescent state for multiple CPUs, all covered by the
+ * specified leaf rcu_node structure.  Caller must hold the rcu_state's
+ * exp_mutex.
+ */
+static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
+				    unsigned long mask, bool wake)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
+	if (!(rnp->expmask & mask)) {
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+		return;
+	}
+	rnp->expmask &= ~mask;
+	__rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
+}
+
+/*
+ * Report expedited quiescent state for specified rcu_data (CPU).
+ */
+static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
+			       bool wake)
+{
+	rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
+}
+
+/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
+static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
+			       unsigned long s)
+{
+	if (rcu_exp_gp_seq_done(rsp, s)) {
+		trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
+		/* Ensure test happens before caller kfree(). */
+		smp_mb__before_atomic(); /* ^^^ */
+		atomic_long_inc(stat);
+		return true;
+	}
+	return false;
+}
+
+/*
+ * Funnel-lock acquisition for expedited grace periods.  Returns true
+ * if some other task completed an expedited grace period that this task
+ * can piggy-back on, and with no mutex held.  Otherwise, returns false
+ * with the mutex held, indicating that the caller must actually do the
+ * expedited grace period.
+ */
+static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
+{
+	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
+	struct rcu_node *rnp = rdp->mynode;
+	struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+	/* Low-contention fastpath. */
+	if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) &&
+	    (rnp == rnp_root ||
+	     ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) &&
+	    !mutex_is_locked(&rsp->exp_mutex) &&
+	    mutex_trylock(&rsp->exp_mutex))
+		goto fastpath;
+
+	/*
+	 * Each pass through the following loop works its way up
+	 * the rcu_node tree, returning if others have done the work or
+	 * otherwise falls through to acquire rsp->exp_mutex.  The mapping
+	 * from CPU to rcu_node structure can be inexact, as it is just
+	 * promoting locality and is not strictly needed for correctness.
+	 */
+	for (; rnp != NULL; rnp = rnp->parent) {
+		if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
+			return true;
+
+		/* Work not done, either wait here or go up. */
+		spin_lock(&rnp->exp_lock);
+		if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) {
+
+			/* Someone else doing GP, so wait for them. */
+			spin_unlock(&rnp->exp_lock);
+			trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
+						  rnp->grplo, rnp->grphi,
+						  TPS("wait"));
+			wait_event(rnp->exp_wq[(s >> 1) & 0x3],
+				   sync_exp_work_done(rsp,
+						      &rdp->exp_workdone2, s));
+			return true;
+		}
+		rnp->exp_seq_rq = s; /* Followers can wait on us. */
+		spin_unlock(&rnp->exp_lock);
+		trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo,
+					  rnp->grphi, TPS("nxtlvl"));
+	}
+	mutex_lock(&rsp->exp_mutex);
+fastpath:
+	if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
+		mutex_unlock(&rsp->exp_mutex);
+		return true;
+	}
+	rcu_exp_gp_seq_start(rsp);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
+	return false;
+}
+
+/* Invoked on each online non-idle CPU for expedited quiescent state. */
+static void sync_sched_exp_handler(void *data)
+{
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+	struct rcu_state *rsp = data;
+
+	rdp = this_cpu_ptr(rsp->rda);
+	rnp = rdp->mynode;
+	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
+	    __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
+		return;
+	if (rcu_is_cpu_rrupt_from_idle()) {
+		rcu_report_exp_rdp(&rcu_sched_state,
+				   this_cpu_ptr(&rcu_sched_data), true);
+		return;
+	}
+	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
+	resched_cpu(smp_processor_id());
+}
+
+/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
+static void sync_sched_exp_online_cleanup(int cpu)
+{
+	struct rcu_data *rdp;
+	int ret;
+	struct rcu_node *rnp;
+	struct rcu_state *rsp = &rcu_sched_state;
+
+	rdp = per_cpu_ptr(rsp->rda, cpu);
+	rnp = rdp->mynode;
+	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
+		return;
+	ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
+	WARN_ON_ONCE(ret);
+}
+
+/*
+ * Select the nodes that the upcoming expedited grace period needs
+ * to wait for.
+ */
+static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
+				     smp_call_func_t func)
+{
+	int cpu;
+	unsigned long flags;
+	unsigned long mask;
+	unsigned long mask_ofl_test;
+	unsigned long mask_ofl_ipi;
+	int ret;
+	struct rcu_node *rnp;
+
+	sync_exp_reset_tree(rsp);
+	rcu_for_each_leaf_node(rsp, rnp) {
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
+
+		/* Each pass checks a CPU for identity, offline, and idle. */
+		mask_ofl_test = 0;
+		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
+			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+			if (raw_smp_processor_id() == cpu ||
+			    !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+				mask_ofl_test |= rdp->grpmask;
+		}
+		mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
+
+		/*
+		 * Need to wait for any blocked tasks as well.  Note that
+		 * additional blocking tasks will also block the expedited
+		 * GP until such time as the ->expmask bits are cleared.
+		 */
+		if (rcu_preempt_has_tasks(rnp))
+			rnp->exp_tasks = rnp->blkd_tasks.next;
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+		/* IPI the remaining CPUs for expedited quiescent state. */
+		mask = 1;
+		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+			if (!(mask_ofl_ipi & mask))
+				continue;
+retry_ipi:
+			ret = smp_call_function_single(cpu, func, rsp, 0);
+			if (!ret) {
+				mask_ofl_ipi &= ~mask;
+				continue;
+			}
+			/* Failed, raced with offline. */
+			raw_spin_lock_irqsave_rcu_node(rnp, flags);
+			if (cpu_online(cpu) &&
+			    (rnp->expmask & mask)) {
+				raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+				schedule_timeout_uninterruptible(1);
+				if (cpu_online(cpu) &&
+				    (rnp->expmask & mask))
+					goto retry_ipi;
+				raw_spin_lock_irqsave_rcu_node(rnp, flags);
+			}
+			if (!(rnp->expmask & mask))
+				mask_ofl_ipi &= ~mask;
+			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+		}
+		/* Report quiescent states for those that went offline. */
+		mask_ofl_test |= mask_ofl_ipi;
+		if (mask_ofl_test)
+			rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+	}
+}
+
+static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
+{
+	int cpu;
+	unsigned long jiffies_stall;
+	unsigned long jiffies_start;
+	unsigned long mask;
+	int ndetected;
+	struct rcu_node *rnp;
+	struct rcu_node *rnp_root = rcu_get_root(rsp);
+	int ret;
+
+	jiffies_stall = rcu_jiffies_till_stall_check();
+	jiffies_start = jiffies;
+
+	for (;;) {
+		ret = swait_event_timeout(
+				rsp->expedited_wq,
+				sync_rcu_preempt_exp_done(rnp_root),
+				jiffies_stall);
+		if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
+			return;
+		if (ret < 0) {
+			/* Hit a signal, disable CPU stall warnings. */
+			swait_event(rsp->expedited_wq,
+				   sync_rcu_preempt_exp_done(rnp_root));
+			return;
+		}
+		pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
+		       rsp->name);
+		ndetected = 0;
+		rcu_for_each_leaf_node(rsp, rnp) {
+			ndetected += rcu_print_task_exp_stall(rnp);
+			mask = 1;
+			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+				struct rcu_data *rdp;
+
+				if (!(rnp->expmask & mask))
+					continue;
+				ndetected++;
+				rdp = per_cpu_ptr(rsp->rda, cpu);
+				pr_cont(" %d-%c%c%c", cpu,
+					"O."[!!cpu_online(cpu)],
+					"o."[!!(rdp->grpmask & rnp->expmaskinit)],
+					"N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
+			}
+			mask <<= 1;
+		}
+		pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
+			jiffies - jiffies_start, rsp->expedited_sequence,
+			rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
+		if (ndetected) {
+			pr_err("blocking rcu_node structures:");
+			rcu_for_each_node_breadth_first(rsp, rnp) {
+				if (rnp == rnp_root)
+					continue; /* printed unconditionally */
+				if (sync_rcu_preempt_exp_done(rnp))
+					continue;
+				pr_cont(" l=%u:%d-%d:%#lx/%c",
+					rnp->level, rnp->grplo, rnp->grphi,
+					rnp->expmask,
+					".T"[!!rnp->exp_tasks]);
+			}
+			pr_cont("\n");
+		}
+		rcu_for_each_leaf_node(rsp, rnp) {
+			mask = 1;
+			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+				if (!(rnp->expmask & mask))
+					continue;
+				dump_cpu_task(cpu);
+			}
+		}
+		jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
+	}
+}
+
+/*
+ * Wait for the current expedited grace period to complete, and then
+ * wake up everyone who piggybacked on the just-completed expedited
+ * grace period.  Also update all the ->exp_seq_rq counters as needed
+ * in order to avoid counter-wrap problems.
+ */
+static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
+{
+	struct rcu_node *rnp;
+
+	synchronize_sched_expedited_wait(rsp);
+	rcu_exp_gp_seq_end(rsp);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
+
+	/*
+	 * Switch over to wakeup mode, allowing the next GP, but -only- the
+	 * next GP, to proceed.
+	 */
+	mutex_lock(&rsp->exp_wake_mutex);
+	mutex_unlock(&rsp->exp_mutex);
+
+	rcu_for_each_node_breadth_first(rsp, rnp) {
+		if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
+			spin_lock(&rnp->exp_lock);
+			/* Recheck, avoid hang in case someone just arrived. */
+			if (ULONG_CMP_LT(rnp->exp_seq_rq, s))
+				rnp->exp_seq_rq = s;
+			spin_unlock(&rnp->exp_lock);
+		}
+		wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
+	}
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
+	mutex_unlock(&rsp->exp_wake_mutex);
+}
+
+/**
+ * synchronize_sched_expedited - Brute-force RCU-sched grace period
+ *
+ * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
+ * approach to force the grace period to end quickly.  This consumes
+ * significant time on all CPUs and is unfriendly to real-time workloads,
+ * so is thus not recommended for any sort of common-case code.  In fact,
+ * if you are using synchronize_sched_expedited() in a loop, please
+ * restructure your code to batch your updates, and then use a single
+ * synchronize_sched() instead.
+ *
+ * This implementation can be thought of as an application of sequence
+ * locking to expedited grace periods, but using the sequence counter to
+ * determine when someone else has already done the work instead of for
+ * retrying readers.
+ */
+void synchronize_sched_expedited(void)
+{
+	unsigned long s;
+	struct rcu_state *rsp = &rcu_sched_state;
+
+	/* If only one CPU, this is automatically a grace period. */
+	if (rcu_blocking_is_gp())
+		return;
+
+	/* If expedited grace periods are prohibited, fall back to normal. */
+	if (rcu_gp_is_normal()) {
+		wait_rcu_gp(call_rcu_sched);
+		return;
+	}
+
+	/* Take a snapshot of the sequence number.  */
+	s = rcu_exp_gp_seq_snap(rsp);
+	if (exp_funnel_lock(rsp, s))
+		return;  /* Someone else did our work for us. */
+
+	/* Initialize the rcu_node tree in preparation for the wait. */
+	sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
+
+	/* Wait and clean up, including waking everyone. */
+	rcu_exp_wait_wake(rsp, s);
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-- 
cgit v1.3-8-gc7d7


From 40e0a6cfd53e37d9b8863cdbc0adb1f72e9311e7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 15 Apr 2016 16:44:07 -0700
Subject: rcu: Move expedited code from tree_plugin.h to tree_exp.h

People have been having some difficulty finding their way around the
RCU code.  This commit therefore pulls some of the expedited grace-period
code from tree_plugin.h to a new tree_exp.h file.  This commit is strictly
code movement.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_exp.h    | 94 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/rcu/tree_plugin.h | 88 ---------------------------------------------
 2 files changed, 94 insertions(+), 88 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index db0909cf7fe1..00a02a231ada 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -562,3 +562,97 @@ void synchronize_sched_expedited(void)
 	rcu_exp_wait_wake(rsp, s);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#ifdef CONFIG_PREEMPT_RCU
+
+/*
+ * Remote handler for smp_call_function_single().  If there is an
+ * RCU read-side critical section in effect, request that the
+ * next rcu_read_unlock() record the quiescent state up the
+ * ->expmask fields in the rcu_node tree.  Otherwise, immediately
+ * report the quiescent state.
+ */
+static void sync_rcu_exp_handler(void *info)
+{
+	struct rcu_data *rdp;
+	struct rcu_state *rsp = info;
+	struct task_struct *t = current;
+
+	/*
+	 * Within an RCU read-side critical section, request that the next
+	 * rcu_read_unlock() report.  Unless this RCU read-side critical
+	 * section has already blocked, in which case it is already set
+	 * up for the expedited grace period to wait on it.
+	 */
+	if (t->rcu_read_lock_nesting > 0 &&
+	    !t->rcu_read_unlock_special.b.blocked) {
+		t->rcu_read_unlock_special.b.exp_need_qs = true;
+		return;
+	}
+
+	/*
+	 * We are either exiting an RCU read-side critical section (negative
+	 * values of t->rcu_read_lock_nesting) or are not in one at all
+	 * (zero value of t->rcu_read_lock_nesting).  Or we are in an RCU
+	 * read-side critical section that blocked before this expedited
+	 * grace period started.  Either way, we can immediately report
+	 * the quiescent state.
+	 */
+	rdp = this_cpu_ptr(rsp->rda);
+	rcu_report_exp_rdp(rsp, rdp, true);
+}
+
+/**
+ * synchronize_rcu_expedited - Brute-force RCU grace period
+ *
+ * Wait for an RCU-preempt grace period, but expedite it.  The basic
+ * idea is to IPI all non-idle non-nohz online CPUs.  The IPI handler
+ * checks whether the CPU is in an RCU-preempt critical section, and
+ * if so, it sets a flag that causes the outermost rcu_read_unlock()
+ * to report the quiescent state.  On the other hand, if the CPU is
+ * not in an RCU read-side critical section, the IPI handler reports
+ * the quiescent state immediately.
+ *
+ * Although this is a greate improvement over previous expedited
+ * implementations, it is still unfriendly to real-time workloads, so is
+ * thus not recommended for any sort of common-case code.  In fact, if
+ * you are using synchronize_rcu_expedited() in a loop, please restructure
+ * your code to batch your updates, and then Use a single synchronize_rcu()
+ * instead.
+ */
+void synchronize_rcu_expedited(void)
+{
+	struct rcu_state *rsp = rcu_state_p;
+	unsigned long s;
+
+	/* If expedited grace periods are prohibited, fall back to normal. */
+	if (rcu_gp_is_normal()) {
+		wait_rcu_gp(call_rcu);
+		return;
+	}
+
+	s = rcu_exp_gp_seq_snap(rsp);
+	if (exp_funnel_lock(rsp, s))
+		return;  /* Someone else did our work for us. */
+
+	/* Initialize the rcu_node tree in preparation for the wait. */
+	sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
+
+	/* Wait for ->blkd_tasks lists to drain, then wake everyone up. */
+	rcu_exp_wait_wake(rsp, s);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+/*
+ * Wait for an rcu-preempt grace period, but make it happen quickly.
+ * But because preemptible RCU does not exist, map to rcu-sched.
+ */
+void synchronize_rcu_expedited(void)
+{
+	synchronize_sched_expedited();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ff1cd4e1188d..695071dd1e9c 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -681,84 +681,6 @@ void synchronize_rcu(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
-/*
- * Remote handler for smp_call_function_single().  If there is an
- * RCU read-side critical section in effect, request that the
- * next rcu_read_unlock() record the quiescent state up the
- * ->expmask fields in the rcu_node tree.  Otherwise, immediately
- * report the quiescent state.
- */
-static void sync_rcu_exp_handler(void *info)
-{
-	struct rcu_data *rdp;
-	struct rcu_state *rsp = info;
-	struct task_struct *t = current;
-
-	/*
-	 * Within an RCU read-side critical section, request that the next
-	 * rcu_read_unlock() report.  Unless this RCU read-side critical
-	 * section has already blocked, in which case it is already set
-	 * up for the expedited grace period to wait on it.
-	 */
-	if (t->rcu_read_lock_nesting > 0 &&
-	    !t->rcu_read_unlock_special.b.blocked) {
-		t->rcu_read_unlock_special.b.exp_need_qs = true;
-		return;
-	}
-
-	/*
-	 * We are either exiting an RCU read-side critical section (negative
-	 * values of t->rcu_read_lock_nesting) or are not in one at all
-	 * (zero value of t->rcu_read_lock_nesting).  Or we are in an RCU
-	 * read-side critical section that blocked before this expedited
-	 * grace period started.  Either way, we can immediately report
-	 * the quiescent state.
-	 */
-	rdp = this_cpu_ptr(rsp->rda);
-	rcu_report_exp_rdp(rsp, rdp, true);
-}
-
-/**
- * synchronize_rcu_expedited - Brute-force RCU grace period
- *
- * Wait for an RCU-preempt grace period, but expedite it.  The basic
- * idea is to IPI all non-idle non-nohz online CPUs.  The IPI handler
- * checks whether the CPU is in an RCU-preempt critical section, and
- * if so, it sets a flag that causes the outermost rcu_read_unlock()
- * to report the quiescent state.  On the other hand, if the CPU is
- * not in an RCU read-side critical section, the IPI handler reports
- * the quiescent state immediately.
- *
- * Although this is a greate improvement over previous expedited
- * implementations, it is still unfriendly to real-time workloads, so is
- * thus not recommended for any sort of common-case code.  In fact, if
- * you are using synchronize_rcu_expedited() in a loop, please restructure
- * your code to batch your updates, and then Use a single synchronize_rcu()
- * instead.
- */
-void synchronize_rcu_expedited(void)
-{
-	struct rcu_state *rsp = rcu_state_p;
-	unsigned long s;
-
-	/* If expedited grace periods are prohibited, fall back to normal. */
-	if (rcu_gp_is_normal()) {
-		wait_rcu_gp(call_rcu);
-		return;
-	}
-
-	s = rcu_exp_gp_seq_snap(rsp);
-	if (exp_funnel_lock(rsp, s))
-		return;  /* Someone else did our work for us. */
-
-	/* Initialize the rcu_node tree in preparation for the wait. */
-	sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
-
-	/* Wait for ->blkd_tasks lists to drain, then wake everyone up. */
-	rcu_exp_wait_wake(rsp, s);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-
 /**
  * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
  *
@@ -882,16 +804,6 @@ static void rcu_preempt_check_callbacks(void)
 {
 }
 
-/*
- * Wait for an rcu-preempt grace period, but make it happen quickly.
- * But because preemptible RCU does not exist, map to rcu-sched.
- */
-void synchronize_rcu_expedited(void)
-{
-	synchronize_sched_expedited();
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-
 /*
  * Because preemptible RCU does not exist, rcu_barrier() is just
  * another name for rcu_barrier_sched().
-- 
cgit v1.3-8-gc7d7


From f8cbdee99b161cb08c3fb55200941028c5fe25c8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 19 Apr 2016 13:03:18 -0700
Subject: torture: Simplify code, eliminate RCU_PERF_TEST_RUNNABLE

This commit applies the infamous IS_ENABLED() macro to eliminate a #ifdef.
It also eliminates the RCU_PERF_TEST_RUNNABLE Kconfig option in favor
of the already-existing rcuperf.perf_runnable kernel boot parameter.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcuperf.c |  7 +------
 lib/Kconfig.debug    | 16 ----------------
 2 files changed, 1 insertion(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 3cee0d8393ed..afd174e901c3 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -96,12 +96,7 @@ static int rcu_perf_writer_state;
 #define MAX_MEAS 10000
 #define MIN_MEAS 100
 
-#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE)
-#define RCUPERF_RUNNABLE_INIT 1
-#else
-#define RCUPERF_RUNNABLE_INIT 0
-#endif
-static int perf_runnable = RCUPERF_RUNNABLE_INIT;
+static int perf_runnable = IS_ENABLED(MODULE);
 module_param(perf_runnable, int, 0444);
 MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot");
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b9cfdbfae9aa..cf6ddcd8f70c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1307,22 +1307,6 @@ config RCU_PERF_TEST
 	  Say M if you want the RCU performance tests to build as a module.
 	  Say N if you are unsure.
 
-config RCU_PERF_TEST_RUNNABLE
-	bool "performance tests for RCU runnable by default"
-	depends on RCU_PERF_TEST = y
-	default n
-	help
-	  This option provides a way to build the RCU performance tests
-	  directly into the kernel without them starting up at boot time.
-	  You can use /sys/module to manually override this setting.
-	  This /proc file is available only when the RCU performance
-	  tests have been built into the kernel.
-
-	  Say Y here if you want the RCU performance tests to start during
-	  boot (you probably don't).
-	  Say N here if you want the RCU performance tests to start only
-	  after being manually enabled via /sys/module.
-
 config RCU_TORTURE_TEST
 	tristate "torture tests for RCU"
 	depends on DEBUG_KERNEL
-- 
cgit v1.3-8-gc7d7


From 4e9a073f60367157fd64b65490654c39d4c44321 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 19 Apr 2016 14:42:34 -0700
Subject: torture: Remove CONFIG_RCU_TORTURE_TEST_RUNNABLE, simplify code

This commit removes CONFIG_RCU_TORTURE_TEST_RUNNABLE in favor of the
already-existing rcutorture.torture_runnable kernel boot parameter.
It also converts an #ifdef into IS_ENABLED(), saving a few lines of code.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcutorture.c  |  7 +------
 kernel/rcu/tree_plugin.h |  2 --
 lib/Kconfig.debug        | 17 -----------------
 3 files changed, 1 insertion(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 084a28a732eb..01cb57ff106f 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -182,12 +182,7 @@ static const char *rcu_torture_writer_state_getname(void)
 	return rcu_torture_writer_state_names[i];
 }
 
-#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
-#define RCUTORTURE_RUNNABLE_INIT 1
-#else
-#define RCUTORTURE_RUNNABLE_INIT 0
-#endif
-static int torture_runnable = RCUTORTURE_RUNNABLE_INIT;
+static int torture_runnable = IS_ENABLED(MODULE);
 module_param(torture_runnable, int, 0444);
 MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
 
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ff1cd4e1188d..5b2d723e6568 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -79,8 +79,6 @@ static void __init rcu_bootup_announce_oddness(void)
 		pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
 	if (IS_ENABLED(CONFIG_PROVE_RCU))
 		pr_info("\tRCU lockdep checking is enabled.\n");
-	if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
-		pr_info("\tRCU torture testing starts during boot.\n");
 	if (RCU_NUM_LVLS >= 4)
 		pr_info("\tFour(or more)-level hierarchy is enabled.\n");
 	if (RCU_FANOUT_LEAF != 16)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index cf6ddcd8f70c..805b7048a1bd 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1324,23 +1324,6 @@ config RCU_TORTURE_TEST
 	  Say M if you want the RCU torture tests to build as a module.
 	  Say N if you are unsure.
 
-config RCU_TORTURE_TEST_RUNNABLE
-	bool "torture tests for RCU runnable by default"
-	depends on RCU_TORTURE_TEST = y
-	default n
-	help
-	  This option provides a way to build the RCU torture tests
-	  directly into the kernel without them starting up at boot
-	  time.  You can use /proc/sys/kernel/rcutorture_runnable
-	  to manually override this setting.  This /proc file is
-	  available only when the RCU torture tests have been built
-	  into the kernel.
-
-	  Say Y here if you want the RCU torture tests to start during
-	  boot (you probably don't).
-	  Say N here if you want the RCU torture tests to start only
-	  after being manually enabled via /proc.
-
 config RCU_TORTURE_TEST_SLOW_PREINIT
 	bool "Slow down RCU grace-period pre-initialization to expose races"
 	depends on RCU_TORTURE_TEST
-- 
cgit v1.3-8-gc7d7


From d95f5ba90fa6043a366958897fdef705af968b70 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 20 Apr 2016 17:18:41 -0700
Subject: torture:  Break online and offline functions out of torture_onoff()

This commit breaks torture_online() and torture_offline() out of
torture_onoff() in preparation for allowing waketorture finer-grained
control of its CPU-hotplug workload.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/torture.h |   4 ++
 kernel/torture.c        | 168 ++++++++++++++++++++++++++++++------------------
 2 files changed, 108 insertions(+), 64 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 7759fc3c622d..6685a73736a2 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -50,6 +50,10 @@
 	do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); } while (0)
 
 /* Definitions for online/offline exerciser. */
+bool torture_offline(int cpu, long *n_onl_attempts, long *n_onl_successes,
+		     unsigned long *sum_offl, int *min_onl, int *max_onl);
+bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes,
+		    unsigned long *sum_onl, int *min_onl, int *max_onl);
 int torture_onoff_init(long ooholdoff, long oointerval);
 void torture_onoff_stats(void);
 bool torture_onoff_failures(void);
diff --git a/kernel/torture.c b/kernel/torture.c
index fa0bdeee17ac..fb39a06bbef5 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -81,6 +81,104 @@ static unsigned long sum_online;
 static int min_online = -1;
 static int max_online;
 
+/*
+ * Attempt to take a CPU offline.  Return false if the CPU is already
+ * offline or if it is not subject to CPU-hotplug operations.  The
+ * caller can detect other failures by looking at the statistics.
+ */
+bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
+		     unsigned long *sum_offl, int *min_offl, int *max_offl)
+{
+	unsigned long delta;
+	int ret;
+	unsigned long starttime;
+
+	if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
+		return false;
+
+	if (verbose)
+		pr_alert("%s" TORTURE_FLAG
+			 "torture_onoff task: offlining %d\n",
+			 torture_type, cpu);
+	starttime = jiffies;
+	(*n_offl_attempts)++;
+	ret = cpu_down(cpu);
+	if (ret) {
+		if (verbose)
+			pr_alert("%s" TORTURE_FLAG
+				 "torture_onoff task: offline %d failed: errno %d\n",
+				 torture_type, cpu, ret);
+	} else {
+		if (verbose)
+			pr_alert("%s" TORTURE_FLAG
+				 "torture_onoff task: offlined %d\n",
+				 torture_type, cpu);
+		(*n_offl_successes)++;
+		delta = jiffies - starttime;
+		sum_offl += delta;
+		if (*min_offl < 0) {
+			*min_offl = delta;
+			*max_offl = delta;
+		}
+		if (*min_offl > delta)
+			*min_offl = delta;
+		if (*max_offl < delta)
+			*max_offl = delta;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(torture_offline);
+
+/*
+ * Attempt to bring a CPU online.  Return false if the CPU is already
+ * online or if it is not subject to CPU-hotplug operations.  The
+ * caller can detect other failures by looking at the statistics.
+ */
+bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes,
+		    unsigned long *sum_onl, int *min_onl, int *max_onl)
+{
+	unsigned long delta;
+	int ret;
+	unsigned long starttime;
+
+	if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
+		return false;
+
+	if (verbose)
+		pr_alert("%s" TORTURE_FLAG
+			 "torture_onoff task: onlining %d\n",
+			 torture_type, cpu);
+	starttime = jiffies;
+	(*n_onl_attempts)++;
+	ret = cpu_up(cpu);
+	if (ret) {
+		if (verbose)
+			pr_alert("%s" TORTURE_FLAG
+				 "torture_onoff task: online %d failed: errno %d\n",
+				 torture_type, cpu, ret);
+	} else {
+		if (verbose)
+			pr_alert("%s" TORTURE_FLAG
+				 "torture_onoff task: onlined %d\n",
+				 torture_type, cpu);
+		(*n_onl_successes)++;
+		delta = jiffies - starttime;
+		*sum_onl += delta;
+		if (*min_onl < 0) {
+			*min_onl = delta;
+			*max_onl = delta;
+		}
+		if (*min_onl > delta)
+			*min_onl = delta;
+		if (*max_onl < delta)
+			*max_onl = delta;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(torture_online);
+
 /*
  * Execute random CPU-hotplug operations at the interval specified
  * by the onoff_interval.
@@ -89,11 +187,8 @@ static int
 torture_onoff(void *arg)
 {
 	int cpu;
-	unsigned long delta;
 	int maxcpu = -1;
 	DEFINE_TORTURE_RANDOM(rand);
-	int ret;
-	unsigned long starttime;
 
 	VERBOSE_TOROUT_STRING("torture_onoff task started");
 	for_each_online_cpu(cpu)
@@ -106,67 +201,12 @@ torture_onoff(void *arg)
 	}
 	while (!torture_must_stop()) {
 		cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
-		if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
-			if (verbose)
-				pr_alert("%s" TORTURE_FLAG
-					 "torture_onoff task: offlining %d\n",
-					 torture_type, cpu);
-			starttime = jiffies;
-			n_offline_attempts++;
-			ret = cpu_down(cpu);
-			if (ret) {
-				if (verbose)
-					pr_alert("%s" TORTURE_FLAG
-						 "torture_onoff task: offline %d failed: errno %d\n",
-						 torture_type, cpu, ret);
-			} else {
-				if (verbose)
-					pr_alert("%s" TORTURE_FLAG
-						 "torture_onoff task: offlined %d\n",
-						 torture_type, cpu);
-				n_offline_successes++;
-				delta = jiffies - starttime;
-				sum_offline += delta;
-				if (min_offline < 0) {
-					min_offline = delta;
-					max_offline = delta;
-				}
-				if (min_offline > delta)
-					min_offline = delta;
-				if (max_offline < delta)
-					max_offline = delta;
-			}
-		} else if (cpu_is_hotpluggable(cpu)) {
-			if (verbose)
-				pr_alert("%s" TORTURE_FLAG
-					 "torture_onoff task: onlining %d\n",
-					 torture_type, cpu);
-			starttime = jiffies;
-			n_online_attempts++;
-			ret = cpu_up(cpu);
-			if (ret) {
-				if (verbose)
-					pr_alert("%s" TORTURE_FLAG
-						 "torture_onoff task: online %d failed: errno %d\n",
-						 torture_type, cpu, ret);
-			} else {
-				if (verbose)
-					pr_alert("%s" TORTURE_FLAG
-						 "torture_onoff task: onlined %d\n",
-						 torture_type, cpu);
-				n_online_successes++;
-				delta = jiffies - starttime;
-				sum_online += delta;
-				if (min_online < 0) {
-					min_online = delta;
-					max_online = delta;
-				}
-				if (min_online > delta)
-					min_online = delta;
-				if (max_online < delta)
-					max_online = delta;
-			}
-		}
+		if (!torture_offline(cpu,
+				     &n_offline_attempts, &n_offline_successes,
+				     &sum_offline, &min_offline, &max_offline))
+			torture_online(cpu,
+				       &n_online_attempts, &n_online_successes,
+				       &sum_online, &min_online, &max_online);
 		schedule_timeout_interruptible(onoff_interval);
 	}
 	torture_kthread_stopping("torture_onoff");
-- 
cgit v1.3-8-gc7d7


From 750db0f5f7d0ff6b86158015f02c275702639b20 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Mon, 2 May 2016 10:30:00 +0800
Subject: torture: Stop onoff task if there is only one cpu

If the whole system has only one cpu, that cpu won't be able to be
offlined, so there is no need onoff task is stil running.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/torture.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/torture.c b/kernel/torture.c
index fb39a06bbef5..75961b3decfe 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -194,6 +194,12 @@ torture_onoff(void *arg)
 	for_each_online_cpu(cpu)
 		maxcpu = cpu;
 	WARN_ON(maxcpu < 0);
+
+	if (maxcpu == 0) {
+		VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled");
+		goto stop;
+	}
+
 	if (onoff_holdoff > 0) {
 		VERBOSE_TOROUT_STRING("torture_onoff begin holdoff");
 		schedule_timeout_interruptible(onoff_holdoff);
@@ -209,6 +215,8 @@ torture_onoff(void *arg)
 				       &sum_online, &min_online, &max_online);
 		schedule_timeout_interruptible(onoff_interval);
 	}
+
+stop:
 	torture_kthread_stopping("torture_onoff");
 	return 0;
 }
-- 
cgit v1.3-8-gc7d7


From af06d4f74a7d2132c805339bfd5ab771b5706f42 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Wed, 25 May 2016 09:25:33 +0800
Subject: rcuperf: Don't treat gp_exp mis-setting as a WARN

0day found a boot warning triggered in rcu_perf_writer() on !SMP kernel:

	WARN_ON(rcu_gp_is_normal() && gp_exp);

, the root cause of which is trying to measure expedited grace
periods(by setting gp_exp to true by default) when all the grace periods
are normal(TINY RCU only has normal grace periods).

However, such a mis-setting would only result in failing to measure the
performance for a specific kind of grace periods, therefore using a
WARN_ON to check this is a little overkilling. We could handle this
inside rcuperf module via some error messages to tell users about the
mis-settings.

Therefore this patch removes the WARN_ON in rcu_perf_writer() and
handles those checkings in rcu_perf_init() with plain if() code.

Moreover, this patch changes the default value of gp_exp to 1) align
with rcutorture tests and 2) make the default setting work for all RCU
implementations by default.

Suggested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Fixes: http://lkml.kernel.org/r/57411b10.mFvG0+AgcrMXGtcj%fengguang.wu@intel.com
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcuperf.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index afd174e901c3..7b2dbdffd791 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -58,7 +58,7 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
 #define VERBOSE_PERFOUT_ERRSTRING(s) \
 	do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
 
-torture_param(bool, gp_exp, true, "Use expedited GP wait primitives");
+torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
 torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
 torture_param(int, nreaders, -1, "Number of RCU reader threads");
 torture_param(int, nwriters, -1, "Number of RCU updater threads");
@@ -358,8 +358,6 @@ rcu_perf_writer(void *arg)
 	u64 *wdpp = writer_durations[me];
 
 	VERBOSE_PERFOUT_STRING("rcu_perf_writer task started");
-	WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp);
-	WARN_ON(rcu_gp_is_normal() && gp_exp);
 	WARN_ON(!wdpp);
 	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
 	sp.sched_priority = 1;
@@ -626,6 +624,16 @@ rcu_perf_init(void)
 		firsterr = -ENOMEM;
 		goto unwind;
 	}
+	if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) {
+		VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
+		firsterr = -EINVAL;
+		goto unwind;
+	}
+	if (rcu_gp_is_normal() && gp_exp) {
+		VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+		firsterr = -EINVAL;
+		goto unwind;
+	}
 	for (i = 0; i < nrealwriters; i++) {
 		writer_durations[i] =
 			kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
-- 
cgit v1.3-8-gc7d7


From 05dbbfe753792dcebb6b85d84fa5926f09723cfe Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Mon, 13 Jun 2016 15:20:39 +0000
Subject: rcutorture: Fix error return code in rcu_perf_init()

Fix to return a negative error code -ENOMEM from kcalloc() error
handling case instead of 0, as done elsewhere in this function.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcuperf.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 7b2dbdffd791..d38ab08a3fe7 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -638,8 +638,10 @@ rcu_perf_init(void)
 		writer_durations[i] =
 			kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
 				GFP_KERNEL);
-		if (!writer_durations[i])
+		if (!writer_durations[i]) {
+			firsterr = -ENOMEM;
 			goto unwind;
+		}
 		firsterr = torture_create_kthread(rcu_perf_writer, (void *)i,
 						  writer_tasks[i]);
 		if (firsterr)
-- 
cgit v1.3-8-gc7d7


From ca5f2b4c4fb7bb7397317ee2ead83485aa295a3e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 14 Jun 2016 16:23:22 +0200
Subject: PM / sleep: Make pm_prepare_console() return void

Nothing is using its return value so change it to return void.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/suspend.h | 5 ++---
 kernel/power/console.c  | 8 ++++----
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 8b6ec7ef0854..7693e39b14fe 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -18,12 +18,11 @@ static inline void pm_set_vt_switch(int do_switch)
 #endif
 
 #ifdef CONFIG_VT_CONSOLE_SLEEP
-extern int pm_prepare_console(void);
+extern void pm_prepare_console(void);
 extern void pm_restore_console(void);
 #else
-static inline int pm_prepare_console(void)
+static inline void pm_prepare_console(void)
 {
-	return 0;
 }
 
 static inline void pm_restore_console(void)
diff --git a/kernel/power/console.c b/kernel/power/console.c
index aba9c545a0e3..0e781798b0b3 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -126,17 +126,17 @@ out:
 	return ret;
 }
 
-int pm_prepare_console(void)
+void pm_prepare_console(void)
 {
 	if (!pm_vt_switch())
-		return 0;
+		return;
 
 	orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
 	if (orig_fgconsole < 0)
-		return 1;
+		return;
 
 	orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
-	return 0;
+	return;
 }
 
 void pm_restore_console(void)
-- 
cgit v1.3-8-gc7d7


From 4929c913bda505dbe44bb42c00da06011fee6c9d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 2 May 2016 11:58:56 -0700
Subject: rcu: Make call_rcu_tasks() tolerate first call with irqs disabled

Currently, if the very first call to call_rcu_tasks() has irqs disabled,
it will create the rcu_tasks_kthread with irqs disabled, which will
result in a splat in the memory allocator, which kthread_run() invokes
with the expectation that irqs are enabled.

This commit fixes this problem by deferring kthread creation if called
with irqs disabled.  The first call to call_rcu_tasks() that has irqs
enabled will create the kthread.

This bug was detected by rcutorture changes that were motivated by
Iftekhar Ahmed's mutation-testing efforts.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 1 +
 kernel/rcu/update.c      | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index a8af79738a0e..3bc5de08c0b7 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -45,6 +45,7 @@
 #include <linux/bug.h>
 #include <linux/compiler.h>
 #include <linux/ktime.h>
+#include <linux/irqflags.h>
 
 #include <asm/barrier.h>
 
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 3e888cd5a594..f0d8322bc3ec 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -528,6 +528,7 @@ static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10;
 module_param(rcu_task_stall_timeout, int, 0644);
 
 static void rcu_spawn_tasks_kthread(void);
+static struct task_struct *rcu_tasks_kthread_ptr;
 
 /*
  * Post an RCU-tasks callback.  First call must be from process context
@@ -537,6 +538,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
 {
 	unsigned long flags;
 	bool needwake;
+	bool havetask = READ_ONCE(rcu_tasks_kthread_ptr);
 
 	rhp->next = NULL;
 	rhp->func = func;
@@ -545,7 +547,9 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
 	*rcu_tasks_cbs_tail = rhp;
 	rcu_tasks_cbs_tail = &rhp->next;
 	raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
-	if (needwake) {
+	/* We can't create the thread unless interrupts are enabled. */
+	if ((needwake && havetask) ||
+	    (!havetask && !irqs_disabled_flags(flags))) {
 		rcu_spawn_tasks_kthread();
 		wake_up(&rcu_tasks_cbs_wq);
 	}
@@ -790,7 +794,6 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 static void rcu_spawn_tasks_kthread(void)
 {
 	static DEFINE_MUTEX(rcu_tasks_kthread_mutex);
-	static struct task_struct *rcu_tasks_kthread_ptr;
 	struct task_struct *t;
 
 	if (READ_ONCE(rcu_tasks_kthread_ptr)) {
-- 
cgit v1.3-8-gc7d7


From aab057382cb9b16249552684c1ebd270f070ec02 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 2 May 2016 12:20:51 -0700
Subject: rcu: Fix a typo in a comment

In the area in hot pursuit of a bug, so might as well clean it up.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 084a28a732eb..60bd533902f9 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1476,7 +1476,7 @@ static int rcu_torture_barrier_cbs(void *arg)
 			break;
 		/*
 		 * The above smp_load_acquire() ensures barrier_phase load
-		 * is ordered before the folloiwng ->call().
+		 * is ordered before the following ->call().
 		 */
 		local_irq_disable(); /* Just to test no-irq call_rcu(). */
 		cur_ops->call(&rcu, rcu_torture_barrier_cbf);
-- 
cgit v1.3-8-gc7d7


From 088e9d253d3a4ab7e058dd84bb532c32dadf1882 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@redhat.com>
Date: Thu, 2 Jun 2016 13:51:41 -0300
Subject: rcu: sysctl: Panic on RCU Stall

It is not always easy to determine the cause of an RCU stall just by
analysing the RCU stall messages, mainly when the problem is caused
by the indirect starvation of rcu threads. For example, when preempt_rcu
is not awakened due to the starvation of a timer softirq.

We have been hard coding panic() in the RCU stall functions for
some time while testing the kernel-rt. But this is not possible in
some scenarios, like when supporting customers.

This patch implements the sysctl kernel.panic_on_rcu_stall. If
set to 1, the system will panic() when an RCU stall takes place,
enabling the capture of a vmcore. The vmcore provides a way to analyze
all kernel/tasks states, helping out to point to the culprit and the
solution for the stall.

The kernel.panic_on_rcu_stall sysctl is disabled by default.

Changes from v1:
- Fixed a typo in the git log
- The if(sysctl_panic_on_rcu_stall) panic() is in a static function
- Fixed the CONFIG_TINY_RCU compilation issue
- The var sysctl_panic_on_rcu_stall is now __read_mostly

Cc: Jonathan Corbet <corbet@lwn.net>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Reviewed-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Tested-by: "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>
Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/sysctl/kernel.txt | 12 ++++++++++++
 include/linux/kernel.h          |  1 +
 kernel/rcu/tree.c               | 12 ++++++++++++
 kernel/sysctl.c                 | 11 +++++++++++
 4 files changed, 36 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index a3683ce2a2f3..33204604de6c 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -58,6 +58,7 @@ show up in /proc/sys/kernel:
 - panic_on_stackoverflow
 - panic_on_unrecovered_nmi
 - panic_on_warn
+- panic_on_rcu_stall
 - perf_cpu_time_max_percent
 - perf_event_paranoid
 - perf_event_max_stack
@@ -618,6 +619,17 @@ a kernel rebuild when attempting to kdump at the location of a WARN().
 
 ==============================================================
 
+panic_on_rcu_stall:
+
+When set to 1, calls panic() after RCU stall detection messages. This
+is useful to define the root cause of RCU stalls using a vmcore.
+
+0: do not panic() when RCU stall takes place, default behavior.
+
+1: panic() after printing RCU stall messages.
+
+==============================================================
+
 perf_cpu_time_max_percent:
 
 Hints to the kernel how much CPU time it should be allowed to
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 94aa10ffe156..c42082112ec8 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -451,6 +451,7 @@ extern int panic_on_oops;
 extern int panic_on_unrecovered_nmi;
 extern int panic_on_io_nmi;
 extern int panic_on_warn;
+extern int sysctl_panic_on_rcu_stall;
 extern int sysctl_panic_on_stackoverflow;
 
 extern bool crash_kexec_post_notifiers;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c844b6142a86..e5ca15a461b9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -125,6 +125,8 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
 /* Number of rcu_nodes at specified level. */
 static int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
 int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
+/* panic() on RCU Stall sysctl. */
+int sysctl_panic_on_rcu_stall __read_mostly;
 
 /*
  * The rcu_scheduler_active variable transitions from zero to one just
@@ -1312,6 +1314,12 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp)
 	}
 }
 
+static inline void panic_on_rcu_stall(void)
+{
+	if (sysctl_panic_on_rcu_stall)
+		panic("RCU Stall\n");
+}
+
 static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 {
 	int cpu;
@@ -1391,6 +1399,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 
 	rcu_check_gp_kthread_starvation(rsp);
 
+	panic_on_rcu_stall();
+
 	force_quiescent_state(rsp);  /* Kick them all. */
 }
 
@@ -1431,6 +1441,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
 			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
+	panic_on_rcu_stall();
+
 	/*
 	 * Attempt to revive the RCU machinery by forcing a context switch.
 	 *
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 87b2fc38398b..35f0dcb1cb4f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1204,6 +1204,17 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &one,
 		.extra2		= &one,
 	},
+#endif
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
+	{
+		.procname	= "panic_on_rcu_stall",
+		.data		= &sysctl_panic_on_rcu_stall,
+		.maxlen		= sizeof(sysctl_panic_on_rcu_stall),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 #endif
 	{ }
 };
-- 
cgit v1.3-8-gc7d7


From bc75e99983df1efd977a5cd468893d55d52b8d70 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 3 Jun 2016 15:20:04 +0100
Subject: rcu: Correctly handle sparse possible cpus

In many cases in the RCU tree code, we iterate over the set of cpus for
a leaf node described by rcu_node::grplo and rcu_node::grphi, checking
per-cpu data for each cpu in this range. However, if the set of possible
cpus is sparse, some cpus described in this range are not possible, and
thus no per-cpu region will have been allocated (or initialised) for
them by the generic percpu code.

Erroneous accesses to a per-cpu area for these !possible cpus may fault
or may hit other data depending on the addressed generated when the
erroneous per cpu offset is applied. In practice, both cases have been
observed on arm64 hardware (the former being silent, but detectable with
additional patches).

To avoid issues resulting from this, we must iterate over the set of
*possible* cpus for a given leaf node. This patch add a new helper,
for_each_leaf_node_possible_cpu, to enable this. As iteration is often
intertwined with rcu_node local bitmask manipulation, a new
leaf_node_cpu_bit helper is added to make this simpler and more
consistent. The RCU tree code is made to use both of these where
appropriate.

Without this patch, running reboot at a shell can result in an oops
like:

[ 3369.075979] Unable to handle kernel paging request at virtual address ffffff8008b21b4c
[ 3369.083881] pgd = ffffffc3ecdda000
[ 3369.087270] [ffffff8008b21b4c] *pgd=00000083eca48003, *pud=00000083eca48003, *pmd=0000000000000000
[ 3369.096222] Internal error: Oops: 96000007 [#1] PREEMPT SMP
[ 3369.101781] Modules linked in:
[ 3369.104825] CPU: 2 PID: 1817 Comm: NetworkManager Tainted: G        W       4.6.0+ #3
[ 3369.121239] task: ffffffc0fa13e000 ti: ffffffc3eb940000 task.ti: ffffffc3eb940000
[ 3369.128708] PC is at sync_rcu_exp_select_cpus+0x188/0x510
[ 3369.134094] LR is at sync_rcu_exp_select_cpus+0x104/0x510
[ 3369.139479] pc : [<ffffff80081109a8>] lr : [<ffffff8008110924>] pstate: 200001c5
[ 3369.146860] sp : ffffffc3eb9435a0
[ 3369.150162] x29: ffffffc3eb9435a0 x28: ffffff8008be4f88
[ 3369.155465] x27: ffffff8008b66c80 x26: ffffffc3eceb2600
[ 3369.160767] x25: 0000000000000001 x24: ffffff8008be4f88
[ 3369.166070] x23: ffffff8008b51c3c x22: ffffff8008b66c80
[ 3369.171371] x21: 0000000000000001 x20: ffffff8008b21b40
[ 3369.176673] x19: ffffff8008b66c80 x18: 0000000000000000
[ 3369.181975] x17: 0000007fa951a010 x16: ffffff80086a30f0
[ 3369.187278] x15: 0000007fa9505590 x14: 0000000000000000
[ 3369.192580] x13: ffffff8008b51000 x12: ffffffc3eb940000
[ 3369.197882] x11: 0000000000000006 x10: ffffff8008b51b78
[ 3369.203184] x9 : 0000000000000001 x8 : ffffff8008be4000
[ 3369.208486] x7 : ffffff8008b21b40 x6 : 0000000000001003
[ 3369.213788] x5 : 0000000000000000 x4 : ffffff8008b27280
[ 3369.219090] x3 : ffffff8008b21b4c x2 : 0000000000000001
[ 3369.224406] x1 : 0000000000000001 x0 : 0000000000000140
...
[ 3369.972257] [<ffffff80081109a8>] sync_rcu_exp_select_cpus+0x188/0x510
[ 3369.978685] [<ffffff80081128b4>] synchronize_rcu_expedited+0x64/0xa8
[ 3369.985026] [<ffffff80086b987c>] synchronize_net+0x24/0x30
[ 3369.990499] [<ffffff80086ddb54>] dev_deactivate_many+0x28c/0x298
[ 3369.996493] [<ffffff80086b6bb8>] __dev_close_many+0x60/0xd0
[ 3370.002052] [<ffffff80086b6d48>] __dev_close+0x28/0x40
[ 3370.007178] [<ffffff80086bf62c>] __dev_change_flags+0x8c/0x158
[ 3370.012999] [<ffffff80086bf718>] dev_change_flags+0x20/0x60
[ 3370.018558] [<ffffff80086cf7f0>] do_setlink+0x288/0x918
[ 3370.023771] [<ffffff80086d0798>] rtnl_newlink+0x398/0x6a8
[ 3370.029158] [<ffffff80086cee84>] rtnetlink_rcv_msg+0xe4/0x220
[ 3370.034891] [<ffffff80086e274c>] netlink_rcv_skb+0xc4/0xf8
[ 3370.040364] [<ffffff80086ced8c>] rtnetlink_rcv+0x2c/0x40
[ 3370.045663] [<ffffff80086e1fe8>] netlink_unicast+0x160/0x238
[ 3370.051309] [<ffffff80086e24b8>] netlink_sendmsg+0x2f0/0x358
[ 3370.056956] [<ffffff80086a0070>] sock_sendmsg+0x18/0x30
[ 3370.062168] [<ffffff80086a21cc>] ___sys_sendmsg+0x26c/0x280
[ 3370.067728] [<ffffff80086a30ac>] __sys_sendmsg+0x44/0x88
[ 3370.073027] [<ffffff80086a3100>] SyS_sendmsg+0x10/0x20
[ 3370.078153] [<ffffff8008085e70>] el0_svc_naked+0x24/0x28

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reported-by: Dennis Chen <dennis.chen@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 21 +++++++++------------
 kernel/rcu/tree.h        | 15 +++++++++++++++
 kernel/rcu/tree_exp.h    | 16 +++++++---------
 kernel/rcu/tree_plugin.h |  5 +++--
 4 files changed, 34 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e5ca15a461b9..f433959e9322 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1287,9 +1287,9 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
 	rcu_for_each_leaf_node(rsp, rnp) {
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		if (rnp->qsmask != 0) {
-			for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
-				if (rnp->qsmask & (1UL << cpu))
-					dump_cpu_task(rnp->grplo + cpu);
+			for_each_leaf_node_possible_cpu(rnp, cpu)
+				if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
+					dump_cpu_task(cpu);
 		}
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	}
@@ -1360,10 +1360,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		ndetected += rcu_print_task_stall(rnp);
 		if (rnp->qsmask != 0) {
-			for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
-				if (rnp->qsmask & (1UL << cpu)) {
-					print_cpu_stall_info(rsp,
-							     rnp->grplo + cpu);
+			for_each_leaf_node_possible_cpu(rnp, cpu)
+				if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
+					print_cpu_stall_info(rsp, cpu);
 					ndetected++;
 				}
 		}
@@ -2884,7 +2883,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
 				  unsigned long *maxj),
 			 bool *isidle, unsigned long *maxj)
 {
-	unsigned long bit;
 	int cpu;
 	unsigned long flags;
 	unsigned long mask;
@@ -2919,9 +2917,8 @@ static void force_qs_rnp(struct rcu_state *rsp,
 				continue;
 			}
 		}
-		cpu = rnp->grplo;
-		bit = 1;
-		for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
+		for_each_leaf_node_possible_cpu(rnp, cpu) {
+			unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
 			if ((rnp->qsmask & bit) != 0) {
 				if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
 					mask |= bit;
@@ -3750,7 +3747,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
-	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
+	rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
 	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
 	WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e3959f5e6ddf..f714f873bf9d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -253,6 +253,13 @@ struct rcu_node {
 	wait_queue_head_t exp_wq[4];
 } ____cacheline_internodealigned_in_smp;
 
+/*
+ * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and
+ * are indexed relative to this interval rather than the global CPU ID space.
+ * This generates the bit for a CPU in node-local masks.
+ */
+#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo))
+
 /*
  * Do a full breadth-first scan of the rcu_node structures for the
  * specified rcu_state structure.
@@ -280,6 +287,14 @@ struct rcu_node {
 	for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
 	     (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
 
+/*
+ * Iterate over all possible CPUs in a leaf RCU node.
+ */
+#define for_each_leaf_node_possible_cpu(rnp, cpu) \
+	for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
+	     cpu <= rnp->grphi; \
+	     cpu = cpumask_next((cpu), cpu_possible_mask))
+
 /*
  * Union to allow "aggregate OR" operation on the need for a quiescent
  * state by the normal and expedited grace periods.
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 00a02a231ada..d400434af6b2 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -344,7 +344,6 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 {
 	int cpu;
 	unsigned long flags;
-	unsigned long mask;
 	unsigned long mask_ofl_test;
 	unsigned long mask_ofl_ipi;
 	int ret;
@@ -356,7 +355,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 
 		/* Each pass checks a CPU for identity, offline, and idle. */
 		mask_ofl_test = 0;
-		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
+		for_each_leaf_node_possible_cpu(rnp, cpu) {
 			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 
@@ -376,8 +375,8 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
 		/* IPI the remaining CPUs for expedited quiescent state. */
-		mask = 1;
-		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+		for_each_leaf_node_possible_cpu(rnp, cpu) {
+			unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
 			if (!(mask_ofl_ipi & mask))
 				continue;
 retry_ipi:
@@ -440,10 +439,10 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 		ndetected = 0;
 		rcu_for_each_leaf_node(rsp, rnp) {
 			ndetected += rcu_print_task_exp_stall(rnp);
-			mask = 1;
-			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+			for_each_leaf_node_possible_cpu(rnp, cpu) {
 				struct rcu_data *rdp;
 
+				mask = leaf_node_cpu_bit(rnp, cpu);
 				if (!(rnp->expmask & mask))
 					continue;
 				ndetected++;
@@ -453,7 +452,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 					"o."[!!(rdp->grpmask & rnp->expmaskinit)],
 					"N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
 			}
-			mask <<= 1;
 		}
 		pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
 			jiffies - jiffies_start, rsp->expedited_sequence,
@@ -473,8 +471,8 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 			pr_cont("\n");
 		}
 		rcu_for_each_leaf_node(rsp, rnp) {
-			mask = 1;
-			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+			for_each_leaf_node_possible_cpu(rnp, cpu) {
+				mask = leaf_node_cpu_bit(rnp, cpu);
 				if (!(rnp->expmask & mask))
 					continue;
 				dump_cpu_task(cpu);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 695071dd1e9c..534c590e8852 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1166,8 +1166,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 		return;
 	if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
 		return;
-	for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
-		if ((mask & 0x1) && cpu != outgoingcpu)
+	for_each_leaf_node_possible_cpu(rnp, cpu)
+		if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
+		    cpu != outgoingcpu)
 			cpumask_set_cpu(cpu, cm);
 	if (cpumask_weight(cm) == 0)
 		cpumask_setall(cm);
-- 
cgit v1.3-8-gc7d7


From 61d1b6a42fec61c5065f54cc62cef02b483c69fb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 15 Jun 2016 22:47:12 +0200
Subject: bpf, maps: add release callback

Add a release callback for maps that is invoked when the last
reference to its struct file is gone and the struct file about
to be released by vfs. The handler will be used by fd array maps.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h  | 3 ++-
 kernel/bpf/syscall.c | 7 ++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1bcae82c6cb1..29b5a1ae22cb 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -19,7 +19,8 @@ struct bpf_map;
 struct bpf_map_ops {
 	/* funcs callable from userspace (via syscall) */
 	struct bpf_map *(*map_alloc)(union bpf_attr *attr);
-	void (*map_free)(struct bpf_map *);
+	void (*map_release)(struct bpf_map *map, struct file *map_file);
+	void (*map_free)(struct bpf_map *map);
 	int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
 
 	/* funcs callable from userspace and from eBPF programs */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 46ecce4b79ed..fc3adcd064b1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -124,7 +124,12 @@ void bpf_map_put_with_uref(struct bpf_map *map)
 
 static int bpf_map_release(struct inode *inode, struct file *filp)
 {
-	bpf_map_put_with_uref(filp->private_data);
+	struct bpf_map *map = filp->private_data;
+
+	if (map->ops->map_release)
+		map->ops->map_release(map, filp);
+
+	bpf_map_put_with_uref(map);
 	return 0;
 }
 
-- 
cgit v1.3-8-gc7d7


From d056a788765e67773124f520159185bc89f5d1ad Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 15 Jun 2016 22:47:13 +0200
Subject: bpf, maps: extend map_fd_get_ptr arguments

This patch extends map_fd_get_ptr() callback that is used by fd array
maps, so that struct file pointer from the related map can be passed
in. It's safe to remove map_update_elem() callback for the two maps since
this is only allowed from syscall side, but not from eBPF programs for these
two map types. Like in per-cpu map case, bpf_fd_array_map_update_elem()
needs to be called directly here due to the extra argument.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   | 12 +++++++++---
 kernel/bpf/arraymap.c | 16 +++++++++-------
 kernel/bpf/syscall.c  |  6 ++++++
 3 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 29b5a1ae22cb..d7b43e73fe87 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -29,8 +29,9 @@ struct bpf_map_ops {
 	int (*map_delete_elem)(struct bpf_map *map, void *key);
 
 	/* funcs called by prog_array and perf_event_array map */
-	void *(*map_fd_get_ptr) (struct bpf_map *map, int fd);
-	void (*map_fd_put_ptr) (void *ptr);
+	void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
+				int fd);
+	void (*map_fd_put_ptr)(void *ptr);
 };
 
 struct bpf_map {
@@ -169,7 +170,7 @@ struct bpf_array {
 
 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
-void bpf_fd_array_map_clear(struct bpf_map *map);
+
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
@@ -207,8 +208,13 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
 			   u64 flags);
 int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
 			    u64 flags);
+
 int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value);
 
+int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
+				 void *key, void *value, u64 map_flags);
+void bpf_fd_array_map_clear(struct bpf_map *map);
+
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
  * forced to use 'long' read/writes to try to atomically copy long counters.
  * Best-effort only.  No barriers here, since it _will_ race with concurrent
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 76d5a794e426..bfedcbdb4d84 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -328,8 +328,8 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
 }
 
 /* only called from syscall */
-static int fd_array_map_update_elem(struct bpf_map *map, void *key,
-				    void *value, u64 map_flags)
+int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
+				 void *key, void *value, u64 map_flags)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	void *new_ptr, *old_ptr;
@@ -342,7 +342,7 @@ static int fd_array_map_update_elem(struct bpf_map *map, void *key,
 		return -E2BIG;
 
 	ufd = *(u32 *)value;
-	new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+	new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
 	if (IS_ERR(new_ptr))
 		return PTR_ERR(new_ptr);
 
@@ -371,10 +371,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
 	}
 }
 
-static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+static void *prog_fd_array_get_ptr(struct bpf_map *map,
+				   struct file *map_file, int fd)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct bpf_prog *prog = bpf_prog_get(fd);
+
 	if (IS_ERR(prog))
 		return prog;
 
@@ -382,6 +384,7 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
 		bpf_prog_put(prog);
 		return ERR_PTR(-EINVAL);
 	}
+
 	return prog;
 }
 
@@ -407,7 +410,6 @@ static const struct bpf_map_ops prog_array_ops = {
 	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
-	.map_update_elem = fd_array_map_update_elem,
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = prog_fd_array_get_ptr,
 	.map_fd_put_ptr = prog_fd_array_put_ptr,
@@ -431,7 +433,8 @@ static void perf_event_array_map_free(struct bpf_map *map)
 	fd_array_map_free(map);
 }
 
-static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
+					 struct file *map_file, int fd)
 {
 	struct perf_event *event;
 	const struct perf_event_attr *attr;
@@ -474,7 +477,6 @@ static const struct bpf_map_ops perf_event_array_ops = {
 	.map_free = perf_event_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
-	.map_update_elem = fd_array_map_update_elem,
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = perf_event_fd_array_get_ptr,
 	.map_fd_put_ptr = perf_event_fd_array_put_ptr,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fc3adcd064b1..c23a4e9311b3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -392,6 +392,12 @@ static int map_update_elem(union bpf_attr *attr)
 		err = bpf_percpu_hash_update(map, key, value, attr->flags);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_update(map, key, value, attr->flags);
+	} else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
+		   map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
+		rcu_read_lock();
+		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+						   attr->flags);
+		rcu_read_unlock();
 	} else {
 		rcu_read_lock();
 		err = map->ops->map_update_elem(map, key, value, attr->flags);
-- 
cgit v1.3-8-gc7d7


From 3b1efb196eee45b2f0c4994e0c43edb5e367f620 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 15 Jun 2016 22:47:14 +0200
Subject: bpf, maps: flush own entries on perf map release

The behavior of perf event arrays are quite different from all
others as they are tightly coupled to perf event fds, f.e. shown
recently by commit e03e7ee34fdd ("perf/bpf: Convert perf_event_array
to use struct file") to make refcounting on perf event more robust.
A remaining issue that the current code still has is that since
additions to the perf event array take a reference on the struct
file via perf_event_get() and are only released via fput() (that
cleans up the perf event eventually via perf_event_release_kernel())
when the element is either manually removed from the map from user
space or automatically when the last reference on the perf event
map is dropped. However, this leads us to dangling struct file's
when the map gets pinned after the application owning the perf
event descriptor exits, and since the struct file reference will
in such case only be manually dropped or via pinned file removal,
it leads to the perf event living longer than necessary, consuming
needlessly resources for that time.

Relations between perf event fds and bpf perf event map fds can be
rather complex. F.e. maps can act as demuxers among different perf
event fds that can possibly be owned by different threads and based
on the index selection from the program, events get dispatched to
one of the per-cpu fd endpoints. One perf event fd (or, rather a
per-cpu set of them) can also live in multiple perf event maps at
the same time, listening for events. Also, another requirement is
that perf event fds can get closed from application side after they
have been attached to the perf event map, so that on exit perf event
map will take care of dropping their references eventually. Likewise,
when such maps are pinned, the intended behavior is that a user
application does bpf_obj_get(), puts its fds in there and on exit
when fd is released, they are dropped from the map again, so the map
acts rather as connector endpoint. This also makes perf event maps
inherently different from program arrays as described in more detail
in commit c9da161c6517 ("bpf: fix clearing on persistent program
array maps").

To tackle this, map entries are marked by the map struct file that
added the element to the map. And when the last reference to that map
struct file is released from user space, then the tracked entries
are purged from the map. This is okay, because new map struct files
instances resp. frontends to the anon inode are provided via
bpf_map_new_fd() that is called when we invoke bpf_obj_get_user()
for retrieving a pinned map, but also when an initial instance is
created via map_create(). The rest is resolved by the vfs layer
automatically for us by keeping reference count on the map's struct
file. Any concurrent updates on the map slot are fine as well, it
just means that perf_event_fd_array_release() needs to delete less
of its own entires.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   9 +++++
 kernel/bpf/arraymap.c    | 102 ++++++++++++++++++++++++++++++++++-------------
 kernel/trace/bpf_trace.c |  18 ++++-----
 3 files changed, 91 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d7b43e73fe87..9adfef694a25 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -13,6 +13,7 @@
 #include <linux/percpu.h>
 #include <linux/err.h>
 
+struct perf_event;
 struct bpf_map;
 
 /* map is generic key/value storage optionally accesible by eBPF programs */
@@ -166,8 +167,16 @@ struct bpf_array {
 		void __percpu *pptrs[0] __aligned(8);
 	};
 };
+
 #define MAX_TAIL_CALL_CNT 32
 
+struct bpf_event_entry {
+	struct perf_event *event;
+	struct file *perf_file;
+	struct file *map_file;
+	struct rcu_head rcu;
+};
+
 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index bfedcbdb4d84..5af30732697b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -427,59 +427,105 @@ static int __init register_prog_array_map(void)
 }
 late_initcall(register_prog_array_map);
 
-static void perf_event_array_map_free(struct bpf_map *map)
+static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
+						   struct file *map_file)
 {
-	bpf_fd_array_map_clear(map);
-	fd_array_map_free(map);
+	struct bpf_event_entry *ee;
+
+	ee = kzalloc(sizeof(*ee), GFP_KERNEL);
+	if (ee) {
+		ee->event = perf_file->private_data;
+		ee->perf_file = perf_file;
+		ee->map_file = map_file;
+	}
+
+	return ee;
+}
+
+static void __bpf_event_entry_free(struct rcu_head *rcu)
+{
+	struct bpf_event_entry *ee;
+
+	ee = container_of(rcu, struct bpf_event_entry, rcu);
+	fput(ee->perf_file);
+	kfree(ee);
+}
+
+static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
+{
+	call_rcu(&ee->rcu, __bpf_event_entry_free);
 }
 
 static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
 					 struct file *map_file, int fd)
 {
-	struct perf_event *event;
 	const struct perf_event_attr *attr;
-	struct file *file;
+	struct bpf_event_entry *ee;
+	struct perf_event *event;
+	struct file *perf_file;
 
-	file = perf_event_get(fd);
-	if (IS_ERR(file))
-		return file;
+	perf_file = perf_event_get(fd);
+	if (IS_ERR(perf_file))
+		return perf_file;
 
-	event = file->private_data;
+	event = perf_file->private_data;
+	ee = ERR_PTR(-EINVAL);
 
 	attr = perf_event_attrs(event);
-	if (IS_ERR(attr))
-		goto err;
-
-	if (attr->inherit)
-		goto err;
-
-	if (attr->type == PERF_TYPE_RAW)
-		return file;
-
-	if (attr->type == PERF_TYPE_HARDWARE)
-		return file;
+	if (IS_ERR(attr) || attr->inherit)
+		goto err_out;
+
+	switch (attr->type) {
+	case PERF_TYPE_SOFTWARE:
+		if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
+			goto err_out;
+		/* fall-through */
+	case PERF_TYPE_RAW:
+	case PERF_TYPE_HARDWARE:
+		ee = bpf_event_entry_gen(perf_file, map_file);
+		if (ee)
+			return ee;
+		ee = ERR_PTR(-ENOMEM);
+		/* fall-through */
+	default:
+		break;
+	}
 
-	if (attr->type == PERF_TYPE_SOFTWARE &&
-	    attr->config == PERF_COUNT_SW_BPF_OUTPUT)
-		return file;
-err:
-	fput(file);
-	return ERR_PTR(-EINVAL);
+err_out:
+	fput(perf_file);
+	return ee;
 }
 
 static void perf_event_fd_array_put_ptr(void *ptr)
 {
-	fput((struct file *)ptr);
+	bpf_event_entry_free_rcu(ptr);
+}
+
+static void perf_event_fd_array_release(struct bpf_map *map,
+					struct file *map_file)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_event_entry *ee;
+	int i;
+
+	rcu_read_lock();
+	for (i = 0; i < array->map.max_entries; i++) {
+		ee = READ_ONCE(array->ptrs[i]);
+		if (ee && ee->map_file == map_file)
+			fd_array_map_delete_elem(map, &i);
+	}
+	rcu_read_unlock();
 }
 
 static const struct bpf_map_ops perf_event_array_ops = {
 	.map_alloc = fd_array_map_alloc,
-	.map_free = perf_event_array_map_free,
+	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = perf_event_fd_array_get_ptr,
 	.map_fd_put_ptr = perf_event_fd_array_put_ptr,
+	.map_release = perf_event_fd_array_release,
 };
 
 static struct bpf_map_type_list perf_event_array_type __read_mostly = {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 720b7bb01d43..037ea6ea3cb2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -192,18 +192,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
 {
 	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_event_entry *ee;
 	struct perf_event *event;
-	struct file *file;
 
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
-	file = READ_ONCE(array->ptrs[index]);
-	if (unlikely(!file))
+	ee = READ_ONCE(array->ptrs[index]);
+	if (unlikely(!ee))
 		return -ENOENT;
 
-	event = file->private_data;
-
+	event = ee->event;
 	/* make sure event is local and doesn't have pmu::count */
 	if (event->oncpu != smp_processor_id() ||
 	    event->pmu->count)
@@ -233,8 +232,8 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 	u64 index = flags & BPF_F_INDEX_MASK;
 	void *data = (void *) (long) r4;
 	struct perf_sample_data sample_data;
+	struct bpf_event_entry *ee;
 	struct perf_event *event;
-	struct file *file;
 	struct perf_raw_record raw = {
 		.size = size,
 		.data = data,
@@ -247,12 +246,11 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
-	file = READ_ONCE(array->ptrs[index]);
-	if (unlikely(!file))
+	ee = READ_ONCE(array->ptrs[index]);
+	if (unlikely(!ee))
 		return -ENOENT;
 
-	event = file->private_data;
-
+	event = ee->event;
 	if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
 		     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
 		return -EINVAL;
-- 
cgit v1.3-8-gc7d7


From e37837fb62f95a81bdcefa86ceea043df84937d7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 18 Apr 2016 01:01:27 +0200
Subject: locking/atomic: Remove the deprecated atomic_{set,clear}_mask()
 functions

These functions have been deprecated for a while and there is only the
one user left, convert and kill.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/atomic.h              | 10 ----------
 kernel/locking/qspinlock_paravirt.h |  4 ++--
 2 files changed, 2 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 0b3802d33125..12d910d61b83 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -480,16 +480,6 @@ static inline int atomic_fetch_andnot_release(int i, atomic_t *v)
 }
 #endif
 
-static inline __deprecated void atomic_clear_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_andnot(mask, v);
-}
-
-static inline __deprecated void atomic_set_mask(unsigned int mask, atomic_t *v)
-{
-	atomic_or(mask, v);
-}
-
 /**
  * atomic_inc_not_zero_hint - increment if not null
  * @v: pointer of type atomic_t
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 21ede57f68b3..37649e69056c 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -112,12 +112,12 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 #else /* _Q_PENDING_BITS == 8 */
 static __always_inline void set_pending(struct qspinlock *lock)
 {
-	atomic_set_mask(_Q_PENDING_VAL, &lock->val);
+	atomic_or(_Q_PENDING_VAL, &lock->val);
 }
 
 static __always_inline void clear_pending(struct qspinlock *lock)
 {
-	atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
+	atomic_andnot(_Q_PENDING_VAL, &lock->val);
 }
 
 static __always_inline int trylock_clear_pending(struct qspinlock *lock)
-- 
cgit v1.3-8-gc7d7


From f9852b74bec0117b888da39d070c323ea1cb7f4c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 18 Apr 2016 01:27:03 +0200
Subject: locking/atomic, arch/qrwlock: Employ atomic_fetch_add_acquire()

The only reason for the current code is to make GCC emit only the
"LOCK XADD" instruction on x86 (and not do a pointless extra ADD on
the result), do so nicer.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Waiman Long <waiman.long@hpe.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qrwlock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index fec082338668..19248ddf37ce 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -93,7 +93,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
 	 * that accesses can't leak upwards out of our subsequent critical
 	 * section in the case that the lock is currently held for write.
 	 */
-	cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS;
+	cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts);
 	rspin_until_writer_unlock(lock, cnts);
 
 	/*
-- 
cgit v1.3-8-gc7d7


From 86a3b5f34fc1fb307abef4fde76bebd3edce0324 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 18 May 2016 12:42:21 +0200
Subject: locking/atomic, arch/rwsem: Employ atomic_long_fetch_add()

Now that we have fetch_add() we can stop using add_return() - val.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jason Low <jason.low2@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <waiman.long@hpe.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem-xadd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2031281bb940..447e08de1fab 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -153,7 +153,7 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
 	if (wake_type != RWSEM_WAKE_READ_OWNED) {
 		adjustment = RWSEM_ACTIVE_READ_BIAS;
  try_reader_grant:
-		oldcount = atomic_long_add_return(adjustment, &sem->count) - adjustment;
+		oldcount = atomic_long_fetch_add(adjustment, &sem->count);
 
 		if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
 			/*
-- 
cgit v1.3-8-gc7d7


From 66b12abc846d31e75fa5f2f31db1396ddfa8ee4a Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Thu, 16 Jun 2016 17:08:19 -0400
Subject: audit: fix some horrible switch statement style crimes

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditfilter.c | 8 ++++++--
 kernel/auditsc.c     | 8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 8a8aa3fbc8d8..ff59a5eed691 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1343,8 +1343,12 @@ static int audit_filter_user_rules(struct audit_krule *rule, int type,
 			return result;
 	}
 	switch (rule->action) {
-	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
-	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
+	case AUDIT_NEVER:
+		*state = AUDIT_DISABLED;
+		break;
+	case AUDIT_ALWAYS:
+		*state = AUDIT_RECORD_CONTEXT;
+		break;
 	}
 	return 1;
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7d0e3cf8abe1..ec4c552876a7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -695,8 +695,12 @@ static int audit_filter_rules(struct task_struct *tsk,
 		ctx->prio = rule->prio;
 	}
 	switch (rule->action) {
-	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
-	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
+	case AUDIT_NEVER:
+		*state = AUDIT_DISABLED;
+		break;
+	case AUDIT_ALWAYS:
+		*state = AUDIT_RECORD_CONTEXT;
+		break;
 	}
 	return 1;
 }
-- 
cgit v1.3-8-gc7d7


From 8c8a5502183c724854afd2f143a72f7eb71b6fea Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 17 Jun 2016 12:23:59 -0400
Subject: cgroup: fix idr leak for the first cgroup root

The valid cgroup hierarchy ID range includes 0, so we can't filter for
positive numbers when freeing it, or it'll leak the first ID. No big
deal, just disruptive when reading the code.

The ID is freed during error handling and when the reference count
hits zero, so the double-free test is not necessary; remove it.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 86cb5c6e8932..36fc0ff506c3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1158,18 +1158,12 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
 {
 	lockdep_assert_held(&cgroup_mutex);
 
-	if (root->hierarchy_id) {
-		idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
-		root->hierarchy_id = 0;
-	}
+	idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
 }
 
 static void cgroup_free_root(struct cgroup_root *root)
 {
 	if (root) {
-		/* hierarchy ID should already have been released */
-		WARN_ON_ONCE(root->hierarchy_id);
-
 		idr_destroy(&root->cgroup_idr);
 		kfree(root);
 	}
-- 
cgit v1.3-8-gc7d7


From d6ccc55e66ccdbc8ad0eeda14419f8eaccbc246b Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 17 Jun 2016 12:24:27 -0400
Subject: cgroup: remove unnecessary 0 check from css_from_id()

css_idr allocation starts at 1, so index 0 will never point to an
item. css_from_id() currently filters that before asking idr_find(),
but idr_find() would also just return NULL, so this is not needed.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 36fc0ff506c3..78f6d18ff0af 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6162,7 +6162,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
 {
 	WARN_ON_ONCE(!rcu_read_lock_held());
-	return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
+	return idr_find(&ss->css_idr, id);
 }
 
 /**
-- 
cgit v1.3-8-gc7d7


From 59a37f8baeb2c9d97f316584c90892d18bf846d4 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 17 Jun 2016 16:58:26 +0200
Subject: blktrace: avoid using timespec

The blktrace code stores the current time in a 32-bit word in its
user interface. This is a bad idea because 32-bit seconds overflow
at some point.

We probably have until 2106 before this one overflows, as it seems
to use an 'unsigned' variable, but we should confirm that user
space treats it the same way.

Aside from this, we want to stop using 'struct timespec' here,
so I'm adding a comment about the overflow and change the code
to use timespec64 instead to make the loss of range more obvious.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/trace/blktrace.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 03b0dd98ff0e..bedb84d168d1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -127,12 +127,13 @@ static void trace_note_tsk(struct task_struct *tsk)
 
 static void trace_note_time(struct blk_trace *bt)
 {
-	struct timespec now;
+	struct timespec64 now;
 	unsigned long flags;
 	u32 words[2];
 
-	getnstimeofday(&now);
-	words[0] = now.tv_sec;
+	/* need to check user space to see if this breaks in y2038 or y2106 */
+	ktime_get_real_ts64(&now);
+	words[0] = (u32)now.tv_sec;
 	words[1] = now.tv_nsec;
 
 	local_irq_save(flags);
-- 
cgit v1.3-8-gc7d7


From edd14cfebc4404698544d407ecf8eda6e19aa19e Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Fri, 17 Jun 2016 16:00:20 -0600
Subject: genirq: Add untracked irq handler

This adds a software irq handler for controllers that multiplex
interrupts from multiple devices, but don't know which device generated
the interrupt. For these devices, the irq handler that demuxes must
check every action for every software irq using the same h/w irq in order
to find out which device generated the interrupt. This will inevitably
trigger spurious interrupt detection if we are noting the irq.

The new irq handler does not track the handling for spurious interrupt
detection. An irq that uses this also won't get stats tracked since it
didn't generate the interrupt, nor added to randomness since they are
not random.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: linux-pci@vger.kernel.org
Cc: Jon Derrick <jonathan.derrick@intel.com>
Link: http://lkml.kernel.org/r/1466200821-29159-1-git-send-email-keith.busch@intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h    |  1 +
 kernel/irq/chip.c      | 43 +++++++++++++++++++++++++++++++++++++++++++
 kernel/irq/handle.c    | 18 ++++++++++++++----
 kernel/irq/internals.h |  1 +
 4 files changed, 59 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 6c92a847394d..562cef010aa8 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -484,6 +484,7 @@ extern void handle_fasteoi_irq(struct irq_desc *desc);
 extern void handle_edge_irq(struct irq_desc *desc);
 extern void handle_edge_eoi_irq(struct irq_desc *desc);
 extern void handle_simple_irq(struct irq_desc *desc);
+extern void handle_untracked_irq(struct irq_desc *desc);
 extern void handle_percpu_irq(struct irq_desc *desc);
 extern void handle_percpu_devid_irq(struct irq_desc *desc);
 extern void handle_bad_irq(struct irq_desc *desc);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ad8131473774..b4c1bc7c9ca2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -426,6 +426,49 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(handle_simple_irq);
 
+/**
+ *	handle_untracked_irq - Simple and software-decoded IRQs.
+ *	@desc:	the interrupt description structure for this irq
+ *
+ *	Untracked interrupts are sent from a demultiplexing interrupt
+ *	handler when the demultiplexer does not know which device it its
+ *	multiplexed irq domain generated the interrupt. IRQ's handled
+ *	through here are not subjected to stats tracking, randomness, or
+ *	spurious interrupt detection.
+ *
+ *	Note: Like handle_simple_irq, the caller is expected to handle
+ *	the ack, clear, mask and unmask issues if necessary.
+ */
+void handle_untracked_irq(struct irq_desc *desc)
+{
+	unsigned int flags = 0;
+
+	raw_spin_lock(&desc->lock);
+
+	if (!irq_may_run(desc))
+		goto out_unlock;
+
+	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+	if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+		desc->istate |= IRQS_PENDING;
+		goto out_unlock;
+	}
+
+	desc->istate &= ~IRQS_PENDING;
+	irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+	raw_spin_unlock(&desc->lock);
+
+	__handle_irq_event_percpu(desc, &flags);
+
+	raw_spin_lock(&desc->lock);
+	irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+
+out_unlock:
+	raw_spin_unlock(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_untracked_irq);
+
 /*
  * Called unconditionally from handle_level_irq() and only for oneshot
  * interrupts from handle_fasteoi_irq()
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a15b5485b446..d3f24905852c 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -132,10 +132,10 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
 	wake_up_process(action->thread);
 }
 
-irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags)
 {
 	irqreturn_t retval = IRQ_NONE;
-	unsigned int flags = 0, irq = desc->irq_data.irq;
+	unsigned int irq = desc->irq_data.irq;
 	struct irqaction *action;
 
 	for_each_action_of_desc(desc, action) {
@@ -164,7 +164,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
 
 			/* Fall through to add to randomness */
 		case IRQ_HANDLED:
-			flags |= action->flags;
+			*flags |= action->flags;
 			break;
 
 		default:
@@ -174,7 +174,17 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
 		retval |= res;
 	}
 
-	add_interrupt_randomness(irq, flags);
+	return retval;
+}
+
+irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
+{
+	irqreturn_t retval;
+	unsigned int flags = 0;
+
+	retval = __handle_irq_event_percpu(desc, &flags);
+
+	add_interrupt_randomness(desc->irq_data.irq, flags);
 
 	if (!noirqdebug)
 		note_interrupt(desc, retval);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index d5edcdc9382a..0c6f35ba9cc0 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -84,6 +84,7 @@ extern void irq_mark_irq(unsigned int irq);
 
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);
 irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
 irqreturn_t handle_irq_event(struct irq_desc *desc);
 
-- 
cgit v1.3-8-gc7d7


From 4e267db135c44d0b18e553899fe7df32b89211a5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 14 Apr 2016 07:38:13 -0400
Subject: tracing: Make the pid filtering helper functions global

Make the functions used for pid filtering global for tracing, such that the
function tracer can use the pid code as well.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h        |  9 +++++++++
 kernel/trace/trace_events.c | 34 +++++++++++++++++-----------------
 2 files changed, 26 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5167c366d6b7..172330891c6d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -628,6 +628,15 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 
 extern unsigned long tracing_thresh;
 
+/* PID filtering */
+bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
+			     pid_t search_pid);
+bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
+			    struct task_struct *task);
+void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
+				  struct task_struct *self,
+				  struct task_struct *task);
+
 #ifdef CONFIG_TRACER_MAX_TRACE
 void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
 void update_max_tr_single(struct trace_array *tr,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3d4155892a1e..b5e514c4dada 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -503,8 +503,8 @@ static void ftrace_clear_events(struct trace_array *tr)
 extern int pid_max;
 
 /* Returns true if found in filter */
-static bool
-find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
+bool
+trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
 {
 	/*
 	 * If pid_max changed after filtered_pids was created, we
@@ -516,8 +516,8 @@ find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
 	return test_bit(search_pid, filtered_pids->pids);
 }
 
-static bool
-ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
+bool
+trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
 {
 	/*
 	 * Return false, because if filtered_pids does not exist,
@@ -526,19 +526,19 @@ ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
 	if (!filtered_pids)
 		return false;
 
-	return !find_filtered_pid(filtered_pids, task->pid);
+	return !trace_find_filtered_pid(filtered_pids, task->pid);
 }
 
-static void filter_add_remove_task(struct trace_pid_list *pid_list,
-				   struct task_struct *self,
-				   struct task_struct *task)
+void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
+				  struct task_struct *self,
+				  struct task_struct *task)
 {
 	if (!pid_list)
 		return;
 
 	/* For forks, we only add if the forking task is listed */
 	if (self) {
-		if (!find_filtered_pid(pid_list, self->pid))
+		if (!trace_find_filtered_pid(pid_list, self->pid))
 			return;
 	}
 
@@ -560,7 +560,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
 	struct trace_array *tr = data;
 
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
-	filter_add_remove_task(pid_list, NULL, task);
+	trace_filter_add_remove_task(pid_list, NULL, task);
 }
 
 static void
@@ -572,7 +572,7 @@ event_filter_pid_sched_process_fork(void *data,
 	struct trace_array *tr = data;
 
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
-	filter_add_remove_task(pid_list, self, task);
+	trace_filter_add_remove_task(pid_list, self, task);
 }
 
 void trace_event_follow_fork(struct trace_array *tr, bool enable)
@@ -600,8 +600,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       ignore_this_task(pid_list, prev) &&
-		       ignore_this_task(pid_list, next));
+		       trace_ignore_this_task(pid_list, prev) &&
+		       trace_ignore_this_task(pid_list, next));
 }
 
 static void
@@ -614,7 +614,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       ignore_this_task(pid_list, next));
+		       trace_ignore_this_task(pid_list, next));
 }
 
 static void
@@ -630,7 +630,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       ignore_this_task(pid_list, task));
+		       trace_ignore_this_task(pid_list, task));
 }
 
 static void
@@ -647,7 +647,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
 
 	/* Set tracing if current is enabled */
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       ignore_this_task(pid_list, current));
+		       trace_ignore_this_task(pid_list, current));
 }
 
 static void __ftrace_clear_event_pids(struct trace_array *tr)
@@ -1654,7 +1654,7 @@ static void ignore_task_cpu(void *data)
 					     mutex_is_locked(&event_mutex));
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       ignore_this_task(pid_list, current));
+		       trace_ignore_this_task(pid_list, current));
 }
 
 static ssize_t
-- 
cgit v1.3-8-gc7d7


From d8275c454dcdba296675221b4c12f19d1b6e0ee8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 14 Apr 2016 12:15:22 -0400
Subject: tracing: Move filtered_pid helper functions into trace.c

As the filtered_pid functions are going to be used by function tracer as
well as trace_events, move the code into the generic trace.c file.

The functions moved are:

 trace_find_filtered_pid()
 trace_ignore_this_task()
 trace_filter_add_remove_task()

Kernel Doc text was also added.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 78 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events.c | 51 -----------------------------
 2 files changed, 78 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a4bd6b68a0b..0b87fe8e6d0b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -319,6 +319,84 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
 	return 0;
 }
 
+/**
+ * trace_find_filtered_pid - check if a pid exists in a filtered_pid list
+ * @filtered_pids: The list of pids to check
+ * @search_pid: The PID to find in @filtered_pids
+ *
+ * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis.
+ */
+bool
+trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
+{
+	/*
+	 * If pid_max changed after filtered_pids was created, we
+	 * by default ignore all pids greater than the previous pid_max.
+	 */
+	if (search_pid >= filtered_pids->pid_max)
+		return false;
+
+	return test_bit(search_pid, filtered_pids->pids);
+}
+
+/**
+ * trace_ignore_this_task - should a task be ignored for tracing
+ * @filtered_pids: The list of pids to check
+ * @task: The task that should be ignored if not filtered
+ *
+ * Checks if @task should be traced or not from @filtered_pids.
+ * Returns true if @task should *NOT* be traced.
+ * Returns false if @task should be traced.
+ */
+bool
+trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
+{
+	/*
+	 * Return false, because if filtered_pids does not exist,
+	 * all pids are good to trace.
+	 */
+	if (!filtered_pids)
+		return false;
+
+	return !trace_find_filtered_pid(filtered_pids, task->pid);
+}
+
+/**
+ * trace_pid_filter_add_remove - Add or remove a task from a pid_list
+ * @pid_list: The list to modify
+ * @self: The current task for fork or NULL for exit
+ * @task: The task to add or remove
+ *
+ * If adding a task, if @self is defined, the task is only added if @self
+ * is also included in @pid_list. This happens on fork and tasks should
+ * only be added when the parent is listed. If @self is NULL, then the
+ * @task pid will be removed from the list, which would happen on exit
+ * of a task.
+ */
+void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
+				  struct task_struct *self,
+				  struct task_struct *task)
+{
+	if (!pid_list)
+		return;
+
+	/* For forks, we only add if the forking task is listed */
+	if (self) {
+		if (!trace_find_filtered_pid(pid_list, self->pid))
+			return;
+	}
+
+	/* Sorry, but we don't support pid_max changing after setting */
+	if (task->pid >= pid_list->pid_max)
+		return;
+
+	/* "self" is set for forks, and NULL for exits */
+	if (self)
+		set_bit(task->pid, pid_list->pids);
+	else
+		clear_bit(task->pid, pid_list->pids);
+}
+
 static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
 {
 	u64 ts;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b5e514c4dada..a11e6d9a3841 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -502,57 +502,6 @@ static void ftrace_clear_events(struct trace_array *tr)
 /* Shouldn't this be in a header? */
 extern int pid_max;
 
-/* Returns true if found in filter */
-bool
-trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
-{
-	/*
-	 * If pid_max changed after filtered_pids was created, we
-	 * by default ignore all pids greater than the previous pid_max.
-	 */
-	if (search_pid >= filtered_pids->pid_max)
-		return false;
-
-	return test_bit(search_pid, filtered_pids->pids);
-}
-
-bool
-trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
-{
-	/*
-	 * Return false, because if filtered_pids does not exist,
-	 * all pids are good to trace.
-	 */
-	if (!filtered_pids)
-		return false;
-
-	return !trace_find_filtered_pid(filtered_pids, task->pid);
-}
-
-void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
-				  struct task_struct *self,
-				  struct task_struct *task)
-{
-	if (!pid_list)
-		return;
-
-	/* For forks, we only add if the forking task is listed */
-	if (self) {
-		if (!trace_find_filtered_pid(pid_list, self->pid))
-			return;
-	}
-
-	/* Sorry, but we don't support pid_max changing after setting */
-	if (task->pid >= pid_list->pid_max)
-		return;
-
-	/* "self" is set for forks, and NULL for exits */
-	if (self)
-		set_bit(task->pid, pid_list->pids);
-	else
-		clear_bit(task->pid, pid_list->pids);
-}
-
 static void
 event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
 {
-- 
cgit v1.3-8-gc7d7


From 5cc8976bd52153678ca37cc1e3000833b20276f3 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 20 Apr 2016 15:19:54 -0400
Subject: tracing: Move the pid_list seq_file functions to be global

To allow other aspects of ftrace to use the pid_list logic, we need to reuse
the seq_file functions. Making the generic part into functions that can be
called by other files will help in this regard.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 71 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h        |  3 ++
 kernel/trace/trace_events.c | 34 ++--------------------
 3 files changed, 77 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0b87fe8e6d0b..7943e306cc7f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -397,6 +397,77 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
 		clear_bit(task->pid, pid_list->pids);
 }
 
+/**
+ * trace_pid_next - Used for seq_file to get to the next pid of a pid_list
+ * @pid_list: The pid list to show
+ * @v: The last pid that was shown (+1 the actual pid to let zero be displayed)
+ * @pos: The position of the file
+ *
+ * This is used by the seq_file "next" operation to iterate the pids
+ * listed in a trace_pid_list structure.
+ *
+ * Returns the pid+1 as we want to display pid of zero, but NULL would
+ * stop the iteration.
+ */
+void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
+{
+	unsigned long pid = (unsigned long)v;
+
+	(*pos)++;
+
+	/* pid already is +1 of the actual prevous bit */
+	pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
+
+	/* Return pid + 1 to allow zero to be represented */
+	if (pid < pid_list->pid_max)
+		return (void *)(pid + 1);
+
+	return NULL;
+}
+
+/**
+ * trace_pid_start - Used for seq_file to start reading pid lists
+ * @pid_list: The pid list to show
+ * @pos: The position of the file
+ *
+ * This is used by seq_file "start" operation to start the iteration
+ * of listing pids.
+ *
+ * Returns the pid+1 as we want to display pid of zero, but NULL would
+ * stop the iteration.
+ */
+void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos)
+{
+	unsigned long pid;
+	loff_t l = 0;
+
+	pid = find_first_bit(pid_list->pids, pid_list->pid_max);
+	if (pid >= pid_list->pid_max)
+		return NULL;
+
+	/* Return pid + 1 so that zero can be the exit value */
+	for (pid++; pid && l < *pos;
+	     pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l))
+		;
+	return (void *)pid;
+}
+
+/**
+ * trace_pid_show - show the current pid in seq_file processing
+ * @m: The seq_file structure to write into
+ * @v: A void pointer of the pid (+1) value to display
+ *
+ * Can be directly used by seq_file operations to display the current
+ * pid value.
+ */
+int trace_pid_show(struct seq_file *m, void *v)
+{
+	unsigned long pid = (unsigned long)v - 1;
+
+	seq_printf(m, "%lu\n", pid);
+	return 0;
+}
+
 static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
 {
 	u64 ts;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 172330891c6d..45442d5842f2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -636,6 +636,9 @@ bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
 void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
 				  struct task_struct *self,
 				  struct task_struct *task);
+void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos);
+void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos);
+int trace_pid_show(struct seq_file *m, void *v);
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a11e6d9a3841..fd831a972bae 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -983,18 +983,8 @@ p_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct trace_array *tr = m->private;
 	struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
-	unsigned long pid = (unsigned long)v;
 
-	(*pos)++;
-
-	/* pid already is +1 of the actual prevous bit */
-	pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
-
-	/* Return pid + 1 to allow zero to be represented */
-	if (pid < pid_list->pid_max)
-		return (void *)(pid + 1);
-
-	return NULL;
+	return trace_pid_next(pid_list, v, pos);
 }
 
 static void *p_start(struct seq_file *m, loff_t *pos)
@@ -1002,8 +992,6 @@ static void *p_start(struct seq_file *m, loff_t *pos)
 {
 	struct trace_pid_list *pid_list;
 	struct trace_array *tr = m->private;
-	unsigned long pid;
-	loff_t l = 0;
 
 	/*
 	 * Grab the mutex, to keep calls to p_next() having the same
@@ -1019,15 +1007,7 @@ static void *p_start(struct seq_file *m, loff_t *pos)
 	if (!pid_list)
 		return NULL;
 
-	pid = find_first_bit(pid_list->pids, pid_list->pid_max);
-	if (pid >= pid_list->pid_max)
-		return NULL;
-
-	/* Return pid + 1 so that zero can be the exit value */
-	for (pid++; pid && l < *pos;
-	     pid = (unsigned long)p_next(m, (void *)pid, &l))
-		;
-	return (void *)pid;
+	return trace_pid_start(pid_list, pos);
 }
 
 static void p_stop(struct seq_file *m, void *p)
@@ -1037,14 +1017,6 @@ static void p_stop(struct seq_file *m, void *p)
 	mutex_unlock(&event_mutex);
 }
 
-static int p_show(struct seq_file *m, void *v)
-{
-	unsigned long pid = (unsigned long)v - 1;
-
-	seq_printf(m, "%lu\n", pid);
-	return 0;
-}
-
 static ssize_t
 event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		  loff_t *ppos)
@@ -1795,7 +1767,7 @@ static const struct seq_operations show_set_event_seq_ops = {
 static const struct seq_operations show_set_pid_seq_ops = {
 	.start = p_start,
 	.next = p_next,
-	.show = p_show,
+	.show = trace_pid_show,
 	.stop = p_stop,
 };
 
-- 
cgit v1.3-8-gc7d7


From 76c813e26606d35ea9d8d6f96e646b3944c730a9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 21 Apr 2016 11:35:30 -0400
Subject: tracing: Move pid_list write processing into its own function

The addition of PIDs into a pid_list via the write operation of
set_event_pid is a bit complex. The same operation will be needed for
function tracing pids. Move the code into its own generic function in
trace.c, so that we can avoid duplication of this code.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 109 ++++++++++++++++++++++++++++++++++++++++++-
 kernel/trace/trace.h        |   7 +++
 kernel/trace/trace_events.c | 110 ++++----------------------------------------
 3 files changed, 124 insertions(+), 102 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7943e306cc7f..a8bb7485fd1d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -25,7 +25,7 @@
 #include <linux/hardirq.h>
 #include <linux/linkage.h>
 #include <linux/uaccess.h>
-#include <linux/kprobes.h>
+#include <linux/vmalloc.h>
 #include <linux/ftrace.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
@@ -319,6 +319,12 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
 	return 0;
 }
 
+void trace_free_pid_list(struct trace_pid_list *pid_list)
+{
+	vfree(pid_list->pids);
+	kfree(pid_list);
+}
+
 /**
  * trace_find_filtered_pid - check if a pid exists in a filtered_pid list
  * @filtered_pids: The list of pids to check
@@ -468,6 +474,107 @@ int trace_pid_show(struct seq_file *m, void *v)
 	return 0;
 }
 
+/* 128 should be much more than enough */
+#define PID_BUF_SIZE		127
+
+int trace_pid_write(struct trace_pid_list *filtered_pids,
+		    struct trace_pid_list **new_pid_list,
+		    const char __user *ubuf, size_t cnt)
+{
+	struct trace_pid_list *pid_list;
+	struct trace_parser parser;
+	unsigned long val;
+	int nr_pids = 0;
+	ssize_t read = 0;
+	ssize_t ret = 0;
+	loff_t pos;
+	pid_t pid;
+
+	if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1))
+		return -ENOMEM;
+
+	/*
+	 * Always recreate a new array. The write is an all or nothing
+	 * operation. Always create a new array when adding new pids by
+	 * the user. If the operation fails, then the current list is
+	 * not modified.
+	 */
+	pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
+	if (!pid_list)
+		return -ENOMEM;
+
+	pid_list->pid_max = READ_ONCE(pid_max);
+
+	/* Only truncating will shrink pid_max */
+	if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
+		pid_list->pid_max = filtered_pids->pid_max;
+
+	pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
+	if (!pid_list->pids) {
+		kfree(pid_list);
+		return -ENOMEM;
+	}
+
+	if (filtered_pids) {
+		/* copy the current bits to the new max */
+		pid = find_first_bit(filtered_pids->pids,
+				     filtered_pids->pid_max);
+		while (pid < filtered_pids->pid_max) {
+			set_bit(pid, pid_list->pids);
+			pid = find_next_bit(filtered_pids->pids,
+					    filtered_pids->pid_max,
+					    pid + 1);
+			nr_pids++;
+		}
+	}
+
+	while (cnt > 0) {
+
+		pos = 0;
+
+		ret = trace_get_user(&parser, ubuf, cnt, &pos);
+		if (ret < 0 || !trace_parser_loaded(&parser))
+			break;
+
+		read += ret;
+		ubuf += ret;
+		cnt -= ret;
+
+		parser.buffer[parser.idx] = 0;
+
+		ret = -EINVAL;
+		if (kstrtoul(parser.buffer, 0, &val))
+			break;
+		if (val >= pid_list->pid_max)
+			break;
+
+		pid = (pid_t)val;
+
+		set_bit(pid, pid_list->pids);
+		nr_pids++;
+
+		trace_parser_clear(&parser);
+		ret = 0;
+	}
+	trace_parser_put(&parser);
+
+	if (ret < 0) {
+		trace_free_pid_list(pid_list);
+		return ret;
+	}
+
+	if (!nr_pids) {
+		/* Cleared the list of pids */
+		trace_free_pid_list(pid_list);
+		read = ret;
+		pid_list = NULL;
+	}
+
+	*new_pid_list = pid_list;
+
+	return read;
+}
+
 static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
 {
 	u64 ts;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 45442d5842f2..a4dce1ef9e03 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -629,6 +629,9 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 extern unsigned long tracing_thresh;
 
 /* PID filtering */
+
+extern int pid_max;
+
 bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
 			     pid_t search_pid);
 bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
@@ -639,6 +642,10 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
 void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos);
 void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos);
 int trace_pid_show(struct seq_file *m, void *v);
+void trace_free_pid_list(struct trace_pid_list *pid_list);
+int trace_pid_write(struct trace_pid_list *filtered_pids,
+		    struct trace_pid_list **new_pid_list,
+		    const char __user *ubuf, size_t cnt);
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index fd831a972bae..fd449eb138cf 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,7 +15,6 @@
 #include <linux/kthread.h>
 #include <linux/tracefs.h>
 #include <linux/uaccess.h>
-#include <linux/vmalloc.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/sort.h>
@@ -499,9 +498,6 @@ static void ftrace_clear_events(struct trace_array *tr)
 	mutex_unlock(&event_mutex);
 }
 
-/* Shouldn't this be in a header? */
-extern int pid_max;
-
 static void
 event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
 {
@@ -634,8 +630,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
 	/* Wait till all users are no longer using pid filtering */
 	synchronize_sched();
 
-	vfree(pid_list->pids);
-	kfree(pid_list);
+	trace_free_pid_list(pid_list);
 }
 
 static void ftrace_clear_event_pids(struct trace_array *tr)
@@ -1587,13 +1582,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 	struct trace_pid_list *filtered_pids = NULL;
 	struct trace_pid_list *pid_list;
 	struct trace_event_file *file;
-	struct trace_parser parser;
-	unsigned long val;
-	loff_t this_pos;
-	ssize_t read = 0;
-	ssize_t ret = 0;
-	pid_t pid;
-	int nr_pids = 0;
+	ssize_t ret;
 
 	if (!cnt)
 		return 0;
@@ -1602,93 +1591,15 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 	if (ret < 0)
 		return ret;
 
-	if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
-		return -ENOMEM;
-
 	mutex_lock(&event_mutex);
+
 	filtered_pids = rcu_dereference_protected(tr->filtered_pids,
 					     lockdep_is_held(&event_mutex));
 
-	/*
-	 * Always recreate a new array. The write is an all or nothing
-	 * operation. Always create a new array when adding new pids by
-	 * the user. If the operation fails, then the current list is
-	 * not modified.
-	 */
-	pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
-	if (!pid_list) {
-		read = -ENOMEM;
-		goto out;
-	}
-	pid_list->pid_max = READ_ONCE(pid_max);
-	/* Only truncating will shrink pid_max */
-	if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
-		pid_list->pid_max = filtered_pids->pid_max;
-	pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
-	if (!pid_list->pids) {
-		kfree(pid_list);
-		read = -ENOMEM;
-		goto out;
-	}
-	if (filtered_pids) {
-		/* copy the current bits to the new max */
-		pid = find_first_bit(filtered_pids->pids,
-				     filtered_pids->pid_max);
-		while (pid < filtered_pids->pid_max) {
-			set_bit(pid, pid_list->pids);
-			pid = find_next_bit(filtered_pids->pids,
-					    filtered_pids->pid_max,
-					    pid + 1);
-			nr_pids++;
-		}
-	}
-
-	while (cnt > 0) {
-
-		this_pos = 0;
-
-		ret = trace_get_user(&parser, ubuf, cnt, &this_pos);
-		if (ret < 0 || !trace_parser_loaded(&parser))
-			break;
-
-		read += ret;
-		ubuf += ret;
-		cnt -= ret;
-
-		parser.buffer[parser.idx] = 0;
-
-		ret = -EINVAL;
-		if (kstrtoul(parser.buffer, 0, &val))
-			break;
-		if (val >= pid_list->pid_max)
-			break;
-
-		pid = (pid_t)val;
-
-		set_bit(pid, pid_list->pids);
-		nr_pids++;
-
-		trace_parser_clear(&parser);
-		ret = 0;
-	}
-	trace_parser_put(&parser);
-
-	if (ret < 0) {
-		vfree(pid_list->pids);
-		kfree(pid_list);
-		read = ret;
+	ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
+	if (ret < 0)
 		goto out;
-	}
 
-	if (!nr_pids) {
-		/* Cleared the list of pids */
-		vfree(pid_list->pids);
-		kfree(pid_list);
-		read = ret;
-		if (!filtered_pids)
-			goto out;
-		pid_list = NULL;
-	}
 	rcu_assign_pointer(tr->filtered_pids, pid_list);
 
 	list_for_each_entry(file, &tr->events, list) {
@@ -1697,10 +1608,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 
 	if (filtered_pids) {
 		synchronize_sched();
-
-		vfree(filtered_pids->pids);
-		kfree(filtered_pids);
-	} else {
+		trace_free_pid_list(filtered_pids);
+	} else if (pid_list) {
 		/*
 		 * Register a probe that is called before all other probes
 		 * to set ignore_pid if next or prev do not match.
@@ -1738,9 +1647,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
  out:
 	mutex_unlock(&event_mutex);
 
-	ret = read;
-	if (read > 0)
-		*ppos += read;
+	if (ret > 0)
+		*ppos += ret;
 
 	return ret;
 }
-- 
cgit v1.3-8-gc7d7


From 345ddcc882d8896dcbdcb3e0ee4a415fc23ec8b0 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 22 Apr 2016 18:11:33 -0400
Subject: ftrace: Have set_ftrace_pid use the bitmap like events do

Convert set_ftrace_pid to use the bitmap like set_event_pid does. This
allows for instances to use the pid filtering as well, and will allow for
function-fork option to set if the children of a traced function should be
traced or not.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c                | 313 +++++++++++++++--------------------
 kernel/trace/trace.c                 |   1 +
 kernel/trace/trace.h                 |  15 +-
 kernel/trace/trace_functions.c       |   2 +-
 kernel/trace/trace_functions_graph.c |   2 +-
 5 files changed, 148 insertions(+), 185 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 900dbb1efff2..8b488f4dd8e8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -89,16 +89,16 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
 /* What to set function_trace_op to */
 static struct ftrace_ops *set_function_trace_op;
 
-/* List for set_ftrace_pid's pids. */
-LIST_HEAD(ftrace_pids);
-struct ftrace_pid {
-	struct list_head list;
-	struct pid *pid;
-};
-
-static bool ftrace_pids_enabled(void)
+static bool ftrace_pids_enabled(struct ftrace_ops *ops)
 {
-	return !list_empty(&ftrace_pids);
+	struct trace_array *tr;
+
+	if (!(ops->flags & FTRACE_OPS_FL_PID) || !ops->private)
+		return false;
+
+	tr = ops->private;
+
+	return tr->function_pids != NULL;
 }
 
 static void ftrace_update_trampoline(struct ftrace_ops *ops);
@@ -179,7 +179,9 @@ int ftrace_nr_registered_ops(void)
 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
 			    struct ftrace_ops *op, struct pt_regs *regs)
 {
-	if (!test_tsk_trace_trace(current))
+	struct trace_array *tr = op->private;
+
+	if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid))
 		return;
 
 	op->saved_func(ip, parent_ip, op, regs);
@@ -417,7 +419,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 	/* Always save the function, and reset at unregistering */
 	ops->saved_func = ops->func;
 
-	if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled())
+	if (ftrace_pids_enabled(ops))
 		ops->func = ftrace_pid_func;
 
 	ftrace_update_trampoline(ops);
@@ -450,7 +452,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 
 static void ftrace_update_pid_func(void)
 {
-	bool enabled = ftrace_pids_enabled();
 	struct ftrace_ops *op;
 
 	/* Only do something if we are tracing something */
@@ -459,8 +460,8 @@ static void ftrace_update_pid_func(void)
 
 	do_for_each_ftrace_op(op, ftrace_ops_list) {
 		if (op->flags & FTRACE_OPS_FL_PID) {
-			op->func = enabled ? ftrace_pid_func :
-				op->saved_func;
+			op->func = ftrace_pids_enabled(op) ?
+				ftrace_pid_func : op->saved_func;
 			ftrace_update_trampoline(op);
 		}
 	} while_for_each_ftrace_op(op);
@@ -5324,179 +5325,99 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
 	return ops->func;
 }
 
-static void clear_ftrace_swapper(void)
+static void
+ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
+		    struct task_struct *prev, struct task_struct *next)
 {
-	struct task_struct *p;
-	int cpu;
+	struct trace_array *tr = data;
+	struct trace_pid_list *pid_list;
 
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		p = idle_task(cpu);
-		clear_tsk_trace_trace(p);
-	}
-	put_online_cpus();
-}
-
-static void set_ftrace_swapper(void)
-{
-	struct task_struct *p;
-	int cpu;
+	pid_list = rcu_dereference_sched(tr->function_pids);
 
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		p = idle_task(cpu);
-		set_tsk_trace_trace(p);
-	}
-	put_online_cpus();
+	this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
+		       trace_ignore_this_task(pid_list, next));
 }
 
-static void clear_ftrace_pid(struct pid *pid)
+static void clear_ftrace_pids(struct trace_array *tr)
 {
-	struct task_struct *p;
+	struct trace_pid_list *pid_list;
+	int cpu;
 
-	rcu_read_lock();
-	do_each_pid_task(pid, PIDTYPE_PID, p) {
-		clear_tsk_trace_trace(p);
-	} while_each_pid_task(pid, PIDTYPE_PID, p);
-	rcu_read_unlock();
+	pid_list = rcu_dereference_protected(tr->function_pids,
+					     lockdep_is_held(&ftrace_lock));
+	if (!pid_list)
+		return;
 
-	put_pid(pid);
-}
+	unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
 
-static void set_ftrace_pid(struct pid *pid)
-{
-	struct task_struct *p;
+	for_each_possible_cpu(cpu)
+		per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false;
 
-	rcu_read_lock();
-	do_each_pid_task(pid, PIDTYPE_PID, p) {
-		set_tsk_trace_trace(p);
-	} while_each_pid_task(pid, PIDTYPE_PID, p);
-	rcu_read_unlock();
-}
+	rcu_assign_pointer(tr->function_pids, NULL);
 
-static void clear_ftrace_pid_task(struct pid *pid)
-{
-	if (pid == ftrace_swapper_pid)
-		clear_ftrace_swapper();
-	else
-		clear_ftrace_pid(pid);
-}
+	/* Wait till all users are no longer using pid filtering */
+	synchronize_sched();
 
-static void set_ftrace_pid_task(struct pid *pid)
-{
-	if (pid == ftrace_swapper_pid)
-		set_ftrace_swapper();
-	else
-		set_ftrace_pid(pid);
+	trace_free_pid_list(pid_list);
 }
 
-static int ftrace_pid_add(int p)
+static void ftrace_pid_reset(struct trace_array *tr)
 {
-	struct pid *pid;
-	struct ftrace_pid *fpid;
-	int ret = -EINVAL;
-
 	mutex_lock(&ftrace_lock);
-
-	if (!p)
-		pid = ftrace_swapper_pid;
-	else
-		pid = find_get_pid(p);
-
-	if (!pid)
-		goto out;
-
-	ret = 0;
-
-	list_for_each_entry(fpid, &ftrace_pids, list)
-		if (fpid->pid == pid)
-			goto out_put;
-
-	ret = -ENOMEM;
-
-	fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
-	if (!fpid)
-		goto out_put;
-
-	list_add(&fpid->list, &ftrace_pids);
-	fpid->pid = pid;
-
-	set_ftrace_pid_task(pid);
+	clear_ftrace_pids(tr);
 
 	ftrace_update_pid_func();
-
 	ftrace_startup_all(0);
 
 	mutex_unlock(&ftrace_lock);
-	return 0;
-
-out_put:
-	if (pid != ftrace_swapper_pid)
-		put_pid(pid);
-
-out:
-	mutex_unlock(&ftrace_lock);
-	return ret;
 }
 
-static void ftrace_pid_reset(void)
-{
-	struct ftrace_pid *fpid, *safe;
-
-	mutex_lock(&ftrace_lock);
-	list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
-		struct pid *pid = fpid->pid;
-
-		clear_ftrace_pid_task(pid);
-
-		list_del(&fpid->list);
-		kfree(fpid);
-	}
-
-	ftrace_update_pid_func();
-	ftrace_startup_all(0);
-
-	mutex_unlock(&ftrace_lock);
-}
+/* Greater than any max PID */
+#define FTRACE_NO_PIDS		(void *)(PID_MAX_LIMIT + 1)
 
 static void *fpid_start(struct seq_file *m, loff_t *pos)
+	__acquires(RCU)
 {
+	struct trace_pid_list *pid_list;
+	struct trace_array *tr = m->private;
+
 	mutex_lock(&ftrace_lock);
+	rcu_read_lock_sched();
 
-	if (!ftrace_pids_enabled() && (!*pos))
-		return (void *) 1;
+	pid_list = rcu_dereference_sched(tr->function_pids);
 
-	return seq_list_start(&ftrace_pids, *pos);
+	if (!pid_list)
+		return !(*pos) ? FTRACE_NO_PIDS : NULL;
+
+	return trace_pid_start(pid_list, pos);
 }
 
 static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	if (v == (void *)1)
+	struct trace_array *tr = m->private;
+	struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids);
+
+	if (v == FTRACE_NO_PIDS)
 		return NULL;
 
-	return seq_list_next(v, &ftrace_pids, pos);
+	return trace_pid_next(pid_list, v, pos);
 }
 
 static void fpid_stop(struct seq_file *m, void *p)
+	__releases(RCU)
 {
+	rcu_read_unlock_sched();
 	mutex_unlock(&ftrace_lock);
 }
 
 static int fpid_show(struct seq_file *m, void *v)
 {
-	const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
-
-	if (v == (void *)1) {
+	if (v == FTRACE_NO_PIDS) {
 		seq_puts(m, "no pid\n");
 		return 0;
 	}
 
-	if (fpid->pid == ftrace_swapper_pid)
-		seq_puts(m, "swapper tasks\n");
-	else
-		seq_printf(m, "%u\n", pid_vnr(fpid->pid));
-
-	return 0;
+	return trace_pid_show(m, v);
 }
 
 static const struct seq_operations ftrace_pid_sops = {
@@ -5509,58 +5430,103 @@ static const struct seq_operations ftrace_pid_sops = {
 static int
 ftrace_pid_open(struct inode *inode, struct file *file)
 {
+	struct trace_array *tr = inode->i_private;
+	struct seq_file *m;
 	int ret = 0;
 
+	if (trace_array_get(tr) < 0)
+		return -ENODEV;
+
 	if ((file->f_mode & FMODE_WRITE) &&
 	    (file->f_flags & O_TRUNC))
-		ftrace_pid_reset();
+		ftrace_pid_reset(tr);
 
-	if (file->f_mode & FMODE_READ)
-		ret = seq_open(file, &ftrace_pid_sops);
+	ret = seq_open(file, &ftrace_pid_sops);
+	if (ret < 0) {
+		trace_array_put(tr);
+	} else {
+		m = file->private_data;
+		/* copy tr over to seq ops */
+		m->private = tr;
+	}
 
 	return ret;
 }
 
+static void ignore_task_cpu(void *data)
+{
+	struct trace_array *tr = data;
+	struct trace_pid_list *pid_list;
+
+	/*
+	 * This function is called by on_each_cpu() while the
+	 * event_mutex is held.
+	 */
+	pid_list = rcu_dereference_protected(tr->function_pids,
+					     mutex_is_locked(&ftrace_lock));
+
+	this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
+		       trace_ignore_this_task(pid_list, current));
+}
+
 static ssize_t
 ftrace_pid_write(struct file *filp, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos)
 {
-	char buf[64], *tmp;
-	long val;
-	int ret;
+	struct seq_file *m = filp->private_data;
+	struct trace_array *tr = m->private;
+	struct trace_pid_list *filtered_pids = NULL;
+	struct trace_pid_list *pid_list;
+	ssize_t ret;
 
-	if (cnt >= sizeof(buf))
-		return -EINVAL;
+	if (!cnt)
+		return 0;
+
+	mutex_lock(&ftrace_lock);
+
+	filtered_pids = rcu_dereference_protected(tr->function_pids,
+					     lockdep_is_held(&ftrace_lock));
+
+	ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
+	if (ret < 0)
+		goto out;
 
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
+	rcu_assign_pointer(tr->function_pids, pid_list);
 
-	buf[cnt] = 0;
+	if (filtered_pids) {
+		synchronize_sched();
+		trace_free_pid_list(filtered_pids);
+	} else if (pid_list) {
+		/* Register a probe to set whether to ignore the tracing of a task */
+		register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
+	}
 
 	/*
-	 * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
-	 * to clean the filter quietly.
+	 * Ignoring of pids is done at task switch. But we have to
+	 * check for those tasks that are currently running.
+	 * Always do this in case a pid was appended or removed.
 	 */
-	tmp = strstrip(buf);
-	if (strlen(tmp) == 0)
-		return 1;
+	on_each_cpu(ignore_task_cpu, tr, 1);
 
-	ret = kstrtol(tmp, 10, &val);
-	if (ret < 0)
-		return ret;
+	ftrace_update_pid_func();
+	ftrace_startup_all(0);
+ out:
+	mutex_unlock(&ftrace_lock);
 
-	ret = ftrace_pid_add(val);
+	if (ret > 0)
+		*ppos += ret;
 
-	return ret ? ret : cnt;
+	return ret;
 }
 
 static int
 ftrace_pid_release(struct inode *inode, struct file *file)
 {
-	if (file->f_mode & FMODE_READ)
-		seq_release(inode, file);
+	struct trace_array *tr = inode->i_private;
 
-	return 0;
+	trace_array_put(tr);
+
+	return seq_release(inode, file);
 }
 
 static const struct file_operations ftrace_pid_fops = {
@@ -5571,24 +5537,17 @@ static const struct file_operations ftrace_pid_fops = {
 	.release	= ftrace_pid_release,
 };
 
-static __init int ftrace_init_tracefs(void)
+void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 {
-	struct dentry *d_tracer;
-
-	d_tracer = tracing_init_dentry();
-	if (IS_ERR(d_tracer))
-		return 0;
-
-	ftrace_init_dyn_tracefs(d_tracer);
+	/* Only the top level directory has the dyn_tracefs and profile */
+	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
+		ftrace_init_dyn_tracefs(d_tracer);
+		ftrace_profile_tracefs(d_tracer);
+	}
 
 	trace_create_file("set_ftrace_pid", 0644, d_tracer,
-			    NULL, &ftrace_pid_fops);
-
-	ftrace_profile_tracefs(d_tracer);
-
-	return 0;
+			    tr, &ftrace_pid_fops);
 }
-fs_initcall(ftrace_init_tracefs);
 
 /**
  * ftrace_kill - kill ftrace
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a8bb7485fd1d..aa240551fc5d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7233,6 +7233,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	for_each_tracing_cpu(cpu)
 		tracing_init_tracefs_percpu(tr, cpu);
 
+	ftrace_init_tracefs(tr, d_tracer);
 }
 
 static struct vfsmount *trace_automount(void *ingore)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a4dce1ef9e03..eaee458755a4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -156,6 +156,9 @@ struct trace_array_cpu {
 	char			comm[TASK_COMM_LEN];
 
 	bool			ignore_pid;
+#ifdef CONFIG_FUNCTION_TRACER
+	bool			ftrace_ignore_pid;
+#endif
 };
 
 struct tracer;
@@ -247,6 +250,7 @@ struct trace_array {
 	int			ref;
 #ifdef CONFIG_FUNCTION_TRACER
 	struct ftrace_ops	*ops;
+	struct trace_pid_list	__rcu *function_pids;
 	/* function tracing enabled */
 	int			function_enabled;
 #endif
@@ -840,12 +844,9 @@ extern struct list_head ftrace_pids;
 
 #ifdef CONFIG_FUNCTION_TRACER
 extern bool ftrace_filter_param __initdata;
-static inline int ftrace_trace_task(struct task_struct *task)
+static inline int ftrace_trace_task(struct trace_array *tr)
 {
-	if (list_empty(&ftrace_pids))
-		return 1;
-
-	return test_tsk_trace_trace(task);
+	return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid);
 }
 extern int ftrace_is_dead(void);
 int ftrace_create_function_files(struct trace_array *tr,
@@ -855,8 +856,9 @@ void ftrace_init_global_array_ops(struct trace_array *tr);
 void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
 void ftrace_reset_array_ops(struct trace_array *tr);
 int using_ftrace_ops_list_func(void);
+void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
 #else
-static inline int ftrace_trace_task(struct task_struct *task)
+static inline int ftrace_trace_task(struct trace_array *tr)
 {
 	return 1;
 }
@@ -871,6 +873,7 @@ static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
 static inline __init void
 ftrace_init_global_array_ops(struct trace_array *tr) { }
 static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
+static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }
 /* ftace_func_t type is not defined, use macro instead of static inline */
 #define ftrace_init_array_ops(tr, func) do { } while (0)
 #endif /* CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5a095c2e4b69..0efa00d80623 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -43,7 +43,7 @@ static int allocate_ftrace_ops(struct trace_array *tr)
 
 	/* Currently only the non stack verision is supported */
 	ops->func = function_trace_call;
-	ops->flags = FTRACE_OPS_FL_RECURSION_SAFE;
+	ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID;
 
 	tr->ops = ops;
 	ops->private = tr;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 3a0244ff7ea8..67cce7896aeb 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -319,7 +319,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	int cpu;
 	int pc;
 
-	if (!ftrace_trace_task(current))
+	if (!ftrace_trace_task(tr))
 		return 0;
 
 	/* trace it when it is-nested-in or is a function enabled. */
-- 
cgit v1.3-8-gc7d7


From 35abb67de744b5dbaec54381f2f9e0246089331d Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 8 Jun 2016 18:38:02 -0700
Subject: tracing: expose current->comm to [ku]probe events

ftrace is very quick to give up on saving the task command line (see
`trace_save_cmdline()`). The workaround for events which really care
about the command line is to explicitly assign it as part of the entry.
However, this doesn't work for kprobe events, as there's no
straightforward way to get access to current->comm. Add a kprobe/uprobe
event variable $comm which provides exactly that.

Link: http://lkml.kernel.org/r/f59b472033b943a370f5f48d0af37698f409108f.1465435894.git.osandov@fb.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 Documentation/trace/kprobetrace.txt  |  3 +++
 Documentation/trace/uprobetracer.txt |  3 +++
 kernel/trace/trace_kprobe.c          |  1 +
 kernel/trace/trace_probe.c           | 33 +++++++++++++++++++++++++++++++++
 kernel/trace/trace_probe.h           | 10 ++++++++++
 5 files changed, 50 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index d68ea5fc812b..ea52ec1f8484 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -40,6 +40,7 @@ Synopsis of kprobe_events
   $stackN	: Fetch Nth entry of stack (N >= 0)
   $stack	: Fetch stack address.
   $retval	: Fetch return value.(*)
+  $comm		: Fetch current task comm.
   +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)
   NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
   FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
@@ -62,6 +63,8 @@ offset, and container-size (usually 32). The syntax is;
 
  b<bit-width>@<bit-offset>/<container-size>
 
+For $comm, the default type is "string"; any other type is invalid.
+
 
 Per-Probe Event Filtering
 -------------------------
diff --git a/Documentation/trace/uprobetracer.txt b/Documentation/trace/uprobetracer.txt
index f1cf9a34ad9d..72d1cd4f7bf3 100644
--- a/Documentation/trace/uprobetracer.txt
+++ b/Documentation/trace/uprobetracer.txt
@@ -36,6 +36,7 @@ Synopsis of uprobe_tracer
    $stackN	: Fetch Nth entry of stack (N >= 0)
    $stack	: Fetch stack address.
    $retval	: Fetch return value.(*)
+   $comm	: Fetch current task comm.
    +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)
    NAME=FETCHARG     : Set NAME as the argument name of FETCHARG.
    FETCHARG:TYPE     : Set TYPE as the type of FETCHARG. Currently, basic types
@@ -57,6 +58,8 @@ offset, and container-size (usually 32). The syntax is;
 
  b<bit-width>@<bit-offset>/<container-size>
 
+For $comm, the default type is "string"; any other type is invalid.
+
 
 Event Profiling
 ---------------
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5546eec0505f..9aedb0b06683 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -587,6 +587,7 @@ static int create_trace_kprobe(int argc, char **argv)
 	 *  $retval	: fetch return value
 	 *  $stack	: fetch stack address
 	 *  $stackN	: fetch Nth of stack (N:0-)
+	 *  $comm       : fetch current task comm
 	 *  @ADDR	: fetch memory at ADDR (ADDR should be in kernel)
 	 *  @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
 	 *  %REG	: fetch register REG
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 1d372fa6fefb..74e80a582c28 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -218,6 +218,28 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)
 	kfree(data);
 }
 
+void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs,
+					  void *data, void *dest)
+{
+	int maxlen = get_rloc_len(*(u32 *)dest);
+	u8 *dst = get_rloc_data(dest);
+	long ret;
+
+	if (!maxlen)
+		return;
+
+	ret = strlcpy(dst, current->comm, maxlen);
+	*(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
+}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string));
+
+void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs,
+					       void *data, void *dest)
+{
+	*(u32 *)dest = strlen(current->comm) + 1;
+}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size));
+
 static const struct fetch_type *find_fetch_type(const char *type,
 						const struct fetch_type *ftbl)
 {
@@ -348,6 +370,11 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
 			}
 		} else
 			ret = -EINVAL;
+	} else if (strcmp(arg, "comm") == 0) {
+		if (strcmp(t->name, "string") != 0 &&
+		    strcmp(t->name, "string_size") != 0)
+			return -EINVAL;
+		f->fn = t->fetch[FETCH_MTD_comm];
 	} else
 		ret = -EINVAL;
 
@@ -522,6 +549,12 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
 		arg[t - parg->comm] = '\0';
 		t++;
 	}
+	/*
+	 * The default type of $comm should be "string", and it can't be
+	 * dereferenced.
+	 */
+	if (!t && strcmp(arg, "$comm") == 0)
+		t = "string";
 	parg->type = find_fetch_type(t, ftbl);
 	if (!parg->type) {
 		pr_info("Unsupported type: %s\n", t);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index f6398db09114..45400ca5ded1 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -102,6 +102,7 @@ enum {
 	FETCH_MTD_reg = 0,
 	FETCH_MTD_stack,
 	FETCH_MTD_retval,
+	FETCH_MTD_comm,
 	FETCH_MTD_memory,
 	FETCH_MTD_symbol,
 	FETCH_MTD_deref,
@@ -183,6 +184,14 @@ DECLARE_BASIC_FETCH_FUNCS(bitfield);
 #define fetch_bitfield_string			NULL
 #define fetch_bitfield_string_size		NULL
 
+/* comm only makes sense as a string */
+#define fetch_comm_u8		NULL
+#define fetch_comm_u16		NULL
+#define fetch_comm_u32		NULL
+#define fetch_comm_u64		NULL
+DECLARE_FETCH_FUNC(comm, string);
+DECLARE_FETCH_FUNC(comm, string_size);
+
 /*
  * Define macro for basic types - we don't need to define s* types, because
  * we have to care only about bitwidth at recording time.
@@ -213,6 +222,7 @@ DEFINE_FETCH_##method(u64)
 ASSIGN_FETCH_FUNC(reg, ftype),				\
 ASSIGN_FETCH_FUNC(stack, ftype),			\
 ASSIGN_FETCH_FUNC(retval, ftype),			\
+ASSIGN_FETCH_FUNC(comm, ftype),				\
 ASSIGN_FETCH_FUNC(memory, ftype),			\
 ASSIGN_FETCH_FUNC(symbol, ftype),			\
 ASSIGN_FETCH_FUNC(deref, ftype),			\
-- 
cgit v1.3-8-gc7d7


From e2ace001176dc9745a472fe8bda1f0b28a4d7351 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 26 May 2016 12:00:33 -0700
Subject: tracing: Choose static tp_printk buffer by explicit nesting count

Currently, the trace_printk code chooses which static buffer to use based
on what type of atomic context (NMI, IRQ, etc) it's in.  Simplify the
code and make it more robust: simply count the nesting depth and choose
a buffer based on the current nesting depth.

The new code will only drop an event if we nest more than 4 deep,
and the old code was guaranteed to malfunction if that happened.

Link: http://lkml.kernel.org/r/07ab03aecfba25fcce8f9a211b14c9c5e2865c58.1464289095.git.luto@kernel.org

Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 83 +++++++++++++++-------------------------------------
 1 file changed, 24 insertions(+), 59 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index aa240551fc5d..45e6747589c6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2339,83 +2339,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
 
 /* created for use with alloc_percpu */
 struct trace_buffer_struct {
-	char buffer[TRACE_BUF_SIZE];
+	int nesting;
+	char buffer[4][TRACE_BUF_SIZE];
 };
 
 static struct trace_buffer_struct *trace_percpu_buffer;
-static struct trace_buffer_struct *trace_percpu_sirq_buffer;
-static struct trace_buffer_struct *trace_percpu_irq_buffer;
-static struct trace_buffer_struct *trace_percpu_nmi_buffer;
 
 /*
- * The buffer used is dependent on the context. There is a per cpu
- * buffer for normal context, softirq contex, hard irq context and
- * for NMI context. Thise allows for lockless recording.
- *
- * Note, if the buffers failed to be allocated, then this returns NULL
+ * Thise allows for lockless recording.  If we're nested too deeply, then
+ * this returns NULL.
  */
 static char *get_trace_buf(void)
 {
-	struct trace_buffer_struct *percpu_buffer;
-
-	/*
-	 * If we have allocated per cpu buffers, then we do not
-	 * need to do any locking.
-	 */
-	if (in_nmi())
-		percpu_buffer = trace_percpu_nmi_buffer;
-	else if (in_irq())
-		percpu_buffer = trace_percpu_irq_buffer;
-	else if (in_softirq())
-		percpu_buffer = trace_percpu_sirq_buffer;
-	else
-		percpu_buffer = trace_percpu_buffer;
+	struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);
 
-	if (!percpu_buffer)
+	if (!buffer || buffer->nesting >= 4)
 		return NULL;
 
-	return this_cpu_ptr(&percpu_buffer->buffer[0]);
+	return &buffer->buffer[buffer->nesting++][0];
+}
+
+static void put_trace_buf(void)
+{
+	this_cpu_dec(trace_percpu_buffer->nesting);
 }
 
 static int alloc_percpu_trace_buffer(void)
 {
 	struct trace_buffer_struct *buffers;
-	struct trace_buffer_struct *sirq_buffers;
-	struct trace_buffer_struct *irq_buffers;
-	struct trace_buffer_struct *nmi_buffers;
 
 	buffers = alloc_percpu(struct trace_buffer_struct);
-	if (!buffers)
-		goto err_warn;
-
-	sirq_buffers = alloc_percpu(struct trace_buffer_struct);
-	if (!sirq_buffers)
-		goto err_sirq;
-
-	irq_buffers = alloc_percpu(struct trace_buffer_struct);
-	if (!irq_buffers)
-		goto err_irq;
-
-	nmi_buffers = alloc_percpu(struct trace_buffer_struct);
-	if (!nmi_buffers)
-		goto err_nmi;
+	if (WARN(!buffers, "Could not allocate percpu trace_printk buffer"))
+		return -ENOMEM;
 
 	trace_percpu_buffer = buffers;
-	trace_percpu_sirq_buffer = sirq_buffers;
-	trace_percpu_irq_buffer = irq_buffers;
-	trace_percpu_nmi_buffer = nmi_buffers;
-
 	return 0;
-
- err_nmi:
-	free_percpu(irq_buffers);
- err_irq:
-	free_percpu(sirq_buffers);
- err_sirq:
-	free_percpu(buffers);
- err_warn:
-	WARN(1, "Could not allocate percpu trace_printk buffer");
-	return -ENOMEM;
 }
 
 static int buffers_allocated;
@@ -2506,7 +2464,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	tbuffer = get_trace_buf();
 	if (!tbuffer) {
 		len = 0;
-		goto out;
+		goto out_nobuffer;
 	}
 
 	len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
@@ -2532,6 +2490,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	}
 
 out:
+	put_trace_buf();
+
+out_nobuffer:
 	preempt_enable_notrace();
 	unpause_graph_tracing();
 
@@ -2563,7 +2524,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
 	tbuffer = get_trace_buf();
 	if (!tbuffer) {
 		len = 0;
-		goto out;
+		goto out_nobuffer;
 	}
 
 	len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
@@ -2582,7 +2543,11 @@ __trace_array_vprintk(struct ring_buffer *buffer,
 		__buffer_unlock_commit(buffer, event);
 		ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
 	}
- out:
+
+out:
+	put_trace_buf();
+
+out_nobuffer:
 	preempt_enable_notrace();
 	unpause_graph_tracing();
 
-- 
cgit v1.3-8-gc7d7


From e947841c0dce9db675a957182214ef8091ac3d61 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 17 Jun 2016 17:40:58 -0400
Subject: tracing: Show the preempt count of when the event was called

Because tracepoint callbacks are done with preemption enabled, the trace
events are always called with preempt disable due to the
rcu_read_lock_sched_notrace() in __DO_TRACE(). This causes the preempt count
shown in the recorded trace event to be inaccurate. It is always one more
that what the preempt_count was when the tracepoint was called.

If CONFIG_PREEMPT is enabled, subtract 1 from the preempt_count before
recording it in the trace buffer.

Link: http://lkml.kernel.org/r/20160525132537.GA10808@linutronix.de

Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index fd449eb138cf..03c0a48c3ac4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -261,6 +261,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
 
 	local_save_flags(fbuffer->flags);
 	fbuffer->pc = preempt_count();
+	/*
+	 * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
+	 * preemption (adding one to the preempt_count). Since we are
+	 * interested in the preempt_count at the time the tracepoint was
+	 * hit, we need to subtract one to offset the increment.
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPT))
+		fbuffer->pc--;
 	fbuffer->trace_file = trace_file;
 
 	fbuffer->event =
-- 
cgit v1.3-8-gc7d7


From 9a51933e360897d9b3867c9b09dd5ccf7493e97e Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 11 May 2016 14:06:57 -0500
Subject: tracing: Expose CPU physical addresses (resource values) for PCI
 devices

Previously, mmio_print_pcidev() put "user" addresses in the trace buffer.
On most architectures, these are the same as CPU physical addresses, but on
microblaze, mips, powerpc, and sparc, they may be something else, typically
a raw BAR value (a bus address as opposed to a CPU address).

Always expose the CPU physical address to avoid this arch-dependent
behavior.

This change should have no user-visible effect because this file currently
depends on CONFIG_HAVE_MMIOTRACE_SUPPORT, which is only defined for x86,
and pci_resource_to_user() is a no-op on x86.

Link: http://lkml.kernel.org/r/20160511190657.5898.4248.stgit@bhelgaas-glaptop2.roam.corp.google.com

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_mmiotrace.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 68f376ca6d3f..cd7480d0a201 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -68,19 +68,15 @@ static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
 	trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
 			 dev->bus->number, dev->devfn,
 			 dev->vendor, dev->device, dev->irq);
-	/*
-	 * XXX: is pci_resource_to_user() appropriate, since we are
-	 * supposed to interpret the __ioremap() phys_addr argument based on
-	 * these printed values?
-	 */
 	for (i = 0; i < 7; i++) {
-		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+		start = dev->resource[i].start;
 		trace_seq_printf(s, " %llx",
 			(unsigned long long)(start |
 			(dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
 	}
 	for (i = 0; i < 7; i++) {
-		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+		start = dev->resource[i].start;
+		end = dev->resource[i].end;
 		trace_seq_printf(s, " %llx",
 			dev->resource[i].start < dev->resource[i].end ?
 			(unsigned long long)(end - start) + 1 : 0);
-- 
cgit v1.3-8-gc7d7


From 0fb71d340d355156818bb53eb36ae79a3f88bda9 Mon Sep 17 00:00:00 2001
From: Minfei Huang <mnghuan@gmail.com>
Date: Mon, 25 Apr 2016 17:20:28 +0800
Subject: clocksource: Make clocksource insert entry more efficient

In clocksource_enqueue(), it is unnecessary to continue looping
the list, if we find there is an entry that the value of rating
is smaller than the new one. It is safe to be out the loop,
because all of entry are inserted in descending order.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Minfei Huang <mnghuan@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/clocksource.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 56ece145a814..6a5a310a1a53 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -669,10 +669,12 @@ static void clocksource_enqueue(struct clocksource *cs)
 	struct list_head *entry = &clocksource_list;
 	struct clocksource *tmp;
 
-	list_for_each_entry(tmp, &clocksource_list, list)
+	list_for_each_entry(tmp, &clocksource_list, list) {
 		/* Keep track of the place, where to insert */
-		if (tmp->rating >= cs->rating)
-			entry = &tmp->list;
+		if (tmp->rating < cs->rating)
+			break;
+		entry = &tmp->list;
+	}
 	list_add(&cs->list, entry);
 }
 
-- 
cgit v1.3-8-gc7d7


From 0209b937569a133dedfe930cdfff3a0d1d68c9e9 Mon Sep 17 00:00:00 2001
From: Thomas Graziadei <thomas.graziadei@omicronenergy.com>
Date: Tue, 31 May 2016 15:06:06 +0200
Subject: timekeeping: Fix 1ns/tick drift with GENERIC_TIME_VSYSCALL_OLD

The user notices the problem in a raw and real time drift, calling
clock_gettime with CLOCK_REALTIME / CLOCK_MONOTONIC_RAW on a system
with no ntp correction taking place (no ntpd or ptp stuff running).

The problem is, that old_vsyscall_fixup adds an extra 1ns even though
xtime_nsec is already held in full nsecs and the remainder in this
case is 0. Do the rounding up buisness only if needed.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Graziadei <thomas.graziadei@omicronenergy.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 479d25cd3d4f..a196e08324e7 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -480,10 +480,12 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
 	* users are removed, this can be killed.
 	*/
 	remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
-	tk->tkr_mono.xtime_nsec -= remainder;
-	tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
-	tk->ntp_error += remainder << tk->ntp_error_shift;
-	tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
+	if (remainder != 0) {
+		tk->tkr_mono.xtime_nsec -= remainder;
+		tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
+		tk->ntp_error += remainder << tk->ntp_error_shift;
+		tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
+	}
 }
 #else
 #define old_vsyscall_fixup(tk)
-- 
cgit v1.3-8-gc7d7


From af4afb40085f04f8b2ea8d28df878f7f0be02f89 Mon Sep 17 00:00:00 2001
From: Pratyush Patel <pratyushpatel.1995@gmail.com>
Date: Tue, 14 Jun 2016 11:00:42 +0200
Subject: alarmtimer: Fix comments describing structure fields

Updated struct alarm and struct alarm_timer descriptions.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Pratyush Patel <pratyushpatel.1995@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/alarmtimer.h | 6 +++---
 kernel/time/alarmtimer.c   | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h
index 52f3b7da4f2d..9d8031257a90 100644
--- a/include/linux/alarmtimer.h
+++ b/include/linux/alarmtimer.h
@@ -26,10 +26,10 @@ enum alarmtimer_restart {
  * struct alarm - Alarm timer structure
  * @node:	timerqueue node for adding to the event list this value
  *		also includes the expiration time.
- * @period:	Period for recuring alarms
+ * @timer:	hrtimer used to schedule events while running
  * @function:	Function pointer to be executed when the timer fires.
- * @type:	Alarm type (BOOTTIME/REALTIME)
- * @enabled:	Flag that represents if the alarm is set to fire or not
+ * @type:	Alarm type (BOOTTIME/REALTIME).
+ * @state:	Flag that represents if the alarm is set to fire or not.
  * @data:	Internal data value.
  */
 struct alarm {
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index e840ed867a5d..c3aad685bbc0 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -30,7 +30,6 @@
  * struct alarm_base - Alarm timer bases
  * @lock:		Lock for syncrhonized access to the base
  * @timerqueue:		Timerqueue head managing the list of events
- * @timer: 		hrtimer used to schedule events while running
  * @gettime:		Function to read the time correlating to the base
  * @base_clockid:	clockid for the base
  */
-- 
cgit v1.3-8-gc7d7


From e6c2682a1da36a2e79d9bab470412374434ce89e Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Wed, 8 Jun 2016 22:04:59 -0700
Subject: time: Add time64_to_tm()

time_to_tm() takes time_t as an argument.
time_t is not y2038 safe.
Add time64_to_tm() that takes time64_t as an argument
which is y2038 safe.
The plan is to eventually replace all calls to time_to_tm()
by time64_to_tm().

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/time.h   | 15 ++++++++++++++-
 kernel/time/timeconv.c | 11 ++++++-----
 2 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/time.h b/include/linux/time.h
index 297f09f23896..4cea09d94208 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -205,7 +205,20 @@ struct tm {
 	int tm_yday;
 };
 
-void time_to_tm(time_t totalsecs, int offset, struct tm *result);
+void time64_to_tm(time64_t totalsecs, int offset, struct tm *result);
+
+/**
+ * time_to_tm - converts the calendar time to local broken-down time
+ *
+ * @totalsecs	the number of seconds elapsed since 00:00:00 on January 1, 1970,
+ *		Coordinated Universal Time (UTC).
+ * @offset	offset seconds adding to totalsecs.
+ * @result	pointer to struct tm variable to receive broken-down time
+ */
+static inline void time_to_tm(time_t totalsecs, int offset, struct tm *result)
+{
+	time64_to_tm(totalsecs, offset, result);
+}
 
 /**
  * timespec_to_ns - Convert timespec to nanoseconds
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
index 86628e755f38..7142580ad94f 100644
--- a/kernel/time/timeconv.c
+++ b/kernel/time/timeconv.c
@@ -67,20 +67,21 @@ static const unsigned short __mon_yday[2][13] = {
 #define SECS_PER_DAY	(SECS_PER_HOUR * 24)
 
 /**
- * time_to_tm - converts the calendar time to local broken-down time
+ * time64_to_tm - converts the calendar time to local broken-down time
  *
  * @totalsecs	the number of seconds elapsed since 00:00:00 on January 1, 1970,
  *		Coordinated Universal Time (UTC).
  * @offset	offset seconds adding to totalsecs.
  * @result	pointer to struct tm variable to receive broken-down time
  */
-void time_to_tm(time_t totalsecs, int offset, struct tm *result)
+void time64_to_tm(time64_t totalsecs, int offset, struct tm *result)
 {
 	long days, rem, y;
+	int remainder;
 	const unsigned short *ip;
 
-	days = totalsecs / SECS_PER_DAY;
-	rem = totalsecs % SECS_PER_DAY;
+	days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder);
+	rem = remainder;
 	rem += offset;
 	while (rem < 0) {
 		rem += SECS_PER_DAY;
@@ -124,4 +125,4 @@ void time_to_tm(time_t totalsecs, int offset, struct tm *result)
 	result->tm_mon = y;
 	result->tm_mday = days + 1;
 }
-EXPORT_SYMBOL(time_to_tm);
+EXPORT_SYMBOL(time64_to_tm);
-- 
cgit v1.3-8-gc7d7


From 4a19bd3d22d51a0c89db10879dacaffa0f52aecf Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 17 Jun 2016 18:03:02 +0200
Subject: time: Avoid timespec in udelay_test

udelay_test_single() uses ktime_get_ts() to get two timespec values
and calculate the difference between them, while udelay_test_show()
uses the same to printk() the current monotonic time.

Both of these are y2038 safe on all machines, but we want to
get rid of struct timespec anyway, so this converts the code to
use ktime_get_ns() and ktime_get_ts64() respectively.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/test_udelay.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
index e622ba365a13..b0928ab3270f 100644
--- a/kernel/time/test_udelay.c
+++ b/kernel/time/test_udelay.c
@@ -43,13 +43,13 @@ static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters)
 	int allowed_error_ns = usecs * 5;
 
 	for (i = 0; i < iters; ++i) {
-		struct timespec ts1, ts2;
+		s64 kt1, kt2;
 		int time_passed;
 
-		ktime_get_ts(&ts1);
+		kt1 = ktime_get_ns();
 		udelay(usecs);
-		ktime_get_ts(&ts2);
-		time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1);
+		kt2 = ktime_get_ns();
+		time_passed = kt2 - kt1;
 
 		if (i == 0 || time_passed < min)
 			min = time_passed;
@@ -87,11 +87,11 @@ static int udelay_test_show(struct seq_file *s, void *v)
 	if (usecs > 0 && iters > 0) {
 		return udelay_test_single(s, usecs, iters);
 	} else if (usecs == 0) {
-		struct timespec ts;
+		struct timespec64 ts;
 
-		ktime_get_ts(&ts);
-		seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n",
-				loops_per_jiffy, ts.tv_sec, ts.tv_nsec);
+		ktime_get_ts64(&ts);
+		seq_printf(s, "udelay() test (lpj=%ld kt=%lld.%09ld)\n",
+				loops_per_jiffy, (s64)ts.tv_sec, ts.tv_nsec);
 		seq_puts(s, "usage:\n");
 		seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n");
 		seq_puts(s, "cat " DEBUGFS_FILENAME "\n");
-- 
cgit v1.3-8-gc7d7


From 7c71feb0a6766c7c3a262e3cc33ae231f3953cb6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 17 Jun 2016 17:30:47 +0200
Subject: timer: Avoid using timespec

The tstats_show() function prints a ktime_t variable by converting
it to struct timespec first. The algorithm is ok, but we want to
stop using timespec in general because of the 32-bit time_t
overflow problem.

This changes the code to use struct timespec64, without any
functional change.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timer_stats.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1adecb4b87c8..087204c733eb 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -279,7 +279,7 @@ static void print_name_offset(struct seq_file *m, unsigned long addr)
 
 static int tstats_show(struct seq_file *m, void *v)
 {
-	struct timespec period;
+	struct timespec64 period;
 	struct entry *entry;
 	unsigned long ms;
 	long events = 0;
@@ -295,11 +295,11 @@ static int tstats_show(struct seq_file *m, void *v)
 
 	time = ktime_sub(time_stop, time_start);
 
-	period = ktime_to_timespec(time);
+	period = ktime_to_timespec64(time);
 	ms = period.tv_nsec / 1000000;
 
 	seq_puts(m, "Timer Stats Version: v0.3\n");
-	seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
+	seq_printf(m, "Sample period: %ld.%03ld s\n", (long)period.tv_sec, ms);
 	if (atomic_read(&overflow_count))
 		seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
 	seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
-- 
cgit v1.3-8-gc7d7


From e7e15b87f86d4a48c270b81cf027eafd801e5b89 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 21 Jun 2016 13:06:24 -0400
Subject: cgroup: allow NULL return from ss->css_alloc()

cgroup core expected css_alloc to return an ERR_PTR value on failure
and caused NULL deref if it returned NULL.  It's an easy mistake to
make from an alloc function and there's no ambiguity in what's being
indicated.  Update css_create() so that it interprets NULL return from
css_alloc as -ENOMEM.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 78f6d18ff0af..dd26e1bb7222 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5133,6 +5133,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
 	lockdep_assert_held(&cgroup_mutex);
 
 	css = ss->css_alloc(parent_css);
+	if (!css)
+		css = ERR_PTR(-ENOMEM);
 	if (IS_ERR(css))
 		return css;
 
-- 
cgit v1.3-8-gc7d7


From 135b8b37bd91cc82f83e98fca109b80375f5317e Mon Sep 17 00:00:00 2001
From: Kenny Yu <kennyyu@fb.com>
Date: Tue, 21 Jun 2016 14:04:36 -0400
Subject: cgroup: Add pids controller event when fork fails because of pid
 limit

This patch adds more visibility into the pids controller when the controller
rejects a fork request. Whenever fork fails because the limit on the number of
pids in the cgroup is reached, the controller will log this and also notify the
newly added cgroups events file. The `max` key in the events file represents
the number of times fork failed because of the pids controller.

This change also logs only the first time the `max` event counter is
incremented. This is to provide a hint to the user to understand why fork
failed, as users are not yet used to seeing fork failures because of the
pids controller.

Signed-off-by: Kenny Yu <kennyyu@fb.com>
Acked-by: Johannes Weiner <hannes <at> cmpxchg.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup_pids.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 303097b37429..9740ea6762de 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -49,6 +49,12 @@ struct pids_cgroup {
 	 */
 	atomic64_t			counter;
 	int64_t				limit;
+
+	/* Handle for "pids.events" */
+	struct cgroup_file		events_file;
+
+	/* Number of times fork failed because limit was hit. */
+	atomic64_t			events_limit;
 };
 
 static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
@@ -72,6 +78,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent)
 
 	pids->limit = PIDS_MAX;
 	atomic64_set(&pids->counter, 0);
+	atomic64_set(&pids->events_limit, 0);
 	return &pids->css;
 }
 
@@ -213,10 +220,21 @@ static int pids_can_fork(struct task_struct *task)
 {
 	struct cgroup_subsys_state *css;
 	struct pids_cgroup *pids;
+	int err;
 
 	css = task_css_check(current, pids_cgrp_id, true);
 	pids = css_pids(css);
-	return pids_try_charge(pids, 1);
+	err = pids_try_charge(pids, 1);
+	if (err) {
+		/* Only log the first time events_limit is incremented. */
+		if (atomic64_inc_return(&pids->events_limit) == 1) {
+			pr_info("cgroup: fork rejected by pids controller in ");
+			pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id));
+			pr_cont("\n");
+		}
+		cgroup_file_notify(&pids->events_file);
+	}
+	return err;
 }
 
 static void pids_cancel_fork(struct task_struct *task)
@@ -288,6 +306,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css,
 	return atomic64_read(&pids->counter);
 }
 
+static int pids_events_show(struct seq_file *sf, void *v)
+{
+	struct pids_cgroup *pids = css_pids(seq_css(sf));
+
+	seq_printf(sf, "max %ld\n", atomic64_read(&pids->events_limit));
+	return 0;
+}
+
 static struct cftype pids_files[] = {
 	{
 		.name = "max",
@@ -300,6 +326,12 @@ static struct cftype pids_files[] = {
 		.read_s64 = pids_current_read,
 		.flags = CFTYPE_NOT_ON_ROOT,
 	},
+	{
+		.name = "events",
+		.seq_show = pids_events_show,
+		.file_offset = offsetof(struct pids_cgroup, events_file),
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
 	{ }	/* terminate */
 };
 
-- 
cgit v1.3-8-gc7d7


From 9f6870dd9790dd87da1d0cf9e43e60113f3a278d Mon Sep 17 00:00:00 2001
From: Kenny Yu <kennyyu@fb.com>
Date: Tue, 21 Jun 2016 11:55:35 -0700
Subject: cgroup: Use lld instead of ld when printing pids controller
 events_limit

The `events_limit` variable needs to be formatted with %lld and not %ld.
This fixes the following warning discovered by kbuild test robot:

   kernel/cgroup_pids.c: In function 'pids_events_show':
   kernel/cgroup_pids.c:313:24: warning: format '%ld' expects argument of type
   'long int', but argument 3 has type 'long long int' [-Wformat=]
        seq_printf(sf, "max %ld\n", atomic64_read(&pids->events_limit));
                                   ^

tj: Added explicit (s64) cast as atomic64 switches between long long
    and long depending on 32 or 64.

Signed-off-by: Kenny Yu <kennyyu@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup_pids.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 9740ea6762de..2bd673783f1a 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -310,7 +310,7 @@ static int pids_events_show(struct seq_file *sf, void *v)
 {
 	struct pids_cgroup *pids = css_pids(seq_css(sf));
 
-	seq_printf(sf, "max %ld\n", atomic64_read(&pids->events_limit));
+	seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
 	return 0;
 }
 
-- 
cgit v1.3-8-gc7d7


From d16dcd3d18759eb955e0325572d07457f93494f5 Mon Sep 17 00:00:00 2001
From: Jon Hunter <jonathanh@nvidia.com>
Date: Tue, 21 Jun 2016 10:23:22 +0100
Subject: irqdomain: Fix disposal of mappings for interrupt hierarchies

The function irq_create_of_mapping() is used to create an interrupt
mapping. However, depending on whether the irqdomain, to which the
interrupt belongs, is part of a hierarchy, determines whether the
mapping is created via calling irq_domain_alloc_irqs() or
irq_create_mapping().

To dispose of the interrupt mapping, drivers call irq_dispose_mapping().
However, this function does not check to see if the irqdomain is part
of a hierarchy or not and simply assumes that it was mapped via calling
irq_create_mapping() so calls irq_domain_disassociate() to unmap the
interrupt.

Fix this by checking to see if the irqdomain is part of a hierarchy and
if so call irq_domain_free_irqs() to free/unmap the interrupt.

Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Link: http://lkml.kernel.org/r/1466501002-16368-1-git-send-email-jonathanh@nvidia.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/irqdomain.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index caa6a63d26f0..5d89d724a02a 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -680,8 +680,12 @@ void irq_dispose_mapping(unsigned int virq)
 	if (WARN_ON(domain == NULL))
 		return;
 
-	irq_domain_disassociate(domain, virq);
-	irq_free_desc(virq);
+	if (irq_domain_is_hierarchy(domain)) {
+		irq_domain_free_irqs(virq, 1);
+	} else {
+		irq_domain_disassociate(domain, virq);
+		irq_free_desc(virq);
+	}
 }
 EXPORT_SYMBOL_GPL(irq_dispose_mapping);
 
-- 
cgit v1.3-8-gc7d7


From be54f69c26193de31053190761e521903b89d098 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 23 Jun 2016 14:03:47 -0400
Subject: tracing: Skip more functions when doing stack tracing of events

 # echo 1 > options/stacktrace
 # echo 1 > events/sched/sched_switch/enable
 # cat trace
          <idle>-0     [002] d..2  1982.525169: <stack trace>
 => save_stack_trace
 => __ftrace_trace_stack
 => trace_buffer_unlock_commit_regs
 => event_trigger_unlock_commit
 => trace_event_buffer_commit
 => trace_event_raw_event_sched_switch
 => __schedule
 => schedule
 => schedule_preempt_disabled
 => cpu_startup_entry
 => start_secondary

The above shows that we are seeing 6 functions before ever making it to the
caller of the sched_switch event.

 # echo stacktrace > events/sched/sched_switch/trigger
 # cat trace
          <idle>-0     [002] d..3  2146.335208: <stack trace>
 => trace_event_buffer_commit
 => trace_event_raw_event_sched_switch
 => __schedule
 => schedule
 => schedule_preempt_disabled
 => cpu_startup_entry
 => start_secondary

The stacktrace trigger isn't as bad, because it adds its own skip to the
stacktracing, but still has two events extra.

One issue is that if the stacktrace passes its own "regs" then there should
be no addition to the skip, as the regs will not include the functions being
called. This was an issue that was fixed by commit 7717c6be6999 ("tracing:
Fix stacktrace skip depth in trace_buffer_unlock_commit_regs()" as adding
the skip number for kprobes made the probes not have any stack at all.

But since this is only an issue when regs is being used, a skip should be
added if regs is NULL. Now we have:

 # echo 1 > options/stacktrace
 # echo 1 > events/sched/sched_switch/enable
 # cat trace
          <idle>-0     [000] d..2  1297.676333: <stack trace>
 => __schedule
 => schedule
 => schedule_preempt_disabled
 => cpu_startup_entry
 => rest_init
 => start_kernel
 => x86_64_start_reservations
 => x86_64_start_kernel

 # echo stacktrace > events/sched/sched_switch/trigger
 # cat trace
          <idle>-0     [002] d..3  1370.759745: <stack trace>
 => __schedule
 => schedule
 => schedule_preempt_disabled
 => cpu_startup_entry
 => start_secondary

And kprobes are not touched.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 45e6747589c6..3d9f31b576f3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2118,7 +2118,17 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 {
 	__buffer_unlock_commit(buffer, event);
 
-	ftrace_trace_stack(tr, buffer, flags, 0, pc, regs);
+	/*
+	 * If regs is not set, then skip the following callers:
+	 *   trace_buffer_unlock_commit_regs
+	 *   event_trigger_unlock_commit
+	 *   trace_event_buffer_commit
+	 *   trace_event_raw_event_sched_switch
+	 * Note, we can still get here via blktrace, wakeup tracer
+	 * and mmiotrace, but that's ok if they lose a function or
+	 * two. They are that meaningful.
+	 */
+	ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs);
 	ftrace_trace_userstack(buffer, flags, pc);
 }
 
@@ -2168,6 +2178,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
 	trace.nr_entries	= 0;
 	trace.skip		= skip;
 
+	/*
+	 * Add two, for this function and the call to save_stack_trace()
+	 * If regs is set, then these functions will not be in the way.
+	 */
+	if (!regs)
+		trace.skip += 2;
+
 	/*
 	 * Since events can happen in NMIs there's no safe way to
 	 * use the per cpu ftrace_stacks. We reserve it and if an interrupt
-- 
cgit v1.3-8-gc7d7


From d07b846f6200454c50d791796edb82660192513d Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Wed, 23 Sep 2015 15:16:04 -0500
Subject: fs: Limit file caps to the user namespace of the super block

Capability sets attached to files must be ignored except in the
user namespaces where the mounter is privileged, i.e. s_user_ns
and its descendants. Otherwise a vector exists for gaining
privileges in namespaces where a user is not already privileged.

Add a new helper function, current_in_user_ns(), to test whether a user
namespace is the same as or a descendant of another namespace.
Use this helper to determine whether a file's capability set
should be applied to the caps constructed during exec.

--EWB Replaced in_userns with the simpler current_in_userns.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/user_namespace.h |  6 ++++++
 kernel/user_namespace.c        | 14 ++++++++++++++
 security/commoncap.c           |  2 ++
 3 files changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 8297e5b341d8..9217169c64cb 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -72,6 +72,7 @@ extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t,
 extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
 extern int proc_setgroups_show(struct seq_file *m, void *v);
 extern bool userns_may_setgroups(const struct user_namespace *ns);
+extern bool current_in_userns(const struct user_namespace *target_ns);
 #else
 
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
@@ -100,6 +101,11 @@ static inline bool userns_may_setgroups(const struct user_namespace *ns)
 {
 	return true;
 }
+
+static inline bool current_in_userns(const struct user_namespace *target_ns)
+{
+	return true;
+}
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9bafc211930c..68f594212759 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -938,6 +938,20 @@ bool userns_may_setgroups(const struct user_namespace *ns)
 	return allowed;
 }
 
+/*
+ * Returns true if @ns is the same namespace as or a descendant of
+ * @target_ns.
+ */
+bool current_in_userns(const struct user_namespace *target_ns)
+{
+	struct user_namespace *ns;
+	for (ns = current_user_ns(); ns; ns = ns->parent) {
+		if (ns == target_ns)
+			return true;
+	}
+	return false;
+}
+
 static inline struct user_namespace *to_user_ns(struct ns_common *ns)
 {
 	return container_of(ns, struct user_namespace, ns);
diff --git a/security/commoncap.c b/security/commoncap.c
index e7fadde737f4..e109e6dac858 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -455,6 +455,8 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c
 
 	if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
 		return 0;
+	if (!current_in_userns(bprm->file->f_path.mnt->mnt_sb->s_user_ns))
+		return 0;
 
 	rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
 	if (rc < 0) {
-- 
cgit v1.3-8-gc7d7


From f295e53b60eb93ee53ed5ac610374ed293caa57b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 17 Jun 2016 11:08:06 -0700
Subject: libnvdimm, pmem: allow nfit_test to override pmem_direct_access()

Currently phys_to_pfn_t() is an exported symbol to allow nfit_test to
override it and indicate that nfit_test-pmem is not device-mapped.  Now,
we want to enable nfit_test to operate without DMA_CMA and the pmem it
provides will no longer be physically contiguous, i.e. won't be capable
of supporting direct_access requests larger than a page.  Make
pmem_direct_access() a weak symbol so that it can be replaced by the
tools/testing/nvdimm/ version, and move phys_to_pfn_t() to a static
inline now that it no longer needs to be overridden.

Acked-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/pmem.c                 | 18 +++------------
 drivers/nvdimm/pmem.h                 | 24 ++++++++++++++++++++
 include/linux/pfn_t.h                 |  5 ++++-
 kernel/memremap.c                     |  6 -----
 tools/testing/nvdimm/Kbuild           |  3 ++-
 tools/testing/nvdimm/pmem-dax.c       | 42 +++++++++++++++++++++++++++++++++++
 tools/testing/nvdimm/test/iomap.c     |  3 ++-
 tools/testing/nvdimm/test/nfit_test.h |  2 ++
 8 files changed, 79 insertions(+), 24 deletions(-)
 create mode 100644 drivers/nvdimm/pmem.h
 create mode 100644 tools/testing/nvdimm/pmem-dax.c

(limited to 'kernel')

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index ba253df6233f..b6fcb97a601c 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -29,23 +29,10 @@
 #include <linux/slab.h>
 #include <linux/pmem.h>
 #include <linux/nd.h>
+#include "pmem.h"
 #include "pfn.h"
 #include "nd.h"
 
-struct pmem_device {
-	/* One contiguous memory region per device */
-	phys_addr_t		phys_addr;
-	/* when non-zero this device is hosting a 'pfn' instance */
-	phys_addr_t		data_offset;
-	u64			pfn_flags;
-	void __pmem		*virt_addr;
-	/* immutable base size of the namespace */
-	size_t			size;
-	/* trim size when namespace capacity has been section aligned */
-	u32			pfn_pad;
-	struct badblocks	bb;
-};
-
 static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset,
 		unsigned int len)
 {
@@ -163,7 +150,8 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 	return rc;
 }
 
-static long pmem_direct_access(struct block_device *bdev, sector_t sector,
+/* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */
+__weak long pmem_direct_access(struct block_device *bdev, sector_t sector,
 		      void __pmem **kaddr, pfn_t *pfn, long size)
 {
 	struct pmem_device *pmem = bdev->bd_queue->queuedata;
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
new file mode 100644
index 000000000000..c48d4e3aa346
--- /dev/null
+++ b/drivers/nvdimm/pmem.h
@@ -0,0 +1,24 @@
+#ifndef __NVDIMM_PMEM_H__
+#define __NVDIMM_PMEM_H__
+#include <linux/badblocks.h>
+#include <linux/types.h>
+#include <linux/pfn_t.h>
+#include <linux/fs.h>
+
+long pmem_direct_access(struct block_device *bdev, sector_t sector,
+		      void __pmem **kaddr, pfn_t *pfn, long size);
+/* this definition is in it's own header for tools/testing/nvdimm to consume */
+struct pmem_device {
+	/* One contiguous memory region per device */
+	phys_addr_t		phys_addr;
+	/* when non-zero this device is hosting a 'pfn' instance */
+	phys_addr_t		data_offset;
+	u64			pfn_flags;
+	void __pmem		*virt_addr;
+	/* immutable base size of the namespace */
+	size_t			size;
+	/* trim size when namespace capacity has been section aligned */
+	u32			pfn_pad;
+	struct badblocks	bb;
+};
+#endif /* __NVDIMM_PMEM_H__ */
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 94994810c7c0..a3d90b9da18d 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -28,7 +28,10 @@ static inline pfn_t pfn_to_pfn_t(unsigned long pfn)
 	return __pfn_to_pfn_t(pfn, 0);
 }
 
-extern pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags);
+static inline pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
+{
+	return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
+}
 
 static inline bool pfn_t_has_page(pfn_t pfn)
 {
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 017532193fb1..852e5266124a 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -169,12 +169,6 @@ void devm_memunmap(struct device *dev, void *addr)
 }
 EXPORT_SYMBOL(devm_memunmap);
 
-pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
-{
-	return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
-}
-EXPORT_SYMBOL(phys_to_pfn_t);
-
 #ifdef CONFIG_ZONE_DEVICE
 static DEFINE_MUTEX(pgmap_lock);
 static RADIX_TREE(pgmap_radix, GFP_KERNEL);
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index 785985677159..4adfa9cdf26f 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -11,12 +11,12 @@ ldflags-y += --wrap=__devm_release_region
 ldflags-y += --wrap=__request_region
 ldflags-y += --wrap=__release_region
 ldflags-y += --wrap=devm_memremap_pages
-ldflags-y += --wrap=phys_to_pfn_t
 
 DRIVERS := ../../../drivers
 NVDIMM_SRC := $(DRIVERS)/nvdimm
 ACPI_SRC := $(DRIVERS)/acpi
 DAX_SRC := $(DRIVERS)/dax
+ccflags-y := -I$(src)/$(NVDIMM_SRC)/
 
 obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
 obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
@@ -31,6 +31,7 @@ nfit-y := $(ACPI_SRC)/nfit.o
 nfit-y += config_check.o
 
 nd_pmem-y := $(NVDIMM_SRC)/pmem.o
+nd_pmem-y += pmem-dax.o
 nd_pmem-y += config_check.o
 
 nd_btt-y := $(NVDIMM_SRC)/btt.o
diff --git a/tools/testing/nvdimm/pmem-dax.c b/tools/testing/nvdimm/pmem-dax.c
new file mode 100644
index 000000000000..fdba77f2bb06
--- /dev/null
+++ b/tools/testing/nvdimm/pmem-dax.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014-2016, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include "test/nfit_test.h"
+#include <linux/blkdev.h>
+#include <pmem.h>
+#include <nd.h>
+
+long pmem_direct_access(struct block_device *bdev, sector_t sector,
+		void __pmem **kaddr, pfn_t *pfn, long size)
+{
+	struct pmem_device *pmem = bdev->bd_queue->queuedata;
+	resource_size_t offset = sector * 512 + pmem->data_offset;
+
+	/* disable DAX for nfit_test pmem devices */
+	if (get_nfit_res(pmem->phys_addr + offset)) {
+		dev_info_once(pmem->bb.dev, "dax is disabled for nfit_test\n");
+		return -EIO;
+	}
+
+	if (unlikely(is_bad_pmem(&pmem->bb, sector, size)))
+		return -EIO;
+	*kaddr = pmem->virt_addr + offset;
+	*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
+
+	/*
+	 * If badblocks are present, limit known good range to the
+	 * requested range.
+	 */
+	if (unlikely(pmem->bb.count))
+		return size;
+	return pmem->size - pmem->pfn_pad - offset;
+}
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index c842095f2801..9966f093b4ff 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -52,7 +52,7 @@ static struct nfit_test_resource *__get_nfit_res(resource_size_t resource)
 	return NULL;
 }
 
-static struct nfit_test_resource *get_nfit_res(resource_size_t resource)
+struct nfit_test_resource *get_nfit_res(resource_size_t resource)
 {
 	struct nfit_test_resource *res;
 
@@ -62,6 +62,7 @@ static struct nfit_test_resource *get_nfit_res(resource_size_t resource)
 
 	return res;
 }
+EXPORT_SYMBOL(get_nfit_res);
 
 void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size,
 		void __iomem *(*fallback_fn)(resource_size_t, unsigned long))
diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h
index 96c5e16d7db9..9f18e2a4a862 100644
--- a/tools/testing/nvdimm/test/nfit_test.h
+++ b/tools/testing/nvdimm/test/nfit_test.h
@@ -12,6 +12,7 @@
  */
 #ifndef __NFIT_TEST_H__
 #define __NFIT_TEST_H__
+#include <linux/list.h>
 
 struct nfit_test_resource {
 	struct list_head list;
@@ -26,4 +27,5 @@ void __iomem *__wrap_ioremap_nocache(resource_size_t offset,
 void __wrap_iounmap(volatile void __iomem *addr);
 void nfit_test_setup(nfit_test_lookup_fn lookup);
 void nfit_test_teardown(void);
+struct nfit_test_resource *get_nfit_res(resource_size_t resource);
 #endif
-- 
cgit v1.3-8-gc7d7


From 65fe935dd2387a4faf15314c73f5e6d31ef0217e Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 13 Jun 2016 15:10:02 -0700
Subject: x86/KASLR, x86/power: Remove x86 hibernation restrictions

With the following fix:

  70595b479ce1 ("x86/power/64: Fix crash whan the hibernation code passes control to the image kernel")

... there is no longer a problem with hibernation resuming a
KASLR-booted kernel image, so remove the restriction.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Len Brown <len.brown@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linux PM list <linux-pm@vger.kernel.org>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: linux-doc@vger.kernel.org
Link: http://lkml.kernel.org/r/20160613221002.GA29719@www.outflux.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/kernel-parameters.txt | 10 ++++------
 arch/x86/boot/compressed/kaslr.c    |  7 -------
 kernel/power/hibernate.c            |  6 ------
 3 files changed, 4 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 82b42c958d1c..fa8c6d470ad2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1803,12 +1803,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	js=		[HW,JOY] Analog joystick
 			See Documentation/input/joystick.txt.
 
-	kaslr/nokaslr	[X86]
-			Enable/disable kernel and module base offset ASLR
-			(Address Space Layout Randomization) if built into
-			the kernel. When CONFIG_HIBERNATION is selected,
-			kASLR is disabled by default. When kASLR is enabled,
-			hibernation will be disabled.
+	nokaslr		[KNL]
+			When CONFIG_RANDOMIZE_BASE is set, this disables
+			kernel and module base offset ASLR (Address Space
+			Layout Randomization).
 
 	keepinitrd	[HW,ARM]
 
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index cfeb0259ed81..dff42177cb0c 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -471,17 +471,10 @@ unsigned char *choose_random_location(unsigned long input,
 	unsigned long choice = output;
 	unsigned long random_addr;
 
-#ifdef CONFIG_HIBERNATION
-	if (!cmdline_find_option_bool("kaslr")) {
-		warn("KASLR disabled: 'kaslr' not on cmdline (hibernation selected).");
-		goto out;
-	}
-#else
 	if (cmdline_find_option_bool("nokaslr")) {
 		warn("KASLR disabled: 'nokaslr' on cmdline.");
 		goto out;
 	}
-#endif
 
 	boot_params->hdr.loadflags |= KASLR_FLAG;
 
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index fca9254280ee..9021387c6ff4 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1154,11 +1154,6 @@ static int __init nohibernate_setup(char *str)
 	return 1;
 }
 
-static int __init kaslr_nohibernate_setup(char *str)
-{
-	return nohibernate_setup(str);
-}
-
 static int __init page_poison_nohibernate_setup(char *str)
 {
 #ifdef CONFIG_PAGE_POISONING_ZERO
@@ -1182,5 +1177,4 @@ __setup("hibernate=", hibernate_setup);
 __setup("resumewait", resumewait_setup);
 __setup("resumedelay=", resumedelay_setup);
 __setup("nohibernate", nohibernate_setup);
-__setup("kaslr", kaslr_nohibernate_setup);
 __setup("page_poison=", page_poison_nohibernate_setup);
-- 
cgit v1.3-8-gc7d7


From 0dceeaf599e6d9b8bd908ba4bd3dfee84aa26be2 Mon Sep 17 00:00:00 2001
From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Date: Tue, 14 Jun 2016 14:37:27 +0800
Subject: locking/qspinlock: Use __this_cpu_dec() instead of full-blown
 this_cpu_dec()

queued_spin_lock_slowpath() should not worry about another
queued_spin_lock_slowpath() running in interrupt context and
changing node->count by accident, because node->count keeps
the same value every time we enter/leave queued_spin_lock_slowpath().

On some architectures this_cpu_dec() will save/restore irq flags,
which has high overhead. Use the much cheaper __this_cpu_dec() instead.

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman.Long@hpe.com
Link: http://lkml.kernel.org/r/1465886247-3773-1-git-send-email-xinhui.pan@linux.vnet.ibm.com
[ Rewrote changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 730655533440..b2caec7315af 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -619,7 +619,7 @@ release:
 	/*
 	 * release the node
 	 */
-	this_cpu_dec(mcs_nodes[0].count);
+	__this_cpu_dec(mcs_nodes[0].count);
 }
 EXPORT_SYMBOL(queued_spin_lock_slowpath);
 
-- 
cgit v1.3-8-gc7d7


From e210bffd39d01b649c94b820c28ff112673266dd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 16 Jun 2016 18:51:48 +0200
Subject: sched/fair: Fix and optimize the fork() path

The task_fork_fair() callback already calls __set_task_cpu() and takes
rq->lock.

If we move the sched_class::task_fork callback in sched_fork() under
the existing p->pi_lock, right after its set_task_cpu() call, we can
avoid doing two such calls and omit the IRQ disabling on the rq->lock.

Change to __set_task_cpu() to skip the migration bits, this is a new
task, not a migration. Similarly, make wake_up_new_task() use
__set_task_cpu() for the same reason, the task hasn't actually
migrated as it hasn't ever ran.

This cures the problem of calling migrate_task_rq_fair(), which does
remove_entity_from_load_avg() on tasks that have never been added to
the load avg to begin with.

This bug would result in transiently messed up load_avg values, averaged
out after a few dozen milliseconds. This is probably the reason why
this bug was not found for such a long time.

Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 16 +++++++++++-----
 kernel/sched/fair.c | 27 ++++++---------------------
 2 files changed, 17 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e406ba0c6891..fa3434dffbbb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2383,9 +2383,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 		p->sched_class = &fair_sched_class;
 	}
 
-	if (p->sched_class->task_fork)
-		p->sched_class->task_fork(p);
-
 	/*
 	 * The child is not yet in the pid-hash so no cgroup attach races,
 	 * and the cgroup is pinned to this child due to cgroup_fork()
@@ -2394,7 +2391,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	 * Silence PROVE_RCU.
 	 */
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	set_task_cpu(p, cpu);
+	/*
+	 * We're setting the cpu for the first time, we don't migrate,
+	 * so use __set_task_cpu().
+	 */
+	__set_task_cpu(p, cpu);
+	if (p->sched_class->task_fork)
+		p->sched_class->task_fork(p);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 #ifdef CONFIG_SCHED_INFO
@@ -2534,8 +2537,11 @@ void wake_up_new_task(struct task_struct *p)
 	 * Fork balancing, do it here and not earlier because:
 	 *  - cpus_allowed can change in the fork path
 	 *  - any previously selected cpu might disappear through hotplug
+	 *
+	 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+	 * as we're not fully set-up yet.
 	 */
-	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+	__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
 	rq = __task_rq_lock(p, &rf);
 	post_init_entity_util_avg(&p->se);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 73063560b9ec..994f5493ee5b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4448,7 +4448,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		 *
 		 * note: in the case of encountering a throttled cfs_rq we will
 		 * post the final h_nr_running increment below.
-		*/
+		 */
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 		cfs_rq->h_nr_running++;
@@ -8289,31 +8289,17 @@ static void task_fork_fair(struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se, *curr;
-	int this_cpu = smp_processor_id();
 	struct rq *rq = this_rq();
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&rq->lock, flags);
 
+	raw_spin_lock(&rq->lock);
 	update_rq_clock(rq);
 
 	cfs_rq = task_cfs_rq(current);
 	curr = cfs_rq->curr;
-
-	/*
-	 * Not only the cpu but also the task_group of the parent might have
-	 * been changed after parent->se.parent,cfs_rq were copied to
-	 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
-	 * of child point to valid ones.
-	 */
-	rcu_read_lock();
-	__set_task_cpu(p, this_cpu);
-	rcu_read_unlock();
-
-	update_curr(cfs_rq);
-
-	if (curr)
+	if (curr) {
+		update_curr(cfs_rq);
 		se->vruntime = curr->vruntime;
+	}
 	place_entity(cfs_rq, se, 1);
 
 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -8326,8 +8312,7 @@ static void task_fork_fair(struct task_struct *p)
 	}
 
 	se->vruntime -= cfs_rq->min_vruntime;
-
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
+	raw_spin_unlock(&rq->lock);
 }
 
 /*
-- 
cgit v1.3-8-gc7d7


From 010114739d294c474764c94196d32fb92e233657 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 17 Jun 2016 11:20:46 +0200
Subject: sched/fair: Fix PELT integrity for new groups

Vincent reported that when a new task is moved into a new cgroup it
gets attached twice to the load tracking:

  sched_move_task()
    task_move_group_fair()
      detach_task_cfs_rq()
      set_task_rq()
      attach_task_cfs_rq()
        attach_entity_load_avg()
          se->avg.last_load_update = cfs_rq->avg.last_load_update // == 0

  enqueue_entity()
    enqueue_entity_load_avg()
      update_cfs_rq_load_avg()
        now = clock()
        __update_load_avg(&cfs_rq->avg)
          cfs_rq->avg.last_load_update = now
          // ages load/util for: now - 0, load/util -> 0
      if (migrated)
        attach_entity_load_avg()
          se->avg.last_load_update = cfs_rq->avg.last_load_update; // now != 0

The problem is that we don't update cfs_rq load_avg before all
entity attach/detach operations. Only enqueue_task() and migrate_task()
do this.

By fixing this, the above will not happen, because the
sched_move_task() attach will have updated cfs_rq's last_load_update
time before attach, and in turn the attach will have set the entity's
last_load_update stamp.

Note that there is a further problem with sched_move_task() calling
detach on a task that hasn't yet been attached; this will be taken
care of in a subsequent patch.

Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by:  Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yuyang Du <yuyang.du@intel.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 994f5493ee5b..dda0ccbd0f3d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3083,6 +3083,12 @@ static int idle_balance(struct rq *this_rq);
 
 #else /* CONFIG_SMP */
 
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+{
+	return 0;
+}
+
 static inline void update_load_avg(struct sched_entity *se, int not_used)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -8368,6 +8374,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	u64 now = cfs_rq_clock_task(cfs_rq);
 
 	if (!vruntime_normalized(p)) {
 		/*
@@ -8379,6 +8386,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
 	}
 
 	/* Catch up with the cfs_rq and remove our load when we leave */
+	update_cfs_rq_load_avg(now, cfs_rq, false);
 	detach_entity_load_avg(cfs_rq, se);
 }
 
@@ -8386,6 +8394,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	u64 now = cfs_rq_clock_task(cfs_rq);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/*
@@ -8396,6 +8405,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
 #endif
 
 	/* Synchronize task with its cfs_rq */
+	update_cfs_rq_load_avg(now, cfs_rq, false);
 	attach_entity_load_avg(cfs_rq, se);
 
 	if (!vruntime_normalized(p))
-- 
cgit v1.3-8-gc7d7


From ea86cb4b7621e1298a37197005bf0abcc86348d4 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Fri, 17 Jun 2016 13:38:55 +0200
Subject: sched/cgroup: Fix cpu_cgroup_fork() handling

A new fair task is detached and attached from/to task_group with:

  cgroup_post_fork()
    ss->fork(child) := cpu_cgroup_fork()
      sched_move_task()
        task_move_group_fair()

Which is wrong, because at this point in fork() the task isn't fully
initialized and it cannot 'move' to another group, because its not
attached to any group as yet.

In fact, cpu_cgroup_fork() needs a small part of sched_move_task() so we
can just call this small part directly instead sched_move_task(). And
the task doesn't really migrate because it is not yet attached so we
need the following sequence:

  do_fork()
    sched_fork()
      __set_task_cpu()

    cgroup_post_fork()
      set_task_rq() # set task group and runqueue

    wake_up_new_task()
      select_task_rq() can select a new cpu
      __set_task_cpu
      post_init_entity_util_avg
        attach_task_cfs_rq()
      activate_task
        enqueue_task

This patch makes that happen.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
[ Added TASK_SET_GROUP to set depth properly. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 63 ++++++++++++++++++++++++++++++++++------------------
 kernel/sched/fair.c  | 23 ++++++++++++++++++-
 kernel/sched/sched.h |  5 ++++-
 3 files changed, 67 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fa3434dffbbb..3d856c46f6d8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7744,27 +7744,9 @@ void sched_offline_group(struct task_group *tg)
 	spin_unlock_irqrestore(&task_group_lock, flags);
 }
 
-/* change task's runqueue when it moves between groups.
- *	The caller of this function should have put the task in its new group
- *	by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- *	reflect its new group.
- */
-void sched_move_task(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk, int type)
 {
 	struct task_group *tg;
-	int queued, running;
-	struct rq_flags rf;
-	struct rq *rq;
-
-	rq = task_rq_lock(tsk, &rf);
-
-	running = task_current(rq, tsk);
-	queued = task_on_rq_queued(tsk);
-
-	if (queued)
-		dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
-	if (unlikely(running))
-		put_prev_task(rq, tsk);
 
 	/*
 	 * All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -7777,11 +7759,37 @@ void sched_move_task(struct task_struct *tsk)
 	tsk->sched_task_group = tg;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	if (tsk->sched_class->task_move_group)
-		tsk->sched_class->task_move_group(tsk);
+	if (tsk->sched_class->task_change_group)
+		tsk->sched_class->task_change_group(tsk, type);
 	else
 #endif
 		set_task_rq(tsk, task_cpu(tsk));
+}
+
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+	int queued, running;
+	struct rq_flags rf;
+	struct rq *rq;
+
+	rq = task_rq_lock(tsk, &rf);
+
+	running = task_current(rq, tsk);
+	queued = task_on_rq_queued(tsk);
+
+	if (queued)
+		dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+	if (unlikely(running))
+		put_prev_task(rq, tsk);
+
+	sched_change_group(tsk, TASK_MOVE_GROUP);
 
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
@@ -8209,9 +8217,20 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
 	sched_free_group(tg);
 }
 
+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
 static void cpu_cgroup_fork(struct task_struct *task)
 {
-	sched_move_task(task);
+	struct rq_flags rf;
+	struct rq *rq;
+
+	rq = task_rq_lock(task, &rf);
+
+	sched_change_group(task, TASK_SET_GROUP);
+
+	task_rq_unlock(rq, task, &rf);
 }
 
 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dda0ccbd0f3d..64f26bc436eb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8466,6 +8466,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+static void task_set_group_fair(struct task_struct *p)
+{
+	struct sched_entity *se = &p->se;
+
+	set_task_rq(p, task_cpu(p));
+	se->depth = se->parent ? se->parent->depth + 1 : 0;
+}
+
 static void task_move_group_fair(struct task_struct *p)
 {
 	detach_task_cfs_rq(p);
@@ -8478,6 +8486,19 @@ static void task_move_group_fair(struct task_struct *p)
 	attach_task_cfs_rq(p);
 }
 
+static void task_change_group_fair(struct task_struct *p, int type)
+{
+	switch (type) {
+	case TASK_SET_GROUP:
+		task_set_group_fair(p);
+		break;
+
+	case TASK_MOVE_GROUP:
+		task_move_group_fair(p);
+		break;
+	}
+}
+
 void free_fair_sched_group(struct task_group *tg)
 {
 	int i;
@@ -8706,7 +8727,7 @@ const struct sched_class fair_sched_class = {
 	.update_curr		= update_curr_fair,
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	.task_move_group	= task_move_group_fair,
+	.task_change_group	= task_change_group_fair,
 #endif
 };
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 71ce9862abc3..307bd0418095 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1246,8 +1246,11 @@ struct sched_class {
 
 	void (*update_curr) (struct rq *rq);
 
+#define TASK_SET_GROUP  0
+#define TASK_MOVE_GROUP	1
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	void (*task_move_group) (struct task_struct *p);
+	void (*task_change_group) (struct task_struct *p, int type);
 #endif
 };
 
-- 
cgit v1.3-8-gc7d7


From 7dc603c9028ea5d4354e0e317e8481df99b06d7e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 16 Jun 2016 13:29:28 +0200
Subject: sched/fair: Fix PELT integrity for new tasks

Vincent and Yuyang found another few scenarios in which entity
tracking goes wobbly.

The scenarios are basically due to the fact that new tasks are not
immediately attached and thereby differ from the normal situation -- a
task is always attached to a cfs_rq load average (such that it
includes its blocked contribution) and are explicitly
detached/attached on migration to another cfs_rq.

Scenario 1: switch to fair class

  p->sched_class = fair_class;
  if (queued)
    enqueue_task(p);
      ...
        enqueue_entity()
	  enqueue_entity_load_avg()
	    migrated = !sa->last_update_time (true)
	    if (migrated)
	      attach_entity_load_avg()
  check_class_changed()
    switched_from() (!fair)
    switched_to()   (fair)
      switched_to_fair()
        attach_entity_load_avg()

If @p is a new task that hasn't been fair before, it will have
!last_update_time and, per the above, end up in
attach_entity_load_avg() _twice_.

Scenario 2: change between cgroups

  sched_move_group(p)
    if (queued)
      dequeue_task()
    task_move_group_fair()
      detach_task_cfs_rq()
        detach_entity_load_avg()
      set_task_rq()
      attach_task_cfs_rq()
        attach_entity_load_avg()
    if (queued)
      enqueue_task();
        ...
          enqueue_entity()
	    enqueue_entity_load_avg()
	      migrated = !sa->last_update_time (true)
	      if (migrated)
	        attach_entity_load_avg()

Similar as with scenario 1, if @p is a new task, it will have
!load_update_time and we'll end up in attach_entity_load_avg()
_twice_.

Furthermore, notice how we do a detach_entity_load_avg() on something
that wasn't attached to begin with.

As stated above; the problem is that the new task isn't yet attached
to the load tracking and thereby violates the invariant assumption.

This patch remedies this by ensuring a new task is indeed properly
attached to the load tracking on creation, through
post_init_entity_util_avg().

Of course, this isn't entirely as straightforward as one might think,
since the task is hashed before we call wake_up_new_task() and thus
can be poked at. We avoid this by adding TASK_NEW and teaching
cpu_cgroup_can_attach() to refuse such tasks.

Reported-by: Yuyang Du <yuyang.du@intel.com>
Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  5 +++--
 kernel/sched/core.c   | 28 +++++++++++++++++++++++-----
 kernel/sched/fair.c   | 45 +++++++++++++++++++++++++++++++++++++--------
 3 files changed, 63 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b45acfd18f4e..d99218a1e043 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -219,9 +219,10 @@ extern void proc_sched_set_task(struct task_struct *p);
 #define TASK_WAKING		256
 #define TASK_PARKED		512
 #define TASK_NOLOAD		1024
-#define TASK_STATE_MAX		2048
+#define TASK_NEW		2048
+#define TASK_STATE_MAX		4096
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
 
 extern char ___assert_task_state[1 - 2*!!(
 		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3d856c46f6d8..14afa518948c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 
 	__sched_fork(clone_flags, p);
 	/*
-	 * We mark the process as running here. This guarantees that
+	 * We mark the process as NEW here. This guarantees that
 	 * nobody will actually run it, and a signal or other external
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
-	p->state = TASK_RUNNING;
+	p->state = TASK_NEW;
 
 	/*
 	 * Make sure we do not leak PI boosting priority to the child.
@@ -2383,6 +2383,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 		p->sched_class = &fair_sched_class;
 	}
 
+	init_entity_runnable_average(&p->se);
+
 	/*
 	 * The child is not yet in the pid-hash so no cgroup attach races,
 	 * and the cgroup is pinned to this child due to cgroup_fork()
@@ -2529,9 +2531,8 @@ void wake_up_new_task(struct task_struct *p)
 	struct rq_flags rf;
 	struct rq *rq;
 
-	/* Initialize new task's runnable average */
-	init_entity_runnable_average(&p->se);
 	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
 	/*
 	 * Fork balancing, do it here and not earlier because:
@@ -8237,6 +8238,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
 	struct cgroup_subsys_state *css;
+	int ret = 0;
 
 	cgroup_taskset_for_each(task, css, tset) {
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -8247,8 +8249,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
 		if (task->sched_class != &fair_sched_class)
 			return -EINVAL;
 #endif
+		/*
+		 * Serialize against wake_up_new_task() such that if its
+		 * running, we're sure to observe its full state.
+		 */
+		raw_spin_lock_irq(&task->pi_lock);
+		/*
+		 * Avoid calling sched_move_task() before wake_up_new_task()
+		 * has happened. This would lead to problems with PELT, due to
+		 * move wanting to detach+attach while we're not attached yet.
+		 */
+		if (task->state == TASK_NEW)
+			ret = -EINVAL;
+		raw_spin_unlock_irq(&task->pi_lock);
+
+		if (ret)
+			break;
 	}
-	return 0;
+	return ret;
 }
 
 static void cpu_cgroup_attach(struct cgroup_taskset *tset)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 64f26bc436eb..0c21a12c0205 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -690,6 +690,10 @@ void init_entity_runnable_average(struct sched_entity *se)
 	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }
 
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
 /*
  * With new tasks being created, their initial util_avgs are extrapolated
  * based on the cfs_rq's current util_avg:
@@ -720,6 +724,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	struct sched_avg *sa = &se->avg;
 	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+	u64 now = cfs_rq_clock_task(cfs_rq);
 
 	if (cap > 0) {
 		if (cfs_rq->avg.util_avg != 0) {
@@ -733,16 +738,37 @@ void post_init_entity_util_avg(struct sched_entity *se)
 		}
 		sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
 	}
+
+	if (entity_is_task(se)) {
+		struct task_struct *p = task_of(se);
+		if (p->sched_class != &fair_sched_class) {
+			/*
+			 * For !fair tasks do:
+			 *
+			update_cfs_rq_load_avg(now, cfs_rq, false);
+			attach_entity_load_avg(cfs_rq, se);
+			switched_from_fair(rq, p);
+			 *
+			 * such that the next switched_to_fair() has the
+			 * expected state.
+			 */
+			se->avg.last_update_time = now;
+			return;
+		}
+	}
+
+	update_cfs_rq_load_avg(now, cfs_rq, false);
+	attach_entity_load_avg(cfs_rq, se);
 }
 
-#else
+#else /* !CONFIG_SMP */
 void init_entity_runnable_average(struct sched_entity *se)
 {
 }
 void post_init_entity_util_avg(struct sched_entity *se)
 {
 }
-#endif
+#endif /* CONFIG_SMP */
 
 /*
  * Update the current task's runtime statistics.
@@ -2840,8 +2866,6 @@ void set_task_rq_fair(struct sched_entity *se,
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-
 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
@@ -2951,6 +2975,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	/*
 	 * If we got migrated (either between CPUs or between cgroups) we'll
 	 * have aged the average right before clearing @last_update_time.
+	 *
+	 * Or we're fresh through post_init_entity_util_avg().
 	 */
 	if (se->avg.last_update_time) {
 		__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -3056,11 +3082,14 @@ void remove_entity_load_avg(struct sched_entity *se)
 	u64 last_update_time;
 
 	/*
-	 * Newly created task or never used group entity should not be removed
-	 * from its (source) cfs_rq
+	 * tasks cannot exit without having gone through wake_up_new_task() ->
+	 * post_init_entity_util_avg() which will have added things to the
+	 * cfs_rq, so we can remove unconditionally.
+	 *
+	 * Similarly for groups, they will have passed through
+	 * post_init_entity_util_avg() before unregister_sched_fair_group()
+	 * calls this.
 	 */
-	if (se->avg.last_update_time == 0)
-		return;
 
 	last_update_time = cfs_rq_last_update_time(cfs_rq);
 
-- 
cgit v1.3-8-gc7d7


From 3d30544f02120b884bba2a9466c87dba980e3be5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 21 Jun 2016 14:27:50 +0200
Subject: sched/fair: Apply more PELT fixes

One additional 'rule' for using update_cfs_rq_load_avg() is that one
should call update_tg_load_avg() if it returns true.

Add a bunch of comments to hopefully clarify some of the rules:

 o  You need to update cfs_rq _before_ any entity attach/detach,
    this is important, because while for mathmatical consisency this
    isn't strictly needed, it is required for the physical
    interpretation of the model, you attach/detach _now_.

 o  When you modify the cfs_rq avg, you have to then call
    update_tg_load_avg() in order to propagate changes upwards.

 o  (Fair) entities are always attached, switched_{to,from}_fair()
    deal with !fair. This directly follows from the definition of the
    cfs_rq averages, namely that they are a direct sum of all
    (runnable or blocked) entities on that rq.

It is the second rule that this patch enforces, but it adds comments
pertaining to all of them.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 49 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0c21a12c0205..781788d54736 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -692,6 +692,7 @@ void init_entity_runnable_average(struct sched_entity *se)
 
 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
 
 /*
@@ -725,6 +726,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
 	struct sched_avg *sa = &se->avg;
 	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
 	u64 now = cfs_rq_clock_task(cfs_rq);
+	int tg_update;
 
 	if (cap > 0) {
 		if (cfs_rq->avg.util_avg != 0) {
@@ -757,8 +759,10 @@ void post_init_entity_util_avg(struct sched_entity *se)
 		}
 	}
 
-	update_cfs_rq_load_avg(now, cfs_rq, false);
+	tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
 	attach_entity_load_avg(cfs_rq, se);
+	if (tg_update)
+		update_tg_load_avg(cfs_rq, false);
 }
 
 #else /* !CONFIG_SMP */
@@ -768,6 +772,9 @@ void init_entity_runnable_average(struct sched_entity *se)
 void post_init_entity_util_avg(struct sched_entity *se)
 {
 }
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+}
 #endif /* CONFIG_SMP */
 
 /*
@@ -2912,7 +2919,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 	WRITE_ONCE(*ptr, res);					\
 } while (0)
 
-/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_task()
+ * @cfs_rq: cfs_rq to update
+ * @update_freq: should we call cfs_rq_util_change() or will the call do so
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed utilization. It is expected
+ * that one calls update_tg_load_avg() on this condition, but after you've
+ * modified the cfs_rq avg (attach/detach), such that we propagate the new
+ * avg up.
+ */
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 {
@@ -2967,6 +2990,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 		update_tg_load_avg(cfs_rq, 0);
 }
 
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if (!sched_feat(ATTACH_AGE_LOAD))
@@ -2998,6 +3029,14 @@ skip_aging:
 	cfs_rq_util_change(cfs_rq);
 }
 
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -8404,6 +8443,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 now = cfs_rq_clock_task(cfs_rq);
+	int tg_update;
 
 	if (!vruntime_normalized(p)) {
 		/*
@@ -8415,8 +8455,10 @@ static void detach_task_cfs_rq(struct task_struct *p)
 	}
 
 	/* Catch up with the cfs_rq and remove our load when we leave */
-	update_cfs_rq_load_avg(now, cfs_rq, false);
+	tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
 	detach_entity_load_avg(cfs_rq, se);
+	if (tg_update)
+		update_tg_load_avg(cfs_rq, false);
 }
 
 static void attach_task_cfs_rq(struct task_struct *p)
@@ -8424,6 +8466,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 now = cfs_rq_clock_task(cfs_rq);
+	int tg_update;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/*
@@ -8434,8 +8477,10 @@ static void attach_task_cfs_rq(struct task_struct *p)
 #endif
 
 	/* Synchronize task with its cfs_rq */
-	update_cfs_rq_load_avg(now, cfs_rq, false);
+	tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
 	attach_entity_load_avg(cfs_rq, se);
+	if (tg_update)
+		update_tg_load_avg(cfs_rq, false);
 
 	if (!vruntime_normalized(p))
 		se->vruntime += cfs_rq->min_vruntime;
-- 
cgit v1.3-8-gc7d7


From 8663e24d56dc1f093232783c23ea17f2a6f61c03 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Jun 2016 14:58:02 +0200
Subject: sched/fair: Reorder cgroup creation code

A future patch needs rq->lock held _after_ we link the task_group into
the hierarchy. In order to avoid taking every rq->lock twice, reorder
things a little and create online_fair_sched_group() to be called
after we link the task_group.

All this code is still ran from css_alloc() so css_online() isn't in
fact used for this.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  |  2 ++
 kernel/sched/fair.c  | 22 ++++++++++++++++++----
 kernel/sched/sched.h |  1 +
 3 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 14afa518948c..4ede4fc65653 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7717,6 +7717,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
 	INIT_LIST_HEAD(&tg->children);
 	list_add_rcu(&tg->siblings, &parent->children);
 	spin_unlock_irqrestore(&task_group_lock, flags);
+
+	online_fair_sched_group(tg);
 }
 
 /* rcu callback to free various structures associated with a task group */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 781788d54736..62d5e7dcc7f8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8624,10 +8624,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 		init_cfs_rq(cfs_rq);
 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
 		init_entity_runnable_average(se);
-
-		raw_spin_lock_irq(&rq->lock);
-		post_init_entity_util_avg(se);
-		raw_spin_unlock_irq(&rq->lock);
 	}
 
 	return 1;
@@ -8638,6 +8634,22 @@ err:
 	return 0;
 }
 
+void online_fair_sched_group(struct task_group *tg)
+{
+	struct sched_entity *se;
+	struct rq *rq;
+	int i;
+
+	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+		se = tg->se[i];
+
+		raw_spin_lock_irq(&rq->lock);
+		post_init_entity_util_avg(se);
+		raw_spin_unlock_irq(&rq->lock);
+	}
+}
+
 void unregister_fair_sched_group(struct task_group *tg)
 {
 	unsigned long flags;
@@ -8742,6 +8754,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	return 1;
 }
 
+void online_fair_sched_group(struct task_group *tg) { }
+
 void unregister_fair_sched_group(struct task_group *tg) { }
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 307bd0418095..28c42b789f70 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data);
 
 extern void free_fair_sched_group(struct task_group *tg);
 extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
+extern void online_fair_sched_group(struct task_group *tg);
 extern void unregister_fair_sched_group(struct task_group *tg);
 extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 			struct sched_entity *se, int cpu,
-- 
cgit v1.3-8-gc7d7


From 599b4840b0ea453c7d11e1798dcc8f494dcfd58a Mon Sep 17 00:00:00 2001
From: Zev Weiss <zev@bewilderbeest.net>
Date: Sun, 26 Jun 2016 16:13:23 -0500
Subject: sched/core: Fix sched_getaffinity() return value kerneldoc comment

Previous version was probably written referencing the man page for
glibc's wrapper, but the wrapper's behavior differs from that of the
syscall itself in this case.

Signed-off-by: Zev Weiss <zev@bewilderbeest.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1466975603-25408-1-git-send-email-zev@bewilderbeest.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4ede4fc65653..28da50a5bc76 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4759,7 +4759,8 @@ out_unlock:
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to hold the current cpu mask
  *
- * Return: 0 on success. An error code otherwise.
+ * Return: size of CPU mask copied to user_mask_ptr on success. An
+ * error code otherwise.
  */
 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
 		unsigned long __user *, user_mask_ptr)
-- 
cgit v1.3-8-gc7d7


From 55e16d30bd99510900caec913c90f53bc2b35cba Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Jun 2016 15:14:26 +0200
Subject: sched/fair: Rework throttle_count sync

Since we already take rq->lock when creating a cgroup, use it to also
sync the throttle_count and avoid the extra state and enqueue path
branch.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: linux-kernel@vger.kernel.org
[ Fixed build warning. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c  | 39 +++++++++++++++++++--------------------
 kernel/sched/sched.h |  2 +-
 2 files changed, 20 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 62d5e7dcc7f8..4088eedea763 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4241,26 +4241,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 	if (!cfs_bandwidth_used())
 		return;
 
-	/* Synchronize hierarchical throttle counter: */
-	if (unlikely(!cfs_rq->throttle_uptodate)) {
-		struct rq *rq = rq_of(cfs_rq);
-		struct cfs_rq *pcfs_rq;
-		struct task_group *tg;
-
-		cfs_rq->throttle_uptodate = 1;
-
-		/* Get closest up-to-date node, because leaves go first: */
-		for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
-			pcfs_rq = tg->cfs_rq[cpu_of(rq)];
-			if (pcfs_rq->throttle_uptodate)
-				break;
-		}
-		if (tg) {
-			cfs_rq->throttle_count = pcfs_rq->throttle_count;
-			cfs_rq->throttled_clock_task = rq_clock_task(rq);
-		}
-	}
-
 	/* an active group must be handled by the update_curr()->put() path */
 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
 		return;
@@ -4275,6 +4255,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 		throttle_cfs_rq(cfs_rq);
 }
 
+static void sync_throttle(struct task_group *tg, int cpu)
+{
+	struct cfs_rq *pcfs_rq, *cfs_rq;
+
+	if (!cfs_bandwidth_used())
+		return;
+
+	if (!tg->parent)
+		return;
+
+	cfs_rq = tg->cfs_rq[cpu];
+	pcfs_rq = tg->parent->cfs_rq[cpu];
+
+	cfs_rq->throttle_count = pcfs_rq->throttle_count;
+	pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+}
+
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
@@ -4414,6 +4411,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static inline void sync_throttle(struct task_group *tg, int cpu) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -8646,6 +8644,7 @@ void online_fair_sched_group(struct task_group *tg)
 
 		raw_spin_lock_irq(&rq->lock);
 		post_init_entity_util_avg(se);
+		sync_throttle(tg, i);
 		raw_spin_unlock_irq(&rq->lock);
 	}
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 28c42b789f70..f44da95c70cd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -438,7 +438,7 @@ struct cfs_rq {
 
 	u64 throttled_clock, throttled_clock_task;
 	u64 throttled_clock_task_time;
-	int throttled, throttle_count, throttle_uptodate;
+	int throttled, throttle_count;
 	struct list_head throttled_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
-- 
cgit v1.3-8-gc7d7


From 86b2efbe3a390e07dbba725ef700b0d143e9a385 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Fri, 24 Jun 2016 16:35:46 -0400
Subject: audit: add fields to exclude filter by reusing user filter

RFE: add additional fields for use in audit filter exclude rules
https://github.com/linux-audit/audit-kernel/issues/5

Re-factor and combine audit_filter_type() with audit_filter_user() to
use audit_filter_user_rules() to enable the exclude filter to
additionally filter on PID, UID, GID, AUID, LOGINUID_SET, SUBJ_*.

The process of combining the similar audit_filter_user() and
audit_filter_type() functions, required inverting the meaning and
including the ALWAYS action of the latter.

Include audit_filter_user_rules() into audit_filter(), removing
unneeded logic in the process.

Keep the check to quit early if the list is empty.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
[PM: checkpatch.pl fixes - whitespace damage, wrapped description]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h |   2 -
 kernel/audit.c        |   4 +-
 kernel/audit.h        |   2 +
 kernel/auditfilter.c  | 151 ++++++++++++++++++--------------------------------
 4 files changed, 57 insertions(+), 102 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index e38e3fc13ea8..9d4443f93db6 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -163,8 +163,6 @@ extern void audit_log_task_info(struct audit_buffer *ab,
 extern int		    audit_update_lsm_rules(void);
 
 				/* Private API (for audit.c only) */
-extern int audit_filter_user(int type);
-extern int audit_filter_type(int type);
 extern int audit_rule_change(int type, __u32 portid, int seq,
 				void *data, size_t datasz);
 extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);
diff --git a/kernel/audit.c b/kernel/audit.c
index 678c3f000191..994588ef9489 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -934,7 +934,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (!audit_enabled && msg_type != AUDIT_USER_AVC)
 			return 0;
 
-		err = audit_filter_user(msg_type);
+		err = audit_filter(msg_type, AUDIT_FILTER_USER);
 		if (err == 1) { /* match or error */
 			err = 0;
 			if (msg_type == AUDIT_USER_TTY) {
@@ -1382,7 +1382,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 	if (audit_initialized != AUDIT_INITIALIZED)
 		return NULL;
 
-	if (unlikely(audit_filter_type(type)))
+	if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE)))
 		return NULL;
 
 	if (gfp_mask & __GFP_DIRECT_RECLAIM) {
diff --git a/kernel/audit.h b/kernel/audit.h
index cbbe6bb6496e..1879f02cb2c3 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -327,6 +327,8 @@ extern pid_t audit_sig_pid;
 extern kuid_t audit_sig_uid;
 extern u32 audit_sig_sid;
 
+extern int audit_filter(int msgtype, unsigned int listtype);
+
 #ifdef CONFIG_AUDITSYSCALL
 extern int __audit_signal_info(int sig, struct task_struct *t);
 static inline int audit_signal_info(int sig, struct task_struct *t)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index ff59a5eed691..85d9cac497e4 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1290,117 +1290,72 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
 	return strncmp(p, dname, dlen);
 }
 
-static int audit_filter_user_rules(struct audit_krule *rule, int type,
-				   enum audit_state *state)
-{
-	int i;
-
-	for (i = 0; i < rule->field_count; i++) {
-		struct audit_field *f = &rule->fields[i];
-		pid_t pid;
-		int result = 0;
-		u32 sid;
-
-		switch (f->type) {
-		case AUDIT_PID:
-			pid = task_pid_nr(current);
-			result = audit_comparator(pid, f->op, f->val);
-			break;
-		case AUDIT_UID:
-			result = audit_uid_comparator(current_uid(), f->op, f->uid);
-			break;
-		case AUDIT_GID:
-			result = audit_gid_comparator(current_gid(), f->op, f->gid);
-			break;
-		case AUDIT_LOGINUID:
-			result = audit_uid_comparator(audit_get_loginuid(current),
-						  f->op, f->uid);
-			break;
-		case AUDIT_LOGINUID_SET:
-			result = audit_comparator(audit_loginuid_set(current),
-						  f->op, f->val);
-			break;
-		case AUDIT_MSGTYPE:
-			result = audit_comparator(type, f->op, f->val);
-			break;
-		case AUDIT_SUBJ_USER:
-		case AUDIT_SUBJ_ROLE:
-		case AUDIT_SUBJ_TYPE:
-		case AUDIT_SUBJ_SEN:
-		case AUDIT_SUBJ_CLR:
-			if (f->lsm_rule) {
-				security_task_getsecid(current, &sid);
-				result = security_audit_rule_match(sid,
-								   f->type,
-								   f->op,
-								   f->lsm_rule,
-								   NULL);
-			}
-			break;
-		}
-
-		if (result <= 0)
-			return result;
-	}
-	switch (rule->action) {
-	case AUDIT_NEVER:
-		*state = AUDIT_DISABLED;
-		break;
-	case AUDIT_ALWAYS:
-		*state = AUDIT_RECORD_CONTEXT;
-		break;
-	}
-	return 1;
-}
-
-int audit_filter_user(int type)
-{
-	enum audit_state state = AUDIT_DISABLED;
-	struct audit_entry *e;
-	int rc, ret;
-
-	ret = 1; /* Audit by default */
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
-		rc = audit_filter_user_rules(&e->rule, type, &state);
-		if (rc) {
-			if (rc > 0 && state == AUDIT_DISABLED)
-				ret = 0;
-			break;
-		}
-	}
-	rcu_read_unlock();
-
-	return ret;
-}
-
-int audit_filter_type(int type)
+int audit_filter(int msgtype, unsigned int listtype)
 {
 	struct audit_entry *e;
-	int result = 0;
+	int ret = 1; /* Audit by default */
 
 	rcu_read_lock();
-	if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
+	if (list_empty(&audit_filter_list[listtype]))
 		goto unlock_and_return;
+	list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) {
+		int i, result = 0;
 
-	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
-				list) {
-		int i;
 		for (i = 0; i < e->rule.field_count; i++) {
 			struct audit_field *f = &e->rule.fields[i];
-			if (f->type == AUDIT_MSGTYPE) {
-				result = audit_comparator(type, f->op, f->val);
-				if (!result)
-					break;
+			pid_t pid;
+			u32 sid;
+
+			switch (f->type) {
+			case AUDIT_PID:
+				pid = task_pid_nr(current);
+				result = audit_comparator(pid, f->op, f->val);
+				break;
+			case AUDIT_UID:
+				result = audit_uid_comparator(current_uid(), f->op, f->uid);
+				break;
+			case AUDIT_GID:
+				result = audit_gid_comparator(current_gid(), f->op, f->gid);
+				break;
+			case AUDIT_LOGINUID:
+				result = audit_uid_comparator(audit_get_loginuid(current),
+							      f->op, f->uid);
+				break;
+			case AUDIT_LOGINUID_SET:
+				result = audit_comparator(audit_loginuid_set(current),
+							  f->op, f->val);
+				break;
+			case AUDIT_MSGTYPE:
+				result = audit_comparator(msgtype, f->op, f->val);
+				break;
+			case AUDIT_SUBJ_USER:
+			case AUDIT_SUBJ_ROLE:
+			case AUDIT_SUBJ_TYPE:
+			case AUDIT_SUBJ_SEN:
+			case AUDIT_SUBJ_CLR:
+				if (f->lsm_rule) {
+					security_task_getsecid(current, &sid);
+					result = security_audit_rule_match(sid,
+							f->type, f->op, f->lsm_rule, NULL);
+				}
+				break;
+			default:
+				goto unlock_and_return;
 			}
+			if (result < 0) /* error */
+				goto unlock_and_return;
+			if (!result)
+				break;
+		}
+		if (result > 0) {
+			if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_TYPE)
+				ret = 0;
+			break;
 		}
-		if (result)
-			goto unlock_and_return;
 	}
 unlock_and_return:
 	rcu_read_unlock();
-	return result;
+	return ret;
 }
 
 static int update_lsm_rule(struct audit_krule *r)
-- 
cgit v1.3-8-gc7d7


From 7fa8b7171a638ad896acabd9a17183b75b70aeb4 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <agnel.joel@gmail.com>
Date: Fri, 17 Jun 2016 22:44:54 -0700
Subject: tracing/function_graph: Fix filters for function_graph threshold

Function graph tracer currently ignores filters if tracing_thresh is set.
For example, even if set_ftrace_pid is set, then its ignored if tracing_thresh
set, resulting in all processes being traced.

To fix this, we reuse the same entry function as when tracing_thresh is not
set and do everything as in the regular case except for writing the function entry
to the ring buffer.

Link: http://lkml.kernel.org/r/1466228694-2677-1-git-send-email-agnel.joel@gmail.com

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Joel Fernandes <agnel.joel@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions_graph.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 67cce7896aeb..7363ccf79512 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -338,6 +338,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	if (ftrace_graph_notrace_addr(trace->func))
 		return 1;
 
+	/*
+	 * Stop here if tracing_threshold is set. We only write function return
+	 * events to the ring buffer.
+	 */
+	if (tracing_thresh)
+		return 1;
+
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = per_cpu_ptr(tr->trace_buffer.data, cpu);
@@ -355,14 +362,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	return ret;
 }
 
-static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
-{
-	if (tracing_thresh)
-		return 1;
-	else
-		return trace_graph_entry(trace);
-}
-
 static void
 __trace_graph_function(struct trace_array *tr,
 		unsigned long ip, unsigned long flags, int pc)
@@ -457,7 +456,7 @@ static int graph_trace_init(struct trace_array *tr)
 	set_graph_array(tr);
 	if (tracing_thresh)
 		ret = register_ftrace_graph(&trace_graph_thresh_return,
-					    &trace_graph_thresh_entry);
+					    &trace_graph_entry);
 	else
 		ret = register_ftrace_graph(&trace_graph_return,
 					    &trace_graph_entry);
-- 
cgit v1.3-8-gc7d7


From ea00f4f4f00cc2bc3b63ad512a4e6df3b20832b9 Mon Sep 17 00:00:00 2001
From: Lianwei Wang <lianwei.wang@gmail.com>
Date: Sun, 19 Jun 2016 23:52:27 -0700
Subject: PM / sleep: make PM notifiers called symmetrically

This makes pm notifier PREPARE/POST symmetrical: if PREPARE
fails, we will only undo what ever happened on PREPARE.

It fixes the unbalanced CPU hotplug enable in CPU PM notifier.

Signed-off-by: Lianwei Wang <lianwei.wang@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 20 ++++++++++++--------
 kernel/power/main.c      | 11 +++++++++--
 kernel/power/power.h     |  2 ++
 kernel/power/suspend.c   | 10 ++++++----
 kernel/power/user.c      | 14 ++++++++------
 5 files changed, 37 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index fca9254280ee..126e24caa82e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -647,7 +647,7 @@ static void power_down(void)
  */
 int hibernate(void)
 {
-	int error;
+	int error, nr_calls = 0;
 
 	if (!hibernation_available()) {
 		pr_debug("PM: Hibernation not available.\n");
@@ -662,9 +662,11 @@ int hibernate(void)
 	}
 
 	pm_prepare_console();
-	error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
-	if (error)
+	error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
+	if (error) {
+		nr_calls--;
 		goto Exit;
+	}
 
 	printk(KERN_INFO "PM: Syncing filesystems ... ");
 	sys_sync();
@@ -714,7 +716,7 @@ int hibernate(void)
 	/* Don't bother checking whether freezer_test_done is true */
 	freezer_test_done = false;
  Exit:
-	pm_notifier_call_chain(PM_POST_HIBERNATION);
+	__pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
 	pm_restore_console();
 	atomic_inc(&snapshot_device_available);
  Unlock:
@@ -740,7 +742,7 @@ int hibernate(void)
  */
 static int software_resume(void)
 {
-	int error;
+	int error, nr_calls = 0;
 	unsigned int flags;
 
 	/*
@@ -827,9 +829,11 @@ static int software_resume(void)
 	}
 
 	pm_prepare_console();
-	error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
-	if (error)
+	error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
+	if (error) {
+		nr_calls--;
 		goto Close_Finish;
+	}
 
 	pr_debug("PM: Preparing processes for restore.\n");
 	error = freeze_processes();
@@ -855,7 +859,7 @@ static int software_resume(void)
 	unlock_device_hotplug();
 	thaw_processes();
  Finish:
-	pm_notifier_call_chain(PM_POST_RESTORE);
+	__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
 	pm_restore_console();
 	atomic_inc(&snapshot_device_available);
 	/* For success case, the suspend path will release the lock */
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 27946975eff0..5ea50b1b7595 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -38,12 +38,19 @@ int unregister_pm_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_pm_notifier);
 
-int pm_notifier_call_chain(unsigned long val)
+int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls)
 {
-	int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL);
+	int ret;
+
+	ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL,
+						nr_to_call, nr_calls);
 
 	return notifier_to_errno(ret);
 }
+int pm_notifier_call_chain(unsigned long val)
+{
+	return __pm_notifier_call_chain(val, -1, NULL);
+}
 
 /* If set, devices may be suspended and resumed asynchronously. */
 int pm_async_enabled = 1;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index efe1b3b17c88..51f02ecaf125 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -200,6 +200,8 @@ static inline void suspend_test_finish(const char *label) {}
 
 #ifdef CONFIG_PM_SLEEP
 /* kernel/power/main.c */
+extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call,
+				    int *nr_calls);
 extern int pm_notifier_call_chain(unsigned long val);
 #endif
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 5b70d64b871e..0acab9d7f96f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -266,16 +266,18 @@ static int suspend_test(int level)
  */
 static int suspend_prepare(suspend_state_t state)
 {
-	int error;
+	int error, nr_calls = 0;
 
 	if (!sleep_state_supported(state))
 		return -EPERM;
 
 	pm_prepare_console();
 
-	error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
-	if (error)
+	error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls);
+	if (error) {
+		nr_calls--;
 		goto Finish;
+	}
 
 	trace_suspend_resume(TPS("freeze_processes"), 0, true);
 	error = suspend_freeze_processes();
@@ -286,7 +288,7 @@ static int suspend_prepare(suspend_state_t state)
 	suspend_stats.failed_freeze++;
 	dpm_save_failed_step(SUSPEND_FREEZE);
  Finish:
-	pm_notifier_call_chain(PM_POST_SUSPEND);
+	__pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL);
 	pm_restore_console();
 	return error;
 }
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 526e8911460a..35310b627388 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -47,7 +47,7 @@ atomic_t snapshot_device_available = ATOMIC_INIT(1);
 static int snapshot_open(struct inode *inode, struct file *filp)
 {
 	struct snapshot_data *data;
-	int error;
+	int error, nr_calls = 0;
 
 	if (!hibernation_available())
 		return -EPERM;
@@ -74,9 +74,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 			swap_type_of(swsusp_resume_device, 0, NULL) : -1;
 		data->mode = O_RDONLY;
 		data->free_bitmaps = false;
-		error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+		error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
 		if (error)
-			pm_notifier_call_chain(PM_POST_HIBERNATION);
+			__pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL);
 	} else {
 		/*
 		 * Resuming.  We may need to wait for the image device to
@@ -86,13 +86,15 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 
 		data->swap = -1;
 		data->mode = O_WRONLY;
-		error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+		error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
 		if (!error) {
 			error = create_basic_memory_bitmaps();
 			data->free_bitmaps = !error;
-		}
+		} else
+			nr_calls--;
+
 		if (error)
-			pm_notifier_call_chain(PM_POST_RESTORE);
+			__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
 	}
 	if (error)
 		atomic_inc(&snapshot_device_available);
-- 
cgit v1.3-8-gc7d7


From 1ca1cc98bf7418c680415bfce05699f67510a7fd Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 28 Jun 2016 12:18:23 +0200
Subject: bpf: minor cleanups on fd maps and helpers

Some minor cleanups: i) Remove the unlikely() from fd array map lookups
and let the CPU branch predictor do its job, scenarios where there is not
always a map entry are very well valid. ii) Move the attribute type check
in the bpf_perf_event_read() helper a bit earlier so it's consistent wrt
checks with bpf_perf_event_output() helper as well. iii) remove some
comments that are self-documenting in kprobe_prog_is_valid_access() and
therefore make it consistent to tp_prog_is_valid_access() as well.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c        |  3 +--
 kernel/trace/bpf_trace.c | 18 ++++++------------
 2 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b94a36550591..d638062f66d6 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -719,14 +719,13 @@ select_insn:
 
 		if (unlikely(index >= array->map.max_entries))
 			goto out;
-
 		if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
 			goto out;
 
 		tail_call_cnt++;
 
 		prog = READ_ONCE(array->ptrs[index]);
-		if (unlikely(!prog))
+		if (!prog)
 			goto out;
 
 		/* ARG1 at this point is guaranteed to point to CTX from
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3de25fbed785..4e61f74a5d73 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -199,19 +199,19 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
 		return -E2BIG;
 
 	ee = READ_ONCE(array->ptrs[index]);
-	if (unlikely(!ee))
+	if (!ee)
 		return -ENOENT;
 
 	event = ee->event;
+	if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
+		     event->attr.type != PERF_TYPE_RAW))
+		return -EINVAL;
+
 	/* make sure event is local and doesn't have pmu::count */
 	if (event->oncpu != smp_processor_id() ||
 	    event->pmu->count)
 		return -EINVAL;
 
-	if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
-		     event->attr.type != PERF_TYPE_RAW))
-		return -EINVAL;
-
 	/*
 	 * we don't know if the function is run successfully by the
 	 * return value. It can be judged in other places, such as
@@ -251,7 +251,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 		return -E2BIG;
 
 	ee = READ_ONCE(array->ptrs[index]);
-	if (unlikely(!ee))
+	if (!ee)
 		return -ENOENT;
 
 	event = ee->event;
@@ -354,18 +354,12 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
 					enum bpf_reg_type *reg_type)
 {
-	/* check bounds */
 	if (off < 0 || off >= sizeof(struct pt_regs))
 		return false;
-
-	/* only read is allowed */
 	if (type != BPF_READ)
 		return false;
-
-	/* disallow misaligned access */
 	if (off % size != 0)
 		return false;
-
 	return true;
 }
 
-- 
cgit v1.3-8-gc7d7


From d79313303181d357d293453fb8486bdc87bfd2f4 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 28 Jun 2016 12:18:24 +0200
Subject: bpf, trace: fetch current cpu only once

We currently have two invocations, which is unnecessary. Fetch it only
once and use the smp_processor_id() variant, so we also get preemption
checks along with it when DEBUG_PREEMPT is set.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4e61f74a5d73..505f9e9cdb3b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -233,6 +233,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 	struct pt_regs *regs = (struct pt_regs *) (long) r1;
 	struct bpf_map *map = (struct bpf_map *) (long) r2;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	unsigned int cpu = smp_processor_id();
 	u64 index = flags & BPF_F_INDEX_MASK;
 	void *data = (void *) (long) r4;
 	struct perf_sample_data sample_data;
@@ -246,7 +247,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
 		return -EINVAL;
 	if (index == BPF_F_CURRENT_CPU)
-		index = raw_smp_processor_id();
+		index = cpu;
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
@@ -259,7 +260,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 		     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
 		return -EINVAL;
 
-	if (unlikely(event->oncpu != smp_processor_id()))
+	if (unlikely(event->oncpu != cpu))
 		return -EOPNOTSUPP;
 
 	perf_sample_data_init(&sample_data, 0, 0);
-- 
cgit v1.3-8-gc7d7


From 6816a7ffce32e999601825ddfd887f36d3052932 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 28 Jun 2016 12:18:25 +0200
Subject: bpf, trace: add BPF_F_CURRENT_CPU flag for bpf_perf_event_read

Follow-up commit to 1e33759c788c ("bpf, trace: add BPF_F_CURRENT_CPU
flag for bpf_perf_event_output") to add the same functionality into
bpf_perf_event_read() helper. The split of index into flags and index
component is also safe here, since such large maps are rejected during
map allocation time.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  2 +-
 kernel/trace/bpf_trace.c | 11 ++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 406459b935a2..58df2da3e9bf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -347,7 +347,7 @@ enum bpf_func_id {
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
 #define BPF_F_DONT_FRAGMENT		(1ULL << 2)
 
-/* BPF_FUNC_perf_event_output flags. */
+/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */
 #define BPF_F_INDEX_MASK		0xffffffffULL
 #define BPF_F_CURRENT_CPU		BPF_F_INDEX_MASK
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 505f9e9cdb3b..19c5b4a5c3eb 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -188,13 +188,19 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
-static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5)
 {
 	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	unsigned int cpu = smp_processor_id();
+	u64 index = flags & BPF_F_INDEX_MASK;
 	struct bpf_event_entry *ee;
 	struct perf_event *event;
 
+	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+		return -EINVAL;
+	if (index == BPF_F_CURRENT_CPU)
+		index = cpu;
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
@@ -208,8 +214,7 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
 		return -EINVAL;
 
 	/* make sure event is local and doesn't have pmu::count */
-	if (event->oncpu != smp_processor_id() ||
-	    event->pmu->count)
+	if (unlikely(event->oncpu != cpu || event->pmu->count))
 		return -EINVAL;
 
 	/*
-- 
cgit v1.3-8-gc7d7


From 80b48c445797a634d869c7e5a53e182ba2688931 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 28 Jun 2016 12:18:26 +0200
Subject: bpf: don't use raw processor id in generic helper

Use smp_processor_id() for the generic helper bpf_get_smp_processor_id()
instead of the raw variant. This allows for preemption checks when we
have DEBUG_PREEMPT, and otherwise uses the raw variant anyway. We only
need to keep the raw variant for socket filters, but we can reuse the
helper that is already there from cBPF side.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/helpers.c |  2 +-
 net/core/filter.c    | 10 +++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ad7a0573f71b..1ea3afba1a4f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -101,7 +101,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = {
 
 static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
-	return raw_smp_processor_id();
+	return smp_processor_id();
 }
 
 const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
diff --git a/net/core/filter.c b/net/core/filter.c
index cb9fc16cac46..46c88d9cec5c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -150,6 +150,12 @@ static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 	return raw_smp_processor_id();
 }
 
+static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
+	.func		= __get_raw_cpu_id,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+
 static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
 			      struct bpf_insn *insn_buf)
 {
@@ -2037,7 +2043,7 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_get_prandom_u32:
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_get_smp_processor_id:
-		return &bpf_get_smp_processor_id_proto;
+		return &bpf_get_raw_smp_processor_id_proto;
 	case BPF_FUNC_tail_call:
 		return &bpf_tail_call_proto;
 	case BPF_FUNC_ktime_get_ns:
@@ -2086,6 +2092,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_route_realm_proto;
 	case BPF_FUNC_perf_event_output:
 		return bpf_get_event_output_proto();
+	case BPF_FUNC_get_smp_processor_id:
+		return &bpf_get_smp_processor_id_proto;
 	default:
 		return sk_filter_func_proto(func_id);
 	}
-- 
cgit v1.3-8-gc7d7


From eaaa7ec71bff4cb34d9025ed89068d4b3cac3df0 Mon Sep 17 00:00:00 2001
From: Gregor Boirie <gregor.boirie@parrot.com>
Date: Wed, 9 Mar 2016 19:05:48 +0100
Subject: timekeeping: export get_monotonic_coarse64 symbol

EXPORT_SYMBOL() get_monotonic_coarse64 for new IIO timestamping clock
selection usage. This provides user apps the ability to request a
particular IIO device to timestamp samples using a monotonic coarse clock
granularity.

Signed-off-by: Gregor Boirie <gregor.boirie@parrot.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 kernel/time/timekeeping.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 479d25cd3d4f..255e225393ac 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2186,6 +2186,7 @@ struct timespec64 get_monotonic_coarse64(void)
 
 	return now;
 }
+EXPORT_SYMBOL(get_monotonic_coarse64);
 
 /*
  * Must hold jiffies_lock
-- 
cgit v1.3-8-gc7d7


From 5f65e5ca286126a60f62c8421b77c2018a482b8a Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Tue, 26 Apr 2016 14:36:24 -0500
Subject: cred: Reject inodes with invalid ids in set_create_file_as()

Using INVALID_[UG]ID for the LSM file creation context doesn't
make sense, so return an error if the inode passed to
set_create_file_as() has an invalid id.

Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/cred.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index 0c0cd8a62285..5f264fb5737d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -689,6 +689,8 @@ EXPORT_SYMBOL(set_security_override_from_ctx);
  */
 int set_create_files_as(struct cred *new, struct inode *inode)
 {
+	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+		return -EINVAL;
 	new->fsuid = inode->i_uid;
 	new->fsgid = inode->i_gid;
 	return security_kernel_create_files_as(new, inode);
-- 
cgit v1.3-8-gc7d7


From 6168f8ed01dc46a277908938294f1132d723f58d Mon Sep 17 00:00:00 2001
From: Wei Jiangang <weijg.fnst@cn.fujitsu.com>
Date: Wed, 29 Jun 2016 12:51:50 +0800
Subject: timers/nohz: Fix several typos

Signed-off-by: Wei Jiangang <weijg.fnst@cn.fujitsu.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: fenghua.yu@intel.com
Link: http://lkml.kernel.org/r/1467175910-2966-2-git-send-email-weijg.fnst@cn.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-sched.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 536ada80f6dd..6d83e9c4a302 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -61,7 +61,7 @@ static void tick_do_update_jiffies64(ktime_t now)
 	if (delta.tv64 < tick_period.tv64)
 		return;
 
-	/* Reevalute with jiffies_lock held */
+	/* Reevaluate with jiffies_lock held */
 	write_seqlock(&jiffies_lock);
 
 	delta = ktime_sub(now, last_jiffies_update);
@@ -117,7 +117,7 @@ static void tick_sched_do_timer(ktime_t now)
 	/*
 	 * Check if the do_timer duty was dropped. We don't care about
 	 * concurrency: This happens only when the cpu in charge went
-	 * into a long sleep. If two cpus happen to assign themself to
+	 * into a long sleep. If two cpus happen to assign themselves to
 	 * this duty, then the jiffies update is still serialized by
 	 * jiffies_lock.
 	 */
@@ -571,7 +571,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
  * @last_update_time: variable to store update time in. Do not update
  * counters if NULL.
  *
- * Return the cummulative idle time (since boot) for a given
+ * Return the cumulative idle time (since boot) for a given
  * CPU, in microseconds.
  *
  * This time is measured via accounting rather than sampling,
@@ -612,7 +612,7 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
  * @last_update_time: variable to store update time in. Do not update
  * counters if NULL.
  *
- * Return the cummulative iowait time (since boot) for a given
+ * Return the cumulative iowait time (since boot) for a given
  * CPU, in microseconds.
  *
  * This time is measured via accounting rather than sampling,
@@ -733,7 +733,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	 * do_timer() never invoked. Keep track of the fact that it
 	 * was the one which had the do_timer() duty last. If this cpu
 	 * is the one which had the do_timer() duty last, we limit the
-	 * sleep time to the timekeeping max_deferement value.
+	 * sleep time to the timekeeping max_deferment value.
 	 * Otherwise we can sleep as long as we want.
 	 */
 	delta = timekeeping_max_deferment();
-- 
cgit v1.3-8-gc7d7


From 0de7611a1031f25b713fda7d36de44f17c2ed790 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 1 Jul 2016 12:42:35 +0200
Subject: timers/nohz: Capitalize 'CPU' consistently

While reviewing another patch I noticed that kernel/time/tick-sched.c
had a charmingly (confusingly, annoyingly) rich set of variants for
spelling 'CPU':

  cpu
  cpus
  CPU
  CPUs
  per CPU
  per-CPU
  per cpu

... sometimes these were mixed even within the same comment block!

Compress these variants down to a single consistent set of:

  CPU
  CPUs
  per-CPU

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-sched.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6d83e9c4a302..db57d1ba73eb 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -31,7 +31,7 @@
 #include <trace/events/timer.h>
 
 /*
- * Per cpu nohz control structure
+ * Per-CPU nohz control structure
  */
 static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
 
@@ -116,8 +116,8 @@ static void tick_sched_do_timer(ktime_t now)
 #ifdef CONFIG_NO_HZ_COMMON
 	/*
 	 * Check if the do_timer duty was dropped. We don't care about
-	 * concurrency: This happens only when the cpu in charge went
-	 * into a long sleep. If two cpus happen to assign themselves to
+	 * concurrency: This happens only when the CPU in charge went
+	 * into a long sleep. If two CPUs happen to assign themselves to
 	 * this duty, then the jiffies update is still serialized by
 	 * jiffies_lock.
 	 */
@@ -349,7 +349,7 @@ void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bi
 /*
  * Re-evaluate the need for the tick as we switch the current task.
  * It might need the tick due to per task/process properties:
- * perf events, posix cpu timers, ...
+ * perf events, posix CPU timers, ...
  */
 void __tick_nohz_task_switch(void)
 {
@@ -509,8 +509,8 @@ int tick_nohz_tick_stopped(void)
  *
  * In case the sched_tick was stopped on this CPU, we have to check if jiffies
  * must be updated. Otherwise an interrupt handler could use a stale jiffy
- * value. We do this unconditionally on any cpu, as we don't know whether the
- * cpu, which has the update task assigned is in a long sleep.
+ * value. We do this unconditionally on any CPU, as we don't know whether the
+ * CPU, which has the update task assigned is in a long sleep.
  */
 static void tick_nohz_update_jiffies(ktime_t now)
 {
@@ -526,7 +526,7 @@ static void tick_nohz_update_jiffies(ktime_t now)
 }
 
 /*
- * Updates the per cpu time idle statistics counters
+ * Updates the per-CPU time idle statistics counters
  */
 static void
 update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
@@ -566,7 +566,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 }
 
 /**
- * get_cpu_idle_time_us - get the total idle time of a cpu
+ * get_cpu_idle_time_us - get the total idle time of a CPU
  * @cpu: CPU number to query
  * @last_update_time: variable to store update time in. Do not update
  * counters if NULL.
@@ -607,7 +607,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 
 /**
- * get_cpu_iowait_time_us - get the total iowait time of a cpu
+ * get_cpu_iowait_time_us - get the total iowait time of a CPU
  * @cpu: CPU number to query
  * @last_update_time: variable to store update time in. Do not update
  * counters if NULL.
@@ -726,12 +726,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	}
 
 	/*
-	 * If this cpu is the one which updates jiffies, then give up
-	 * the assignment and let it be taken by the cpu which runs
-	 * the tick timer next, which might be this cpu as well. If we
+	 * If this CPU is the one which updates jiffies, then give up
+	 * the assignment and let it be taken by the CPU which runs
+	 * the tick timer next, which might be this CPU as well. If we
 	 * don't drop this here the jiffies might be stale and
 	 * do_timer() never invoked. Keep track of the fact that it
-	 * was the one which had the do_timer() duty last. If this cpu
+	 * was the one which had the do_timer() duty last. If this CPU
 	 * is the one which had the do_timer() duty last, we limit the
 	 * sleep time to the timekeeping max_deferment value.
 	 * Otherwise we can sleep as long as we want.
@@ -841,9 +841,9 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
 static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 {
 	/*
-	 * If this cpu is offline and it is the one which updates
+	 * If this CPU is offline and it is the one which updates
 	 * jiffies, then give up the assignment and let it be taken by
-	 * the cpu which runs the tick timer next. If we don't drop
+	 * the CPU which runs the tick timer next. If we don't drop
 	 * this here the jiffies might be stale and do_timer() never
 	 * invoked.
 	 */
@@ -933,11 +933,11 @@ void tick_nohz_idle_enter(void)
 	WARN_ON_ONCE(irqs_disabled());
 
 	/*
- 	 * Update the idle state in the scheduler domain hierarchy
- 	 * when tick_nohz_stop_sched_tick() is called from the idle loop.
- 	 * State will be updated to busy during the first busy tick after
- 	 * exiting idle.
- 	 */
+	 * Update the idle state in the scheduler domain hierarchy
+	 * when tick_nohz_stop_sched_tick() is called from the idle loop.
+	 * State will be updated to busy during the first busy tick after
+	 * exiting idle.
+	 */
 	set_cpu_sd_state_idle();
 
 	local_irq_disable();
@@ -1211,7 +1211,7 @@ void tick_setup_sched_timer(void)
 	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	ts->sched_timer.function = tick_sched_timer;
 
-	/* Get the next period (per cpu) */
+	/* Get the next period (per-CPU) */
 	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
 
 	/* Offset the tick to avert jiffies_lock contention. */
-- 
cgit v1.3-8-gc7d7


From 1aacde3d22c42281236155c1ef6d7a5aa32a826b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 30 Jun 2016 17:24:43 +0200
Subject: bpf: generally move prog destruction to RCU deferral

Jann Horn reported following analysis that could potentially result
in a very hard to trigger (if not impossible) UAF race, to quote his
event timeline:

 - Set up a process with threads T1, T2 and T3
 - Let T1 set up a socket filter F1 that invokes another filter F2
   through a BPF map [tail call]
 - Let T1 trigger the socket filter via a unix domain socket write,
   don't wait for completion
 - Let T2 call PERF_EVENT_IOC_SET_BPF with F2, don't wait for completion
 - Now T2 should be behind bpf_prog_get(), but before bpf_prog_put()
 - Let T3 close the file descriptor for F2, dropping the reference
   count of F2 to 2
 - At this point, T1 should have looked up F2 from the map, but not
   finished executing it
 - Let T3 remove F2 from the BPF map, dropping the reference count of
   F2 to 1
 - Now T2 should call bpf_prog_put() (wrong BPF program type), dropping
   the reference count of F2 to 0 and scheduling bpf_prog_free_deferred()
   via schedule_work()
 - At this point, the BPF program could be freed
 - BPF execution is still running in a freed BPF program

While at PERF_EVENT_IOC_SET_BPF time it's only guaranteed that the perf
event fd we're doing the syscall on doesn't disappear from underneath us
for whole syscall time, it may not be the case for the bpf fd used as
an argument only after we did the put. It needs to be a valid fd pointing
to a BPF program at the time of the call to make the bpf_prog_get() and
while T2 gets preempted, F2 must have dropped reference to 1 on the other
CPU. The fput() from the close() in T3 should also add additionally delay
to the reference drop via exit_task_work() when bpf_prog_release() gets
called as well as scheduling bpf_prog_free_deferred().

That said, it makes nevertheless sense to move the BPF prog destruction
generally after RCU grace period to guarantee that such scenario above,
but also others as recently fixed in ceb56070359b ("bpf, perf: delay release
of BPF prog after grace period") with regards to tail calls won't happen.
Integrating bpf_prog_free_deferred() directly into the RCU callback is
not allowed since the invocation might happen from either softirq or
process context, so we're not permitted to block. Reviewing all bpf_prog_put()
invocations from eBPF side (note, cBPF -> eBPF progs don't use this for
their destruction) with call_rcu() look good to me.

Since we don't know whether at the time of attaching the program, we're
already part of a tail call map, we need to use RCU variant. However, due
to this, there won't be severely more stress on the RCU callback queue:
situations with above bpf_prog_get() and bpf_prog_put() combo in practice
normally won't lead to releases, but even if they would, enough effort/
cycles have to be put into loading a BPF program into the kernel already.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   |  5 -----
 kernel/bpf/arraymap.c |  4 +---
 kernel/bpf/syscall.c  | 13 +++----------
 kernel/events/core.c  |  2 +-
 4 files changed, 5 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8411032ac90d..749549888b86 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -220,7 +220,6 @@ void bpf_register_map_type(struct bpf_map_type_list *tl);
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog);
 void bpf_prog_put(struct bpf_prog *prog);
-void bpf_prog_put_rcu(struct bpf_prog *prog);
 
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
@@ -281,10 +280,6 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 static inline void bpf_prog_put(struct bpf_prog *prog)
 {
 }
-
-static inline void bpf_prog_put_rcu(struct bpf_prog *prog)
-{
-}
 #endif /* CONFIG_BPF_SYSCALL */
 
 /* verifier prototypes for helper functions called from eBPF programs */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 5af30732697b..4ec57a649b1f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -390,9 +390,7 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map,
 
 static void prog_fd_array_put_ptr(void *ptr)
 {
-	struct bpf_prog *prog = ptr;
-
-	bpf_prog_put_rcu(prog);
+	bpf_prog_put(ptr);
 }
 
 /* decrement refcnt of all bpf_progs that are stored in this map */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c23a4e9311b3..f6806a1d7ed9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -623,7 +623,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
 	free_uid(user);
 }
 
-static void __prog_put_common(struct rcu_head *rcu)
+static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 {
 	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
 
@@ -632,17 +632,10 @@ static void __prog_put_common(struct rcu_head *rcu)
 	bpf_prog_free(aux->prog);
 }
 
-/* version of bpf_prog_put() that is called after a grace period */
-void bpf_prog_put_rcu(struct bpf_prog *prog)
-{
-	if (atomic_dec_and_test(&prog->aux->refcnt))
-		call_rcu(&prog->aux->rcu, __prog_put_common);
-}
-
 void bpf_prog_put(struct bpf_prog *prog)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt))
-		__prog_put_common(&prog->aux->rcu);
+		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_put);
 
@@ -650,7 +643,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
 {
 	struct bpf_prog *prog = filp->private_data;
 
-	bpf_prog_put_rcu(prog);
+	bpf_prog_put(prog);
 	return 0;
 }
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 85cd41878a74..9c51ec3f0f44 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7529,7 +7529,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
 	prog = event->tp_event->prog;
 	if (prog) {
 		event->tp_event->prog = NULL;
-		bpf_prog_put_rcu(prog);
+		bpf_prog_put(prog);
 	}
 }
 
-- 
cgit v1.3-8-gc7d7


From 113214be7f6c98dd6d0435e4765aea8dea91662c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 30 Jun 2016 17:24:44 +0200
Subject: bpf: refactor bpf_prog_get and type check into helper

Since bpf_prog_get() and program type check is used in a couple of places,
refactor this into a small helper function that we can make use of. Since
the non RO prog->aux part is not used in performance critical paths and a
program destruction via RCU is rather very unlikley when doing the put, we
shouldn't have an issue just doing the bpf_prog_get() + prog->type != type
check, but actually not taking the ref at all (due to being in fdget() /
fdput() section of the bpf fd) is even cleaner and makes the diff smaller
as well, so just go for that. Callsites are changed to make use of the new
helper where possible.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h    |  7 +++++++
 kernel/bpf/syscall.c   | 27 +++++++++++++++++++--------
 net/core/filter.c      | 13 +------------
 net/kcm/kcmsock.c      |  8 +-------
 net/packet/af_packet.c |  6 +-----
 net/sched/act_bpf.c    |  7 +------
 net/sched/cls_bpf.c    |  7 +------
 7 files changed, 31 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 749549888b86..b3336b4f5d04 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -218,6 +218,7 @@ void bpf_register_prog_type(struct bpf_prog_type_list *tl);
 void bpf_register_map_type(struct bpf_map_type_list *tl);
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
+struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
 struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog);
 void bpf_prog_put(struct bpf_prog *prog);
 
@@ -277,6 +278,12 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
+static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
+						 enum bpf_prog_type type)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline void bpf_prog_put(struct bpf_prog *prog)
 {
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f6806a1d7ed9..22863d9872b1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -657,7 +657,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog)
 				O_RDWR | O_CLOEXEC);
 }
 
-static struct bpf_prog *__bpf_prog_get(struct fd f)
+static struct bpf_prog *____bpf_prog_get(struct fd f)
 {
 	if (!f.file)
 		return ERR_PTR(-EBADF);
@@ -678,24 +678,35 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
 	return prog;
 }
 
-/* called by sockets/tracing/seccomp before attaching program to an event
- * pairs with bpf_prog_put()
- */
-struct bpf_prog *bpf_prog_get(u32 ufd)
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 {
 	struct fd f = fdget(ufd);
 	struct bpf_prog *prog;
 
-	prog = __bpf_prog_get(f);
+	prog = ____bpf_prog_get(f);
 	if (IS_ERR(prog))
 		return prog;
+	if (type && prog->type != *type) {
+		prog = ERR_PTR(-EINVAL);
+		goto out;
+	}
 
 	prog = bpf_prog_inc(prog);
+out:
 	fdput(f);
-
 	return prog;
 }
-EXPORT_SYMBOL_GPL(bpf_prog_get);
+
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+	return __bpf_prog_get(ufd, NULL);
+}
+
+struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
+{
+	return __bpf_prog_get(ufd, &type);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 /* last field in 'union bpf_attr' used by this command */
 #define	BPF_PROG_LOAD_LAST_FIELD kern_version
diff --git a/net/core/filter.c b/net/core/filter.c
index 76f9a4938be4..76fee35da244 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1301,21 +1301,10 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 
 static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
 {
-	struct bpf_prog *prog;
-
 	if (sock_flag(sk, SOCK_FILTER_LOCKED))
 		return ERR_PTR(-EPERM);
 
-	prog = bpf_prog_get(ufd);
-	if (IS_ERR(prog))
-		return prog;
-
-	if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
-		bpf_prog_put(prog);
-		return ERR_PTR(-EINVAL);
-	}
-
-	return prog;
+	return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
 }
 
 int sk_attach_bpf(u32 ufd, struct sock *sk)
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 0b68ba730a06..cb39e05b166c 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1765,18 +1765,12 @@ static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info)
 	if (!csock)
 		return -ENOENT;
 
-	prog = bpf_prog_get(info->bpf_fd);
+	prog = bpf_prog_get_type(info->bpf_fd, BPF_PROG_TYPE_SOCKET_FILTER);
 	if (IS_ERR(prog)) {
 		err = PTR_ERR(prog);
 		goto out;
 	}
 
-	if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
-		bpf_prog_put(prog);
-		err = -EINVAL;
-		goto out;
-	}
-
 	err = kcm_attach(sock, csock, prog);
 	if (err) {
 		bpf_prog_put(prog);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d1f3b9e977e5..48b58957adf4 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1588,13 +1588,9 @@ static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
 	if (copy_from_user(&fd, data, len))
 		return -EFAULT;
 
-	new = bpf_prog_get(fd);
+	new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
-	if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) {
-		bpf_prog_put(new);
-		return -EINVAL;
-	}
 
 	__fanout_set_data_bpf(po->fanout, new);
 	return 0;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index f7b6cf49ea6f..ef74bffa6101 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -223,15 +223,10 @@ static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
 
 	bpf_fd = nla_get_u32(tb[TCA_ACT_BPF_FD]);
 
-	fp = bpf_prog_get(bpf_fd);
+	fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_ACT);
 	if (IS_ERR(fp))
 		return PTR_ERR(fp);
 
-	if (fp->type != BPF_PROG_TYPE_SCHED_ACT) {
-		bpf_prog_put(fp);
-		return -EINVAL;
-	}
-
 	if (tb[TCA_ACT_BPF_NAME]) {
 		name = kmemdup(nla_data(tb[TCA_ACT_BPF_NAME]),
 			       nla_len(tb[TCA_ACT_BPF_NAME]),
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 7b342c779da7..c3002c2c68bb 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -272,15 +272,10 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
 
 	bpf_fd = nla_get_u32(tb[TCA_BPF_FD]);
 
-	fp = bpf_prog_get(bpf_fd);
+	fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS);
 	if (IS_ERR(fp))
 		return PTR_ERR(fp);
 
-	if (fp->type != BPF_PROG_TYPE_SCHED_CLS) {
-		bpf_prog_put(fp);
-		return -EINVAL;
-	}
-
 	if (tb[TCA_BPF_NAME]) {
 		name = kmemdup(nla_data(tb[TCA_BPF_NAME]),
 			       nla_len(tb[TCA_BPF_NAME]),
-- 
cgit v1.3-8-gc7d7


From 1f3fe7ebf6136c341012db9f554d4caa566fcbaa Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 30 Jun 2016 10:28:42 -0700
Subject: cgroup: Add cgroup_get_from_fd

Add a helper function to get a cgroup2 from a fd.  It will be
stored in a bpf array (BPF_MAP_TYPE_CGROUP_ARRAY) which will
be introduced in the later patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Tejun Heo <tj@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/cgroup.h |  1 +
 kernel/cgroup.c        | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a20320c666fd..984f73b719a9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -87,6 +87,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 						       struct cgroup_subsys *ss);
 
 struct cgroup *cgroup_get_from_path(const char *path);
+struct cgroup *cgroup_get_from_fd(int fd);
 
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 75c0ff00aca6..50787cd61da2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -62,6 +62,7 @@
 #include <linux/proc_ns.h>
 #include <linux/nsproxy.h>
 #include <linux/proc_ns.h>
+#include <linux/file.h>
 #include <net/sock.h>
 
 /*
@@ -6209,6 +6210,40 @@ struct cgroup *cgroup_get_from_path(const char *path)
 }
 EXPORT_SYMBOL_GPL(cgroup_get_from_path);
 
+/**
+ * cgroup_get_from_fd - get a cgroup pointer from a fd
+ * @fd: fd obtained by open(cgroup2_dir)
+ *
+ * Find the cgroup from a fd which should be obtained
+ * by opening a cgroup directory.  Returns a pointer to the
+ * cgroup on success. ERR_PTR is returned if the cgroup
+ * cannot be found.
+ */
+struct cgroup *cgroup_get_from_fd(int fd)
+{
+	struct cgroup_subsys_state *css;
+	struct cgroup *cgrp;
+	struct file *f;
+
+	f = fget_raw(fd);
+	if (!f)
+		return ERR_PTR(-EBADF);
+
+	css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+	fput(f);
+	if (IS_ERR(css))
+		return ERR_CAST(css);
+
+	cgrp = css->cgroup;
+	if (!cgroup_on_dfl(cgrp)) {
+		cgroup_put(cgrp);
+		return ERR_PTR(-EBADF);
+	}
+
+	return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
+
 /*
  * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
  * definition in cgroup-defs.h.
-- 
cgit v1.3-8-gc7d7


From 4ed8ec521ed57c4e207ad464ca0388776de74d4b Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 30 Jun 2016 10:28:43 -0700
Subject: cgroup: bpf: Add BPF_MAP_TYPE_CGROUP_ARRAY

Add a BPF_MAP_TYPE_CGROUP_ARRAY and its bpf_map_ops's implementations.
To update an element, the caller is expected to obtain a cgroup2 backed
fd by open(cgroup2_dir) and then update the array with that fd.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Tejun Heo <tj@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  1 +
 kernel/bpf/arraymap.c    | 43 +++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c     |  3 ++-
 kernel/bpf/verifier.c    |  2 ++
 4 files changed, 48 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index be6ac1291680..26c04be32003 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -84,6 +84,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_PERCPU_HASH,
 	BPF_MAP_TYPE_PERCPU_ARRAY,
 	BPF_MAP_TYPE_STACK_TRACE,
+	BPF_MAP_TYPE_CGROUP_ARRAY,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 4ec57a649b1f..db1a743e3db2 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -537,3 +537,46 @@ static int __init register_perf_event_array_map(void)
 	return 0;
 }
 late_initcall(register_perf_event_array_map);
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
+				     struct file *map_file /* not used */,
+				     int fd)
+{
+	return cgroup_get_from_fd(fd);
+}
+
+static void cgroup_fd_array_put_ptr(void *ptr)
+{
+	/* cgroup_put free cgrp after a rcu grace period */
+	cgroup_put(ptr);
+}
+
+static void cgroup_fd_array_free(struct bpf_map *map)
+{
+	bpf_fd_array_map_clear(map);
+	fd_array_map_free(map);
+}
+
+static const struct bpf_map_ops cgroup_array_ops = {
+	.map_alloc = fd_array_map_alloc,
+	.map_free = cgroup_fd_array_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = fd_array_map_lookup_elem,
+	.map_delete_elem = fd_array_map_delete_elem,
+	.map_fd_get_ptr = cgroup_fd_array_get_ptr,
+	.map_fd_put_ptr = cgroup_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list cgroup_array_type __read_mostly = {
+	.ops = &cgroup_array_ops,
+	.type = BPF_MAP_TYPE_CGROUP_ARRAY,
+};
+
+static int __init register_cgroup_array_map(void)
+{
+	bpf_register_map_type(&cgroup_array_type);
+	return 0;
+}
+late_initcall(register_cgroup_array_map);
+#endif
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 22863d9872b1..96d938a22050 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -393,7 +393,8 @@ static int map_update_elem(union bpf_attr *attr)
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_update(map, key, value, attr->flags);
 	} else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
-		   map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
+		   map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
+		   map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) {
 		rcu_read_lock();
 		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
 						   attr->flags);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eec9f90ba030..69ba2251a22b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1035,6 +1035,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		if (func_id != BPF_FUNC_get_stackid)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_CGROUP_ARRAY:
+		goto error;
 	default:
 		break;
 	}
-- 
cgit v1.3-8-gc7d7


From 4a482f34afcc162d8456f449b137ec2a95be60d8 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 30 Jun 2016 10:28:44 -0700
Subject: cgroup: bpf: Add bpf_skb_in_cgroup_proto

Adds a bpf helper, bpf_skb_in_cgroup, to decide if a skb->sk
belongs to a descendant of a cgroup2.  It is similar to the
feature added in netfilter:
commit c38c4597e4bf ("netfilter: implement xt_cgroup cgroup2 path match")

The user is expected to populate a BPF_MAP_TYPE_CGROUP_ARRAY
which will be used by the bpf_skb_in_cgroup.

Modifications to the bpf verifier is to ensure BPF_MAP_TYPE_CGROUP_ARRAY
and bpf_skb_in_cgroup() are always used together.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Tejun Heo <tj@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 11 +++++++++++
 kernel/bpf/verifier.c    |  8 +++++++-
 net/core/filter.c        | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 26c04be32003..f44504d875e2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -337,6 +337,17 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_skb_change_type,
 
+	/**
+	 * bpf_skb_in_cgroup(skb, map, index) - Check cgroup2 membership of skb
+	 * @skb: pointer to skb
+	 * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
+	 * @index: index of the cgroup in the bpf_map
+	 * Return:
+	 *   == 0 skb failed the cgroup2 descendant test
+	 *   == 1 skb succeeded the cgroup2 descendant test
+	 *    < 0 error
+	 */
+	BPF_FUNC_skb_in_cgroup,
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 69ba2251a22b..e206c2181412 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1036,7 +1036,9 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_CGROUP_ARRAY:
-		goto error;
+		if (func_id != BPF_FUNC_skb_in_cgroup)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -1056,6 +1058,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
 			goto error;
 		break;
+	case BPF_FUNC_skb_in_cgroup:
+		if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
+			goto error;
+		break;
 	default:
 		break;
 	}
diff --git a/net/core/filter.c b/net/core/filter.c
index 76fee35da244..54071cf70fb5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2239,6 +2239,40 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
 	}
 }
 
+#ifdef CONFIG_SOCK_CGROUP_DATA
+static u64 bpf_skb_in_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *)(long)r1;
+	struct bpf_map *map = (struct bpf_map *)(long)r2;
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct cgroup *cgrp;
+	struct sock *sk;
+	u32 i = (u32)r3;
+
+	sk = skb->sk;
+	if (!sk || !sk_fullsock(sk))
+		return -ENOENT;
+
+	if (unlikely(i >= array->map.max_entries))
+		return -E2BIG;
+
+	cgrp = READ_ONCE(array->ptrs[i]);
+	if (unlikely(!cgrp))
+		return -EAGAIN;
+
+	return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp);
+}
+
+static const struct bpf_func_proto bpf_skb_in_cgroup_proto = {
+	.func		= bpf_skb_in_cgroup,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+};
+#endif
+
 static const struct bpf_func_proto *
 sk_filter_func_proto(enum bpf_func_id func_id)
 {
@@ -2307,6 +2341,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return bpf_get_event_output_proto();
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+	case BPF_FUNC_skb_in_cgroup:
+		return &bpf_skb_in_cgroup_proto;
+#endif
 	default:
 		return sk_filter_func_proto(func_id);
 	}
-- 
cgit v1.3-8-gc7d7


From 7b776af66dc462caa7e839cc5c950a61db1f8551 Mon Sep 17 00:00:00 2001
From: Roger Lu <roger.lu@mediatek.com>
Date: Fri, 1 Jul 2016 11:05:02 +0800
Subject: PM / suspend: show workqueue state in suspend flow

If freezable workqueue aborts suspend flow, show
workqueue state for debug purpose.

Signed-off-by: Roger Lu <roger.lu@mediatek.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/process.c | 3 +++
 kernel/workqueue.c     | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index df058bed53ce..6eef250a5705 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -89,6 +89,9 @@ static int try_to_freeze_tasks(bool user_only)
 		       elapsed_msecs / 1000, elapsed_msecs % 1000,
 		       todo - wq_busy, wq_busy);
 
+		if (wq_busy)
+			show_workqueue_state();
+
 		if (!wakeup) {
 			read_lock(&tasklist_lock);
 			for_each_process_thread(g, p) {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e1c0e996b5ae..619e80ce4a59 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4369,8 +4369,8 @@ static void show_pwq(struct pool_workqueue *pwq)
 /**
  * show_workqueue_state - dump workqueue state
  *
- * Called from a sysrq handler and prints out all busy workqueues and
- * pools.
+ * Called from a sysrq handler or try_to_freeze_tasks() and prints out
+ * all busy workqueues and pools.
  */
 void show_workqueue_state(void)
 {
-- 
cgit v1.3-8-gc7d7


From 9c744481c003697de453e8fc039468143ba604aa Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 29 Jun 2016 03:00:51 +0200
Subject: PM / hibernate: Do not free preallocated safe pages during image
 restore

The core image restoration code preallocates some safe pages
(ie. pages that weren't used by the image kernel before hibernation)
for future use before allocating the bulk of memory for loading the
image data.  Those safe pages are then freed so they can be allocated
again (with the memory management subsystem's help).  That's done to
ensure that there will be enough safe pages for temporary data
structures needed during image restoration.

However, it is not really necessary to free those pages after they
have been allocated.  They can be added to the (global) list of
safe pages right away and then picked up from there when needed
without freeing.

That reduces the overhead related to using safe pages, especially
in the arch-specific code, so modify the code accordingly.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 66 ++++++++++++++++++++++++++++---------------------
 1 file changed, 38 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3a970604308f..d9476ff877b8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -74,6 +74,22 @@ void __init hibernate_image_size_init(void)
  */
 struct pbe *restore_pblist;
 
+/* struct linked_page is used to build chains of pages */
+
+#define LINKED_PAGE_DATA_SIZE	(PAGE_SIZE - sizeof(void *))
+
+struct linked_page {
+	struct linked_page *next;
+	char data[LINKED_PAGE_DATA_SIZE];
+} __packed;
+
+/*
+ * List of "safe" pages (ie. pages that were not used by the image kernel
+ * before hibernation) that may be used as temporary storage for image kernel
+ * memory contents.
+ */
+static struct linked_page *safe_pages_list;
+
 /* Pointer to an auxiliary buffer (1 page) */
 static void *buffer;
 
@@ -113,9 +129,21 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
 	return res;
 }
 
+static void *__get_safe_page(gfp_t gfp_mask)
+{
+	if (safe_pages_list) {
+		void *ret = safe_pages_list;
+
+		safe_pages_list = safe_pages_list->next;
+		memset(ret, 0, PAGE_SIZE);
+		return ret;
+	}
+	return get_image_page(gfp_mask, PG_SAFE);
+}
+
 unsigned long get_safe_page(gfp_t gfp_mask)
 {
-	return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
+	return (unsigned long)__get_safe_page(gfp_mask);
 }
 
 static struct page *alloc_image_page(gfp_t gfp_mask)
@@ -150,15 +178,6 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
 	__free_page(page);
 }
 
-/* struct linked_page is used to build chains of pages */
-
-#define LINKED_PAGE_DATA_SIZE	(PAGE_SIZE - sizeof(void *))
-
-struct linked_page {
-	struct linked_page *next;
-	char data[LINKED_PAGE_DATA_SIZE];
-} __packed;
-
 static inline void
 free_list_of_pages(struct linked_page *list, int clear_page_nosave)
 {
@@ -208,7 +227,8 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
 	if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
 		struct linked_page *lp;
 
-		lp = get_image_page(ca->gfp_mask, ca->safe_needed);
+		lp = ca->safe_needed ? __get_safe_page(ca->gfp_mask) :
+					get_image_page(ca->gfp_mask, PG_ANY);
 		if (!lp)
 			return NULL;
 
@@ -2104,11 +2124,6 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 	return 0;
 }
 
-/* List of "safe" pages that may be used to store data loaded from the suspend
- * image
- */
-static struct linked_page *safe_pages_list;
-
 #ifdef CONFIG_HIGHMEM
 /* struct highmem_pbe is used for creating the list of highmem pages that
  * should be restored atomically during the resume from disk, because the page
@@ -2334,7 +2349,7 @@ static int
 prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 {
 	unsigned int nr_pages, nr_highmem;
-	struct linked_page *sp_list, *lp;
+	struct linked_page *lp;
 	int error;
 
 	/* If there is no highmem, the buffer will not be necessary */
@@ -2362,9 +2377,9 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 	 * NOTE: This way we make sure there will be enough safe pages for the
 	 * chain_alloc() in get_buffer().  It is a bit wasteful, but
 	 * nr_copy_pages cannot be greater than 50% of the memory anyway.
+	 *
+	 * nr_copy_pages cannot be less than allocated_unsafe_pages too.
 	 */
-	sp_list = NULL;
-	/* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
 	nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
 	nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
 	while (nr_pages > 0) {
@@ -2373,12 +2388,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 			error = -ENOMEM;
 			goto Free;
 		}
-		lp->next = sp_list;
-		sp_list = lp;
+		lp->next = safe_pages_list;
+		safe_pages_list = lp;
 		nr_pages--;
 	}
 	/* Preallocate memory for the image */
-	safe_pages_list = NULL;
 	nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
 	while (nr_pages > 0) {
 		lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
@@ -2396,12 +2410,6 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 		swsusp_set_page_free(virt_to_page(lp));
 		nr_pages--;
 	}
-	/* Free the reserved safe pages so that chain_alloc() can use them */
-	while (sp_list) {
-		lp = sp_list->next;
-		free_image_page(sp_list, PG_UNSAFE_CLEAR);
-		sp_list = lp;
-	}
 	return 0;
 
  Free:
@@ -2491,6 +2499,8 @@ int snapshot_write_next(struct snapshot_handle *handle)
 		if (error)
 			return error;
 
+		safe_pages_list = NULL;
+
 		error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
 		if (error)
 			return error;
-- 
cgit v1.3-8-gc7d7


From 6dbecfd345a617888da370b13d5b190c9ff3df53 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 29 Jun 2016 03:02:16 +0200
Subject: PM / hibernate: Simplify mark_unsafe_pages()

Rework mark_unsafe_pages() to use a simpler method of clearing
all bits in free_pages_map and to set the bits for the "unsafe"
pages (ie. pages that were used by the image kernel before
hibernation) with the help of duplicate_memory_bitmap().

For this purpose, move the pfn_valid() check from mark_unsafe_pages()
to unpack_orig_pfns() where the "unsafe" pages are discovered.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 64 +++++++++++++++++++------------------------------
 1 file changed, 25 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d9476ff877b8..39bbad5fac5a 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -2019,53 +2019,41 @@ int snapshot_read_next(struct snapshot_handle *handle)
 	return PAGE_SIZE;
 }
 
+static void duplicate_memory_bitmap(struct memory_bitmap *dst,
+				    struct memory_bitmap *src)
+{
+	unsigned long pfn;
+
+	memory_bm_position_reset(src);
+	pfn = memory_bm_next_pfn(src);
+	while (pfn != BM_END_OF_MAP) {
+		memory_bm_set_bit(dst, pfn);
+		pfn = memory_bm_next_pfn(src);
+	}
+}
+
 /**
  *	mark_unsafe_pages - mark the pages that cannot be used for storing
  *	the image during resume, because they conflict with the pages that
  *	had been used before suspend
  */
 
-static int mark_unsafe_pages(struct memory_bitmap *bm)
+static void mark_unsafe_pages(struct memory_bitmap *bm)
 {
-	struct zone *zone;
-	unsigned long pfn, max_zone_pfn;
+	unsigned long pfn;
 
-	/* Clear page flags */
-	for_each_populated_zone(zone) {
-		max_zone_pfn = zone_end_pfn(zone);
-		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
-			if (pfn_valid(pfn))
-				swsusp_unset_page_free(pfn_to_page(pfn));
+	/* Clear the "free"/"unsafe" bit for all PFNs */
+	memory_bm_position_reset(free_pages_map);
+	pfn = memory_bm_next_pfn(free_pages_map);
+	while (pfn != BM_END_OF_MAP) {
+		memory_bm_clear_current(free_pages_map);
+		pfn = memory_bm_next_pfn(free_pages_map);
 	}
 
-	/* Mark pages that correspond to the "original" pfns as "unsafe" */
-	memory_bm_position_reset(bm);
-	do {
-		pfn = memory_bm_next_pfn(bm);
-		if (likely(pfn != BM_END_OF_MAP)) {
-			if (likely(pfn_valid(pfn)))
-				swsusp_set_page_free(pfn_to_page(pfn));
-			else
-				return -EFAULT;
-		}
-	} while (pfn != BM_END_OF_MAP);
+	/* Mark pages that correspond to the "original" PFNs as "unsafe" */
+	duplicate_memory_bitmap(free_pages_map, bm);
 
 	allocated_unsafe_pages = 0;
-
-	return 0;
-}
-
-static void
-duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
-{
-	unsigned long pfn;
-
-	memory_bm_position_reset(src);
-	pfn = memory_bm_next_pfn(src);
-	while (pfn != BM_END_OF_MAP) {
-		memory_bm_set_bit(dst, pfn);
-		pfn = memory_bm_next_pfn(src);
-	}
 }
 
 static int check_header(struct swsusp_info *info)
@@ -2115,7 +2103,7 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 		/* Extract and buffer page key for data page (s390 only). */
 		page_key_memorize(buf + j);
 
-		if (memory_bm_pfn_present(bm, buf[j]))
+		if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j]))
 			memory_bm_set_bit(bm, buf[j]);
 		else
 			return -EFAULT;
@@ -2357,9 +2345,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 	buffer = NULL;
 
 	nr_highmem = count_highmem_image_pages(bm);
-	error = mark_unsafe_pages(bm);
-	if (error)
-		goto Free;
+	mark_unsafe_pages(bm);
 
 	error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
 	if (error)
-- 
cgit v1.3-8-gc7d7


From 307c5971c972ef2bfd541d2850b36a692c6354c9 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 29 Jun 2016 03:05:10 +0200
Subject: PM / hibernate: Recycle safe pages after image restoration

One of the memory bitmaps used by the hibernation image restoration
code is freed after the image has been loaded.

That is not quite efficient, though, because the memory pages used
for building that bitmap are known to be safe (ie. they were not
used by the image kernel before hibernation) and the arch-specific
code finalizing the image restoration may need them.  In that case
it needs to allocate those pages again via the memory management
subsystem, check if they are really safe again by consulting the
other bitmaps and so on.

To avoid that, recycle those pages by putting them into the global
list of known safe pages so that they can be given to the arch code
right away when necessary.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 39bbad5fac5a..94b6fe6c9ae3 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -158,6 +158,14 @@ static struct page *alloc_image_page(gfp_t gfp_mask)
 	return page;
 }
 
+static void recycle_safe_page(void *page_address)
+{
+	struct linked_page *lp = page_address;
+
+	lp->next = safe_pages_list;
+	safe_pages_list = lp;
+}
+
 /**
  *	free_image_page - free page represented by @addr, allocated with
  *	get_image_page (page flags set by it must be cleared)
@@ -852,6 +860,34 @@ struct nosave_region {
 
 static LIST_HEAD(nosave_regions);
 
+static void recycle_zone_bm_rtree(struct mem_zone_bm_rtree *zone)
+{
+	struct rtree_node *node;
+
+	list_for_each_entry(node, &zone->nodes, list)
+		recycle_safe_page(node->data);
+
+	list_for_each_entry(node, &zone->leaves, list)
+		recycle_safe_page(node->data);
+}
+
+static void memory_bm_recycle(struct memory_bitmap *bm)
+{
+	struct mem_zone_bm_rtree *zone;
+	struct linked_page *p_list;
+
+	list_for_each_entry(zone, &bm->zones, list)
+		recycle_zone_bm_rtree(zone);
+
+	p_list = bm->p_list;
+	while (p_list) {
+		struct linked_page *lp = p_list;
+
+		p_list = lp->next;
+		recycle_safe_page(lp);
+	}
+}
+
 /**
  *	register_nosave_region - register a range of page frames the contents
  *	of which should not be saved during the suspend (to be used in the early
@@ -2542,9 +2578,9 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
 	/* Restore page key for data page (s390 only). */
 	page_key_write(handle->buffer);
 	page_key_free();
-	/* Free only if we have loaded the image entirely */
+	/* Do that only if we have loaded the image entirely */
 	if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
-		memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+		memory_bm_recycle(&orig_bm);
 		free_highmem_data();
 	}
 }
-- 
cgit v1.3-8-gc7d7


From b6140914fd079e43ea75a53429b47128584f033a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 17:39:22 +0900
Subject: genirq/msi: Remove unused MSI_FLAG_IDENTITY_MAP

No user and we definitely don't want to grow one.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: linux-block@vger.kernel.org
Cc: linux-pci@vger.kernel.org
Cc: linux-nvme@lists.infradead.org
Cc: axboe@fb.com
Cc: agordeev@redhat.com
Link: http://lkml.kernel.org/r/1467621574-8277-2-git-send-email-hch@lst.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/msi.h | 6 ++----
 kernel/irq/msi.c    | 8 ++------
 2 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 8b425c66305a..c33abfa0f5a7 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -264,12 +264,10 @@ enum {
 	 * callbacks.
 	 */
 	MSI_FLAG_USE_DEF_CHIP_OPS	= (1 << 1),
-	/* Build identity map between hwirq and irq */
-	MSI_FLAG_IDENTITY_MAP		= (1 << 2),
 	/* Support multiple PCI MSI interrupts */
-	MSI_FLAG_MULTI_PCI_MSI		= (1 << 3),
+	MSI_FLAG_MULTI_PCI_MSI		= (1 << 2),
 	/* Support PCI MSIX interrupts */
-	MSI_FLAG_PCI_MSIX		= (1 << 4),
+	MSI_FLAG_PCI_MSIX		= (1 << 3),
 };
 
 int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask,
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 38e89ce7b071..eb5bf2b50b07 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -324,7 +324,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 	struct msi_domain_ops *ops = info->ops;
 	msi_alloc_info_t arg;
 	struct msi_desc *desc;
-	int i, ret, virq = -1;
+	int i, ret, virq;
 
 	ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
 	if (ret)
@@ -332,12 +332,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 
 	for_each_msi_entry(desc, dev) {
 		ops->set_desc(&arg, desc);
-		if (info->flags & MSI_FLAG_IDENTITY_MAP)
-			virq = (int)ops->get_hwirq(info, &arg);
-		else
-			virq = -1;
 
-		virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used,
+		virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
 					       dev_to_node(dev), &arg, false);
 		if (virq < 0) {
 			ret = -ENOSPC;
-- 
cgit v1.3-8-gc7d7


From 9c2555835bb3d34dfac52a0be943dcc4bedd650f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 17:39:23 +0900
Subject: genirq: Introduce IRQD_AFFINITY_MANAGED flag

Interupts marked with this flag are excluded from user space interrupt
affinity changes. Contrary to the IRQ_NO_BALANCING flag, the kernel internal
affinity mechanism is not blocked.

This flag will be used for multi-queue device interrupts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: linux-block@vger.kernel.org
Cc: linux-pci@vger.kernel.org
Cc: linux-nvme@lists.infradead.org
Cc: axboe@fb.com
Cc: agordeev@redhat.com
Link: http://lkml.kernel.org/r/1467621574-8277-3-git-send-email-hch@lst.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h    |  7 +++++++
 kernel/irq/internals.h |  2 ++
 kernel/irq/manage.c    | 21 ++++++++++++++++++---
 kernel/irq/proc.c      |  2 +-
 4 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 4d758a7c604a..f6074813688d 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -197,6 +197,7 @@ struct irq_data {
  * IRQD_IRQ_INPROGRESS		- In progress state of the interrupt
  * IRQD_WAKEUP_ARMED		- Wakeup mode armed
  * IRQD_FORWARDED_TO_VCPU	- The interrupt is forwarded to a VCPU
+ * IRQD_AFFINITY_MANAGED	- Affinity is auto-managed by the kernel
  */
 enum {
 	IRQD_TRIGGER_MASK		= 0xf,
@@ -212,6 +213,7 @@ enum {
 	IRQD_IRQ_INPROGRESS		= (1 << 18),
 	IRQD_WAKEUP_ARMED		= (1 << 19),
 	IRQD_FORWARDED_TO_VCPU		= (1 << 20),
+	IRQD_AFFINITY_MANAGED		= (1 << 21),
 };
 
 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -305,6 +307,11 @@ static inline void irqd_clr_forwarded_to_vcpu(struct irq_data *d)
 	__irqd_to_state(d) &= ~IRQD_FORWARDED_TO_VCPU;
 }
 
+static inline bool irqd_affinity_is_managed(struct irq_data *d)
+{
+	return __irqd_to_state(d) & IRQD_AFFINITY_MANAGED;
+}
+
 #undef __irqd_to_state
 
 static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 09be2c903c6d..b15aa3b617a2 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -105,6 +105,8 @@ static inline void unregister_handler_proc(unsigned int irq,
 					   struct irqaction *action) { }
 #endif
 
+extern bool irq_can_set_affinity_usr(unsigned int irq);
+
 extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
 
 extern void irq_set_thread_affinity(struct irq_desc *desc);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ef0bc02c3a70..30658e9827f0 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -115,12 +115,12 @@ EXPORT_SYMBOL(synchronize_irq);
 #ifdef CONFIG_SMP
 cpumask_var_t irq_default_affinity;
 
-static int __irq_can_set_affinity(struct irq_desc *desc)
+static bool __irq_can_set_affinity(struct irq_desc *desc)
 {
 	if (!desc || !irqd_can_balance(&desc->irq_data) ||
 	    !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
-		return 0;
-	return 1;
+		return false;
+	return true;
 }
 
 /**
@@ -133,6 +133,21 @@ int irq_can_set_affinity(unsigned int irq)
 	return __irq_can_set_affinity(irq_to_desc(irq));
 }
 
+/**
+ * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space
+ * @irq:	Interrupt to check
+ *
+ * Like irq_can_set_affinity() above, but additionally checks for the
+ * AFFINITY_MANAGED flag.
+ */
+bool irq_can_set_affinity_usr(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	return __irq_can_set_affinity(desc) &&
+		!irqd_affinity_is_managed(&desc->irq_data);
+}
+
 /**
  *	irq_set_thread_affinity - Notify irq threads to adjust affinity
  *	@desc:		irq descriptor which has affitnity changed
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4e1b94726818..40bdcdc1f700 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -96,7 +96,7 @@ static ssize_t write_irq_affinity(int type, struct file *file,
 	cpumask_var_t new_value;
 	int err;
 
-	if (!irq_can_set_affinity(irq) || no_irq_affinity)
+	if (!irq_can_set_affinity_usr(irq) || no_irq_affinity)
 		return -EIO;
 
 	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
-- 
cgit v1.3-8-gc7d7


From 06ee6d571f0e350253a8fc3492316b2be007fae2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 17:39:24 +0900
Subject: genirq: Add affinity hint to irq allocation

Add an extra argument to the irq(domain) allocation functions, so we can hand
down affinity hints to the allocator. Thats necessary to implement proper
support for multiqueue devices.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: linux-block@vger.kernel.org
Cc: linux-pci@vger.kernel.org
Cc: linux-nvme@lists.infradead.org
Cc: axboe@fb.com
Cc: agordeev@redhat.com
Link: http://lkml.kernel.org/r/1467621574-8277-4-git-send-email-hch@lst.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/sparc/kernel/irq_64.c     |  2 +-
 arch/x86/kernel/apic/io_apic.c |  5 +++--
 include/linux/irq.h            |  4 ++--
 include/linux/irqdomain.h      |  9 ++++++---
 kernel/irq/ipi.c               |  2 +-
 kernel/irq/irqdesc.c           | 12 ++++++++----
 kernel/irq/irqdomain.c         | 22 ++++++++++++++--------
 kernel/irq/manage.c            |  7 ++++---
 kernel/irq/msi.c               |  3 ++-
 9 files changed, 41 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index e22416ce56ea..34a7930b76ef 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -242,7 +242,7 @@ unsigned int irq_alloc(unsigned int dev_handle, unsigned int dev_ino)
 {
 	int irq;
 
-	irq = __irq_alloc_descs(-1, 1, 1, numa_node_id(), NULL);
+	irq = __irq_alloc_descs(-1, 1, 1, numa_node_id(), NULL, NULL);
 	if (irq <= 0)
 		goto out;
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 446702ed99dc..7c4f90dd4c2a 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -981,7 +981,7 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
 
 	return __irq_domain_alloc_irqs(domain, irq, 1,
 				       ioapic_alloc_attr_node(info),
-				       info, legacy);
+				       info, legacy, NULL);
 }
 
 /*
@@ -1014,7 +1014,8 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain,
 					  info->ioapic_pin))
 			return -ENOMEM;
 	} else {
-		irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true);
+		irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true,
+					      NULL);
 		if (irq >= 0) {
 			irq_data = irq_domain_get_irq_data(domain, irq);
 			data = irq_data->chip_data;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index f6074813688d..39ce46ac5c18 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -708,11 +708,11 @@ static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d)
 unsigned int arch_dynirq_lower_bound(unsigned int from);
 
 int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
-		struct module *owner);
+		      struct module *owner, const struct cpumask *affinity);
 
 /* use macros to avoid needing export.h for THIS_MODULE */
 #define irq_alloc_descs(irq, from, cnt, node)	\
-	__irq_alloc_descs(irq, from, cnt, node, THIS_MODULE)
+	__irq_alloc_descs(irq, from, cnt, node, THIS_MODULE, NULL)
 
 #define irq_alloc_desc(node)			\
 	irq_alloc_descs(-1, 0, 1, node)
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index f1f36e04d885..1aee0fbe900e 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -39,6 +39,7 @@ struct irq_domain;
 struct of_device_id;
 struct irq_chip;
 struct irq_data;
+struct cpumask;
 
 /* Number of irqs reserved for a legacy isa controller */
 #define NUM_ISA_INTERRUPTS	16
@@ -217,7 +218,8 @@ extern struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
 						   enum irq_domain_bus_token bus_token);
 extern void irq_set_default_host(struct irq_domain *host);
 extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
-				  irq_hw_number_t hwirq, int node);
+				  irq_hw_number_t hwirq, int node,
+				  const struct cpumask *affinity);
 
 static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node)
 {
@@ -389,7 +391,7 @@ static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *par
 
 extern int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
 				   unsigned int nr_irqs, int node, void *arg,
-				   bool realloc);
+				   bool realloc, const struct cpumask *affinity);
 extern void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs);
 extern void irq_domain_activate_irq(struct irq_data *irq_data);
 extern void irq_domain_deactivate_irq(struct irq_data *irq_data);
@@ -397,7 +399,8 @@ extern void irq_domain_deactivate_irq(struct irq_data *irq_data);
 static inline int irq_domain_alloc_irqs(struct irq_domain *domain,
 			unsigned int nr_irqs, int node, void *arg)
 {
-	return __irq_domain_alloc_irqs(domain, -1, nr_irqs, node, arg, false);
+	return __irq_domain_alloc_irqs(domain, -1, nr_irqs, node, arg, false,
+				       NULL);
 }
 
 extern int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 89b49f6773f0..4fd23510d5f2 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -76,7 +76,7 @@ int irq_reserve_ipi(struct irq_domain *domain,
 		}
 	}
 
-	virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE);
+	virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE, NULL);
 	if (virq <= 0) {
 		pr_warn("Can't reserve IPI, failed to alloc descs\n");
 		return -ENOMEM;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 8731e1c5d1e7..b8df4fcdbb5f 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -223,7 +223,7 @@ static void free_desc(unsigned int irq)
 }
 
 static int alloc_descs(unsigned int start, unsigned int cnt, int node,
-		       struct module *owner)
+		       const struct cpumask *affinity, struct module *owner)
 {
 	struct irq_desc *desc;
 	int i;
@@ -333,6 +333,7 @@ static void free_desc(unsigned int irq)
 }
 
 static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
+			      const struct cpumask *affinity,
 			      struct module *owner)
 {
 	u32 i;
@@ -453,12 +454,15 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
  * @cnt:	Number of consecutive irqs to allocate.
  * @node:	Preferred node on which the irq descriptor should be allocated
  * @owner:	Owning module (can be NULL)
+ * @affinity:	Optional pointer to an affinity mask which hints where the
+ *		irq descriptors should be allocated and which default
+ *		affinities to use
  *
  * Returns the first irq number or error code
  */
 int __ref
 __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
-		  struct module *owner)
+		  struct module *owner, const struct cpumask *affinity)
 {
 	int start, ret;
 
@@ -494,7 +498,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
 
 	bitmap_set(allocated_irqs, start, cnt);
 	mutex_unlock(&sparse_irq_lock);
-	return alloc_descs(start, cnt, node, owner);
+	return alloc_descs(start, cnt, node, affinity, owner);
 
 err:
 	mutex_unlock(&sparse_irq_lock);
@@ -512,7 +516,7 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs);
  */
 unsigned int irq_alloc_hwirqs(int cnt, int node)
 {
-	int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL);
+	int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL, NULL);
 
 	if (irq < 0)
 		return 0;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8798b6c9e945..79459b732dc9 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -481,7 +481,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
 	}
 
 	/* Allocate a virtual interrupt number */
-	virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node));
+	virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL);
 	if (virq <= 0) {
 		pr_debug("-> virq allocation failed\n");
 		return 0;
@@ -835,19 +835,23 @@ const struct irq_domain_ops irq_domain_simple_ops = {
 EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
 
 int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
-			   int node)
+			   int node, const struct cpumask *affinity)
 {
 	unsigned int hint;
 
 	if (virq >= 0) {
-		virq = irq_alloc_descs(virq, virq, cnt, node);
+		virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE,
+					 affinity);
 	} else {
 		hint = hwirq % nr_irqs;
 		if (hint == 0)
 			hint++;
-		virq = irq_alloc_descs_from(hint, cnt, node);
-		if (virq <= 0 && hint > 1)
-			virq = irq_alloc_descs_from(1, cnt, node);
+		virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE,
+					 affinity);
+		if (virq <= 0 && hint > 1) {
+			virq = __irq_alloc_descs(-1, 1, cnt, node, THIS_MODULE,
+						 affinity);
+		}
 	}
 
 	return virq;
@@ -1160,6 +1164,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
  * @node:	NUMA node id for memory allocation
  * @arg:	domain specific argument
  * @realloc:	IRQ descriptors have already been allocated if true
+ * @affinity:	Optional irq affinity mask for multiqueue devices
  *
  * Allocate IRQ numbers and initialized all data structures to support
  * hierarchy IRQ domains.
@@ -1175,7 +1180,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
  */
 int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
 			    unsigned int nr_irqs, int node, void *arg,
-			    bool realloc)
+			    bool realloc, const struct cpumask *affinity)
 {
 	int i, ret, virq;
 
@@ -1193,7 +1198,8 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
 	if (realloc && irq_base >= 0) {
 		virq = irq_base;
 	} else {
-		virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node);
+		virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node,
+					      affinity);
 		if (virq < 0) {
 			pr_debug("cannot allocate IRQ(base %d, count %d)\n",
 				 irq_base, nr_irqs);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 30658e9827f0..ad0aac6d1248 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -353,10 +353,11 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
 		return 0;
 
 	/*
-	 * Preserve an userspace affinity setup, but make sure that
-	 * one of the targets is online.
+	 * Preserve the managed affinity setting and an userspace affinity
+	 * setup, but make sure that one of the targets is online.
 	 */
-	if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
+	if (irqd_affinity_is_managed(&desc->irq_data) ||
+	    irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
 		if (cpumask_intersects(desc->irq_common_data.affinity,
 				       cpu_online_mask))
 			set = desc->irq_common_data.affinity;
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index eb5bf2b50b07..58dbbacc6fbb 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -334,7 +334,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 		ops->set_desc(&arg, desc);
 
 		virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
-					       dev_to_node(dev), &arg, false);
+					       dev_to_node(dev), &arg, false,
+					       NULL);
 		if (virq < 0) {
 			ret = -ENOSPC;
 			if (ops->handle_error)
-- 
cgit v1.3-8-gc7d7


From 45ddcecbfa947f1dd8e8019bad9e90d6c9f2665c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 17:39:25 +0900
Subject: genirq: Use affinity hint in irqdesc allocation

Use the affinity hint in the irqdesc allocator. The hint is used to determine
the node for the allocation and to set the affinity of the interrupt.

If multiple interrupts are allocated (multi-MSI) then the allocator iterates
over the cpumask and for each set cpu it allocates on their node and sets the
initial affinity to that cpu.

If a single interrupt is allocated (MSI-X) then the allocator uses the first
cpu in the mask to compute the allocation node and uses the mask for the
initial affinity setting.

Interrupts set up this way are marked with the AFFINITY_MANAGED flag to
prevent userspace from messing with their affinity settings.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: linux-block@vger.kernel.org
Cc: linux-pci@vger.kernel.org
Cc: linux-nvme@lists.infradead.org
Cc: axboe@fb.com
Cc: agordeev@redhat.com
Link: http://lkml.kernel.org/r/1467621574-8277-5-git-send-email-hch@lst.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/irqdesc.c | 51 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index b8df4fcdbb5f..a623b44f2d4b 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -68,9 +68,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
 	return 0;
 }
 
-static void desc_smp_init(struct irq_desc *desc, int node)
+static void desc_smp_init(struct irq_desc *desc, int node,
+			  const struct cpumask *affinity)
 {
-	cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity);
+	if (!affinity)
+		affinity = irq_default_affinity;
+	cpumask_copy(desc->irq_common_data.affinity, affinity);
+
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	cpumask_clear(desc->pending_mask);
 #endif
@@ -82,11 +86,12 @@ static void desc_smp_init(struct irq_desc *desc, int node)
 #else
 static inline int
 alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
-static inline void desc_smp_init(struct irq_desc *desc, int node) { }
+static inline void
+desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { }
 #endif
 
 static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
-		struct module *owner)
+			      const struct cpumask *affinity, struct module *owner)
 {
 	int cpu;
 
@@ -107,7 +112,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
 	desc->owner = owner;
 	for_each_possible_cpu(cpu)
 		*per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
-	desc_smp_init(desc, node);
+	desc_smp_init(desc, node, affinity);
 }
 
 int nr_irqs = NR_IRQS;
@@ -158,7 +163,9 @@ void irq_unlock_sparse(void)
 	mutex_unlock(&sparse_irq_lock);
 }
 
-static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
+static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
+				   const struct cpumask *affinity,
+				   struct module *owner)
 {
 	struct irq_desc *desc;
 	gfp_t gfp = GFP_KERNEL;
@@ -178,7 +185,8 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
 	init_rcu_head(&desc->rcu);
 
-	desc_set_defaults(irq, desc, node, owner);
+	desc_set_defaults(irq, desc, node, affinity, owner);
+	irqd_set(&desc->irq_data, flags);
 
 	return desc;
 
@@ -225,11 +233,30 @@ static void free_desc(unsigned int irq)
 static int alloc_descs(unsigned int start, unsigned int cnt, int node,
 		       const struct cpumask *affinity, struct module *owner)
 {
+	const struct cpumask *mask = NULL;
 	struct irq_desc *desc;
-	int i;
+	unsigned int flags;
+	int i, cpu = -1;
+
+	if (affinity && cpumask_empty(affinity))
+		return -EINVAL;
+
+	flags = affinity ? IRQD_AFFINITY_MANAGED : 0;
 
 	for (i = 0; i < cnt; i++) {
-		desc = alloc_desc(start + i, node, owner);
+		if (affinity) {
+			cpu = cpumask_next(cpu, affinity);
+			if (cpu >= nr_cpu_ids)
+				cpu = cpumask_first(affinity);
+			node = cpu_to_node(cpu);
+
+			/*
+			 * For single allocations we use the caller provided
+			 * mask otherwise we use the mask of the target cpu
+			 */
+			mask = cnt == 1 ? affinity : cpumask_of(cpu);
+		}
+		desc = alloc_desc(start + i, node, flags, mask, owner);
 		if (!desc)
 			goto err;
 		mutex_lock(&sparse_irq_lock);
@@ -277,7 +304,7 @@ int __init early_irq_init(void)
 		nr_irqs = initcnt;
 
 	for (i = 0; i < initcnt; i++) {
-		desc = alloc_desc(i, node, NULL);
+		desc = alloc_desc(i, node, 0, NULL, NULL);
 		set_bit(i, allocated_irqs);
 		irq_insert_desc(i, desc);
 	}
@@ -311,7 +338,7 @@ int __init early_irq_init(void)
 		alloc_masks(&desc[i], GFP_KERNEL, node);
 		raw_spin_lock_init(&desc[i].lock);
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-		desc_set_defaults(i, &desc[i], node, NULL);
+		desc_set_defaults(i, &desc[i], node, NULL, NULL);
 	}
 	return arch_early_irq_init();
 }
@@ -328,7 +355,7 @@ static void free_desc(unsigned int irq)
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
-	desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL);
+	desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 
-- 
cgit v1.3-8-gc7d7


From 0972fa57f53525ffa6ced12d703750fc2791e3ce Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 17:39:26 +0900
Subject: genirq/msi: Make use of affinity aware allocations

Allow the MSI code to provide affinity hints per MSI descriptor.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: linux-block@vger.kernel.org
Cc: linux-pci@vger.kernel.org
Cc: linux-nvme@lists.infradead.org
Cc: axboe@fb.com
Cc: agordeev@redhat.com
Link: http://lkml.kernel.org/r/1467621574-8277-6-git-send-email-hch@lst.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/msi.h | 2 ++
 kernel/irq/msi.c    | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index c33abfa0f5a7..4f0bfe5912b2 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -47,6 +47,7 @@ struct fsl_mc_msi_desc {
  * @nvec_used:	The number of vectors used
  * @dev:	Pointer to the device which uses this descriptor
  * @msg:	The last set MSI message cached for reuse
+ * @affinity:	Optional pointer to a cpu affinity mask for this descriptor
  *
  * @masked:	[PCI MSI/X] Mask bits
  * @is_msix:	[PCI MSI/X] True if MSI-X
@@ -67,6 +68,7 @@ struct msi_desc {
 	unsigned int			nvec_used;
 	struct device			*dev;
 	struct msi_msg			msg;
+	const struct cpumask		*affinity;
 
 	union {
 		/* PCI MSI/X specific data */
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 58dbbacc6fbb..0e2a736b14a7 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -335,7 +335,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 
 		virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
 					       dev_to_node(dev), &arg, false,
-					       NULL);
+					       desc->affinity);
 		if (virq < 0) {
 			ret = -ENOSPC;
 			if (ops->handle_error)
-- 
cgit v1.3-8-gc7d7


From 5e385a6ef31fbbf2acbda770aecc2bc2ff933d17 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 4 Jul 2016 17:39:27 +0900
Subject: genirq: Add a helper to spread an affinity mask for MSI/MSI-X vectors

This is lifted from the blk-mq code and adopted to use the affinity mask
concept just introduced in the irq handling code.  It tries to keep the
algorithm the same as the one current used by blk-mq, but improvements
like assining vectors on a per-node basis instead of just per sibling
are possible with this simple move and refactoring.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: linux-block@vger.kernel.org
Cc: linux-pci@vger.kernel.org
Cc: linux-nvme@lists.infradead.org
Cc: axboe@fb.com
Cc: agordeev@redhat.com
Link: http://lkml.kernel.org/r/1467621574-8277-7-git-send-email-hch@lst.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |  8 +++++++
 kernel/irq/Makefile       |  1 +
 kernel/irq/affinity.c     | 61 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+)
 create mode 100644 kernel/irq/affinity.c

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9fcabeb07787..b6683f0ffc9f 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -278,6 +278,8 @@ extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
 extern int
 irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
 
+struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs);
+
 #else /* CONFIG_SMP */
 
 static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
@@ -308,6 +310,12 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
 {
 	return 0;
 }
+
+static inline struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
+{
+	*nr_vecs = 1;
+	return NULL;
+}
 #endif /* CONFIG_SMP */
 
 /*
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2ee42e95a3ce..1d3ee3169202 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
 obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
 obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
+obj-$(CONFIG_SMP) += affinity.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
new file mode 100644
index 000000000000..f68959341c0f
--- /dev/null
+++ b/kernel/irq/affinity.c
@@ -0,0 +1,61 @@
+
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+
+static int get_first_sibling(unsigned int cpu)
+{
+	unsigned int ret;
+
+	ret = cpumask_first(topology_sibling_cpumask(cpu));
+	if (ret < nr_cpu_ids)
+		return ret;
+	return cpu;
+}
+
+/*
+ * Take a map of online CPUs and the number of available interrupt vectors
+ * and generate an output cpumask suitable for spreading MSI/MSI-X vectors
+ * so that they are distributed as good as possible around the CPUs.  If
+ * more vectors than CPUs are available we'll map one to each CPU,
+ * otherwise we map one to the first sibling of each socket.
+ *
+ * If there are more vectors than CPUs we will still only have one bit
+ * set per CPU, but interrupt code will keep on assigning the vectors from
+ * the start of the bitmap until we run out of vectors.
+ */
+struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
+{
+	struct cpumask *affinity_mask;
+	unsigned int max_vecs = *nr_vecs;
+
+	if (max_vecs == 1)
+		return NULL;
+
+	affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL);
+	if (!affinity_mask) {
+		*nr_vecs = 1;
+		return NULL;
+	}
+
+	if (max_vecs >= num_online_cpus()) {
+		cpumask_copy(affinity_mask, cpu_online_mask);
+		*nr_vecs = num_online_cpus();
+	} else {
+		unsigned int vecs = 0, cpu;
+
+		for_each_online_cpu(cpu) {
+			if (cpu == get_first_sibling(cpu)) {
+				cpumask_set_cpu(cpu, affinity_mask);
+				vecs++;
+			}
+
+			if (--max_vecs == 0)
+				break;
+		}
+		*nr_vecs = vecs;
+	}
+
+	return affinity_mask;
+}
-- 
cgit v1.3-8-gc7d7


From 4364e1a29be16b2783c0bcbc263f61236af64281 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 15:32:25 +0200
Subject: genirq/msi: Fix broken debug output

virq is not required to be the same for all msi descs. Use the base irq number
from the desc in the debug printk.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/msi.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 0e2a736b14a7..54999350162c 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -353,6 +353,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 		ops->msi_finish(&arg, 0);
 
 	for_each_msi_entry(desc, dev) {
+		virq = desc->irq;
 		if (desc->nvec_used == 1)
 			dev_dbg(dev, "irq %d for MSI\n", virq);
 		else
-- 
cgit v1.3-8-gc7d7


From 501c2375253c0795048f48368e0b3e8b2f6646dc Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 5 Jul 2016 10:04:34 -0400
Subject: ftrace: Move toplevel init out of ftrace_init_tracefs()

Commit 345ddcc882d8 ("ftrace: Have set_ftrace_pid use the bitmap like events
do") placed ftrace_init_tracefs into the instance creation, and encapsulated
the top level updating with an if conditional, as the top level only gets
updated at boot up. Unfortunately, this triggers section mismatch errors as
the init functions are called from a function that can be called later, and
the section mismatch logic is unaware of the if conditional that would
prevent it from happening at run time.

To make everyone happy, create a separate ftrace_init_tracefs_toplevel()
routine that only gets called by init functions, and this will be what calls
other init functions for the toplevel directory.

Link: http://lkml.kernel.org/r/20160704102139.19cbc0d9@gandalf.local.home

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Reported-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 345ddcc882d8 ("ftrace: Have set_ftrace_pid use the bitmap like events do")
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 16 ++++++++++------
 kernel/trace/trace.c  |  1 +
 kernel/trace/trace.h  |  3 +++
 3 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8b488f4dd8e8..84752c8e28b5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5539,16 +5539,20 @@ static const struct file_operations ftrace_pid_fops = {
 
 void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 {
-	/* Only the top level directory has the dyn_tracefs and profile */
-	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
-		ftrace_init_dyn_tracefs(d_tracer);
-		ftrace_profile_tracefs(d_tracer);
-	}
-
 	trace_create_file("set_ftrace_pid", 0644, d_tracer,
 			    tr, &ftrace_pid_fops);
 }
 
+void __init ftrace_init_tracefs_toplevel(struct trace_array *tr,
+					 struct dentry *d_tracer)
+{
+	/* Only the top level directory has the dyn_tracefs and profile */
+	WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
+
+	ftrace_init_dyn_tracefs(d_tracer);
+	ftrace_profile_tracefs(d_tracer);
+}
+
 /**
  * ftrace_kill - kill ftrace
  *
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3d9f31b576f3..5fd53a7847bc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7369,6 +7369,7 @@ static __init int tracer_init_tracefs(void)
 		return 0;
 
 	init_tracer_tracefs(&global_trace, d_tracer);
+	ftrace_init_tracefs_toplevel(&global_trace, d_tracer);
 
 	trace_create_file("tracing_thresh", 0644, d_tracer,
 			&global_trace, &tracing_thresh_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index eaee458755a4..c1de3f493cd3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -857,6 +857,8 @@ void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
 void ftrace_reset_array_ops(struct trace_array *tr);
 int using_ftrace_ops_list_func(void);
 void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
+void ftrace_init_tracefs_toplevel(struct trace_array *tr,
+				  struct dentry *d_tracer);
 #else
 static inline int ftrace_trace_task(struct trace_array *tr)
 {
@@ -874,6 +876,7 @@ static inline __init void
 ftrace_init_global_array_ops(struct trace_array *tr) { }
 static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
 static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }
+static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { }
 /* ftace_func_t type is not defined, use macro instead of static inline */
 #define ftrace_init_array_ops(tr, func) do { } while (0)
 #endif /* CONFIG_FUNCTION_TRACER */
-- 
cgit v1.3-8-gc7d7


From 5130213721d01b6632c255d4295a8102cbb58379 Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@marvell.com>
Date: Tue, 5 Jul 2016 16:57:51 +0800
Subject: tick/broadcast-hrtimer: Set name of the ce_broadcast_hrtimer

This is to avoid the "null" name when we either

~ # cat /sys/devices/system/clockevents/broadcast/current_device
(null)

or

~ # cat /proc/timer_list
...
Tick Device: mode:     1
Broadcast device
Clock Event Device: (null)
...

Signed-off-by: Jisheng Zhang <jszhang@marvell.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1467709071-3667-1-git-send-email-jszhang@marvell.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast-hrtimer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 53d7184da0be..690b797f522e 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -75,6 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
 }
 
 static struct clock_event_device ce_broadcast_hrtimer = {
+	.name			= "bc_hrtimer",
 	.set_state_shutdown	= bc_shutdown,
 	.set_next_ktime		= bc_set_next,
 	.features		= CLOCK_EVT_FEAT_ONESHOT |
-- 
cgit v1.3-8-gc7d7


From 67f20b084574def586ecba68508acd5d054ccc88 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Mon, 4 Jul 2016 15:10:04 +0000
Subject: tracing: Using for_each_set_bit() to simplify trace_pid_write()

Using for_each_set_bit() to simplify the code.

Link: http://lkml.kernel.org/r/1467645004-11169-1-git-send-email-weiyj_lk@163.com

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5fd53a7847bc..dade4c9559cc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -517,13 +517,9 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
 
 	if (filtered_pids) {
 		/* copy the current bits to the new max */
-		pid = find_first_bit(filtered_pids->pids,
-				     filtered_pids->pid_max);
-		while (pid < filtered_pids->pid_max) {
+		for_each_set_bit(pid, filtered_pids->pids,
+				 filtered_pids->pid_max) {
 			set_bit(pid, pid_list->pids);
-			pid = find_next_bit(filtered_pids->pids,
-					    filtered_pids->pid_max,
-					    pid + 1);
 			nr_pids++;
 		}
 	}
-- 
cgit v1.3-8-gc7d7


From 7ad8fb61c4abf589596f0a4da34d987471481569 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Sun, 3 Jul 2016 08:51:34 -0500
Subject: tracing: Have HIST_TRIGGERS select TRACING

The kbuild test robot reported a compile error if HIST_TRIGGERS was
enabled but nothing else that selected TRACING was configured in.

HIST_TRIGGERS should directly select it and not rely on anything else
to do it.

Link: http://lkml.kernel.org/r/57791866.8080505@linux.intel.com

Reported-by: kbuild test robot <fennguang.wu@intel.com>
Fixes: 7ef224d1d0e3a ("tracing: Add 'hist' event trigger command")
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index fafeaf803bd0..f4b86e8ca1e7 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -542,6 +542,7 @@ config HIST_TRIGGERS
 	bool "Histogram triggers"
 	depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select TRACING_MAP
+	select TRACING
 	default n
 	help
 	  Hist triggers allow one or more arbitrary trace event fields
-- 
cgit v1.3-8-gc7d7


From a4a551b8f1d4c4ebffd0f49dfef44df3128546f8 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 29 Jun 2016 19:56:48 +0900
Subject: ftrace: Reduce size of function graph entries

Currently ftrace_graph_ent{,_entry} and ftrace_graph_ret{,_entry} struct
can have padding bytes at the end due to alignment in 64-bit data type.
As these data are recorded so frequently, those paddings waste
non-negligible space.  As the ring buffer maintains alignment properly
for each architecture, just to remove the extra padding using 'packed'
attribute.

  ftrace_graph_ent_entry:  24 -> 20
  ftrace_graph_ret_entry:  48 -> 44

Also I moved the 'overrun' field in struct ftrace_graph_ret to minimize
the padding in the middle.

Tested on x86_64 only.

Link: http://lkml.kernel.org/r/1467197808-13578-1-git-send-email-namhyung@kernel.org

Cc: Ingo Molnar <mingo@kernel.org>
Cc: linux-arch@vger.kernel.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h       | 12 ++++++++----
 kernel/trace/trace.h         | 11 +++++++++++
 kernel/trace/trace_entries.h |  4 ++--
 3 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 66a36a815f0a..7d565afe35d2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -754,23 +754,27 @@ static inline void ftrace_init(void) { }
 
 /*
  * Structure that defines an entry function trace.
+ * It's already packed but the attribute "packed" is needed
+ * to remove extra padding at the end.
  */
 struct ftrace_graph_ent {
 	unsigned long func; /* Current function */
 	int depth;
-};
+} __packed;
 
 /*
  * Structure that defines a return function trace.
+ * It's already packed but the attribute "packed" is needed
+ * to remove extra padding at the end.
  */
 struct ftrace_graph_ret {
 	unsigned long func; /* Current function */
-	unsigned long long calltime;
-	unsigned long long rettime;
 	/* Number of functions that overran the depth limit for current task */
 	unsigned long overrun;
+	unsigned long long calltime;
+	unsigned long long rettime;
 	int depth;
-};
+} __packed;
 
 /* Type of the callback handlers for tracing function graph*/
 typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c1de3f493cd3..f783df416726 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -80,6 +80,12 @@ enum trace_type {
 	FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
 		     filter)
 
+#undef FTRACE_ENTRY_PACKED
+#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print,	\
+			    filter)					\
+	FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+		     filter) __packed
+
 #include "trace_entries.h"
 
 /*
@@ -1625,6 +1631,11 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
 #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter)	\
 	FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
 		     filter)
+#undef FTRACE_ENTRY_PACKED
+#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \
+	FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+		     filter)
+
 #include "trace_entries.h"
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ee7b94a4810a..5c30efcda5e6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -72,7 +72,7 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
 );
 
 /* Function call entry */
-FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
+FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
 
 	TRACE_GRAPH_ENT,
 
@@ -88,7 +88,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
 );
 
 /* Function return entry */
-FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
+FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
 
 	TRACE_GRAPH_RET,
 
-- 
cgit v1.3-8-gc7d7


From ecb23dc6f2eff0ce64dd60351a81f376f13b12cc Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Fri, 20 May 2016 09:26:48 +0200
Subject: xen: add steal_clock support on x86

The pv_time_ops structure contains a function pointer for the
"steal_clock" functionality used only by KVM and Xen on ARM. Xen on x86
uses its own mechanism to account for the "stolen" time a thread wasn't
able to run due to hypervisor scheduling.

Add support in Xen arch independent time handling for this feature by
moving it out of the arm arch into drivers/xen and remove the x86 Xen
hack.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/arm/xen/enlighten.c    | 18 ++----------------
 arch/x86/xen/time.c         | 44 ++------------------------------------------
 drivers/xen/time.c          | 20 ++++++++++++++++++++
 include/linux/kernel_stat.h |  1 -
 include/xen/xen-ops.h       |  1 +
 kernel/sched/cputime.c      | 10 ----------
 6 files changed, 25 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c
index d4f36eba3499..47acb3613f40 100644
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -12,7 +12,6 @@
 #include <xen/page.h>
 #include <xen/interface/sched.h>
 #include <xen/xen-ops.h>
-#include <asm/paravirt.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/xen-ops.h>
@@ -86,19 +85,6 @@ int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
 }
 EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
 
-static unsigned long long xen_stolen_accounting(int cpu)
-{
-	struct vcpu_runstate_info state;
-
-	BUG_ON(cpu != smp_processor_id());
-
-	xen_get_runstate_snapshot(&state);
-
-	WARN_ON(state.state != RUNSTATE_running);
-
-	return state.time[RUNSTATE_runnable] + state.time[RUNSTATE_offline];
-}
-
 static void xen_read_wallclock(struct timespec64 *ts)
 {
 	u32 version;
@@ -432,8 +418,8 @@ static int __init xen_guest_init(void)
 
 	register_cpu_notifier(&xen_cpu_notifier);
 
-	pv_time_ops.steal_clock = xen_stolen_accounting;
-	static_key_slow_inc(&paravirt_steal_enabled);
+	xen_time_setup_guest();
+
 	if (xen_initial_domain())
 		pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
 
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 6deba5bc7e34..c31006f0f37f 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -11,8 +11,6 @@
 #include <linux/interrupt.h>
 #include <linux/clocksource.h>
 #include <linux/clockchips.h>
-#include <linux/kernel_stat.h>
-#include <linux/math64.h>
 #include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/pvclock_gtod.h>
@@ -31,44 +29,6 @@
 
 /* Xen may fire a timer up to this many ns early */
 #define TIMER_SLOP	100000
-#define NS_PER_TICK	(1000000000LL / HZ)
-
-/* snapshots of runstate info */
-static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
-
-/* unused ns of stolen time */
-static DEFINE_PER_CPU(u64, xen_residual_stolen);
-
-static void do_stolen_accounting(void)
-{
-	struct vcpu_runstate_info state;
-	struct vcpu_runstate_info *snap;
-	s64 runnable, offline, stolen;
-	cputime_t ticks;
-
-	xen_get_runstate_snapshot(&state);
-
-	WARN_ON(state.state != RUNSTATE_running);
-
-	snap = this_cpu_ptr(&xen_runstate_snapshot);
-
-	/* work out how much time the VCPU has not been runn*ing*  */
-	runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
-	offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
-
-	*snap = state;
-
-	/* Add the appropriate number of ticks of stolen time,
-	   including any left-overs from last time. */
-	stolen = runnable + offline + __this_cpu_read(xen_residual_stolen);
-
-	if (stolen < 0)
-		stolen = 0;
-
-	ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
-	__this_cpu_write(xen_residual_stolen, stolen);
-	account_steal_ticks(ticks);
-}
 
 /* Get the TSC speed from Xen */
 static unsigned long xen_tsc_khz(void)
@@ -335,8 +295,6 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
 		ret = IRQ_HANDLED;
 	}
 
-	do_stolen_accounting();
-
 	return ret;
 }
 
@@ -431,6 +389,8 @@ static void __init xen_time_init(void)
 	xen_setup_timer(cpu);
 	xen_setup_cpu_clockevents();
 
+	xen_time_setup_guest();
+
 	if (xen_initial_domain())
 		pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
 }
diff --git a/drivers/xen/time.c b/drivers/xen/time.c
index 71078425c9ea..2257b6663766 100644
--- a/drivers/xen/time.c
+++ b/drivers/xen/time.c
@@ -6,6 +6,7 @@
 #include <linux/math64.h>
 #include <linux/gfp.h>
 
+#include <asm/paravirt.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 
@@ -75,6 +76,15 @@ bool xen_vcpu_stolen(int vcpu)
 	return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
 }
 
+static u64 xen_steal_clock(int cpu)
+{
+	struct vcpu_runstate_info state;
+
+	BUG_ON(cpu != smp_processor_id());
+	xen_get_runstate_snapshot(&state);
+	return state.time[RUNSTATE_runnable] + state.time[RUNSTATE_offline];
+}
+
 void xen_setup_runstate_info(int cpu)
 {
 	struct vcpu_register_runstate_memory_area area;
@@ -86,3 +96,13 @@ void xen_setup_runstate_info(int cpu)
 		BUG();
 }
 
+void __init xen_time_setup_guest(void)
+{
+	pv_time_ops.steal_clock = xen_steal_clock;
+
+	static_key_slow_inc(&paravirt_steal_enabled);
+	/*
+	 * We can't set paravirt_steal_rq_enabled as this would require the
+	 * capability to read another cpu's runstate info.
+	 */
+}
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 25a822f6f000..44fda64ad434 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -92,7 +92,6 @@ static inline void account_process_tick(struct task_struct *tsk, int user)
 extern void account_process_tick(struct task_struct *, int user);
 #endif
 
-extern void account_steal_ticks(unsigned long ticks);
 extern void account_idle_ticks(unsigned long ticks);
 
 #endif /* _LINUX_KERNEL_STAT_H */
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 3491582bf50a..355275bad2cf 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -21,6 +21,7 @@ void xen_resume_notifier_unregister(struct notifier_block *nb);
 
 bool xen_vcpu_stolen(int vcpu);
 void xen_setup_runstate_info(int cpu);
+void xen_time_setup_guest(void);
 void xen_get_runstate_snapshot(struct vcpu_runstate_info *res);
 
 int xen_setup_shutdown_event(void);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 75f98c5498d5..8c4c6dcc052c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -489,16 +489,6 @@ void account_process_tick(struct task_struct *p, int user_tick)
 		account_idle_time(cputime_one_jiffy);
 }
 
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
-	account_steal_time(jiffies_to_cputime(ticks));
-}
-
 /*
  * Account multiple ticks of idle time.
  * @ticks: number of stolen ticks
-- 
cgit v1.3-8-gc7d7


From 885885f6b88d22f81e67ee6a61561e480b27d27a Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Fri, 17 Jun 2016 17:19:40 +0000
Subject: locking/static_keys: Fix non static symbol Sparse warning

Fix the following sparse warnings:

  kernel/jump_label.c:473:23: warning:
   symbol 'jump_label_module_nb' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1466183980-8903-1-git-send-email-weiyj_lk@163.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/jump_label.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 05254eeb4b4e..ac4ab953b49c 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -422,7 +422,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
 	return notifier_from_errno(ret);
 }
 
-struct notifier_block jump_label_module_nb = {
+static struct notifier_block jump_label_module_nb = {
 	.notifier_call = jump_label_module_notify,
 	.priority = 1, /* higher than tracepoints */
 };
-- 
cgit v1.3-8-gc7d7


From e675447bda51c1ea72d1ac9132ce3bed974f1da3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:15 +0000
Subject: timers: Make 'pinned' a timer property

We want to move the timer migration logic from a 'push' to a 'pull' model.

Under the current 'push' model pinned timers are handled via
a runtime API variant: mod_timer_pinned().

The 'pull' model requires us to store the pinned attribute of a timer
in the timer_list structure itself, as a new TIMER_PINNED bit in
timer->flags.

This flag must be set at initialization time and the timer APIs
recognize the flag.

This patch:

 - Implements the new flag and associated new-style initialization
   methods

 - makes mod_timer() recognize new-style pinned timers,

 - and adds some migration helper facility to allow
   step by step conversion of old-style to new-style
   pinned timers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094341.049338558@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/timer.h | 25 ++++++++++++++++++++++---
 kernel/time/timer.c   | 10 +++++-----
 2 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 20ac746f3eb3..046d6cf26498 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -62,7 +62,8 @@ struct timer_list {
 #define TIMER_MIGRATING		0x00080000
 #define TIMER_BASEMASK		(TIMER_CPUMASK | TIMER_MIGRATING)
 #define TIMER_DEFERRABLE	0x00100000
-#define TIMER_IRQSAFE		0x00200000
+#define TIMER_PINNED		0x00200000
+#define TIMER_IRQSAFE		0x00400000
 
 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
@@ -78,9 +79,15 @@ struct timer_list {
 #define TIMER_INITIALIZER(_function, _expires, _data)		\
 	__TIMER_INITIALIZER((_function), (_expires), (_data), 0)
 
+#define TIMER_PINNED_INITIALIZER(_function, _expires, _data)	\
+	__TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_PINNED)
+
 #define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data)	\
 	__TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE)
 
+#define TIMER_PINNED_DEFERRED_INITIALIZER(_function, _expires, _data)	\
+	__TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE | TIMER_PINNED)
+
 #define DEFINE_TIMER(_name, _function, _expires, _data)		\
 	struct timer_list _name =				\
 		TIMER_INITIALIZER(_function, _expires, _data)
@@ -124,8 +131,12 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 
 #define init_timer(timer)						\
 	__init_timer((timer), 0)
+#define init_timer_pinned(timer)					\
+	__init_timer((timer), TIMER_PINNED)
 #define init_timer_deferrable(timer)					\
 	__init_timer((timer), TIMER_DEFERRABLE)
+#define init_timer_pinned_deferrable(timer)				\
+	__init_timer((timer), TIMER_DEFERRABLE | TIMER_PINNED)
 #define init_timer_on_stack(timer)					\
 	__init_timer_on_stack((timer), 0)
 
@@ -145,12 +156,20 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 
 #define setup_timer(timer, fn, data)					\
 	__setup_timer((timer), (fn), (data), 0)
+#define setup_pinned_timer(timer, fn, data)				\
+	__setup_timer((timer), (fn), (data), TIMER_PINNED)
 #define setup_deferrable_timer(timer, fn, data)				\
 	__setup_timer((timer), (fn), (data), TIMER_DEFERRABLE)
+#define setup_pinned_deferrable_timer(timer, fn, data)			\
+	__setup_timer((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED)
 #define setup_timer_on_stack(timer, fn, data)				\
 	__setup_timer_on_stack((timer), (fn), (data), 0)
+#define setup_pinned_timer_on_stack(timer, fn, data)			\
+	__setup_timer_on_stack((timer), (fn), (data), TIMER_PINNED)
 #define setup_deferrable_timer_on_stack(timer, fn, data)		\
 	__setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE)
+#define setup_pinned_deferrable_timer_on_stack(timer, fn, data)		\
+	__setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED)
 
 /**
  * timer_pending - is a timer pending?
@@ -175,8 +194,8 @@ extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires);
 
 extern void set_timer_slack(struct timer_list *time, int slack_hz);
 
-#define TIMER_NOT_PINNED	0
-#define TIMER_PINNED		1
+#define MOD_TIMER_NOT_PINNED	0
+#define MOD_TIMER_PINNED	1
 /*
  * The jiffies value which is added to now, when there is no timer
  * in the timer wheel:
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3a95f9728778..693f6d14058e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -782,7 +782,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	debug_activate(timer, expires);
 
-	new_base = get_target_base(base, pinned);
+	new_base = get_target_base(base, pinned || timer->flags & TIMER_PINNED);
 
 	if (base != new_base) {
 		/*
@@ -825,7 +825,7 @@ out_unlock:
  */
 int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 {
-	return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
+	return __mod_timer(timer, expires, true, MOD_TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer_pending);
 
@@ -900,7 +900,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 	if (timer_pending(timer) && timer->expires == expires)
 		return 1;
 
-	return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
+	return __mod_timer(timer, expires, false, MOD_TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer);
 
@@ -928,7 +928,7 @@ int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
 	if (timer->expires == expires && timer_pending(timer))
 		return 1;
 
-	return __mod_timer(timer, expires, false, TIMER_PINNED);
+	return __mod_timer(timer, expires, false, MOD_TIMER_PINNED);
 }
 EXPORT_SYMBOL(mod_timer_pinned);
 
@@ -1512,7 +1512,7 @@ signed long __sched schedule_timeout(signed long timeout)
 	expire = timeout + jiffies;
 
 	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-	__mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
+	__mod_timer(&timer, expire, false, MOD_TIMER_NOT_PINNED);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 
-- 
cgit v1.3-8-gc7d7


From 177ec0a0a531695210b277d734b2f92ee5796303 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:24 +0000
Subject: timers: Remove the deprecated mod_timer_pinned() API

We switched all users to initialize the timers as pinned and call
mod_timer(). Remove the now unused timer API function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094341.706205231@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/timer.h |  3 ---
 kernel/time/timer.c   | 39 +++++----------------------------------
 2 files changed, 5 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 046d6cf26498..a8f6c70eb414 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -190,12 +190,9 @@ extern void add_timer_on(struct timer_list *timer, int cpu);
 extern int del_timer(struct timer_list * timer);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
-extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires);
 
 extern void set_timer_slack(struct timer_list *time, int slack_hz);
 
-#define MOD_TIMER_NOT_PINNED	0
-#define MOD_TIMER_PINNED	1
 /*
  * The jiffies value which is added to now, when there is no timer
  * in the timer wheel:
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 693f6d14058e..ba49c1cf80f5 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -764,8 +764,7 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 }
 
 static inline int
-__mod_timer(struct timer_list *timer, unsigned long expires,
-	    bool pending_only, int pinned)
+__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 {
 	struct tvec_base *base, *new_base;
 	unsigned long flags;
@@ -782,7 +781,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	debug_activate(timer, expires);
 
-	new_base = get_target_base(base, pinned || timer->flags & TIMER_PINNED);
+	new_base = get_target_base(base, timer->flags & TIMER_PINNED);
 
 	if (base != new_base) {
 		/*
@@ -825,7 +824,7 @@ out_unlock:
  */
 int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 {
-	return __mod_timer(timer, expires, true, MOD_TIMER_NOT_PINNED);
+	return __mod_timer(timer, expires, true);
 }
 EXPORT_SYMBOL(mod_timer_pending);
 
@@ -900,38 +899,10 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 	if (timer_pending(timer) && timer->expires == expires)
 		return 1;
 
-	return __mod_timer(timer, expires, false, MOD_TIMER_NOT_PINNED);
+	return __mod_timer(timer, expires, false);
 }
 EXPORT_SYMBOL(mod_timer);
 
-/**
- * mod_timer_pinned - modify a timer's timeout
- * @timer: the timer to be modified
- * @expires: new timeout in jiffies
- *
- * mod_timer_pinned() is a way to update the expire field of an
- * active timer (if the timer is inactive it will be activated)
- * and to ensure that the timer is scheduled on the current CPU.
- *
- * Note that this does not prevent the timer from being migrated
- * when the current CPU goes offline.  If this is a problem for
- * you, use CPU-hotplug notifiers to handle it correctly, for
- * example, cancelling the timer when the corresponding CPU goes
- * offline.
- *
- * mod_timer_pinned(timer, expires) is equivalent to:
- *
- *     del_timer(timer); timer->expires = expires; add_timer(timer);
- */
-int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
-{
-	if (timer->expires == expires && timer_pending(timer))
-		return 1;
-
-	return __mod_timer(timer, expires, false, MOD_TIMER_PINNED);
-}
-EXPORT_SYMBOL(mod_timer_pinned);
-
 /**
  * add_timer - start a timer
  * @timer: the timer to be added
@@ -1512,7 +1483,7 @@ signed long __sched schedule_timeout(signed long timeout)
 	expire = timeout + jiffies;
 
 	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-	__mod_timer(&timer, expire, false, MOD_TIMER_NOT_PINNED);
+	__mod_timer(&timer, expire, false);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 
-- 
cgit v1.3-8-gc7d7


From 2b1ecc3d1a6b10f8fbac7f83d80db30b5a2c2791 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:25 +0000
Subject: signals: Use hrtimer for sigtimedwait()

We've converted most timeout related syscalls to hrtimers, but
sigtimedwait() did not get this treatment.

Convert it so we get a reasonable accuracy and remove the
user space exposure to the timer wheel properties.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Cyril Hrubis <chrubis@suse.cz>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094341.787164909@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/signal.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 96e9bc40667f..af21afc00d08 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2751,23 +2751,18 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
  *  @ts: upper bound on process time suspension
  */
 int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
-			const struct timespec *ts)
+		    const struct timespec *ts)
 {
+	ktime_t *to = NULL, timeout = { .tv64 = KTIME_MAX };
 	struct task_struct *tsk = current;
-	long timeout = MAX_SCHEDULE_TIMEOUT;
 	sigset_t mask = *which;
-	int sig;
+	int sig, ret = 0;
 
 	if (ts) {
 		if (!timespec_valid(ts))
 			return -EINVAL;
-		timeout = timespec_to_jiffies(ts);
-		/*
-		 * We can be close to the next tick, add another one
-		 * to ensure we will wait at least the time asked for.
-		 */
-		if (ts->tv_sec || ts->tv_nsec)
-			timeout++;
+		timeout = timespec_to_ktime(*ts);
+		to = &timeout;
 	}
 
 	/*
@@ -2778,7 +2773,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
 
 	spin_lock_irq(&tsk->sighand->siglock);
 	sig = dequeue_signal(tsk, &mask, info);
-	if (!sig && timeout) {
+	if (!sig && timeout.tv64) {
 		/*
 		 * None ready, temporarily unblock those we're interested
 		 * while we are sleeping in so that we'll be awakened when
@@ -2790,8 +2785,9 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
 		recalc_sigpending();
 		spin_unlock_irq(&tsk->sighand->siglock);
 
-		timeout = freezable_schedule_timeout_interruptible(timeout);
-
+		__set_current_state(TASK_INTERRUPTIBLE);
+		ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns,
+							 HRTIMER_MODE_REL);
 		spin_lock_irq(&tsk->sighand->siglock);
 		__set_task_blocked(tsk, &tsk->real_blocked);
 		sigemptyset(&tsk->real_blocked);
@@ -2801,7 +2797,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
 
 	if (sig)
 		return sig;
-	return timeout ? -EINTR : -EAGAIN;
+	return ret ? -EINTR : -EAGAIN;
 }
 
 /**
-- 
cgit v1.3-8-gc7d7


From 494af3ed7848de08640d98ee5aff57a45c137c3c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:28 +0000
Subject: timers: Give a few structs and members proper names

Some of the names in the internal implementation of the timer code
are not longer correct and others are simply too long to type.

Clean it up before we switch the wheel implementation over to
the new scheme.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094341.948752516@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timer.c | 118 ++++++++++++++++++++++++++--------------------------
 1 file changed, 59 insertions(+), 59 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ba49c1cf80f5..f259a3ef4577 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -77,10 +77,10 @@ struct tvec_root {
 	struct hlist_head vec[TVR_SIZE];
 };
 
-struct tvec_base {
+struct timer_base {
 	spinlock_t lock;
 	struct timer_list *running_timer;
-	unsigned long timer_jiffies;
+	unsigned long clk;
 	unsigned long next_timer;
 	unsigned long active_timers;
 	unsigned long all_timers;
@@ -95,7 +95,7 @@ struct tvec_base {
 } ____cacheline_aligned;
 
 
-static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
+static DEFINE_PER_CPU(struct timer_base, timer_bases);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 unsigned int sysctl_timer_migration = 1;
@@ -106,15 +106,15 @@ void timers_update_migration(bool update_nohz)
 	unsigned int cpu;
 
 	/* Avoid the loop, if nothing to update */
-	if (this_cpu_read(tvec_bases.migration_enabled) == on)
+	if (this_cpu_read(timer_bases.migration_enabled) == on)
 		return;
 
 	for_each_possible_cpu(cpu) {
-		per_cpu(tvec_bases.migration_enabled, cpu) = on;
+		per_cpu(timer_bases.migration_enabled, cpu) = on;
 		per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
 		if (!update_nohz)
 			continue;
-		per_cpu(tvec_bases.nohz_active, cpu) = true;
+		per_cpu(timer_bases.nohz_active, cpu) = true;
 		per_cpu(hrtimer_bases.nohz_active, cpu) = true;
 	}
 }
@@ -134,18 +134,18 @@ int timer_migration_handler(struct ctl_table *table, int write,
 	return ret;
 }
 
-static inline struct tvec_base *get_target_base(struct tvec_base *base,
+static inline struct timer_base *get_target_base(struct timer_base *base,
 						int pinned)
 {
 	if (pinned || !base->migration_enabled)
-		return this_cpu_ptr(&tvec_bases);
-	return per_cpu_ptr(&tvec_bases, get_nohz_timer_target());
+		return this_cpu_ptr(&timer_bases);
+	return per_cpu_ptr(&timer_bases, get_nohz_timer_target());
 }
 #else
-static inline struct tvec_base *get_target_base(struct tvec_base *base,
+static inline struct timer_base *get_target_base(struct timer_base *base,
 						int pinned)
 {
-	return this_cpu_ptr(&tvec_bases);
+	return this_cpu_ptr(&timer_bases);
 }
 #endif
 
@@ -371,10 +371,10 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
 EXPORT_SYMBOL_GPL(set_timer_slack);
 
 static void
-__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+__internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
-	unsigned long idx = expires - base->timer_jiffies;
+	unsigned long idx = expires - base->clk;
 	struct hlist_head *vec;
 
 	if (idx < TVR_SIZE) {
@@ -394,7 +394,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 		 * Can happen if you add a timer with expires == jiffies,
 		 * or you set a timer to go off in the past
 		 */
-		vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
+		vec = base->tv1.vec + (base->clk & TVR_MASK);
 	} else {
 		int i;
 		/* If the timeout is larger than MAX_TVAL (on 64-bit
@@ -403,7 +403,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 		 */
 		if (idx > MAX_TVAL) {
 			idx = MAX_TVAL;
-			expires = idx + base->timer_jiffies;
+			expires = idx + base->clk;
 		}
 		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
 		vec = base->tv5.vec + i;
@@ -412,11 +412,11 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 	hlist_add_head(&timer->entry, vec);
 }
 
-static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
 	/* Advance base->jiffies, if the base is empty */
 	if (!base->all_timers++)
-		base->timer_jiffies = jiffies;
+		base->clk = jiffies;
 
 	__internal_add_timer(base, timer);
 	/*
@@ -707,7 +707,7 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending)
 }
 
 static inline void
-detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
+detach_expired_timer(struct timer_list *timer, struct timer_base *base)
 {
 	detach_timer(timer, true);
 	if (!(timer->flags & TIMER_DEFERRABLE))
@@ -715,7 +715,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
 	base->all_timers--;
 }
 
-static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
+static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
 			     bool clear_pending)
 {
 	if (!timer_pending(timer))
@@ -725,16 +725,16 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
 	if (!(timer->flags & TIMER_DEFERRABLE)) {
 		base->active_timers--;
 		if (timer->expires == base->next_timer)
-			base->next_timer = base->timer_jiffies;
+			base->next_timer = base->clk;
 	}
 	/* If this was the last timer, advance base->jiffies */
 	if (!--base->all_timers)
-		base->timer_jiffies = jiffies;
+		base->clk = jiffies;
 	return 1;
 }
 
 /*
- * We are using hashed locking: holding per_cpu(tvec_bases).lock
+ * We are using hashed locking: holding per_cpu(timer_bases).lock
  * means that all timers which are tied to this base via timer->base are
  * locked, and the base itself is locked too.
  *
@@ -744,16 +744,16 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
  * When the timer's base is locked and removed from the list, the
  * TIMER_MIGRATING flag is set, FIXME
  */
-static struct tvec_base *lock_timer_base(struct timer_list *timer,
+static struct timer_base *lock_timer_base(struct timer_list *timer,
 					unsigned long *flags)
 	__acquires(timer->base->lock)
 {
 	for (;;) {
 		u32 tf = timer->flags;
-		struct tvec_base *base;
+		struct timer_base *base;
 
 		if (!(tf & TIMER_MIGRATING)) {
-			base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
+			base = per_cpu_ptr(&timer_bases, tf & TIMER_CPUMASK);
 			spin_lock_irqsave(&base->lock, *flags);
 			if (timer->flags == tf)
 				return base;
@@ -766,7 +766,7 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 static inline int
 __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 {
-	struct tvec_base *base, *new_base;
+	struct timer_base *base, *new_base;
 	unsigned long flags;
 	int ret = 0;
 
@@ -933,8 +933,8 @@ EXPORT_SYMBOL(add_timer);
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-	struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu);
-	struct tvec_base *base;
+	struct timer_base *new_base = per_cpu_ptr(&timer_bases, cpu);
+	struct timer_base *base;
 	unsigned long flags;
 
 	timer_stats_timer_set_start_info(timer);
@@ -975,7 +975,7 @@ EXPORT_SYMBOL_GPL(add_timer_on);
  */
 int del_timer(struct timer_list *timer)
 {
-	struct tvec_base *base;
+	struct timer_base *base;
 	unsigned long flags;
 	int ret = 0;
 
@@ -1001,7 +1001,7 @@ EXPORT_SYMBOL(del_timer);
  */
 int try_to_del_timer_sync(struct timer_list *timer)
 {
-	struct tvec_base *base;
+	struct timer_base *base;
 	unsigned long flags;
 	int ret = -1;
 
@@ -1085,7 +1085,7 @@ int del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
-static int cascade(struct tvec_base *base, struct tvec *tv, int index)
+static int cascade(struct timer_base *base, struct tvec *tv, int index)
 {
 	/* cascade all the timers from tv up one level */
 	struct timer_list *timer;
@@ -1149,7 +1149,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
 	}
 }
 
-#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
+#define INDEX(N) ((base->clk >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
 
 /**
  * __run_timers - run all expired timers (if any) on this CPU.
@@ -1158,23 +1158,23 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
  * This function cascades all vectors and executes all expired timer
  * vectors.
  */
-static inline void __run_timers(struct tvec_base *base)
+static inline void __run_timers(struct timer_base *base)
 {
 	struct timer_list *timer;
 
 	spin_lock_irq(&base->lock);
 
-	while (time_after_eq(jiffies, base->timer_jiffies)) {
+	while (time_after_eq(jiffies, base->clk)) {
 		struct hlist_head work_list;
 		struct hlist_head *head = &work_list;
 		int index;
 
 		if (!base->all_timers) {
-			base->timer_jiffies = jiffies;
+			base->clk = jiffies;
 			break;
 		}
 
-		index = base->timer_jiffies & TVR_MASK;
+		index = base->clk & TVR_MASK;
 
 		/*
 		 * Cascade timers:
@@ -1184,7 +1184,7 @@ static inline void __run_timers(struct tvec_base *base)
 				(!cascade(base, &base->tv3, INDEX(1))) &&
 					!cascade(base, &base->tv4, INDEX(2)))
 			cascade(base, &base->tv5, INDEX(3));
-		++base->timer_jiffies;
+		++base->clk;
 		hlist_move_list(base->tv1.vec + index, head);
 		while (!hlist_empty(head)) {
 			void (*fn)(unsigned long);
@@ -1222,16 +1222,16 @@ static inline void __run_timers(struct tvec_base *base)
  * is used on S/390 to stop all activity when a CPU is idle.
  * This function needs to be called with interrupts disabled.
  */
-static unsigned long __next_timer_interrupt(struct tvec_base *base)
+static unsigned long __next_timer_interrupt(struct timer_base *base)
 {
-	unsigned long timer_jiffies = base->timer_jiffies;
-	unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
+	unsigned long clk = base->clk;
+	unsigned long expires = clk + NEXT_TIMER_MAX_DELTA;
 	int index, slot, array, found = 0;
 	struct timer_list *nte;
 	struct tvec *varray[4];
 
 	/* Look for timer events in tv1. */
-	index = slot = timer_jiffies & TVR_MASK;
+	index = slot = clk & TVR_MASK;
 	do {
 		hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
 			if (nte->flags & TIMER_DEFERRABLE)
@@ -1250,8 +1250,8 @@ static unsigned long __next_timer_interrupt(struct tvec_base *base)
 cascade:
 	/* Calculate the next cascade event */
 	if (index)
-		timer_jiffies += TVR_SIZE - index;
-	timer_jiffies >>= TVR_BITS;
+		clk += TVR_SIZE - index;
+	clk >>= TVR_BITS;
 
 	/* Check tv2-tv5. */
 	varray[0] = &base->tv2;
@@ -1262,7 +1262,7 @@ cascade:
 	for (array = 0; array < 4; array++) {
 		struct tvec *varp = varray[array];
 
-		index = slot = timer_jiffies & TVN_MASK;
+		index = slot = clk & TVN_MASK;
 		do {
 			hlist_for_each_entry(nte, varp->vec + slot, entry) {
 				if (nte->flags & TIMER_DEFERRABLE)
@@ -1286,8 +1286,8 @@ cascade:
 		} while (slot != index);
 
 		if (index)
-			timer_jiffies += TVN_SIZE - index;
-		timer_jiffies >>= TVN_BITS;
+			clk += TVN_SIZE - index;
+		clk >>= TVN_BITS;
 	}
 	return expires;
 }
@@ -1335,7 +1335,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
  */
 u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 {
-	struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+	struct timer_base *base = this_cpu_ptr(&timer_bases);
 	u64 expires = KTIME_MAX;
 	unsigned long nextevt;
 
@@ -1348,7 +1348,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 
 	spin_lock(&base->lock);
 	if (base->active_timers) {
-		if (time_before_eq(base->next_timer, base->timer_jiffies))
+		if (time_before_eq(base->next_timer, base->clk))
 			base->next_timer = __next_timer_interrupt(base);
 		nextevt = base->next_timer;
 		if (time_before_eq(nextevt, basej))
@@ -1387,9 +1387,9 @@ void update_process_times(int user_tick)
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+	struct timer_base *base = this_cpu_ptr(&timer_bases);
 
-	if (time_after_eq(jiffies, base->timer_jiffies))
+	if (time_after_eq(jiffies, base->clk))
 		__run_timers(base);
 }
 
@@ -1534,7 +1534,7 @@ signed long __sched schedule_timeout_idle(signed long timeout)
 EXPORT_SYMBOL(schedule_timeout_idle);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
+static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
 {
 	struct timer_list *timer;
 	int cpu = new_base->cpu;
@@ -1550,13 +1550,13 @@ static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *he
 
 static void migrate_timers(int cpu)
 {
-	struct tvec_base *old_base;
-	struct tvec_base *new_base;
+	struct timer_base *old_base;
+	struct timer_base *new_base;
 	int i;
 
 	BUG_ON(cpu_online(cpu));
-	old_base = per_cpu_ptr(&tvec_bases, cpu);
-	new_base = get_cpu_ptr(&tvec_bases);
+	old_base = per_cpu_ptr(&timer_bases, cpu);
+	new_base = get_cpu_ptr(&timer_bases);
 	/*
 	 * The caller is globally serialized and nobody else
 	 * takes two locks at once, deadlock is not possible.
@@ -1580,7 +1580,7 @@ static void migrate_timers(int cpu)
 
 	spin_unlock(&old_base->lock);
 	spin_unlock_irq(&new_base->lock);
-	put_cpu_ptr(&tvec_bases);
+	put_cpu_ptr(&timer_bases);
 }
 
 static int timer_cpu_notify(struct notifier_block *self,
@@ -1608,13 +1608,13 @@ static inline void timer_register_cpu_notifier(void) { }
 
 static void __init init_timer_cpu(int cpu)
 {
-	struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
+	struct timer_base *base = per_cpu_ptr(&timer_bases, cpu);
 
 	base->cpu = cpu;
 	spin_lock_init(&base->lock);
 
-	base->timer_jiffies = jiffies;
-	base->next_timer = base->timer_jiffies;
+	base->clk = jiffies;
+	base->next_timer = base->clk;
 }
 
 static void __init init_timer_cpus(void)
-- 
cgit v1.3-8-gc7d7


From 500462a9de657f86edaa102f8ab6bff7f7e43fc2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:30 +0000
Subject: timers: Switch to a non-cascading wheel

The current timer wheel has some drawbacks:

1) Cascading:

   Cascading can be an unbound operation and is completely pointless in most
   cases because the vast majority of the timer wheel timers are canceled or
   rearmed before expiration. (They are used as timeout safeguards, not as
   real timers to measure time.)

2) No fast lookup of the next expiring timer:

   In NOHZ scenarios the first timer soft interrupt after a long NOHZ period
   must fast forward the base time to the current value of jiffies. As we
   have no way to find the next expiring timer fast, the code loops linearly
   and increments the base time one by one and checks for expired timers
   in each step. This causes unbound overhead spikes exactly in the moment
   when we should wake up as fast as possible.

After a thorough analysis of real world data gathered on laptops,
workstations, webservers and other machines (thanks Chris!) I came to the
conclusion that the current 'classic' timer wheel implementation can be
modified to address the above issues.

The vast majority of timer wheel timers is canceled or rearmed before
expiry. Most of them are timeouts for networking and other I/O tasks. The
nature of timeouts is to catch the exception from normal operation (TCP ack
timed out, disk does not respond, etc.). For these kinds of timeouts the
accuracy of the timeout is not really a concern. Timeouts are very often
approximate worst-case values and in case the timeout fires, we already
waited for a long time and performance is down the drain already.

The few timers which actually expire can be split into two categories:

 1) Short expiry times which expect halfways accurate expiry

 2) Long term expiry times are inaccurate today already due to the
    batching which is done for NOHZ automatically and also via the
    set_timer_slack() API.

So for long term expiry timers we can avoid the cascading property and just
leave them in the less granular outer wheels until expiry or
cancelation. Timers which are armed with a timeout larger than the wheel
capacity are no longer cascaded. We expire them with the longest possible
timeout (6+ days). We have not observed such timeouts in our data collection,
but at least we handle them, applying the rule of the least surprise.

To avoid extending the wheel levels for HZ=1000 so we can accomodate the
longest observed timeouts (5 days in the network conntrack code) we reduce the
first level granularity on HZ=1000 to 4ms, which effectively is the same as
the HZ=250 behaviour. From our data analysis there is nothing which relies on
that 1ms granularity and as a side effect we get better batching and timer
locality for the networking code as well.

Contrary to the classic wheel the granularity of the next wheel is not the
capacity of the first wheel. The granularities of the wheels are in the
currently chosen setting 8 times the granularity of the previous wheel.

So for HZ=250 we end up with the following granularity levels:

 Level Offset   Granularity                  Range
     0      0          4 ms                 0 ms -        252 ms
     1     64         32 ms               256 ms -       2044 ms (256ms - ~2s)
     2    128        256 ms              2048 ms -      16380 ms (~2s   - ~16s)
     3    192       2048 ms (~2s)       16384 ms -     131068 ms (~16s  - ~2m)
     4    256      16384 ms (~16s)     131072 ms -    1048572 ms (~2m   - ~17m)
     5    320     131072 ms (~2m)     1048576 ms -    8388604 ms (~17m  - ~2h)
     6    384    1048576 ms (~17m)    8388608 ms -   67108863 ms (~2h   - ~18h)
     7    448    8388608 ms (~2h)    67108864 ms -  536870911 ms (~18h  - ~6d)

That's a worst case inaccuracy of 12.5% for the timers which are queued at the
beginning of a level.

So the new wheel concept addresses the old issues:

1) Cascading is avoided completely

2) By keeping the timers in the bucket until expiry/cancelation we can track
   the buckets which have timers enqueued in a bucket bitmap and therefore can
   look up the next expiring timer very fast and O(1).

A further benefit of the concept is that the slack calculation which is done
on every timer start is no longer necessary because the granularity levels
provide natural batching already.

Our extensive testing with various loads did not show any performance
degradation vs. the current wheel implementation.

This patch does not address the 'fast lookup' issue as we wanted to make sure
that there is no regression introduced by the wheel redesign. The
optimizations are in follow up patches.

This patch contains fixes from Anna-Maria Gleixner and Richard Cochran.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.108621834@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/timer.h |   2 +
 kernel/time/timer.c   | 829 ++++++++++++++++++++++++++++----------------------
 2 files changed, 469 insertions(+), 362 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 989f33d16ebf..5869ab9848fe 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -64,6 +64,8 @@ struct timer_list {
 #define TIMER_DEFERRABLE	0x00080000
 #define TIMER_PINNED		0x00100000
 #define TIMER_IRQSAFE		0x00200000
+#define TIMER_ARRAYSHIFT	22
+#define TIMER_ARRAYMASK		0xFFC00000
 
 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index f259a3ef4577..86e95b72665d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -59,43 +59,151 @@ __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
 EXPORT_SYMBOL(jiffies_64);
 
 /*
- * per-CPU timer vector definitions:
+ * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
+ * LVL_SIZE buckets. Each level is driven by its own clock and therefor each
+ * level has a different granularity.
+ *
+ * The level granularity is:		LVL_CLK_DIV ^ lvl
+ * The level clock frequency is:	HZ / (LVL_CLK_DIV ^ level)
+ *
+ * The array level of a newly armed timer depends on the relative expiry
+ * time. The farther the expiry time is away the higher the array level and
+ * therefor the granularity becomes.
+ *
+ * Contrary to the original timer wheel implementation, which aims for 'exact'
+ * expiry of the timers, this implementation removes the need for recascading
+ * the timers into the lower array levels. The previous 'classic' timer wheel
+ * implementation of the kernel already violated the 'exact' expiry by adding
+ * slack to the expiry time to provide batched expiration. The granularity
+ * levels provide implicit batching.
+ *
+ * This is an optimization of the original timer wheel implementation for the
+ * majority of the timer wheel use cases: timeouts. The vast majority of
+ * timeout timers (networking, disk I/O ...) are canceled before expiry. If
+ * the timeout expires it indicates that normal operation is disturbed, so it
+ * does not matter much whether the timeout comes with a slight delay.
+ *
+ * The only exception to this are networking timers with a small expiry
+ * time. They rely on the granularity. Those fit into the first wheel level,
+ * which has HZ granularity.
+ *
+ * We don't have cascading anymore. timers with a expiry time above the
+ * capacity of the last wheel level are force expired at the maximum timeout
+ * value of the last wheel level. From data sampling we know that the maximum
+ * value observed is 5 days (network connection tracking), so this should not
+ * be an issue.
+ *
+ * The currently chosen array constants values are a good compromise between
+ * array size and granularity.
+ *
+ * This results in the following granularity and range levels:
+ *
+ * HZ 1000 steps
+ * Level Offset  Granularity            Range
+ *  0      0         1 ms                0 ms -         63 ms
+ *  1     64         8 ms               64 ms -        511 ms
+ *  2    128        64 ms              512 ms -       4095 ms (512ms - ~4s)
+ *  3    192       512 ms             4096 ms -      32767 ms (~4s - ~32s)
+ *  4    256      4096 ms (~4s)      32768 ms -     262143 ms (~32s - ~4m)
+ *  5    320     32768 ms (~32s)    262144 ms -    2097151 ms (~4m - ~34m)
+ *  6    384    262144 ms (~4m)    2097152 ms -   16777215 ms (~34m - ~4h)
+ *  7    448   2097152 ms (~34m)  16777216 ms -  134217727 ms (~4h - ~1d)
+ *  8    512  16777216 ms (~4h)  134217728 ms - 1073741822 ms (~1d - ~12d)
+ *
+ * HZ  300
+ * Level Offset  Granularity            Range
+ *  0	   0         3 ms                0 ms -        210 ms
+ *  1	  64        26 ms              213 ms -       1703 ms (213ms - ~1s)
+ *  2	 128       213 ms             1706 ms -      13650 ms (~1s - ~13s)
+ *  3	 192      1706 ms (~1s)      13653 ms -     109223 ms (~13s - ~1m)
+ *  4	 256     13653 ms (~13s)    109226 ms -     873810 ms (~1m - ~14m)
+ *  5	 320    109226 ms (~1m)     873813 ms -    6990503 ms (~14m - ~1h)
+ *  6	 384    873813 ms (~14m)   6990506 ms -   55924050 ms (~1h - ~15h)
+ *  7	 448   6990506 ms (~1h)   55924053 ms -  447392423 ms (~15h - ~5d)
+ *  8    512  55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
+ *
+ * HZ  250
+ * Level Offset  Granularity            Range
+ *  0	   0         4 ms                0 ms -        255 ms
+ *  1	  64        32 ms              256 ms -       2047 ms (256ms - ~2s)
+ *  2	 128       256 ms             2048 ms -      16383 ms (~2s - ~16s)
+ *  3	 192      2048 ms (~2s)      16384 ms -     131071 ms (~16s - ~2m)
+ *  4	 256     16384 ms (~16s)    131072 ms -    1048575 ms (~2m - ~17m)
+ *  5	 320    131072 ms (~2m)    1048576 ms -    8388607 ms (~17m - ~2h)
+ *  6	 384   1048576 ms (~17m)   8388608 ms -   67108863 ms (~2h - ~18h)
+ *  7	 448   8388608 ms (~2h)   67108864 ms -  536870911 ms (~18h - ~6d)
+ *  8    512  67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
+ *
+ * HZ  100
+ * Level Offset  Granularity            Range
+ *  0	   0         10 ms               0 ms -        630 ms
+ *  1	  64         80 ms             640 ms -       5110 ms (640ms - ~5s)
+ *  2	 128        640 ms            5120 ms -      40950 ms (~5s - ~40s)
+ *  3	 192       5120 ms (~5s)     40960 ms -     327670 ms (~40s - ~5m)
+ *  4	 256      40960 ms (~40s)   327680 ms -    2621430 ms (~5m - ~43m)
+ *  5	 320     327680 ms (~5m)   2621440 ms -   20971510 ms (~43m - ~5h)
+ *  6	 384    2621440 ms (~43m) 20971520 ms -  167772150 ms (~5h - ~1d)
+ *  7	 448   20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
  */
-#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
-#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
-#define TVN_SIZE (1 << TVN_BITS)
-#define TVR_SIZE (1 << TVR_BITS)
-#define TVN_MASK (TVN_SIZE - 1)
-#define TVR_MASK (TVR_SIZE - 1)
-#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
-
-struct tvec {
-	struct hlist_head vec[TVN_SIZE];
-};
 
-struct tvec_root {
-	struct hlist_head vec[TVR_SIZE];
-};
+/* Clock divisor for the next level */
+#define LVL_CLK_SHIFT	3
+#define LVL_CLK_DIV	(1UL << LVL_CLK_SHIFT)
+#define LVL_CLK_MASK	(LVL_CLK_DIV - 1)
+#define LVL_SHIFT(n)	((n) * LVL_CLK_SHIFT)
+#define LVL_GRAN(n)	(1UL << LVL_SHIFT(n))
+
+/*
+ * The time start value for each level to select the bucket at enqueue
+ * time.
+ */
+#define LVL_START(n)	((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
+
+/* Size of each clock level */
+#define LVL_BITS	6
+#define LVL_SIZE	(1UL << LVL_BITS)
+#define LVL_MASK	(LVL_SIZE - 1)
+#define LVL_OFFS(n)	((n) * LVL_SIZE)
+
+/* Level depth */
+#if HZ > 100
+# define LVL_DEPTH	9
+# else
+# define LVL_DEPTH	8
+#endif
+
+/* The cutoff (max. capacity of the wheel) */
+#define WHEEL_TIMEOUT_CUTOFF	(LVL_START(LVL_DEPTH))
+#define WHEEL_TIMEOUT_MAX	(WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))
+
+/*
+ * The resulting wheel size. If NOHZ is configured we allocate two
+ * wheels so we have a separate storage for the deferrable timers.
+ */
+#define WHEEL_SIZE	(LVL_SIZE * LVL_DEPTH)
+
+#ifdef CONFIG_NO_HZ_COMMON
+# define NR_BASES	2
+# define BASE_STD	0
+# define BASE_DEF	1
+#else
+# define NR_BASES	1
+# define BASE_STD	0
+# define BASE_DEF	0
+#endif
 
 struct timer_base {
-	spinlock_t lock;
-	struct timer_list *running_timer;
-	unsigned long clk;
-	unsigned long next_timer;
-	unsigned long active_timers;
-	unsigned long all_timers;
-	int cpu;
-	bool migration_enabled;
-	bool nohz_active;
-	struct tvec_root tv1;
-	struct tvec tv2;
-	struct tvec tv3;
-	struct tvec tv4;
-	struct tvec tv5;
+	spinlock_t		lock;
+	struct timer_list	*running_timer;
+	unsigned long		clk;
+	unsigned int		cpu;
+	bool			migration_enabled;
+	bool			nohz_active;
+	DECLARE_BITMAP(pending_map, WHEEL_SIZE);
+	struct hlist_head	vectors[WHEEL_SIZE];
 } ____cacheline_aligned;
 
-
-static DEFINE_PER_CPU(struct timer_base, timer_bases);
+static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 unsigned int sysctl_timer_migration = 1;
@@ -106,15 +214,17 @@ void timers_update_migration(bool update_nohz)
 	unsigned int cpu;
 
 	/* Avoid the loop, if nothing to update */
-	if (this_cpu_read(timer_bases.migration_enabled) == on)
+	if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
 		return;
 
 	for_each_possible_cpu(cpu) {
-		per_cpu(timer_bases.migration_enabled, cpu) = on;
+		per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
+		per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
 		per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
 		if (!update_nohz)
 			continue;
-		per_cpu(timer_bases.nohz_active, cpu) = true;
+		per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
+		per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
 		per_cpu(hrtimer_bases.nohz_active, cpu) = true;
 	}
 }
@@ -133,20 +243,6 @@ int timer_migration_handler(struct ctl_table *table, int write,
 	mutex_unlock(&mutex);
 	return ret;
 }
-
-static inline struct timer_base *get_target_base(struct timer_base *base,
-						int pinned)
-{
-	if (pinned || !base->migration_enabled)
-		return this_cpu_ptr(&timer_bases);
-	return per_cpu_ptr(&timer_bases, get_nohz_timer_target());
-}
-#else
-static inline struct timer_base *get_target_base(struct timer_base *base,
-						int pinned)
-{
-	return this_cpu_ptr(&timer_bases);
-}
 #endif
 
 static unsigned long round_jiffies_common(unsigned long j, int cpu,
@@ -370,78 +466,91 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
 }
 EXPORT_SYMBOL_GPL(set_timer_slack);
 
+static inline unsigned int timer_get_idx(struct timer_list *timer)
+{
+	return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
+}
+
+static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
+{
+	timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
+			idx << TIMER_ARRAYSHIFT;
+}
+
+/*
+ * Helper function to calculate the array index for a given expiry
+ * time.
+ */
+static inline unsigned calc_index(unsigned expires, unsigned lvl)
+{
+	expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
+	return LVL_OFFS(lvl) + (expires & LVL_MASK);
+}
+
 static void
 __internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
-	unsigned long idx = expires - base->clk;
+	unsigned long delta = expires - base->clk;
 	struct hlist_head *vec;
-
-	if (idx < TVR_SIZE) {
-		int i = expires & TVR_MASK;
-		vec = base->tv1.vec + i;
-	} else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
-		int i = (expires >> TVR_BITS) & TVN_MASK;
-		vec = base->tv2.vec + i;
-	} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
-		int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
-		vec = base->tv3.vec + i;
-	} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
-		int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
-		vec = base->tv4.vec + i;
-	} else if ((signed long) idx < 0) {
-		/*
-		 * Can happen if you add a timer with expires == jiffies,
-		 * or you set a timer to go off in the past
-		 */
-		vec = base->tv1.vec + (base->clk & TVR_MASK);
+	unsigned int idx;
+
+	if (delta < LVL_START(1)) {
+		idx = calc_index(expires, 0);
+	} else if (delta < LVL_START(2)) {
+		idx = calc_index(expires, 1);
+	} else if (delta < LVL_START(3)) {
+		idx = calc_index(expires, 2);
+	} else if (delta < LVL_START(4)) {
+		idx = calc_index(expires, 3);
+	} else if (delta < LVL_START(5)) {
+		idx = calc_index(expires, 4);
+	} else if (delta < LVL_START(6)) {
+		idx = calc_index(expires, 5);
+	} else if (delta < LVL_START(7)) {
+		idx = calc_index(expires, 6);
+	} else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
+		idx = calc_index(expires, 7);
+	} else if ((long) delta < 0) {
+		idx = base->clk & LVL_MASK;
 	} else {
-		int i;
-		/* If the timeout is larger than MAX_TVAL (on 64-bit
-		 * architectures or with CONFIG_BASE_SMALL=1) then we
-		 * use the maximum timeout.
+		/*
+		 * Force expire obscene large timeouts to expire at the
+		 * capacity limit of the wheel.
 		 */
-		if (idx > MAX_TVAL) {
-			idx = MAX_TVAL;
-			expires = idx + base->clk;
-		}
-		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
-		vec = base->tv5.vec + i;
-	}
+		if (expires >= WHEEL_TIMEOUT_CUTOFF)
+			expires = WHEEL_TIMEOUT_MAX;
 
+		idx = calc_index(expires, LVL_DEPTH - 1);
+	}
+	/*
+	 * Enqueue the timer into the array bucket, mark it pending in
+	 * the bitmap and store the index in the timer flags.
+	 */
+	vec = base->vectors + idx;
 	hlist_add_head(&timer->entry, vec);
+	__set_bit(idx, base->pending_map);
+	timer_set_idx(timer, idx);
 }
 
 static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
-	/* Advance base->jiffies, if the base is empty */
-	if (!base->all_timers++)
-		base->clk = jiffies;
-
 	__internal_add_timer(base, timer);
-	/*
-	 * Update base->active_timers and base->next_timer
-	 */
-	if (!(timer->flags & TIMER_DEFERRABLE)) {
-		if (!base->active_timers++ ||
-		    time_before(timer->expires, base->next_timer))
-			base->next_timer = timer->expires;
-	}
 
 	/*
 	 * Check whether the other CPU is in dynticks mode and needs
-	 * to be triggered to reevaluate the timer wheel.
-	 * We are protected against the other CPU fiddling
-	 * with the timer by holding the timer base lock. This also
-	 * makes sure that a CPU on the way to stop its tick can not
-	 * evaluate the timer wheel.
+	 * to be triggered to reevaluate the timer wheel.  We are
+	 * protected against the other CPU fiddling with the timer by
+	 * holding the timer base lock. This also makes sure that a
+	 * CPU on the way to stop its tick can not evaluate the timer
+	 * wheel.
 	 *
 	 * Spare the IPI for deferrable timers on idle targets though.
 	 * The next busy ticks will take care of it. Except full dynticks
 	 * require special care against races with idle_cpu(), lets deal
 	 * with that later.
 	 */
-	if (base->nohz_active) {
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) {
 		if (!(timer->flags & TIMER_DEFERRABLE) ||
 		    tick_nohz_full_cpu(base->cpu))
 			wake_up_nohz_cpu(base->cpu);
@@ -706,54 +815,87 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending)
 	entry->next = LIST_POISON2;
 }
 
-static inline void
-detach_expired_timer(struct timer_list *timer, struct timer_base *base)
-{
-	detach_timer(timer, true);
-	if (!(timer->flags & TIMER_DEFERRABLE))
-		base->active_timers--;
-	base->all_timers--;
-}
-
 static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
 			     bool clear_pending)
 {
+	unsigned idx = timer_get_idx(timer);
+
 	if (!timer_pending(timer))
 		return 0;
 
+	if (hlist_is_singular_node(&timer->entry, base->vectors + idx))
+		__clear_bit(idx, base->pending_map);
+
 	detach_timer(timer, clear_pending);
-	if (!(timer->flags & TIMER_DEFERRABLE)) {
-		base->active_timers--;
-		if (timer->expires == base->next_timer)
-			base->next_timer = base->clk;
-	}
-	/* If this was the last timer, advance base->jiffies */
-	if (!--base->all_timers)
-		base->clk = jiffies;
 	return 1;
 }
 
+static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
+{
+	struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
+
+	/*
+	 * If the timer is deferrable and nohz is active then we need to use
+	 * the deferrable base.
+	 */
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+	    (tflags & TIMER_DEFERRABLE))
+		base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
+	return base;
+}
+
+static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
+{
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+
+	/*
+	 * If the timer is deferrable and nohz is active then we need to use
+	 * the deferrable base.
+	 */
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+	    (tflags & TIMER_DEFERRABLE))
+		base = this_cpu_ptr(&timer_bases[BASE_DEF]);
+	return base;
+}
+
+static inline struct timer_base *get_timer_base(u32 tflags)
+{
+	return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
+}
+
+static inline struct timer_base *get_target_base(struct timer_base *base,
+						 unsigned tflags)
+{
+#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
+	if ((tflags & TIMER_PINNED) || !base->migration_enabled)
+		return get_timer_this_cpu_base(tflags);
+	return get_timer_cpu_base(tflags, get_nohz_timer_target());
+#else
+	return get_timer_this_cpu_base(tflags);
+#endif
+}
+
 /*
- * We are using hashed locking: holding per_cpu(timer_bases).lock
- * means that all timers which are tied to this base via timer->base are
- * locked, and the base itself is locked too.
+ * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
+ * that all timers which are tied to this base are locked, and the base itself
+ * is locked too.
  *
  * So __run_timers/migrate_timers can safely modify all timers which could
- * be found on ->tvX lists.
+ * be found in the base->vectors array.
  *
- * When the timer's base is locked and removed from the list, the
- * TIMER_MIGRATING flag is set, FIXME
+ * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
+ * to wait until the migration is done.
  */
 static struct timer_base *lock_timer_base(struct timer_list *timer,
-					unsigned long *flags)
+					  unsigned long *flags)
 	__acquires(timer->base->lock)
 {
 	for (;;) {
-		u32 tf = timer->flags;
 		struct timer_base *base;
+		u32 tf = timer->flags;
 
 		if (!(tf & TIMER_MIGRATING)) {
-			base = per_cpu_ptr(&timer_bases, tf & TIMER_CPUMASK);
+			base = get_timer_base(tf);
 			spin_lock_irqsave(&base->lock, *flags);
 			if (timer->flags == tf)
 				return base;
@@ -770,6 +912,27 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 	unsigned long flags;
 	int ret = 0;
 
+	/*
+	 * TODO: Calculate the array bucket of the timer right here w/o
+	 * holding the base lock. This allows to check not only
+	 * timer->expires == expires below, but also whether the timer
+	 * ends up in the same bucket. If we really need to requeue
+	 * the timer then we check whether base->clk have
+	 * advanced between here and locking the timer base. If
+	 * jiffies advanced we have to recalc the array bucket with the
+	 * lock held.
+	 */
+
+	/*
+	 * This is a common optimization triggered by the
+	 * networking code - if the timer is re-modified
+	 * to be the same thing then just return:
+	 */
+	if (timer_pending(timer)) {
+		if (timer->expires == expires)
+			return 1;
+	}
+
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(!timer->function);
 
@@ -781,15 +944,15 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 
 	debug_activate(timer, expires);
 
-	new_base = get_target_base(base, timer->flags & TIMER_PINNED);
+	new_base = get_target_base(base, timer->flags);
 
 	if (base != new_base) {
 		/*
-		 * We are trying to schedule the timer on the local CPU.
+		 * We are trying to schedule the timer on the new base.
 		 * However we can't change timer's base while it is running,
 		 * otherwise del_timer_sync() can't detect that the timer's
-		 * handler yet has not finished. This also guarantees that
-		 * the timer is serialized wrt itself.
+		 * handler yet has not finished. This also guarantees that the
+		 * timer is serialized wrt itself.
 		 */
 		if (likely(base->running_timer != timer)) {
 			/* See the comment in lock_timer_base() */
@@ -828,45 +991,6 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 }
 EXPORT_SYMBOL(mod_timer_pending);
 
-/*
- * Decide where to put the timer while taking the slack into account
- *
- * Algorithm:
- *   1) calculate the maximum (absolute) time
- *   2) calculate the highest bit where the expires and new max are different
- *   3) use this bit to make a mask
- *   4) use the bitmask to round down the maximum time, so that all last
- *      bits are zeros
- */
-static inline
-unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
-{
-	unsigned long expires_limit, mask;
-	int bit;
-
-	if (timer->slack >= 0) {
-		expires_limit = expires + timer->slack;
-	} else {
-		long delta = expires - jiffies;
-
-		if (delta < 256)
-			return expires;
-
-		expires_limit = expires + delta / 256;
-	}
-	mask = expires ^ expires_limit;
-	if (mask == 0)
-		return expires;
-
-	bit = __fls(mask);
-
-	mask = (1UL << bit) - 1;
-
-	expires_limit = expires_limit & ~(mask);
-
-	return expires_limit;
-}
-
 /**
  * mod_timer - modify a timer's timeout
  * @timer: the timer to be modified
@@ -889,16 +1013,6 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
  */
 int mod_timer(struct timer_list *timer, unsigned long expires)
 {
-	expires = apply_slack(timer, expires);
-
-	/*
-	 * This is a common optimization triggered by the
-	 * networking code - if the timer is re-modified
-	 * to be the same thing then just return:
-	 */
-	if (timer_pending(timer) && timer->expires == expires)
-		return 1;
-
 	return __mod_timer(timer, expires, false);
 }
 EXPORT_SYMBOL(mod_timer);
@@ -933,13 +1047,14 @@ EXPORT_SYMBOL(add_timer);
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-	struct timer_base *new_base = per_cpu_ptr(&timer_bases, cpu);
-	struct timer_base *base;
+	struct timer_base *new_base, *base;
 	unsigned long flags;
 
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(timer_pending(timer) || !timer->function);
 
+	new_base = get_timer_cpu_base(timer->flags, cpu);
+
 	/*
 	 * If @timer was on a different CPU, it should be migrated with the
 	 * old base locked to prevent other operations proceeding with the
@@ -1085,27 +1200,6 @@ int del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
-static int cascade(struct timer_base *base, struct tvec *tv, int index)
-{
-	/* cascade all the timers from tv up one level */
-	struct timer_list *timer;
-	struct hlist_node *tmp;
-	struct hlist_head tv_list;
-
-	hlist_move_list(tv->vec + index, &tv_list);
-
-	/*
-	 * We are removing _all_ timers from the list, so we
-	 * don't have to detach them individually.
-	 */
-	hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
-		/* No accounting, while moving them */
-		__internal_add_timer(base, timer);
-	}
-
-	return index;
-}
-
 static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
 			  unsigned long data)
 {
@@ -1149,68 +1243,80 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
 	}
 }
 
-#define INDEX(N) ((base->clk >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
+static void expire_timers(struct timer_base *base, struct hlist_head *head)
+{
+	while (!hlist_empty(head)) {
+		struct timer_list *timer;
+		void (*fn)(unsigned long);
+		unsigned long data;
+
+		timer = hlist_entry(head->first, struct timer_list, entry);
+		timer_stats_account_timer(timer);
+
+		base->running_timer = timer;
+		detach_timer(timer, true);
+
+		fn = timer->function;
+		data = timer->data;
+
+		if (timer->flags & TIMER_IRQSAFE) {
+			spin_unlock(&base->lock);
+			call_timer_fn(timer, fn, data);
+			spin_lock(&base->lock);
+		} else {
+			spin_unlock_irq(&base->lock);
+			call_timer_fn(timer, fn, data);
+			spin_lock_irq(&base->lock);
+		}
+	}
+}
+
+static int collect_expired_timers(struct timer_base *base,
+				  struct hlist_head *heads)
+{
+	unsigned long clk = base->clk;
+	struct hlist_head *vec;
+	int i, levels = 0;
+	unsigned int idx;
+
+	for (i = 0; i < LVL_DEPTH; i++) {
+		idx = (clk & LVL_MASK) + i * LVL_SIZE;
+
+		if (__test_and_clear_bit(idx, base->pending_map)) {
+			vec = base->vectors + idx;
+			hlist_move_list(vec, heads++);
+			levels++;
+		}
+		/* Is it time to look at the next level? */
+		if (clk & LVL_CLK_MASK)
+			break;
+		/* Shift clock for the next level granularity */
+		clk >>= LVL_CLK_SHIFT;
+	}
+	return levels;
+}
 
 /**
  * __run_timers - run all expired timers (if any) on this CPU.
  * @base: the timer vector to be processed.
- *
- * This function cascades all vectors and executes all expired timer
- * vectors.
  */
 static inline void __run_timers(struct timer_base *base)
 {
-	struct timer_list *timer;
+	struct hlist_head heads[LVL_DEPTH];
+	int levels;
+
+	if (!time_after_eq(jiffies, base->clk))
+		return;
 
 	spin_lock_irq(&base->lock);
 
 	while (time_after_eq(jiffies, base->clk)) {
-		struct hlist_head work_list;
-		struct hlist_head *head = &work_list;
-		int index;
 
-		if (!base->all_timers) {
-			base->clk = jiffies;
-			break;
-		}
-
-		index = base->clk & TVR_MASK;
+		levels = collect_expired_timers(base, heads);
+		base->clk++;
 
-		/*
-		 * Cascade timers:
-		 */
-		if (!index &&
-			(!cascade(base, &base->tv2, INDEX(0))) &&
-				(!cascade(base, &base->tv3, INDEX(1))) &&
-					!cascade(base, &base->tv4, INDEX(2)))
-			cascade(base, &base->tv5, INDEX(3));
-		++base->clk;
-		hlist_move_list(base->tv1.vec + index, head);
-		while (!hlist_empty(head)) {
-			void (*fn)(unsigned long);
-			unsigned long data;
-			bool irqsafe;
-
-			timer = hlist_entry(head->first, struct timer_list, entry);
-			fn = timer->function;
-			data = timer->data;
-			irqsafe = timer->flags & TIMER_IRQSAFE;
-
-			timer_stats_account_timer(timer);
-
-			base->running_timer = timer;
-			detach_expired_timer(timer, base);
-
-			if (irqsafe) {
-				spin_unlock(&base->lock);
-				call_timer_fn(timer, fn, data);
-				spin_lock(&base->lock);
-			} else {
-				spin_unlock_irq(&base->lock);
-				call_timer_fn(timer, fn, data);
-				spin_lock_irq(&base->lock);
-			}
-		}
+		while (levels--)
+			expire_timers(base, heads + levels);
 	}
 	base->running_timer = NULL;
 	spin_unlock_irq(&base->lock);
@@ -1218,78 +1324,87 @@ static inline void __run_timers(struct timer_base *base)
 
 #ifdef CONFIG_NO_HZ_COMMON
 /*
- * Find out when the next timer event is due to happen. This
- * is used on S/390 to stop all activity when a CPU is idle.
- * This function needs to be called with interrupts disabled.
+ * Find the next pending bucket of a level. Search from @offset + @clk upwards
+ * and if nothing there, search from start of the level (@offset) up to
+ * @offset + clk.
+ */
+static int next_pending_bucket(struct timer_base *base, unsigned offset,
+			       unsigned clk)
+{
+	unsigned pos, start = offset + clk;
+	unsigned end = offset + LVL_SIZE;
+
+	pos = find_next_bit(base->pending_map, end, start);
+	if (pos < end)
+		return pos - start;
+
+	pos = find_next_bit(base->pending_map, start, offset);
+	return pos < start ? pos + LVL_SIZE - start : -1;
+}
+
+/*
+ * Search the first expiring timer in the various clock levels.
  */
 static unsigned long __next_timer_interrupt(struct timer_base *base)
 {
-	unsigned long clk = base->clk;
-	unsigned long expires = clk + NEXT_TIMER_MAX_DELTA;
-	int index, slot, array, found = 0;
-	struct timer_list *nte;
-	struct tvec *varray[4];
-
-	/* Look for timer events in tv1. */
-	index = slot = clk & TVR_MASK;
-	do {
-		hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
-			if (nte->flags & TIMER_DEFERRABLE)
-				continue;
-
-			found = 1;
-			expires = nte->expires;
-			/* Look at the cascade bucket(s)? */
-			if (!index || slot < index)
-				goto cascade;
-			return expires;
+	unsigned long clk, next, adj;
+	unsigned lvl, offset = 0;
+
+	spin_lock(&base->lock);
+	next = base->clk + NEXT_TIMER_MAX_DELTA;
+	clk = base->clk;
+	for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
+		int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
+
+		if (pos >= 0) {
+			unsigned long tmp = clk + (unsigned long) pos;
+
+			tmp <<= LVL_SHIFT(lvl);
+			if (time_before(tmp, next))
+				next = tmp;
 		}
-		slot = (slot + 1) & TVR_MASK;
-	} while (slot != index);
-
-cascade:
-	/* Calculate the next cascade event */
-	if (index)
-		clk += TVR_SIZE - index;
-	clk >>= TVR_BITS;
-
-	/* Check tv2-tv5. */
-	varray[0] = &base->tv2;
-	varray[1] = &base->tv3;
-	varray[2] = &base->tv4;
-	varray[3] = &base->tv5;
-
-	for (array = 0; array < 4; array++) {
-		struct tvec *varp = varray[array];
-
-		index = slot = clk & TVN_MASK;
-		do {
-			hlist_for_each_entry(nte, varp->vec + slot, entry) {
-				if (nte->flags & TIMER_DEFERRABLE)
-					continue;
-
-				found = 1;
-				if (time_before(nte->expires, expires))
-					expires = nte->expires;
-			}
-			/*
-			 * Do we still search for the first timer or are
-			 * we looking up the cascade buckets ?
-			 */
-			if (found) {
-				/* Look at the cascade bucket(s)? */
-				if (!index || slot < index)
-					break;
-				return expires;
-			}
-			slot = (slot + 1) & TVN_MASK;
-		} while (slot != index);
-
-		if (index)
-			clk += TVN_SIZE - index;
-		clk >>= TVN_BITS;
+		/*
+		 * Clock for the next level. If the current level clock lower
+		 * bits are zero, we look at the next level as is. If not we
+		 * need to advance it by one because that's going to be the
+		 * next expiring bucket in that level. base->clk is the next
+		 * expiring jiffie. So in case of:
+		 *
+		 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+		 *  0    0    0    0    0    0
+		 *
+		 * we have to look at all levels @index 0. With
+		 *
+		 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+		 *  0    0    0    0    0    2
+		 *
+		 * LVL0 has the next expiring bucket @index 2. The upper
+		 * levels have the next expiring bucket @index 1.
+		 *
+		 * In case that the propagation wraps the next level the same
+		 * rules apply:
+		 *
+		 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+		 *  0    0    0    0    F    2
+		 *
+		 * So after looking at LVL0 we get:
+		 *
+		 * LVL5 LVL4 LVL3 LVL2 LVL1
+		 *  0    0    0    1    0
+		 *
+		 * So no propagation from LVL1 to LVL2 because that happened
+		 * with the add already, but then we need to propagate further
+		 * from LVL2 to LVL3.
+		 *
+		 * So the simple check whether the lower bits of the current
+		 * level are 0 or not is sufficient for all cases.
+		 */
+		adj = clk & LVL_CLK_MASK ? 1 : 0;
+		clk >>= LVL_CLK_SHIFT;
+		clk += adj;
 	}
-	return expires;
+	spin_unlock(&base->lock);
+	return next;
 }
 
 /*
@@ -1335,7 +1450,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
  */
 u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 {
-	struct timer_base *base = this_cpu_ptr(&timer_bases);
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
 	u64 expires = KTIME_MAX;
 	unsigned long nextevt;
 
@@ -1346,17 +1461,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 	if (cpu_is_offline(smp_processor_id()))
 		return expires;
 
-	spin_lock(&base->lock);
-	if (base->active_timers) {
-		if (time_before_eq(base->next_timer, base->clk))
-			base->next_timer = __next_timer_interrupt(base);
-		nextevt = base->next_timer;
-		if (time_before_eq(nextevt, basej))
-			expires = basem;
-		else
-			expires = basem + (nextevt - basej) * TICK_NSEC;
-	}
-	spin_unlock(&base->lock);
+	nextevt = __next_timer_interrupt(base);
+	if (time_before_eq(nextevt, basej))
+		expires = basem;
+	else
+		expires = basem + (nextevt - basej) * TICK_NSEC;
 
 	return cmp_next_hrtimer_event(basem, expires);
 }
@@ -1387,10 +1496,11 @@ void update_process_times(int user_tick)
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	struct timer_base *base = this_cpu_ptr(&timer_bases);
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
 
-	if (time_after_eq(jiffies, base->clk))
-		__run_timers(base);
+	__run_timers(base);
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
+		__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
 }
 
 /*
@@ -1541,7 +1651,6 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
 
 	while (!hlist_empty(head)) {
 		timer = hlist_entry(head->first, struct timer_list, entry);
-		/* We ignore the accounting on the dying cpu */
 		detach_timer(timer, false);
 		timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
 		internal_add_timer(new_base, timer);
@@ -1552,35 +1661,29 @@ static void migrate_timers(int cpu)
 {
 	struct timer_base *old_base;
 	struct timer_base *new_base;
-	int i;
+	int b, i;
 
 	BUG_ON(cpu_online(cpu));
-	old_base = per_cpu_ptr(&timer_bases, cpu);
-	new_base = get_cpu_ptr(&timer_bases);
-	/*
-	 * The caller is globally serialized and nobody else
-	 * takes two locks at once, deadlock is not possible.
-	 */
-	spin_lock_irq(&new_base->lock);
-	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
-
-	BUG_ON(old_base->running_timer);
-
-	for (i = 0; i < TVR_SIZE; i++)
-		migrate_timer_list(new_base, old_base->tv1.vec + i);
-	for (i = 0; i < TVN_SIZE; i++) {
-		migrate_timer_list(new_base, old_base->tv2.vec + i);
-		migrate_timer_list(new_base, old_base->tv3.vec + i);
-		migrate_timer_list(new_base, old_base->tv4.vec + i);
-		migrate_timer_list(new_base, old_base->tv5.vec + i);
-	}
 
-	old_base->active_timers = 0;
-	old_base->all_timers = 0;
+	for (b = 0; b < NR_BASES; b++) {
+		old_base = per_cpu_ptr(&timer_bases[b], cpu);
+		new_base = get_cpu_ptr(&timer_bases[b]);
+		/*
+		 * The caller is globally serialized and nobody else
+		 * takes two locks at once, deadlock is not possible.
+		 */
+		spin_lock_irq(&new_base->lock);
+		spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+
+		BUG_ON(old_base->running_timer);
+
+		for (i = 0; i < WHEEL_SIZE; i++)
+			migrate_timer_list(new_base, old_base->vectors + i);
 
-	spin_unlock(&old_base->lock);
-	spin_unlock_irq(&new_base->lock);
-	put_cpu_ptr(&timer_bases);
+		spin_unlock(&old_base->lock);
+		spin_unlock_irq(&new_base->lock);
+		put_cpu_ptr(&timer_bases);
+	}
 }
 
 static int timer_cpu_notify(struct notifier_block *self,
@@ -1608,13 +1711,15 @@ static inline void timer_register_cpu_notifier(void) { }
 
 static void __init init_timer_cpu(int cpu)
 {
-	struct timer_base *base = per_cpu_ptr(&timer_bases, cpu);
-
-	base->cpu = cpu;
-	spin_lock_init(&base->lock);
+	struct timer_base *base;
+	int i;
 
-	base->clk = jiffies;
-	base->next_timer = base->clk;
+	for (i = 0; i < NR_BASES; i++) {
+		base = per_cpu_ptr(&timer_bases[i], cpu);
+		base->cpu = cpu;
+		spin_lock_init(&base->lock);
+		base->clk = jiffies;
+	}
 }
 
 static void __init init_timer_cpus(void)
-- 
cgit v1.3-8-gc7d7


From 53bf837b78d155b8e1110b3c25b4d0d6391b8ff3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:31 +0000
Subject: timers: Remove set_timer_slack() leftovers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We now have implicit batching in the timer wheel. The slack API is no longer
used, so remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Andrew F. Davis <afd@ti.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jaehoon Chung <jh80.chung@samsung.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathias Nyman <mathias.nyman@intel.com>
Cc: Pali Rohár <pali.rohar@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Ulf Hansson <ulf.hansson@linaro.org>
Cc: linux-block@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-mmc@vger.kernel.org
Cc: linux-pm@vger.kernel.org
Cc: linux-usb@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.189813118@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 block/genhd.c                   |  5 -----
 drivers/mmc/host/jz4740_mmc.c   |  2 --
 drivers/power/bq27xxx_battery.c |  5 +----
 drivers/usb/host/ohci-hcd.c     |  1 -
 drivers/usb/host/xhci.c         |  2 --
 include/linux/timer.h           |  4 ----
 kernel/time/timer.c             | 19 -------------------
 lib/random32.c                  |  1 -
 8 files changed, 1 insertion(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/block/genhd.c b/block/genhd.c
index 9f42526b4d62..f06d7f3b075b 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1523,12 +1523,7 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now)
 	if (--ev->block)
 		goto out_unlock;
 
-	/*
-	 * Not exactly a latency critical operation, set poll timer
-	 * slack to 25% and kick event check.
-	 */
 	intv = disk_events_poll_jiffies(disk);
-	set_timer_slack(&ev->dwork.timer, intv / 4);
 	if (check_now)
 		queue_delayed_work(system_freezable_power_efficient_wq,
 				&ev->dwork, 0);
diff --git a/drivers/mmc/host/jz4740_mmc.c b/drivers/mmc/host/jz4740_mmc.c
index 03ddf0ecf402..684087db170b 100644
--- a/drivers/mmc/host/jz4740_mmc.c
+++ b/drivers/mmc/host/jz4740_mmc.c
@@ -1068,8 +1068,6 @@ static int jz4740_mmc_probe(struct platform_device* pdev)
 	jz4740_mmc_clock_disable(host);
 	setup_timer(&host->timeout_timer, jz4740_mmc_timeout,
 			(unsigned long)host);
-	/* It is not important when it times out, it just needs to timeout. */
-	set_timer_slack(&host->timeout_timer, HZ);
 
 	host->use_dma = true;
 	if (host->use_dma && jz4740_mmc_acquire_dma_channels(host) != 0)
diff --git a/drivers/power/bq27xxx_battery.c b/drivers/power/bq27xxx_battery.c
index 45f6ebf88df6..e90b3f307e0f 100644
--- a/drivers/power/bq27xxx_battery.c
+++ b/drivers/power/bq27xxx_battery.c
@@ -735,11 +735,8 @@ static void bq27xxx_battery_poll(struct work_struct *work)
 
 	bq27xxx_battery_update(di);
 
-	if (poll_interval > 0) {
-		/* The timer does not have to be accurate. */
-		set_timer_slack(&di->work.timer, poll_interval * HZ / 4);
+	if (poll_interval > 0)
 		schedule_delayed_work(&di->work, poll_interval * HZ);
-	}
 }
 
 /*
diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c
index 0449235d4f22..1700908b84ef 100644
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -500,7 +500,6 @@ static int ohci_init (struct ohci_hcd *ohci)
 
 	setup_timer(&ohci->io_watchdog, io_watchdog_func,
 			(unsigned long) ohci);
-	set_timer_slack(&ohci->io_watchdog, msecs_to_jiffies(20));
 
 	ohci->hcca = dma_alloc_coherent (hcd->self.controller,
 			sizeof(*ohci->hcca), &ohci->hcca_dma, GFP_KERNEL);
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index f2f9518c53ab..a986fe763d87 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -490,8 +490,6 @@ static void compliance_mode_recovery_timer_init(struct xhci_hcd *xhci)
 	xhci->comp_mode_recovery_timer.expires = jiffies +
 			msecs_to_jiffies(COMP_MODE_RCVRY_MSECS);
 
-	set_timer_slack(&xhci->comp_mode_recovery_timer,
-			msecs_to_jiffies(COMP_MODE_RCVRY_MSECS));
 	add_timer(&xhci->comp_mode_recovery_timer);
 	xhci_dbg_trace(xhci, trace_xhci_dbg_quirks,
 			"Compliance mode recovery timer initialized");
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 5869ab9848fe..4419506b564e 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -19,7 +19,6 @@ struct timer_list {
 	void			(*function)(unsigned long);
 	unsigned long		data;
 	u32			flags;
-	int			slack;
 
 #ifdef CONFIG_TIMER_STATS
 	int			start_pid;
@@ -73,7 +72,6 @@ struct timer_list {
 		.expires = (_expires),				\
 		.data = (_data),				\
 		.flags = (_flags),				\
-		.slack = -1,					\
 		__TIMER_LOCKDEP_MAP_INITIALIZER(		\
 			__FILE__ ":" __stringify(__LINE__))	\
 	}
@@ -193,8 +191,6 @@ extern int del_timer(struct timer_list * timer);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
 
-extern void set_timer_slack(struct timer_list *time, int slack_hz);
-
 /*
  * The jiffies value which is added to now, when there is no timer
  * in the timer wheel:
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 86e95b72665d..a83e23d0bc25 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -447,24 +447,6 @@ unsigned long round_jiffies_up_relative(unsigned long j)
 }
 EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
 
-/**
- * set_timer_slack - set the allowed slack for a timer
- * @timer: the timer to be modified
- * @slack_hz: the amount of time (in jiffies) allowed for rounding
- *
- * Set the amount of time, in jiffies, that a certain timer has
- * in terms of slack. By setting this value, the timer subsystem
- * will schedule the actual timer somewhere between
- * the time mod_timer() asks for, and that time plus the slack.
- *
- * By setting the slack to -1, a percentage of the delay is used
- * instead.
- */
-void set_timer_slack(struct timer_list *timer, int slack_hz)
-{
-	timer->slack = slack_hz;
-}
-EXPORT_SYMBOL_GPL(set_timer_slack);
 
 static inline unsigned int timer_get_idx(struct timer_list *timer)
 {
@@ -775,7 +757,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
 {
 	timer->entry.pprev = NULL;
 	timer->flags = flags | raw_smp_processor_id();
-	timer->slack = -1;
 #ifdef CONFIG_TIMER_STATS
 	timer->start_site = NULL;
 	timer->start_pid = -1;
diff --git a/lib/random32.c b/lib/random32.c
index 510d1ce7d4d2..69ed593aab07 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -233,7 +233,6 @@ static void __prandom_timer(unsigned long dontcare)
 
 static void __init __prandom_start_seed_timer(void)
 {
-	set_timer_slack(&seed_timer, HZ);
 	seed_timer.expires = jiffies + msecs_to_jiffies(40 * MSEC_PER_SEC);
 	add_timer(&seed_timer);
 }
-- 
cgit v1.3-8-gc7d7


From 73420fea80c6c376d91a69defe64013baa0d7e95 Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:33 +0000
Subject: timers: Move __run_timers() function

Move __run_timers() below __next_timer_interrupt() and next_pending_bucket()
in preparation for __run_timers() NOHZ optimization.

No functional change.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.271872665@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timer.c | 52 ++++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a83e23d0bc25..c16c48de01c5 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1277,32 +1277,6 @@ static int collect_expired_timers(struct timer_base *base,
 	return levels;
 }
 
-/**
- * __run_timers - run all expired timers (if any) on this CPU.
- * @base: the timer vector to be processed.
- */
-static inline void __run_timers(struct timer_base *base)
-{
-	struct hlist_head heads[LVL_DEPTH];
-	int levels;
-
-	if (!time_after_eq(jiffies, base->clk))
-		return;
-
-	spin_lock_irq(&base->lock);
-
-	while (time_after_eq(jiffies, base->clk)) {
-
-		levels = collect_expired_timers(base, heads);
-		base->clk++;
-
-		while (levels--)
-			expire_timers(base, heads + levels);
-	}
-	base->running_timer = NULL;
-	spin_unlock_irq(&base->lock);
-}
-
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * Find the next pending bucket of a level. Search from @offset + @clk upwards
@@ -1472,6 +1446,32 @@ void update_process_times(int user_tick)
 	run_posix_cpu_timers(p);
 }
 
+/**
+ * __run_timers - run all expired timers (if any) on this CPU.
+ * @base: the timer vector to be processed.
+ */
+static inline void __run_timers(struct timer_base *base)
+{
+	struct hlist_head heads[LVL_DEPTH];
+	int levels;
+
+	if (!time_after_eq(jiffies, base->clk))
+		return;
+
+	spin_lock_irq(&base->lock);
+
+	while (time_after_eq(jiffies, base->clk)) {
+
+		levels = collect_expired_timers(base, heads);
+		base->clk++;
+
+		while (levels--)
+			expire_timers(base, heads + levels);
+	}
+	base->running_timer = NULL;
+	spin_unlock_irq(&base->lock);
+}
+
 /*
  * This function runs timers and the timer-tq in bottom half context.
  */
-- 
cgit v1.3-8-gc7d7


From 236968383cf5cd48835ff0d8a265e299e220d140 Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:34 +0000
Subject: timers: Optimize collect_expired_timers() for NOHZ

After a NOHZ idle sleep the timer wheel must be forwarded to current jiffies.
There might be expired timers so the current code loops and checks the expired
buckets for timers. This can take quite some time for long NOHZ idle periods.

The pending bitmask in the timer base allows us to do a quick search for the
next expiring timer and therefore a fast forward of the base time which
prevents pointless long lasting loops.

For a 3 seconds idle sleep this reduces the catchup time from ~1ms to 5us.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.351296290@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timer.c | 49 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 41 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index c16c48de01c5..658051c97a3c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1252,8 +1252,8 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 	}
 }
 
-static int collect_expired_timers(struct timer_base *base,
-				  struct hlist_head *heads)
+static int __collect_expired_timers(struct timer_base *base,
+				    struct hlist_head *heads)
 {
 	unsigned long clk = base->clk;
 	struct hlist_head *vec;
@@ -1279,9 +1279,9 @@ static int collect_expired_timers(struct timer_base *base,
 
 #ifdef CONFIG_NO_HZ_COMMON
 /*
- * Find the next pending bucket of a level. Search from @offset + @clk upwards
- * and if nothing there, search from start of the level (@offset) up to
- * @offset + clk.
+ * Find the next pending bucket of a level. Search from level start (@offset)
+ * + @clk upwards and if nothing there, search from start of the level
+ * (@offset) up to @offset + clk.
  */
 static int next_pending_bucket(struct timer_base *base, unsigned offset,
 			       unsigned clk)
@@ -1298,14 +1298,14 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset,
 }
 
 /*
- * Search the first expiring timer in the various clock levels.
+ * Search the first expiring timer in the various clock levels. Caller must
+ * hold base->lock.
  */
 static unsigned long __next_timer_interrupt(struct timer_base *base)
 {
 	unsigned long clk, next, adj;
 	unsigned lvl, offset = 0;
 
-	spin_lock(&base->lock);
 	next = base->clk + NEXT_TIMER_MAX_DELTA;
 	clk = base->clk;
 	for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
@@ -1358,7 +1358,6 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
 		clk >>= LVL_CLK_SHIFT;
 		clk += adj;
 	}
-	spin_unlock(&base->lock);
 	return next;
 }
 
@@ -1416,7 +1415,10 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 	if (cpu_is_offline(smp_processor_id()))
 		return expires;
 
+	spin_lock(&base->lock);
 	nextevt = __next_timer_interrupt(base);
+	spin_unlock(&base->lock);
+
 	if (time_before_eq(nextevt, basej))
 		expires = basem;
 	else
@@ -1424,6 +1426,37 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 
 	return cmp_next_hrtimer_event(basem, expires);
 }
+
+static int collect_expired_timers(struct timer_base *base,
+				  struct hlist_head *heads)
+{
+	/*
+	 * NOHZ optimization. After a long idle sleep we need to forward the
+	 * base to current jiffies. Avoid a loop by searching the bitfield for
+	 * the next expiring timer.
+	 */
+	if ((long)(jiffies - base->clk) > 2) {
+		unsigned long next = __next_timer_interrupt(base);
+
+		/*
+		 * If the next timer is ahead of time forward to current
+		 * jiffies, otherwise forward to the next expiry time.
+		 */
+		if (time_after(next, jiffies)) {
+			/* The call site will increment clock! */
+			base->clk = jiffies - 1;
+			return 0;
+		}
+		base->clk = next;
+	}
+	return __collect_expired_timers(base, heads);
+}
+#else
+static inline int collect_expired_timers(struct timer_base *base,
+					 struct hlist_head *heads)
+{
+	return __collect_expired_timers(base, heads);
+}
 #endif
 
 /*
-- 
cgit v1.3-8-gc7d7


From ff00673292bd42a3688b33de47252a6a3c3f424c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:35 +0000
Subject: timers/nohz: Remove pointless tick_nohz_kick_tick() function

This was a failed attempt to optimize the timer expiry in idle, which was
disabled and never revisited. Remove the cruft.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.431073782@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-sched.c | 33 +--------------------------------
 1 file changed, 1 insertion(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 536ada80f6dd..69abc7bfe80f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1092,35 +1092,6 @@ static void tick_nohz_switch_to_nohz(void)
 	tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
 }
 
-/*
- * When NOHZ is enabled and the tick is stopped, we need to kick the
- * tick timer from irq_enter() so that the jiffies update is kept
- * alive during long running softirqs. That's ugly as hell, but
- * correctness is key even if we need to fix the offending softirq in
- * the first place.
- *
- * Note, this is different to tick_nohz_restart. We just kick the
- * timer and do not touch the other magic bits which need to be done
- * when idle is left.
- */
-static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
-{
-#if 0
-	/* Switch back to 2.6.27 behaviour */
-	ktime_t delta;
-
-	/*
-	 * Do not touch the tick device, when the next expiry is either
-	 * already reached or less/equal than the tick period.
-	 */
-	delta =	ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
-	if (delta.tv64 <= tick_period.tv64)
-		return;
-
-	tick_nohz_restart(ts, now);
-#endif
-}
-
 static inline void tick_nohz_irq_enter(void)
 {
 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
@@ -1131,10 +1102,8 @@ static inline void tick_nohz_irq_enter(void)
 	now = ktime_get();
 	if (ts->idle_active)
 		tick_nohz_stop_idle(ts, now);
-	if (ts->tick_stopped) {
+	if (ts->tick_stopped)
 		tick_nohz_update_jiffies(now);
-		tick_nohz_kick_tick(ts, now);
-	}
 }
 
 #else
-- 
cgit v1.3-8-gc7d7


From a683f390b93f4d1292f849fc48d28e322046120f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:36 +0000
Subject: timers: Forward the wheel clock whenever possible

The wheel clock is stale when a CPU goes into a long idle sleep. This has the
side effect that timers which are queued end up in the outer wheel levels.
That results in coarser granularity.

To solve this, we keep track of the idle state and forward the wheel clock
whenever possible.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.512039360@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-internal.h |   1 +
 kernel/time/tick-sched.c    |  12 +++++
 kernel/time/timer.c         | 128 ++++++++++++++++++++++++++++++++++++--------
 3 files changed, 120 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 966a5a6fdd0a..f738251000fe 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -164,3 +164,4 @@ static inline void timers_update_migration(bool update_nohz) { }
 DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
 
 extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
+void timer_clear_idle(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 69abc7bfe80f..5d81f9aa30d2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -700,6 +700,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	delta = next_tick - basemono;
 	if (delta <= (u64)TICK_NSEC) {
 		tick.tv64 = 0;
+
+		/*
+		 * Tell the timer code that the base is not idle, i.e. undo
+		 * the effect of get_next_timer_interrupt():
+		 */
+		timer_clear_idle();
 		/*
 		 * We've not stopped the tick yet, and there's a timer in the
 		 * next period, so no point in stopping it either, bail.
@@ -809,6 +815,12 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 	tick_do_update_jiffies64(now);
 	cpu_load_update_nohz_stop();
 
+	/*
+	 * Clear the timer idle flag, so we avoid IPIs on remote queueing and
+	 * the clock forward checks in the enqueue path:
+	 */
+	timer_clear_idle();
+
 	calc_load_exit_idle();
 	touch_softlockup_watchdog_sched();
 	/*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 658051c97a3c..9339d71ee998 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -196,9 +196,11 @@ struct timer_base {
 	spinlock_t		lock;
 	struct timer_list	*running_timer;
 	unsigned long		clk;
+	unsigned long		next_expiry;
 	unsigned int		cpu;
 	bool			migration_enabled;
 	bool			nohz_active;
+	bool			is_idle;
 	DECLARE_BITMAP(pending_map, WHEEL_SIZE);
 	struct hlist_head	vectors[WHEEL_SIZE];
 } ____cacheline_aligned;
@@ -519,24 +521,37 @@ static void internal_add_timer(struct timer_base *base, struct timer_list *timer
 {
 	__internal_add_timer(base, timer);
 
+	if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+		return;
+
 	/*
-	 * Check whether the other CPU is in dynticks mode and needs
-	 * to be triggered to reevaluate the timer wheel.  We are
-	 * protected against the other CPU fiddling with the timer by
-	 * holding the timer base lock. This also makes sure that a
-	 * CPU on the way to stop its tick can not evaluate the timer
-	 * wheel.
-	 *
-	 * Spare the IPI for deferrable timers on idle targets though.
-	 * The next busy ticks will take care of it. Except full dynticks
-	 * require special care against races with idle_cpu(), lets deal
-	 * with that later.
+	 * TODO: This wants some optimizing similar to the code below, but we
+	 * will do that when we switch from push to pull for deferrable timers.
 	 */
-	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) {
-		if (!(timer->flags & TIMER_DEFERRABLE) ||
-		    tick_nohz_full_cpu(base->cpu))
+	if (timer->flags & TIMER_DEFERRABLE) {
+		if (tick_nohz_full_cpu(base->cpu))
 			wake_up_nohz_cpu(base->cpu);
+		return;
 	}
+
+	/*
+	 * We might have to IPI the remote CPU if the base is idle and the
+	 * timer is not deferrable. If the other CPU is on the way to idle
+	 * then it can't set base->is_idle as we hold the base lock:
+	 */
+	if (!base->is_idle)
+		return;
+
+	/* Check whether this is the new first expiring timer: */
+	if (time_after_eq(timer->expires, base->next_expiry))
+		return;
+
+	/*
+	 * Set the next expiry time and kick the CPU so it can reevaluate the
+	 * wheel:
+	 */
+	base->next_expiry = timer->expires;
+	wake_up_nohz_cpu(base->cpu);
 }
 
 #ifdef CONFIG_TIMER_STATS
@@ -844,10 +859,11 @@ static inline struct timer_base *get_timer_base(u32 tflags)
 	return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
 }
 
-static inline struct timer_base *get_target_base(struct timer_base *base,
-						 unsigned tflags)
+#ifdef CONFIG_NO_HZ_COMMON
+static inline struct timer_base *
+__get_target_base(struct timer_base *base, unsigned tflags)
 {
-#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
 	if ((tflags & TIMER_PINNED) || !base->migration_enabled)
 		return get_timer_this_cpu_base(tflags);
 	return get_timer_cpu_base(tflags, get_nohz_timer_target());
@@ -856,6 +872,43 @@ static inline struct timer_base *get_target_base(struct timer_base *base,
 #endif
 }
 
+static inline void forward_timer_base(struct timer_base *base)
+{
+	/*
+	 * We only forward the base when it's idle and we have a delta between
+	 * base clock and jiffies.
+	 */
+	if (!base->is_idle || (long) (jiffies - base->clk) < 2)
+		return;
+
+	/*
+	 * If the next expiry value is > jiffies, then we fast forward to
+	 * jiffies otherwise we forward to the next expiry value.
+	 */
+	if (time_after(base->next_expiry, jiffies))
+		base->clk = jiffies;
+	else
+		base->clk = base->next_expiry;
+}
+#else
+static inline struct timer_base *
+__get_target_base(struct timer_base *base, unsigned tflags)
+{
+	return get_timer_this_cpu_base(tflags);
+}
+
+static inline void forward_timer_base(struct timer_base *base) { }
+#endif
+
+static inline struct timer_base *
+get_target_base(struct timer_base *base, unsigned tflags)
+{
+	struct timer_base *target = __get_target_base(base, tflags);
+
+	forward_timer_base(target);
+	return target;
+}
+
 /*
  * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
  * that all timers which are tied to this base are locked, and the base itself
@@ -1417,16 +1470,49 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 
 	spin_lock(&base->lock);
 	nextevt = __next_timer_interrupt(base);
-	spin_unlock(&base->lock);
+	base->next_expiry = nextevt;
+	/*
+	 * We have a fresh next event. Check whether we can forward the base:
+	 */
+	if (time_after(nextevt, jiffies))
+		base->clk = jiffies;
+	else if (time_after(nextevt, base->clk))
+		base->clk = nextevt;
 
-	if (time_before_eq(nextevt, basej))
+	if (time_before_eq(nextevt, basej)) {
 		expires = basem;
-	else
+		base->is_idle = false;
+	} else {
 		expires = basem + (nextevt - basej) * TICK_NSEC;
+		/*
+		 * If we expect to sleep more than a tick, mark the base idle:
+		 */
+		if ((expires - basem) > TICK_NSEC)
+			base->is_idle = true;
+	}
+	spin_unlock(&base->lock);
 
 	return cmp_next_hrtimer_event(basem, expires);
 }
 
+/**
+ * timer_clear_idle - Clear the idle state of the timer base
+ *
+ * Called with interrupts disabled
+ */
+void timer_clear_idle(void)
+{
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+
+	/*
+	 * We do this unlocked. The worst outcome is a remote enqueue sending
+	 * a pointless IPI, but taking the lock would just make the window for
+	 * sending the IPI a few instructions smaller for the cost of taking
+	 * the lock in the exit from idle path.
+	 */
+	base->is_idle = false;
+}
+
 static int collect_expired_timers(struct timer_base *base,
 				  struct hlist_head *heads)
 {
@@ -1440,7 +1526,7 @@ static int collect_expired_timers(struct timer_base *base,
 
 		/*
 		 * If the next timer is ahead of time forward to current
-		 * jiffies, otherwise forward to the next expiry time.
+		 * jiffies, otherwise forward to the next expiry time:
 		 */
 		if (time_after(next, jiffies)) {
 			/* The call site will increment clock! */
-- 
cgit v1.3-8-gc7d7


From 4e85876a9d2a977b4a07389da8c07edf76d10825 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:37 +0000
Subject: timers: Only wake softirq if necessary

With the wheel forwading in place and with the HZ=1000 4ms folding we can
avoid running the softirq at all.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.607650550@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timer.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 9339d71ee998..8d830f1f6a6a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1608,7 +1608,18 @@ static void run_timer_softirq(struct softirq_action *h)
  */
 void run_local_timers(void)
 {
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+
 	hrtimer_run_queues();
+	/* Raise the softirq only if required. */
+	if (time_before(jiffies, base->clk)) {
+		if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+			return;
+		/* CPU is awake, so check the deferrable base. */
+		base++;
+		if (time_before(jiffies, base->clk))
+			return;
+	}
 	raise_softirq(TIMER_SOFTIRQ);
 }
 
-- 
cgit v1.3-8-gc7d7


From ffdf047728f8f93df896b58049c7513856027141 Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:39 +0000
Subject: timers: Split out index calculation

For further optimizations we need to seperate index calculation
from queueing. No functional change.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.691159619@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timer.c | 47 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 8d830f1f6a6a..8d7c23e55c85 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -471,12 +471,9 @@ static inline unsigned calc_index(unsigned expires, unsigned lvl)
 	return LVL_OFFS(lvl) + (expires & LVL_MASK);
 }
 
-static void
-__internal_add_timer(struct timer_base *base, struct timer_list *timer)
+static int calc_wheel_index(unsigned long expires, unsigned long clk)
 {
-	unsigned long expires = timer->expires;
-	unsigned long delta = expires - base->clk;
-	struct hlist_head *vec;
+	unsigned long delta = expires - clk;
 	unsigned int idx;
 
 	if (delta < LVL_START(1)) {
@@ -496,7 +493,7 @@ __internal_add_timer(struct timer_base *base, struct timer_list *timer)
 	} else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
 		idx = calc_index(expires, 7);
 	} else if ((long) delta < 0) {
-		idx = base->clk & LVL_MASK;
+		idx = clk & LVL_MASK;
 	} else {
 		/*
 		 * Force expire obscene large timeouts to expire at the
@@ -507,20 +504,33 @@ __internal_add_timer(struct timer_base *base, struct timer_list *timer)
 
 		idx = calc_index(expires, LVL_DEPTH - 1);
 	}
-	/*
-	 * Enqueue the timer into the array bucket, mark it pending in
-	 * the bitmap and store the index in the timer flags.
-	 */
-	vec = base->vectors + idx;
-	hlist_add_head(&timer->entry, vec);
+	return idx;
+}
+
+/*
+ * Enqueue the timer into the hash bucket, mark it pending in
+ * the bitmap and store the index in the timer flags.
+ */
+static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
+			  unsigned int idx)
+{
+	hlist_add_head(&timer->entry, base->vectors + idx);
 	__set_bit(idx, base->pending_map);
 	timer_set_idx(timer, idx);
 }
 
-static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
+static void
+__internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
-	__internal_add_timer(base, timer);
+	unsigned int idx;
+
+	idx = calc_wheel_index(timer->expires, base->clk);
+	enqueue_timer(base, timer, idx);
+}
 
+static void
+trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
+{
 	if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
 		return;
 
@@ -551,7 +561,14 @@ static void internal_add_timer(struct timer_base *base, struct timer_list *timer
 	 * wheel:
 	 */
 	base->next_expiry = timer->expires;
-	wake_up_nohz_cpu(base->cpu);
+		wake_up_nohz_cpu(base->cpu);
+}
+
+static void
+internal_add_timer(struct timer_base *base, struct timer_list *timer)
+{
+	__internal_add_timer(base, timer);
+	trigger_dyntick_cpu(base, timer);
 }
 
 #ifdef CONFIG_TIMER_STATS
-- 
cgit v1.3-8-gc7d7


From f00c0afdfa625165a609513bc74164d56752ec3e Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Mon, 4 Jul 2016 09:50:40 +0000
Subject: timers: Implement optimization for same expiry time in mod_timer()

The existing optimization for same expiry time in mod_timer() checks whether
the timer expiry time is the same as the new requested expiry time. In the old
timer wheel implementation this does not take the slack batching into account,
neither does the new implementation evaluate whether the new expiry time will
requeue the timer to the same bucket.

To optimize that, we can calculate the resulting bucket and check if the new
expiry time is different from the current expiry time. This calculation
happens outside the base lock held region. If the resulting bucket is the same
we can avoid taking the base lock and requeueing the timer.

If the timer needs to be requeued then we have to check under the base lock
whether the base time has changed between the lockless calculation and taking
the lock. If it has changed we need to recalculate under the lock.

This optimization takes effect for timers which are enqueued into the less
granular wheel levels (1 and above). With a simple test case the functionality
has been verified:

            Before        After
 Match:       5.5%        86.6%
 Requeue:    94.5%        13.4%
 Recalc:                  <0.01%

In the non optimized case the timer is requeued in 94.5% of the cases. With
the index optimization in place the requeue rate drops to 13.4%. The case
where the lockless index calculation has to be redone is less than 0.01%.

With a real world test case (networking) we observed the following changes:

            Before        After
 Match:      97.8%        99.7%
 Requeue:     2.2%         0.3%
 Recalc:                  <0.001%

That means two percent fewer lock/requeue/unlock operations done in one of
the hot path use cases of timers.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: George Spelvin <linux@sciencehorizons.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160704094342.778527749@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timer.c | 51 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 8d7c23e55c85..8f29abeb8c4d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -960,28 +960,36 @@ static inline int
 __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 {
 	struct timer_base *base, *new_base;
-	unsigned long flags;
+	unsigned int idx = UINT_MAX;
+	unsigned long clk = 0, flags;
 	int ret = 0;
 
 	/*
-	 * TODO: Calculate the array bucket of the timer right here w/o
-	 * holding the base lock. This allows to check not only
-	 * timer->expires == expires below, but also whether the timer
-	 * ends up in the same bucket. If we really need to requeue
-	 * the timer then we check whether base->clk have
-	 * advanced between here and locking the timer base. If
-	 * jiffies advanced we have to recalc the array bucket with the
-	 * lock held.
-	 */
-
-	/*
-	 * This is a common optimization triggered by the
-	 * networking code - if the timer is re-modified
-	 * to be the same thing then just return:
+	 * This is a common optimization triggered by the networking code - if
+	 * the timer is re-modified to have the same timeout or ends up in the
+	 * same array bucket then just return:
 	 */
 	if (timer_pending(timer)) {
 		if (timer->expires == expires)
 			return 1;
+		/*
+		 * Take the current timer_jiffies of base, but without holding
+		 * the lock!
+		 */
+		base = get_timer_base(timer->flags);
+		clk = base->clk;
+
+		idx = calc_wheel_index(expires, clk);
+
+		/*
+		 * Retrieve and compare the array index of the pending
+		 * timer. If it matches set the expiry to the new value so a
+		 * subsequent call will exit in the expires check above.
+		 */
+		if (idx == timer_get_idx(timer)) {
+			timer->expires = expires;
+			return 1;
+		}
 	}
 
 	timer_stats_timer_set_start_info(timer);
@@ -1018,7 +1026,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 	}
 
 	timer->expires = expires;
-	internal_add_timer(base, timer);
+	/*
+	 * If 'idx' was calculated above and the base time did not advance
+	 * between calculating 'idx' and taking the lock, only enqueue_timer()
+	 * and trigger_dyntick_cpu() is required. Otherwise we need to
+	 * (re)calculate the wheel index via internal_add_timer().
+	 */
+	if (idx != UINT_MAX && clk == base->clk) {
+		enqueue_timer(base, timer, idx);
+		trigger_dyntick_cpu(base, timer);
+	} else {
+		internal_add_timer(base, timer);
+	}
 
 out_unlock:
 	spin_unlock_irqrestore(&base->lock, flags);
-- 
cgit v1.3-8-gc7d7


From 606274c5abd8e245add01bc7145a8cbb92b69ba8 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Jul 2016 22:38:36 -0700
Subject: bpf: introduce bpf_get_current_task() helper

over time there were multiple requests to access different data
structures and fields of task_struct current, so finally add
the helper to access 'current' as-is. Tracing bpf programs will do
the rest of walking the pointers via bpf_probe_read().
Note that current can be null and bpf program has to deal it with,
but even dumb passing null into bpf_probe_read() is still safe.

Suggested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  7 +++++++
 kernel/trace/bpf_trace.c | 13 +++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c14ca1cd6297..262a7e883b19 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -357,6 +357,13 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_get_hash_recalc,
 
+	/**
+	 * u64 bpf_get_current_task(void)
+	 * Returns current task_struct
+	 * Return: current
+	 */
+	BPF_FUNC_get_current_task,
+
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 19c5b4a5c3eb..094c716154ed 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -312,6 +312,17 @@ const struct bpf_func_proto *bpf_get_event_output_proto(void)
 	return &bpf_event_output_proto;
 }
 
+static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	return (long) current;
+}
+
+static const struct bpf_func_proto bpf_get_current_task_proto = {
+	.func		= bpf_get_current_task,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+};
+
 static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -329,6 +340,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 		return &bpf_tail_call_proto;
 	case BPF_FUNC_get_current_pid_tgid:
 		return &bpf_get_current_pid_tgid_proto;
+	case BPF_FUNC_get_current_task:
+		return &bpf_get_current_task_proto;
 	case BPF_FUNC_get_current_uid_gid:
 		return &bpf_get_current_uid_gid_proto;
 	case BPF_FUNC_get_current_comm:
-- 
cgit v1.3-8-gc7d7


From 9acacc2ac525ef1397af63b15cef7bb77a823c06 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Mon, 20 Jun 2016 17:37:18 +0800
Subject: sched/cpuacct: Merge cpuacct_usage_index and cpuacct_stat_index enums

These two types have similar function, no need to separate them.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/436748885270d64363c7dc67167507d486c2057a.1466415271.git.zhaolei@cn.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cpuacct.c | 47 ++++++++++++++++++++---------------------------
 1 file changed, 20 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 41f85c4d0938..74241eb5f3ff 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -25,15 +25,13 @@ enum cpuacct_stat_index {
 	CPUACCT_STAT_NSTATS,
 };
 
-enum cpuacct_usage_index {
-	CPUACCT_USAGE_USER,	/* ... user mode */
-	CPUACCT_USAGE_SYSTEM,	/* ... kernel mode */
-
-	CPUACCT_USAGE_NRUSAGE,
+static const char * const cpuacct_stat_desc[] = {
+	[CPUACCT_STAT_USER] = "user",
+	[CPUACCT_STAT_SYSTEM] = "system",
 };
 
 struct cpuacct_usage {
-	u64	usages[CPUACCT_USAGE_NRUSAGE];
+	u64	usages[CPUACCT_STAT_NSTATS];
 };
 
 /* track cpu usage of a group of tasks and its child groups */
@@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
 }
 
 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
-				 enum cpuacct_usage_index index)
+				 enum cpuacct_stat_index index)
 {
 	struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 	u64 data;
 
 	/*
-	 * We allow index == CPUACCT_USAGE_NRUSAGE here to read
+	 * We allow index == CPUACCT_STAT_NSTATS here to read
 	 * the sum of suages.
 	 */
-	BUG_ON(index > CPUACCT_USAGE_NRUSAGE);
+	BUG_ON(index > CPUACCT_STAT_NSTATS);
 
 #ifndef CONFIG_64BIT
 	/*
@@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
 	raw_spin_lock_irq(&cpu_rq(cpu)->lock);
 #endif
 
-	if (index == CPUACCT_USAGE_NRUSAGE) {
+	if (index == CPUACCT_STAT_NSTATS) {
 		int i = 0;
 
 		data = 0;
-		for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+		for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
 			data += cpuusage->usages[i];
 	} else {
 		data = cpuusage->usages[index];
@@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 	raw_spin_lock_irq(&cpu_rq(cpu)->lock);
 #endif
 
-	for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
 		cpuusage->usages[i] = val;
 
 #ifndef CONFIG_64BIT
@@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 
 /* return total cpu usage (in nanoseconds) of a group */
 static u64 __cpuusage_read(struct cgroup_subsys_state *css,
-			   enum cpuacct_usage_index index)
+			   enum cpuacct_stat_index index)
 {
 	struct cpuacct *ca = css_ca(css);
 	u64 totalcpuusage = 0;
@@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css,
 static u64 cpuusage_user_read(struct cgroup_subsys_state *css,
 			      struct cftype *cft)
 {
-	return __cpuusage_read(css, CPUACCT_USAGE_USER);
+	return __cpuusage_read(css, CPUACCT_STAT_USER);
 }
 
 static u64 cpuusage_sys_read(struct cgroup_subsys_state *css,
 			     struct cftype *cft)
 {
-	return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM);
+	return __cpuusage_read(css, CPUACCT_STAT_SYSTEM);
 }
 
 static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
-	return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE);
+	return __cpuusage_read(css, CPUACCT_STAT_NSTATS);
 }
 
 static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
 }
 
 static int __cpuacct_percpu_seq_show(struct seq_file *m,
-				     enum cpuacct_usage_index index)
+				     enum cpuacct_stat_index index)
 {
 	struct cpuacct *ca = css_ca(seq_css(m));
 	u64 percpu;
@@ -229,24 +227,19 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m,
 
 static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V)
 {
-	return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER);
+	return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER);
 }
 
 static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V)
 {
-	return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM);
+	return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM);
 }
 
 static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
 {
-	return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE);
+	return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS);
 }
 
-static const char * const cpuacct_stat_desc[] = {
-	[CPUACCT_STAT_USER] = "user",
-	[CPUACCT_STAT_SYSTEM] = "system",
-};
-
 static int cpuacct_stats_show(struct seq_file *sf, void *v)
 {
 	struct cpuacct *ca = css_ca(seq_css(sf));
@@ -316,11 +309,11 @@ static struct cftype files[] = {
 void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
 	struct cpuacct *ca;
-	int index = CPUACCT_USAGE_SYSTEM;
+	int index = CPUACCT_STAT_SYSTEM;
 	struct pt_regs *regs = task_pt_regs(tsk);
 
 	if (regs && user_mode(regs))
-		index = CPUACCT_USAGE_USER;
+		index = CPUACCT_STAT_USER;
 
 	rcu_read_lock();
 
-- 
cgit v1.3-8-gc7d7


From 8e546bfafb3121ed25c73a0c02311ec58459344a Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Mon, 20 Jun 2016 17:37:19 +0800
Subject: sched/cpuacct: Use loop to consolidate code in cpuacct_stats_show()

In cpuacct_stats_show() we currently we have copies of similar code,
for each cpustat(system/user) variant.

Use a loop instead to consolidate the code. This will also work better
if we extend the CPUACCT_STAT_NSTATS type.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/b0597d4224655e9f333f1a6224ed9654c7d7d36a.1466415271.git.zhaolei@cn.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cpuacct.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 74241eb5f3ff..677cd1ab33b7 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -243,27 +243,26 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
 static int cpuacct_stats_show(struct seq_file *sf, void *v)
 {
 	struct cpuacct *ca = css_ca(seq_css(sf));
+	s64 val[CPUACCT_STAT_NSTATS];
 	int cpu;
-	s64 val = 0;
+	int stat;
 
+	memset(val, 0, sizeof(val));
 	for_each_possible_cpu(cpu) {
-		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
-		val += kcpustat->cpustat[CPUTIME_USER];
-		val += kcpustat->cpustat[CPUTIME_NICE];
-	}
-	val = cputime64_to_clock_t(val);
-	seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+		u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
 
-	val = 0;
-	for_each_possible_cpu(cpu) {
-		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
-		val += kcpustat->cpustat[CPUTIME_SYSTEM];
-		val += kcpustat->cpustat[CPUTIME_IRQ];
-		val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
+		val[CPUACCT_STAT_USER]   += cpustat[CPUTIME_USER];
+		val[CPUACCT_STAT_USER]   += cpustat[CPUTIME_NICE];
+		val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
+		val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
+		val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
 	}
 
-	val = cputime64_to_clock_t(val);
-	seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+	for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
+		seq_printf(sf, "%s %lld\n",
+			   cpuacct_stat_desc[stat],
+			   cputime64_to_clock_t(val[stat]));
+	}
 
 	return 0;
 }
-- 
cgit v1.3-8-gc7d7


From 277a13e4f0d661678a7084bf97ed96a99c7dac21 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Mon, 20 Jun 2016 17:37:20 +0800
Subject: sched/cpuacct: Introduce cpuacct.usage_all to show all CPU stats
 together

In current code, we can get cpuacct data from several files,
but each file has various limitations.

For example:

 - We can get CPU usage in user and kernel mode via cpuacct.stat,
   but we can't get detailed data about each CPU.

 - We can get each CPU's kernel mode usage in cpuacct.usage_percpu_sys,
   but we can't get user mode usage data at the same time.

This patch introduces cpuacct.usage_all, to show all detailed CPU
accounting data together:

 # cat cpuacct.usage_all
 cpu user system
 0 3809760299 5807968992
 1 3250329855 454612211
 ..

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/7744460969edd7caaf0e903592ee52353ed9bdd6.1466415271.git.zhaolei@cn.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cpuacct.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 677cd1ab33b7..bc0b309c3f19 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -240,6 +240,42 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
 	return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS);
 }
 
+static int cpuacct_all_seq_show(struct seq_file *m, void *V)
+{
+	struct cpuacct *ca = css_ca(seq_css(m));
+	int index;
+	int cpu;
+
+	seq_puts(m, "cpu");
+	for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
+		seq_printf(m, " %s", cpuacct_stat_desc[index]);
+	seq_puts(m, "\n");
+
+	for_each_possible_cpu(cpu) {
+		struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+
+		seq_printf(m, "%d", cpu);
+
+		for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
+#ifndef CONFIG_64BIT
+			/*
+			 * Take rq->lock to make 64-bit read safe on 32-bit
+			 * platforms.
+			 */
+			raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+#endif
+
+			seq_printf(m, " %llu", cpuusage->usages[index]);
+
+#ifndef CONFIG_64BIT
+			raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#endif
+		}
+		seq_puts(m, "\n");
+	}
+	return 0;
+}
+
 static int cpuacct_stats_show(struct seq_file *sf, void *v)
 {
 	struct cpuacct *ca = css_ca(seq_css(sf));
@@ -293,6 +329,10 @@ static struct cftype files[] = {
 		.name = "usage_percpu_sys",
 		.seq_show = cpuacct_percpu_sys_seq_show,
 	},
+	{
+		.name = "usage_all",
+		.seq_show = cpuacct_all_seq_show,
+	},
 	{
 		.name = "stat",
 		.seq_show = cpuacct_stats_show,
-- 
cgit v1.3-8-gc7d7


From 2f88e41a22ccfa95291c4df573f8ed4c6a71f29b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 6 Jul 2016 02:40:56 +0200
Subject: PM / hibernate: Add missing braces in hibernate_setup()

Make hibernate_setup() follow the coding style more closely by adding
some missing braces to the if () statement in it.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 126e24caa82e..b00f270d328e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1119,11 +1119,11 @@ static int __init resume_offset_setup(char *str)
 
 static int __init hibernate_setup(char *str)
 {
-	if (!strncmp(str, "noresume", 8))
+	if (!strncmp(str, "noresume", 8)) {
 		noresume = 1;
-	else if (!strncmp(str, "nocompress", 10))
+	} else if (!strncmp(str, "nocompress", 10)) {
 		nocompress = 1;
-	else if (!strncmp(str, "no", 2)) {
+	} else if (!strncmp(str, "no", 2)) {
 		noresume = 1;
 		nohibernate = 1;
 	}
-- 
cgit v1.3-8-gc7d7


From efd5a85242e996275ebf3df71013beabd723bda3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 6 Jul 2016 23:42:46 +0200
Subject: PM / hibernate: Clean up function headers in snapshot.c

The formatting of some function headers in kernel/power/snapshot.c
is not consistent with the general kernel coding style and with the
formatting of some other function headers in the same file.

Make all of them follow the same formatting convention.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 93 ++++++++++++++++++++++---------------------------
 1 file changed, 42 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 94b6fe6c9ae3..1fe0ddb6fd0d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -186,8 +186,8 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
 	__free_page(page);
 }
 
-static inline void
-free_list_of_pages(struct linked_page *list, int clear_page_nosave)
+static inline void free_list_of_pages(struct linked_page *list,
+				      int clear_page_nosave)
 {
 	while (list) {
 		struct linked_page *lp = list->next;
@@ -219,8 +219,8 @@ struct chain_allocator {
 	int safe_needed;	/* if set, only "safe" pages are allocated */
 };
 
-static void
-chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
+static void chain_init(struct chain_allocator *ca, gfp_t gfp_mask,
+		       int safe_needed)
 {
 	ca->chain = NULL;
 	ca->used_space = LINKED_PAGE_DATA_SIZE;
@@ -452,10 +452,11 @@ static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
  *	This function also allocated and builds the radix tree for the
  *	zone.
  */
-static struct mem_zone_bm_rtree *
-create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
-		     struct chain_allocator *ca,
-		     unsigned long start, unsigned long end)
+static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask,
+						      int safe_needed,
+						      struct chain_allocator *ca,
+						      unsigned long start,
+						      unsigned long end)
 {
 	struct mem_zone_bm_rtree *zone;
 	unsigned int i, nr_blocks;
@@ -595,8 +596,8 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
 /**
   *	memory_bm_create - allocate memory for a memory bitmap
   */
-static int
-memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
+static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask,
+			    int safe_needed)
 {
 	struct chain_allocator ca;
 	struct list_head mem_extents;
@@ -894,9 +895,8 @@ static void memory_bm_recycle(struct memory_bitmap *bm)
  *	initialization code)
  */
 
-void __init
-__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
-			 int use_kmalloc)
+void __init __register_nosave_region(unsigned long start_pfn,
+				     unsigned long end_pfn, int use_kmalloc)
 {
 	struct nosave_region *region;
 
@@ -1277,8 +1277,7 @@ static void safe_copy_page(void *dst, struct page *s_page)
 
 
 #ifdef CONFIG_HIGHMEM
-static inline struct page *
-page_is_saveable(struct zone *zone, unsigned long pfn)
+static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn)
 {
 	return is_highmem(zone) ?
 		saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
@@ -1321,8 +1320,8 @@ static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
 }
 #endif /* CONFIG_HIGHMEM */
 
-static void
-copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
+static void copy_data_pages(struct memory_bitmap *copy_bm,
+			    struct memory_bitmap *orig_bm)
 {
 	struct zone *zone;
 	unsigned long pfn;
@@ -1485,8 +1484,8 @@ static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
 }
 
 static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
-						unsigned long highmem,
-						unsigned long total)
+						  unsigned long highmem,
+						  unsigned long total)
 {
 	unsigned long alloc = __fraction(nr_pages, highmem, total);
 
@@ -1499,8 +1498,8 @@ static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
 }
 
 static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
-						unsigned long highmem,
-						unsigned long total)
+							 unsigned long highmem,
+							 unsigned long total)
 {
 	return 0;
 }
@@ -1780,8 +1779,7 @@ static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
 	return nr_highmem;
 }
 #else
-static unsigned int
-count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
+static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
 #endif /* CONFIG_HIGHMEM */
 
 /**
@@ -1823,8 +1821,8 @@ static inline int get_highmem_buffer(int safe_needed)
  *	highmem pages is lesser than that, allocate them all.
  */
 
-static inline unsigned int
-alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
+static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
+					       unsigned int nr_highmem)
 {
 	unsigned int to_alloc = count_free_highmem_pages();
 
@@ -1843,8 +1841,8 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
 #else
 static inline int get_highmem_buffer(int safe_needed) { return 0; }
 
-static inline unsigned int
-alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
+static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
+					       unsigned int n) { return 0; }
 #endif /* CONFIG_HIGHMEM */
 
 /**
@@ -1859,9 +1857,9 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
  *	copy_data_pages() works.
  */
 
-static int
-swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
-		unsigned int nr_pages, unsigned int nr_highmem)
+static int swsusp_alloc(struct memory_bitmap *orig_bm,
+			struct memory_bitmap *copy_bm,
+			unsigned int nr_pages, unsigned int nr_highmem)
 {
 	if (nr_highmem > 0) {
 		if (get_highmem_buffer(PG_ANY))
@@ -1978,8 +1976,7 @@ static int init_header(struct swsusp_info *info)
  *	are stored in the array @buf[] (1 page at a time)
  */
 
-static inline void
-pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
+static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
 {
 	int j;
 
@@ -2110,8 +2107,7 @@ static int check_header(struct swsusp_info *info)
  *	load header - check the image header and copy data from it
  */
 
-static int
-load_header(struct swsusp_info *info)
+static int load_header(struct swsusp_info *info)
 {
 	int error;
 
@@ -2204,8 +2200,8 @@ static unsigned int safe_highmem_pages;
 
 static struct memory_bitmap *safe_highmem_bm;
 
-static int
-prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+static int prepare_highmem_image(struct memory_bitmap *bm,
+				 unsigned int *nr_highmem_p)
 {
 	unsigned int to_alloc;
 
@@ -2259,8 +2255,8 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
 
 static struct page *last_highmem_page;
 
-static void *
-get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+static void *get_highmem_page_buffer(struct page *page,
+				     struct chain_allocator *ca)
 {
 	struct highmem_pbe *pbe;
 	void *kaddr;
@@ -2333,17 +2329,13 @@ static inline void free_highmem_data(void)
 		free_image_page(buffer, PG_UNSAFE_CLEAR);
 }
 #else
-static unsigned int
-count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
+static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
 
-static inline int
-prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
-{
-	return 0;
-}
+static inline int prepare_highmem_image(struct memory_bitmap *bm,
+					unsigned int *nr_highmem_p) { return 0; }
 
-static inline void *
-get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+static inline void *get_highmem_page_buffer(struct page *page,
+					    struct chain_allocator *ca)
 {
 	return ERR_PTR(-EINVAL);
 }
@@ -2369,8 +2361,7 @@ static inline void free_highmem_data(void) {}
 
 #define PBES_PER_LINKED_PAGE	(LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
 
-static int
-prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
+static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 {
 	unsigned int nr_pages, nr_highmem;
 	struct linked_page *lp;
@@ -2593,8 +2584,8 @@ int snapshot_image_loaded(struct snapshot_handle *handle)
 
 #ifdef CONFIG_HIGHMEM
 /* Assumes that @buf is ready and points to a "safe" page */
-static inline void
-swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
+static inline void swap_two_pages_data(struct page *p1, struct page *p2,
+				       void *buf)
 {
 	void *kaddr1, *kaddr2;
 
-- 
cgit v1.3-8-gc7d7


From ef96f639ea663474c4e1c57bd64e118ffbb92be4 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 6 Jul 2016 23:43:46 +0200
Subject: PM / hibernate: Clean up comments in snapshot.c

Many comments in kernel/power/snapshot.c do not follow the general
comment formatting rules.  They look odd, some of them are outdated
too, some are hard to parse and generally difficult to understand.

Clean them up to make them easier to comprehend.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 636 +++++++++++++++++++++++++-----------------------
 1 file changed, 330 insertions(+), 306 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1fe0ddb6fd0d..bd927d9efeb7 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -67,7 +67,8 @@ void __init hibernate_image_size_init(void)
 	image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
 }
 
-/* List of PBEs needed for restoring the pages that were allocated before
+/*
+ * List of PBEs needed for restoring the pages that were allocated before
  * the suspend and included in the suspend image, but have also been
  * allocated by the "resume" kernel, so their contents cannot be written
  * directly to their "original" page frames.
@@ -93,16 +94,6 @@ static struct linked_page *safe_pages_list;
 /* Pointer to an auxiliary buffer (1 page) */
 static void *buffer;
 
-/**
- *	@safe_needed - on resume, for storing the PBE list and the image,
- *	we can only use memory pages that do not conflict with the pages
- *	used before suspend.  The unsafe pages have PageNosaveFree set
- *	and we count them using unsafe_pages.
- *
- *	Each allocated image page is marked as PageNosave and PageNosaveFree
- *	so that swsusp_free() can release it.
- */
-
 #define PG_ANY		0
 #define PG_SAFE		1
 #define PG_UNSAFE_CLEAR	1
@@ -110,6 +101,19 @@ static void *buffer;
 
 static unsigned int allocated_unsafe_pages;
 
+/**
+ * get_image_page - Allocate a page for a hibernation image.
+ * @gfp_mask: GFP mask for the allocation.
+ * @safe_needed: Get pages that were not used before hibernation (restore only)
+ *
+ * During image restoration, for storing the PBE list and the image data, we can
+ * only use memory pages that do not conflict with the pages used before
+ * hibernation.  The "unsafe" pages have PageNosaveFree set and we count them
+ * using allocated_unsafe_pages.
+ *
+ * Each allocated image page is marked as PageNosave and PageNosaveFree so that
+ * swsusp_free() can release it.
+ */
 static void *get_image_page(gfp_t gfp_mask, int safe_needed)
 {
 	void *res;
@@ -167,10 +171,13 @@ static void recycle_safe_page(void *page_address)
 }
 
 /**
- *	free_image_page - free page represented by @addr, allocated with
- *	get_image_page (page flags set by it must be cleared)
+ * free_image_page - Free a page allocated for hibernation image.
+ * @addr: Address of the page to free.
+ * @clear_nosave_free: If set, clear the PageNosaveFree bit for the page.
+ *
+ * The page to free should have been allocated by get_image_page() (page flags
+ * set by it are affected).
  */
-
 static inline void free_image_page(void *addr, int clear_nosave_free)
 {
 	struct page *page;
@@ -197,24 +204,22 @@ static inline void free_list_of_pages(struct linked_page *list,
 	}
 }
 
-/**
-  *	struct chain_allocator is used for allocating small objects out of
-  *	a linked list of pages called 'the chain'.
-  *
-  *	The chain grows each time when there is no room for a new object in
-  *	the current page.  The allocated objects cannot be freed individually.
-  *	It is only possible to free them all at once, by freeing the entire
-  *	chain.
-  *
-  *	NOTE: The chain allocator may be inefficient if the allocated objects
-  *	are not much smaller than PAGE_SIZE.
-  */
-
+/*
+ * struct chain_allocator is used for allocating small objects out of
+ * a linked list of pages called 'the chain'.
+ *
+ * The chain grows each time when there is no room for a new object in
+ * the current page.  The allocated objects cannot be freed individually.
+ * It is only possible to free them all at once, by freeing the entire
+ * chain.
+ *
+ * NOTE: The chain allocator may be inefficient if the allocated objects
+ * are not much smaller than PAGE_SIZE.
+ */
 struct chain_allocator {
 	struct linked_page *chain;	/* the chain */
 	unsigned int used_space;	/* total size of objects allocated out
-					 * of the current page
-					 */
+					   of the current page */
 	gfp_t gfp_mask;		/* mask for allocating pages */
 	int safe_needed;	/* if set, only "safe" pages are allocated */
 };
@@ -250,44 +255,44 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
 }
 
 /**
- *	Data types related to memory bitmaps.
+ * Data types related to memory bitmaps.
  *
- *	Memory bitmap is a structure consiting of many linked lists of
- *	objects.  The main list's elements are of type struct zone_bitmap
- *	and each of them corresonds to one zone.  For each zone bitmap
- *	object there is a list of objects of type struct bm_block that
- *	represent each blocks of bitmap in which information is stored.
+ * Memory bitmap is a structure consiting of many linked lists of
+ * objects.  The main list's elements are of type struct zone_bitmap
+ * and each of them corresonds to one zone.  For each zone bitmap
+ * object there is a list of objects of type struct bm_block that
+ * represent each blocks of bitmap in which information is stored.
  *
- *	struct memory_bitmap contains a pointer to the main list of zone
- *	bitmap objects, a struct bm_position used for browsing the bitmap,
- *	and a pointer to the list of pages used for allocating all of the
- *	zone bitmap objects and bitmap block objects.
+ * struct memory_bitmap contains a pointer to the main list of zone
+ * bitmap objects, a struct bm_position used for browsing the bitmap,
+ * and a pointer to the list of pages used for allocating all of the
+ * zone bitmap objects and bitmap block objects.
  *
- *	NOTE: It has to be possible to lay out the bitmap in memory
- *	using only allocations of order 0.  Additionally, the bitmap is
- *	designed to work with arbitrary number of zones (this is over the
- *	top for now, but let's avoid making unnecessary assumptions ;-).
+ * NOTE: It has to be possible to lay out the bitmap in memory
+ * using only allocations of order 0.  Additionally, the bitmap is
+ * designed to work with arbitrary number of zones (this is over the
+ * top for now, but let's avoid making unnecessary assumptions ;-).
  *
- *	struct zone_bitmap contains a pointer to a list of bitmap block
- *	objects and a pointer to the bitmap block object that has been
- *	most recently used for setting bits.  Additionally, it contains the
- *	pfns that correspond to the start and end of the represented zone.
+ * struct zone_bitmap contains a pointer to a list of bitmap block
+ * objects and a pointer to the bitmap block object that has been
+ * most recently used for setting bits.  Additionally, it contains the
+ * PFNs that correspond to the start and end of the represented zone.
  *
- *	struct bm_block contains a pointer to the memory page in which
- *	information is stored (in the form of a block of bitmap)
- *	It also contains the pfns that correspond to the start and end of
- *	the represented memory area.
+ * struct bm_block contains a pointer to the memory page in which
+ * information is stored (in the form of a block of bitmap)
+ * It also contains the pfns that correspond to the start and end of
+ * the represented memory area.
  *
- *	The memory bitmap is organized as a radix tree to guarantee fast random
- *	access to the bits. There is one radix tree for each zone (as returned
- *	from create_mem_extents).
+ * The memory bitmap is organized as a radix tree to guarantee fast random
+ * access to the bits. There is one radix tree for each zone (as returned
+ * from create_mem_extents).
  *
- *	One radix tree is represented by one struct mem_zone_bm_rtree. There are
- *	two linked lists for the nodes of the tree, one for the inner nodes and
- *	one for the leave nodes. The linked leave nodes are used for fast linear
- *	access of the memory bitmap.
+ * One radix tree is represented by one struct mem_zone_bm_rtree. There are
+ * two linked lists for the nodes of the tree, one for the inner nodes and
+ * one for the leave nodes. The linked leave nodes are used for fast linear
+ * access of the memory bitmap.
  *
- *	The struct rtree_node represents one node of the radix tree.
+ * The struct rtree_node represents one node of the radix tree.
  */
 
 #define BM_END_OF_MAP	(~0UL)
@@ -333,9 +338,8 @@ struct bm_position {
 struct memory_bitmap {
 	struct list_head zones;
 	struct linked_page *p_list;	/* list of pages used to store zone
-					 * bitmap objects and bitmap block
-					 * objects
-					 */
+					   bitmap objects and bitmap block
+					   objects */
 	struct bm_position cur;	/* most recently used bit position */
 };
 
@@ -349,12 +353,12 @@ struct memory_bitmap {
 #endif
 #define BM_RTREE_LEVEL_MASK	((1UL << BM_RTREE_LEVEL_SHIFT) - 1)
 
-/*
- *	alloc_rtree_node - Allocate a new node and add it to the radix tree.
+/**
+ * alloc_rtree_node - Allocate a new node and add it to the radix tree.
  *
- *	This function is used to allocate inner nodes as well as the
- *	leave nodes of the radix tree. It also adds the node to the
- *	corresponding linked list passed in by the *list parameter.
+ * This function is used to allocate inner nodes as well as the
+ * leave nodes of the radix tree. It also adds the node to the
+ * corresponding linked list passed in by the *list parameter.
  */
 static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
 					   struct chain_allocator *ca,
@@ -375,12 +379,12 @@ static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
 	return node;
 }
 
-/*
- *	add_rtree_block - Add a new leave node to the radix tree
+/**
+ * add_rtree_block - Add a new leave node to the radix tree.
  *
- *	The leave nodes need to be allocated in order to keep the leaves
- *	linked list in order. This is guaranteed by the zone->blocks
- *	counter.
+ * The leave nodes need to be allocated in order to keep the leaves
+ * linked list in order. This is guaranteed by the zone->blocks
+ * counter.
  */
 static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
 			   int safe_needed, struct chain_allocator *ca)
@@ -445,12 +449,12 @@ static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
 static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
 			       int clear_nosave_free);
 
-/*
- *	create_zone_bm_rtree - create a radix tree for one zone
+/**
+ * create_zone_bm_rtree - Create a radix tree for one zone.
  *
- *	Allocated the mem_zone_bm_rtree structure and initializes it.
- *	This function also allocated and builds the radix tree for the
- *	zone.
+ * Allocated the mem_zone_bm_rtree structure and initializes it.
+ * This function also allocated and builds the radix tree for the
+ * zone.
  */
 static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask,
 						      int safe_needed,
@@ -483,12 +487,12 @@ static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask,
 	return zone;
 }
 
-/*
- *	free_zone_bm_rtree - Free the memory of the radix tree
+/**
+ * free_zone_bm_rtree - Free the memory of the radix tree.
  *
- *	Free all node pages of the radix tree. The mem_zone_bm_rtree
- *	structure itself is not freed here nor are the rtree_node
- *	structs.
+ * Free all node pages of the radix tree. The mem_zone_bm_rtree
+ * structure itself is not freed here nor are the rtree_node
+ * structs.
  */
 static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
 			       int clear_nosave_free)
@@ -521,8 +525,8 @@ struct mem_extent {
 };
 
 /**
- *	free_mem_extents - free a list of memory extents
- *	@list - list of extents to empty
+ * free_mem_extents - Free a list of memory extents.
+ * @list: List of extents to free.
  */
 static void free_mem_extents(struct list_head *list)
 {
@@ -535,10 +539,11 @@ static void free_mem_extents(struct list_head *list)
 }
 
 /**
- *	create_mem_extents - create a list of memory extents representing
- *	                     contiguous ranges of PFNs
- *	@list - list to put the extents into
- *	@gfp_mask - mask to use for memory allocations
+ * create_mem_extents - Create a list of memory extents.
+ * @list: List to put the extents into.
+ * @gfp_mask: Mask to use for memory allocations.
+ *
+ * The extents represent contiguous ranges of PFNs.
  */
 static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
 {
@@ -594,8 +599,8 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
 }
 
 /**
-  *	memory_bm_create - allocate memory for a memory bitmap
-  */
+ * memory_bm_create - Allocate memory for a memory bitmap.
+ */
 static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask,
 			    int safe_needed)
 {
@@ -636,8 +641,9 @@ static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask,
 }
 
 /**
-  *	memory_bm_free - free memory occupied by the memory bitmap @bm
-  */
+ * memory_bm_free - Free memory occupied by the memory bitmap.
+ * @bm: Memory bitmap.
+ */
 static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
 {
 	struct mem_zone_bm_rtree *zone;
@@ -651,14 +657,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
 }
 
 /**
- *	memory_bm_find_bit - Find the bit for pfn in the memory
- *			     bitmap
+ * memory_bm_find_bit - Find the bit for a given PFN in a memory bitmap.
  *
- *	Find the bit in the bitmap @bm that corresponds to given pfn.
- *	The cur.zone, cur.block and cur.node_pfn member of @bm are
- *	updated.
- *	It walks the radix tree to find the page which contains the bit for
- *	pfn and returns the bit position in **addr and *bit_nr.
+ * Find the bit in memory bitmap @bm that corresponds to the given PFN.
+ * The cur.zone, cur.block and cur.node_pfn members of @bm are updated.
+ *
+ * Walk the radix tree to find the page containing the bit that represents @pfn
+ * and return the position of the bit in @addr and @bit_nr.
  */
 static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
 			      void **addr, unsigned int *bit_nr)
@@ -687,10 +692,9 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
 
 zone_found:
 	/*
-	 * We have a zone. Now walk the radix tree to find the leave
-	 * node for our pfn.
+	 * We have found the zone. Now walk the radix tree to find the leaf node
+	 * for our PFN.
 	 */
-
 	node = bm->cur.node;
 	if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
 		goto node_found;
@@ -783,14 +787,14 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
 }
 
 /*
- *	rtree_next_node - Jumps to the next leave node
+ * rtree_next_node - Jump to the next leaf node.
  *
- *	Sets the position to the beginning of the next node in the
- *	memory bitmap. This is either the next node in the current
- *	zone's radix tree or the first node in the radix tree of the
- *	next zone.
+ * Set the position to the beginning of the next node in the
+ * memory bitmap. This is either the next node in the current
+ * zone's radix tree or the first node in the radix tree of the
+ * next zone.
  *
- *	Returns true if there is a next node, false otherwise.
+ * Return true if there is a next node, false otherwise.
  */
 static bool rtree_next_node(struct memory_bitmap *bm)
 {
@@ -819,14 +823,15 @@ static bool rtree_next_node(struct memory_bitmap *bm)
 }
 
 /**
- *	memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm
+ * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap.
+ * @bm: Memory bitmap.
  *
- *	Starting from the last returned position this function searches
- *	for the next set bit in the memory bitmap and returns its
- *	number. If no more bit is set BM_END_OF_MAP is returned.
+ * Starting from the last returned position this function searches for the next
+ * set bit in @bm and returns the PFN represented by it.  If no more bits are
+ * set, BM_END_OF_MAP is returned.
  *
- *	It is required to run memory_bm_position_reset() before the
- *	first call to this function.
+ * It is required to run memory_bm_position_reset() before the first call to
+ * this function for the given memory bitmap.
  */
 static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
 {
@@ -848,11 +853,10 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
 	return BM_END_OF_MAP;
 }
 
-/**
- *	This structure represents a range of page frames the contents of which
- *	should not be saved during the suspend.
+/*
+ * This structure represents a range of page frames the contents of which
+ * should not be saved during hibernation.
  */
-
 struct nosave_region {
 	struct list_head list;
 	unsigned long start_pfn;
@@ -890,11 +894,11 @@ static void memory_bm_recycle(struct memory_bitmap *bm)
 }
 
 /**
- *	register_nosave_region - register a range of page frames the contents
- *	of which should not be saved during the suspend (to be used in the early
- *	initialization code)
+ * register_nosave_region - Register a region of unsaveable memory.
+ *
+ * Register a range of page frames the contents of which should not be saved
+ * during hibernation (to be used in the early initialization code).
  */
-
 void __init __register_nosave_region(unsigned long start_pfn,
 				     unsigned long end_pfn, int use_kmalloc)
 {
@@ -913,7 +917,7 @@ void __init __register_nosave_region(unsigned long start_pfn,
 		}
 	}
 	if (use_kmalloc) {
-		/* during init, this shouldn't fail */
+		/* During init, this shouldn't fail */
 		region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
 		BUG_ON(!region);
 	} else
@@ -979,10 +983,12 @@ static void swsusp_unset_page_forbidden(struct page *page)
 }
 
 /**
- *	mark_nosave_pages - set bits corresponding to the page frames the
- *	contents of which should not be saved in a given bitmap.
+ * mark_nosave_pages - Mark pages that should not be saved.
+ * @bm: Memory bitmap.
+ *
+ * Set the bits in @bm that correspond to the page frames the contents of which
+ * should not be saved.
  */
-
 static void mark_nosave_pages(struct memory_bitmap *bm)
 {
 	struct nosave_region *region;
@@ -1012,13 +1018,13 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
 }
 
 /**
- *	create_basic_memory_bitmaps - create bitmaps needed for marking page
- *	frames that should not be saved and free page frames.  The pointers
- *	forbidden_pages_map and free_pages_map are only modified if everything
- *	goes well, because we don't want the bits to be used before both bitmaps
- *	are set up.
+ * create_basic_memory_bitmaps - Create bitmaps to hold basic page information.
+ *
+ * Create bitmaps needed for marking page frames that should not be saved and
+ * free page frames.  The forbidden_pages_map and free_pages_map pointers are
+ * only modified if everything goes well, because we don't want the bits to be
+ * touched before both bitmaps are set up.
  */
-
 int create_basic_memory_bitmaps(void)
 {
 	struct memory_bitmap *bm1, *bm2;
@@ -1063,12 +1069,12 @@ int create_basic_memory_bitmaps(void)
 }
 
 /**
- *	free_basic_memory_bitmaps - free memory bitmaps allocated by
- *	create_basic_memory_bitmaps().  The auxiliary pointers are necessary
- *	so that the bitmaps themselves are not referred to while they are being
- *	freed.
+ * free_basic_memory_bitmaps - Free memory bitmaps holding basic information.
+ *
+ * Free memory bitmaps allocated by create_basic_memory_bitmaps().  The
+ * auxiliary pointers are necessary so that the bitmaps themselves are not
+ * referred to while they are being freed.
  */
-
 void free_basic_memory_bitmaps(void)
 {
 	struct memory_bitmap *bm1, *bm2;
@@ -1089,11 +1095,13 @@ void free_basic_memory_bitmaps(void)
 }
 
 /**
- *	snapshot_additional_pages - estimate the number of additional pages
- *	be needed for setting up the suspend image data structures for given
- *	zone (usually the returned value is greater than the exact number)
+ * snapshot_additional_pages - Estimate the number of extra pages needed.
+ * @zone: Memory zone to carry out the computation for.
+ *
+ * Estimate the number of additional pages needed for setting up a hibernation
+ * image data structures for @zone (usually, the returned value is greater than
+ * the exact number).
  */
-
 unsigned int snapshot_additional_pages(struct zone *zone)
 {
 	unsigned int rtree, nodes;
@@ -1111,10 +1119,10 @@ unsigned int snapshot_additional_pages(struct zone *zone)
 
 #ifdef CONFIG_HIGHMEM
 /**
- *	count_free_highmem_pages - compute the total number of free highmem
- *	pages, system-wide.
+ * count_free_highmem_pages - Compute the total number of free highmem pages.
+ *
+ * The returned number is system-wide.
  */
-
 static unsigned int count_free_highmem_pages(void)
 {
 	struct zone *zone;
@@ -1128,11 +1136,12 @@ static unsigned int count_free_highmem_pages(void)
 }
 
 /**
- *	saveable_highmem_page - Determine whether a highmem page should be
- *	included in the suspend image.
+ * saveable_highmem_page - Check if a highmem page is saveable.
+ *
+ * Determine whether a highmem page should be included in a hibernation image.
  *
- *	We should save the page if it isn't Nosave or NosaveFree, or Reserved,
- *	and it isn't a part of a free chunk of pages.
+ * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
+ * and it isn't part of a free chunk of pages.
  */
 static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
 {
@@ -1158,10 +1167,8 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
 }
 
 /**
- *	count_highmem_pages - compute the total number of saveable highmem
- *	pages.
+ * count_highmem_pages - Compute the total number of saveable highmem pages.
  */
-
 static unsigned int count_highmem_pages(void)
 {
 	struct zone *zone;
@@ -1189,12 +1196,14 @@ static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
 #endif /* CONFIG_HIGHMEM */
 
 /**
- *	saveable_page - Determine whether a non-highmem page should be included
- *	in the suspend image.
+ * saveable_page - Check if the given page is saveable.
+ *
+ * Determine whether a non-highmem page should be included in a hibernation
+ * image.
  *
- *	We should save the page if it isn't Nosave, and is not in the range
- *	of pages statically defined as 'unsaveable', and it isn't a part of
- *	a free chunk of pages.
+ * We should save the page if it isn't Nosave, and is not in the range
+ * of pages statically defined as 'unsaveable', and it isn't part of
+ * a free chunk of pages.
  */
 static struct page *saveable_page(struct zone *zone, unsigned long pfn)
 {
@@ -1223,10 +1232,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
 }
 
 /**
- *	count_data_pages - compute the total number of saveable non-highmem
- *	pages.
+ * count_data_pages - Compute the total number of saveable non-highmem pages.
  */
-
 static unsigned int count_data_pages(void)
 {
 	struct zone *zone;
@@ -1246,7 +1253,8 @@ static unsigned int count_data_pages(void)
 	return n;
 }
 
-/* This is needed, because copy_page and memcpy are not usable for copying
+/*
+ * This is needed, because copy_page and memcpy are not usable for copying
  * task structs.
  */
 static inline void do_copy_page(long *dst, long *src)
@@ -1257,12 +1265,12 @@ static inline void do_copy_page(long *dst, long *src)
 		*dst++ = *src++;
 }
 
-
 /**
- *	safe_copy_page - check if the page we are going to copy is marked as
- *		present in the kernel page tables (this always is the case if
- *		CONFIG_DEBUG_PAGEALLOC is not set and in that case
- *		kernel_page_present() always returns 'true').
+ * safe_copy_page - Copy a page in a safe way.
+ *
+ * Check if the page we are going to copy is marked as present in the kernel
+ * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set
+ * and in that case kernel_page_present() always returns 'true').
  */
 static void safe_copy_page(void *dst, struct page *s_page)
 {
@@ -1275,7 +1283,6 @@ static void safe_copy_page(void *dst, struct page *s_page)
 	}
 }
 
-
 #ifdef CONFIG_HIGHMEM
 static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn)
 {
@@ -1298,7 +1305,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
 		kunmap_atomic(src);
 	} else {
 		if (PageHighMem(d_page)) {
-			/* Page pointed to by src may contain some kernel
+			/*
+			 * The page pointed to by src may contain some kernel
 			 * data modified by kmap_atomic()
 			 */
 			safe_copy_page(buffer, s_page);
@@ -1370,12 +1378,11 @@ static struct memory_bitmap orig_bm;
 static struct memory_bitmap copy_bm;
 
 /**
- *	swsusp_free - free pages allocated for the suspend.
+ * swsusp_free - Free pages allocated for hibernation image.
  *
- *	Suspend pages are alocated before the atomic copy is made, so we
- *	need to release them after the resume.
+ * Image pages are alocated before snapshot creation, so they need to be
+ * released after resume.
  */
-
 void swsusp_free(void)
 {
 	unsigned long fb_pfn, fr_pfn;
@@ -1424,7 +1431,7 @@ out:
 #define GFP_IMAGE	(GFP_KERNEL | __GFP_NOWARN)
 
 /**
- * preallocate_image_pages - Allocate a number of pages for hibernation image
+ * preallocate_image_pages - Allocate a number of pages for hibernation image.
  * @nr_pages: Number of page frames to allocate.
  * @mask: GFP flags to use for the allocation.
  *
@@ -1474,7 +1481,7 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages)
 }
 
 /**
- *  __fraction - Compute (an approximation of) x * (multiplier / base)
+ *  __fraction - Compute (an approximation of) x * (multiplier / base).
  */
 static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
 {
@@ -1506,7 +1513,7 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
 #endif /* CONFIG_HIGHMEM */
 
 /**
- * free_unnecessary_pages - Release preallocated pages not needed for the image
+ * free_unnecessary_pages - Release preallocated pages not needed for the image.
  */
 static unsigned long free_unnecessary_pages(void)
 {
@@ -1560,7 +1567,7 @@ static unsigned long free_unnecessary_pages(void)
 }
 
 /**
- * minimum_image_size - Estimate the minimum acceptable size of an image
+ * minimum_image_size - Estimate the minimum acceptable size of an image.
  * @saveable: Number of saveable pages in the system.
  *
  * We want to avoid attempting to free too much memory too hard, so estimate the
@@ -1590,7 +1597,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
 }
 
 /**
- * hibernate_preallocate_memory - Preallocate memory for hibernation image
+ * hibernate_preallocate_memory - Preallocate memory for hibernation image.
  *
  * To create a hibernation image it is necessary to make a copy of every page
  * frame in use.  We also need a number of page frames to be free during
@@ -1763,10 +1770,11 @@ int hibernate_preallocate_memory(void)
 
 #ifdef CONFIG_HIGHMEM
 /**
-  *	count_pages_for_highmem - compute the number of non-highmem pages
-  *	that will be necessary for creating copies of highmem pages.
-  */
-
+ * count_pages_for_highmem - Count non-highmem pages needed for copying highmem.
+ *
+ * Compute the number of non-highmem pages that will be necessary for creating
+ * copies of highmem pages.
+ */
 static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
 {
 	unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
@@ -1783,10 +1791,8 @@ static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0;
 #endif /* CONFIG_HIGHMEM */
 
 /**
- *	enough_free_mem - Make sure we have enough free memory for the
- *	snapshot image.
+ * enough_free_mem - Check if there is enough free memory for the image.
  */
-
 static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
 {
 	struct zone *zone;
@@ -1805,10 +1811,11 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
 
 #ifdef CONFIG_HIGHMEM
 /**
- *	get_highmem_buffer - if there are some highmem pages in the suspend
- *	image, we may need the buffer to copy them and/or load their data.
+ * get_highmem_buffer - Allocate a buffer for highmem pages.
+ *
+ * If there are some highmem pages in the hibernation image, we may need a
+ * buffer to copy them and/or load their data.
  */
-
 static inline int get_highmem_buffer(int safe_needed)
 {
 	buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
@@ -1816,11 +1823,11 @@ static inline int get_highmem_buffer(int safe_needed)
 }
 
 /**
- *	alloc_highmem_image_pages - allocate some highmem pages for the image.
- *	Try to allocate as many pages as needed, but if the number of free
- *	highmem pages is lesser than that, allocate them all.
+ * alloc_highmem_image_pages - Allocate some highmem pages for the image.
+ *
+ * Try to allocate as many pages as needed, but if the number of free highmem
+ * pages is less than that, allocate them all.
  */
-
 static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
 					       unsigned int nr_highmem)
 {
@@ -1846,17 +1853,16 @@ static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
 #endif /* CONFIG_HIGHMEM */
 
 /**
- *	swsusp_alloc - allocate memory for the suspend image
+ * swsusp_alloc - Allocate memory for hibernation image.
  *
- *	We first try to allocate as many highmem pages as there are
- *	saveable highmem pages in the system.  If that fails, we allocate
- *	non-highmem pages for the copies of the remaining highmem ones.
+ * We first try to allocate as many highmem pages as there are
+ * saveable highmem pages in the system.  If that fails, we allocate
+ * non-highmem pages for the copies of the remaining highmem ones.
  *
- *	In this approach it is likely that the copies of highmem pages will
- *	also be located in the high memory, because of the way in which
- *	copy_data_pages() works.
+ * In this approach it is likely that the copies of highmem pages will
+ * also be located in the high memory, because of the way in which
+ * copy_data_pages() works.
  */
-
 static int swsusp_alloc(struct memory_bitmap *orig_bm,
 			struct memory_bitmap *copy_bm,
 			unsigned int nr_pages, unsigned int nr_highmem)
@@ -1909,7 +1915,8 @@ asmlinkage __visible int swsusp_save(void)
 		return -ENOMEM;
 	}
 
-	/* During allocating of suspend pagedir, new cold pages may appear.
+	/*
+	 * During allocating of suspend pagedir, new cold pages may appear.
 	 * Kill them.
 	 */
 	drain_local_pages(NULL);
@@ -1972,10 +1979,13 @@ static int init_header(struct swsusp_info *info)
 }
 
 /**
- *	pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
- *	are stored in the array @buf[] (1 page at a time)
+ * pack_pfns - Prepare PFNs for saving.
+ * @bm: Memory bitmap.
+ * @buf: Memory buffer to store the PFNs in.
+ *
+ * PFNs corresponding to set bits in @bm are stored in the area of memory
+ * pointed to by @buf (1 page at a time).
  */
-
 static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
 {
 	int j;
@@ -1990,22 +2000,21 @@ static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
 }
 
 /**
- *	snapshot_read_next - used for reading the system memory snapshot.
+ * snapshot_read_next - Get the address to read the next image page from.
+ * @handle: Snapshot handle to be used for the reading.
  *
- *	On the first call to it @handle should point to a zeroed
- *	snapshot_handle structure.  The structure gets updated and a pointer
- *	to it should be passed to this function every next time.
+ * On the first call, @handle should point to a zeroed snapshot_handle
+ * structure.  The structure gets populated then and a pointer to it should be
+ * passed to this function every next time.
  *
- *	On success the function returns a positive number.  Then, the caller
- *	is allowed to read up to the returned number of bytes from the memory
- *	location computed by the data_of() macro.
+ * On success, the function returns a positive number.  Then, the caller
+ * is allowed to read up to the returned number of bytes from the memory
+ * location computed by the data_of() macro.
  *
- *	The function returns 0 to indicate the end of data stream condition,
- *	and a negative number is returned on error.  In such cases the
- *	structure pointed to by @handle is not updated and should not be used
- *	any more.
+ * The function returns 0 to indicate the end of the data stream condition,
+ * and negative numbers are returned on errors.  If that happens, the structure
+ * pointed to by @handle is not updated and should not be used any more.
  */
-
 int snapshot_read_next(struct snapshot_handle *handle)
 {
 	if (handle->cur > nr_meta_pages + nr_copy_pages)
@@ -2034,7 +2043,8 @@ int snapshot_read_next(struct snapshot_handle *handle)
 
 		page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
 		if (PageHighMem(page)) {
-			/* Highmem pages are copied to the buffer,
+			/*
+			 * Highmem pages are copied to the buffer,
 			 * because we can't return with a kmapped
 			 * highmem page (we may not be called again).
 			 */
@@ -2066,11 +2076,11 @@ static void duplicate_memory_bitmap(struct memory_bitmap *dst,
 }
 
 /**
- *	mark_unsafe_pages - mark the pages that cannot be used for storing
- *	the image during resume, because they conflict with the pages that
- *	had been used before suspend
+ * mark_unsafe_pages - Mark pages that were used before hibernation.
+ *
+ * Mark the pages that cannot be used for storing the image during restoration,
+ * because they conflict with the pages that had been used before hibernation.
  */
-
 static void mark_unsafe_pages(struct memory_bitmap *bm)
 {
 	unsigned long pfn;
@@ -2104,9 +2114,8 @@ static int check_header(struct swsusp_info *info)
 }
 
 /**
- *	load header - check the image header and copy data from it
+ * load header - Check the image header and copy the data from it.
  */
-
 static int load_header(struct swsusp_info *info)
 {
 	int error;
@@ -2121,8 +2130,12 @@ static int load_header(struct swsusp_info *info)
 }
 
 /**
- *	unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
- *	the corresponding bit in the memory bitmap @bm
+ * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap.
+ * @bm: Memory bitmap.
+ * @buf: Area of memory containing the PFNs.
+ *
+ * For each element of the array pointed to by @buf (1 page at a time), set the
+ * corresponding bit in @bm.
  */
 static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 {
@@ -2145,7 +2158,8 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 }
 
 #ifdef CONFIG_HIGHMEM
-/* struct highmem_pbe is used for creating the list of highmem pages that
+/*
+ * struct highmem_pbe is used for creating the list of highmem pages that
  * should be restored atomically during the resume from disk, because the page
  * frames they have occupied before the suspend are in use.
  */
@@ -2155,7 +2169,8 @@ struct highmem_pbe {
 	struct highmem_pbe *next;
 };
 
-/* List of highmem PBEs needed for restoring the highmem pages that were
+/*
+ * List of highmem PBEs needed for restoring the highmem pages that were
  * allocated before the suspend and included in the suspend image, but have
  * also been allocated by the "resume" kernel, so their contents cannot be
  * written directly to their "original" page frames.
@@ -2163,11 +2178,11 @@ struct highmem_pbe {
 static struct highmem_pbe *highmem_pblist;
 
 /**
- *	count_highmem_image_pages - compute the number of highmem pages in the
- *	suspend image.  The bits in the memory bitmap @bm that correspond to the
- *	image pages are assumed to be set.
+ * count_highmem_image_pages - Compute the number of highmem pages in the image.
+ * @bm: Memory bitmap.
+ *
+ * The bits in @bm that correspond to image pages are assumed to be set.
  */
-
 static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
 {
 	unsigned long pfn;
@@ -2184,22 +2199,23 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
 	return cnt;
 }
 
-/**
- *	prepare_highmem_image - try to allocate as many highmem pages as
- *	there are highmem image pages (@nr_highmem_p points to the variable
- *	containing the number of highmem image pages).  The pages that are
- *	"safe" (ie. will not be overwritten when the suspend image is
- *	restored) have the corresponding bits set in @bm (it must be
- *	unitialized).
- *
- *	NOTE: This function should not be called if there are no highmem
- *	image pages.
- */
-
 static unsigned int safe_highmem_pages;
 
 static struct memory_bitmap *safe_highmem_bm;
 
+/**
+ * prepare_highmem_image - Allocate memory for loading highmem data from image.
+ * @bm: Pointer to an uninitialized memory bitmap structure.
+ * @nr_highmem_p: Pointer to the number of highmem image pages.
+ *
+ * Try to allocate as many highmem pages as there are highmem image pages
+ * (@nr_highmem_p points to the variable containing the number of highmem image
+ * pages).  The pages that are "safe" (ie. will not be overwritten when the
+ * hibernation image is restored entirely) have the corresponding bits set in
+ * @bm (it must be unitialized).
+ *
+ * NOTE: This function should not be called if there are no highmem image pages.
+ */
 static int prepare_highmem_image(struct memory_bitmap *bm,
 				 unsigned int *nr_highmem_p)
 {
@@ -2236,25 +2252,26 @@ static int prepare_highmem_image(struct memory_bitmap *bm,
 	return 0;
 }
 
+static struct page *last_highmem_page;
+
 /**
- *	get_highmem_page_buffer - for given highmem image page find the buffer
- *	that suspend_write_next() should set for its caller to write to.
+ * get_highmem_page_buffer - Prepare a buffer to store a highmem image page.
+ *
+ * For a given highmem image page get a buffer that suspend_write_next() should
+ * return to its caller to write to.
  *
- *	If the page is to be saved to its "original" page frame or a copy of
- *	the page is to be made in the highmem, @buffer is returned.  Otherwise,
- *	the copy of the page is to be made in normal memory, so the address of
- *	the copy is returned.
+ * If the page is to be saved to its "original" page frame or a copy of
+ * the page is to be made in the highmem, @buffer is returned.  Otherwise,
+ * the copy of the page is to be made in normal memory, so the address of
+ * the copy is returned.
  *
- *	If @buffer is returned, the caller of suspend_write_next() will write
- *	the page's contents to @buffer, so they will have to be copied to the
- *	right location on the next call to suspend_write_next() and it is done
- *	with the help of copy_last_highmem_page().  For this purpose, if
- *	@buffer is returned, @last_highmem page is set to the page to which
- *	the data will have to be copied from @buffer.
+ * If @buffer is returned, the caller of suspend_write_next() will write
+ * the page's contents to @buffer, so they will have to be copied to the
+ * right location on the next call to suspend_write_next() and it is done
+ * with the help of copy_last_highmem_page().  For this purpose, if
+ * @buffer is returned, @last_highmem_page is set to the page to which
+ * the data will have to be copied from @buffer.
  */
-
-static struct page *last_highmem_page;
-
 static void *get_highmem_page_buffer(struct page *page,
 				     struct chain_allocator *ca)
 {
@@ -2262,13 +2279,15 @@ static void *get_highmem_page_buffer(struct page *page,
 	void *kaddr;
 
 	if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) {
-		/* We have allocated the "original" page frame and we can
+		/*
+		 * We have allocated the "original" page frame and we can
 		 * use it directly to store the loaded page.
 		 */
 		last_highmem_page = page;
 		return buffer;
 	}
-	/* The "original" page frame has not been allocated and we have to
+	/*
+	 * The "original" page frame has not been allocated and we have to
 	 * use a "safe" page frame to store the loaded page.
 	 */
 	pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
@@ -2298,11 +2317,12 @@ static void *get_highmem_page_buffer(struct page *page,
 }
 
 /**
- *	copy_last_highmem_page - copy the contents of a highmem image from
- *	@buffer, where the caller of snapshot_write_next() has place them,
- *	to the right location represented by @last_highmem_page .
+ * copy_last_highmem_page - Copy most the most recent highmem image page.
+ *
+ * Copy the contents of a highmem image from @buffer, where the caller of
+ * snapshot_write_next() has stored them, to the right location represented by
+ * @last_highmem_page .
  */
-
 static void copy_last_highmem_page(void)
 {
 	if (last_highmem_page) {
@@ -2345,22 +2365,23 @@ static inline int last_highmem_page_copied(void) { return 1; }
 static inline void free_highmem_data(void) {}
 #endif /* CONFIG_HIGHMEM */
 
+#define PBES_PER_LINKED_PAGE	(LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
+
 /**
- *	prepare_image - use the memory bitmap @bm to mark the pages that will
- *	be overwritten in the process of restoring the system memory state
- *	from the suspend image ("unsafe" pages) and allocate memory for the
- *	image.
+ * prepare_image - Make room for loading hibernation image.
+ * @new_bm: Unitialized memory bitmap structure.
+ * @bm: Memory bitmap with unsafe pages marked.
+ *
+ * Use @bm to mark the pages that will be overwritten in the process of
+ * restoring the system memory state from the suspend image ("unsafe" pages)
+ * and allocate memory for the image.
  *
- *	The idea is to allocate a new memory bitmap first and then allocate
- *	as many pages as needed for the image data, but not to assign these
- *	pages to specific tasks initially.  Instead, we just mark them as
- *	allocated and create a lists of "safe" pages that will be used
- *	later.  On systems with high memory a list of "safe" highmem pages is
- *	also created.
+ * The idea is to allocate a new memory bitmap first and then allocate
+ * as many pages as needed for image data, but without specifying what those
+ * pages will be used for just yet.  Instead, we mark them all as allocated and
+ * create a lists of "safe" pages to be used later.  On systems with high
+ * memory a list of "safe" highmem pages is created too.
  */
-
-#define PBES_PER_LINKED_PAGE	(LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
-
 static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 {
 	unsigned int nr_pages, nr_highmem;
@@ -2385,7 +2406,8 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 		if (error)
 			goto Free;
 	}
-	/* Reserve some safe pages for potential later use.
+	/*
+	 * Reserve some safe pages for potential later use.
 	 *
 	 * NOTE: This way we make sure there will be enough safe pages for the
 	 * chain_alloc() in get_buffer().  It is a bit wasteful, but
@@ -2431,10 +2453,11 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 }
 
 /**
- *	get_buffer - compute the address that snapshot_write_next() should
- *	set for its caller to write to.
+ * get_buffer - Get the address to store the next image data page.
+ *
+ * Get the address that snapshot_write_next() should return to its caller to
+ * write to.
  */
-
 static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 {
 	struct pbe *pbe;
@@ -2449,12 +2472,14 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 		return get_highmem_page_buffer(page, ca);
 
 	if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page))
-		/* We have allocated the "original" page frame and we can
+		/*
+		 * We have allocated the "original" page frame and we can
 		 * use it directly to store the loaded page.
 		 */
 		return page_address(page);
 
-	/* The "original" page frame has not been allocated and we have to
+	/*
+	 * The "original" page frame has not been allocated and we have to
 	 * use a "safe" page frame to store the loaded page.
 	 */
 	pbe = chain_alloc(ca, sizeof(struct pbe));
@@ -2471,22 +2496,21 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 }
 
 /**
- *	snapshot_write_next - used for writing the system memory snapshot.
+ * snapshot_write_next - Get the address to store the next image page.
+ * @handle: Snapshot handle structure to guide the writing.
  *
- *	On the first call to it @handle should point to a zeroed
- *	snapshot_handle structure.  The structure gets updated and a pointer
- *	to it should be passed to this function every next time.
+ * On the first call, @handle should point to a zeroed snapshot_handle
+ * structure.  The structure gets populated then and a pointer to it should be
+ * passed to this function every next time.
  *
- *	On success the function returns a positive number.  Then, the caller
- *	is allowed to write up to the returned number of bytes to the memory
- *	location computed by the data_of() macro.
+ * On success, the function returns a positive number.  Then, the caller
+ * is allowed to write up to the returned number of bytes to the memory
+ * location computed by the data_of() macro.
  *
- *	The function returns 0 to indicate the "end of file" condition,
- *	and a negative number is returned on error.  In such cases the
- *	structure pointed to by @handle is not updated and should not be used
- *	any more.
+ * The function returns 0 to indicate the "end of file" condition.  Negative
+ * numbers are returned on errors, in which cases the structure pointed to by
+ * @handle is not updated and should not be used any more.
  */
-
 int snapshot_write_next(struct snapshot_handle *handle)
 {
 	static struct chain_allocator ca;
@@ -2556,13 +2580,13 @@ int snapshot_write_next(struct snapshot_handle *handle)
 }
 
 /**
- *	snapshot_write_finalize - must be called after the last call to
- *	snapshot_write_next() in case the last page in the image happens
- *	to be a highmem page and its contents should be stored in the
- *	highmem.  Additionally, it releases the memory that will not be
- *	used any more.
+ * snapshot_write_finalize - Complete the loading of a hibernation image.
+ *
+ * Must be called after the last call to snapshot_write_next() in case the last
+ * page in the image happens to be a highmem page and its contents should be
+ * stored in highmem.  Additionally, it recycles bitmap memory that's not
+ * necessary any more.
  */
-
 void snapshot_write_finalize(struct snapshot_handle *handle)
 {
 	copy_last_highmem_page();
@@ -2599,15 +2623,15 @@ static inline void swap_two_pages_data(struct page *p1, struct page *p2,
 }
 
 /**
- *	restore_highmem - for each highmem page that was allocated before
- *	the suspend and included in the suspend image, and also has been
- *	allocated by the "resume" kernel swap its current (ie. "before
- *	resume") contents with the previous (ie. "before suspend") one.
+ * restore_highmem - Put highmem image pages into their original locations.
+ *
+ * For each highmem page that was in use before hibernation and is included in
+ * the image, and also has been allocated by the "restore" kernel, swap its
+ * current contents with the previous (ie. "before hibernation") ones.
  *
- *	If the resume eventually fails, we can call this function once
- *	again and restore the "before resume" highmem state.
+ * If the restore eventually fails, we can call this function once again and
+ * restore the highmem state as seen by the restore kernel.
  */
-
 int restore_highmem(void)
 {
 	struct highmem_pbe *pbe = highmem_pblist;
-- 
cgit v1.3-8-gc7d7


From d5f32af3100165cbd625855bd155b3aa9bd87ebf Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 6 Jul 2016 23:44:31 +0200
Subject: PM / hibernate: Add missing braces in __register_nosave_region()

One branch of an if/else statement in __register_nosave_region() is
formatted against the kernel coding style which causes the code to
look slightly odd.  To fix that, add missing braces to it.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index bd927d9efeb7..d64d5d0efa79 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -920,9 +920,10 @@ void __init __register_nosave_region(unsigned long start_pfn,
 		/* During init, this shouldn't fail */
 		region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
 		BUG_ON(!region);
-	} else
+	} else {
 		/* This allocation cannot fail */
 		region = memblock_virt_alloc(sizeof(struct nosave_region), 0);
+	}
 	region->start_pfn = start_pfn;
 	region->end_pfn = end_pfn;
 	list_add_tail(&region->list, &nosave_regions);
-- 
cgit v1.3-8-gc7d7


From 4c0b6c10fbaf0c82efe2a7ba6c236c633d4f2ed7 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 10 Jul 2016 02:12:10 +0200
Subject: PM / hibernate: Image data protection during restoration

Make it possible to protect all pages holding image data during
hibernate image restoration by setting them read-only (so as to
catch attempts to write to those pages after image data have been
stored in them).

This adds overhead to image restoration code (it may cause large
page mappings to be split as a result of page flags changes) and
the errors it protects against should never happen in theory, so
the feature is only active after passing hibernate=protect_image
to the command line of the restore kernel.

Also it only is built if CONFIG_DEBUG_RODATA is set.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/kernel-parameters.txt |  3 +++
 kernel/power/hibernate.c            |  3 +++
 kernel/power/power.h                |  7 +++++++
 kernel/power/snapshot.c             | 42 +++++++++++++++++++++++++++++++++++++
 4 files changed, 55 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 82b42c958d1c..07960c24642d 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3594,6 +3594,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 				present during boot.
 		nocompress	Don't compress/decompress hibernation images.
 		no		Disable hibernation and resume.
+		protect_image	Turn on image protection during restoration
+				(that will set all pages holding image data
+				during restoration read-only).
 
 	retain_initrd	[RAM] Keep initrd memory after extraction
 
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b00f270d328e..51441d87f0b6 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1126,6 +1126,9 @@ static int __init hibernate_setup(char *str)
 	} else if (!strncmp(str, "no", 2)) {
 		noresume = 1;
 		nohibernate = 1;
+	} else if (IS_ENABLED(CONFIG_DEBUG_RODATA)
+		   && !strncmp(str, "protect_image", 13)) {
+		enable_restore_image_protection();
 	}
 	return 1;
 }
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 51f02ecaf125..064963e89194 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -59,6 +59,13 @@ extern int hibernation_snapshot(int platform_mode);
 extern int hibernation_restore(int platform_mode);
 extern int hibernation_platform_enter(void);
 
+#ifdef CONFIG_DEBUG_RODATA
+/* kernel/power/snapshot.c */
+extern void enable_restore_image_protection(void);
+#else
+static inline void enable_restore_image_protection(void) {}
+#endif /* CONFIG_DEBUG_RODATA */
+
 #else /* !CONFIG_HIBERNATION */
 
 static inline void hibernate_reserved_size_init(void) {}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d64d5d0efa79..d90df926b59f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -38,6 +38,43 @@
 
 #include "power.h"
 
+#ifdef CONFIG_DEBUG_RODATA
+static bool hibernate_restore_protection;
+static bool hibernate_restore_protection_active;
+
+void enable_restore_image_protection(void)
+{
+	hibernate_restore_protection = true;
+}
+
+static inline void hibernate_restore_protection_begin(void)
+{
+	hibernate_restore_protection_active = hibernate_restore_protection;
+}
+
+static inline void hibernate_restore_protection_end(void)
+{
+	hibernate_restore_protection_active = false;
+}
+
+static inline void hibernate_restore_protect_page(void *page_address)
+{
+	if (hibernate_restore_protection_active)
+		set_memory_ro((unsigned long)page_address, 1);
+}
+
+static inline void hibernate_restore_unprotect_page(void *page_address)
+{
+	if (hibernate_restore_protection_active)
+		set_memory_rw((unsigned long)page_address, 1);
+}
+#else
+static inline void hibernate_restore_protection_begin(void) {}
+static inline void hibernate_restore_protection_end(void) {}
+static inline void hibernate_restore_protect_page(void *page_address) {}
+static inline void hibernate_restore_unprotect_page(void *page_address) {}
+#endif /* CONFIG_DEBUG_RODATA */
+
 static int swsusp_page_is_free(struct page *);
 static void swsusp_set_page_forbidden(struct page *);
 static void swsusp_unset_page_forbidden(struct page *);
@@ -1414,6 +1451,7 @@ loop:
 
 		memory_bm_clear_current(forbidden_pages_map);
 		memory_bm_clear_current(free_pages_map);
+		hibernate_restore_unprotect_page(page_address(page));
 		__free_page(page);
 		goto loop;
 	}
@@ -1425,6 +1463,7 @@ out:
 	buffer = NULL;
 	alloc_normal = 0;
 	alloc_highmem = 0;
+	hibernate_restore_protection_end();
 }
 
 /* Helper functions used for the shrinking of memory. */
@@ -2548,6 +2587,7 @@ int snapshot_write_next(struct snapshot_handle *handle)
 		if (error)
 			return error;
 
+		hibernate_restore_protection_begin();
 	} else if (handle->cur <= nr_meta_pages + 1) {
 		error = unpack_orig_pfns(buffer, &copy_bm);
 		if (error)
@@ -2570,6 +2610,7 @@ int snapshot_write_next(struct snapshot_handle *handle)
 		copy_last_highmem_page();
 		/* Restore page key for data page (s390 only). */
 		page_key_write(handle->buffer);
+		hibernate_restore_protect_page(handle->buffer);
 		handle->buffer = get_buffer(&orig_bm, &ca);
 		if (IS_ERR(handle->buffer))
 			return PTR_ERR(handle->buffer);
@@ -2594,6 +2635,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
 	/* Restore page key for data page (s390 only). */
 	page_key_write(handle->buffer);
 	page_key_free();
+	hibernate_restore_protect_page(handle->buffer);
 	/* Do that only if we have loaded the image entirely */
 	if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
 		memory_bm_recycle(&orig_bm);
-- 
cgit v1.3-8-gc7d7


From 748c7201e622d1c24abb4f85072d2e74d12f295f Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@redhat.com>
Date: Fri, 3 Jun 2016 17:10:18 -0300
Subject: sched/core: Panic on scheduling while atomic bugs if
 kernel.panic_on_warn is set

Currently, a schedule while atomic error prints the stack trace to the
kernel log and the system continue running.

Although it is possible to collect the kernel log messages and analyze
it, often more information are needed. Furthermore, keep the system
running is not always the best choice. For example, when the preempt
count underflows the system will not stop to complain about scheduling
while atomic, so the kernel log can wrap around overwriting the first
stack trace, tuning the analysis even more challenging.

This patch uses the kernel.panic_on_warn sysctl to help out on these
more complex situations.

When kernel.panic_on_warn is set to 1, the kernel will panic() in the
schedule while atomic detection.

The default value of the sysctl is 0, maintaining the current behavior.

Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reviewed-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis Claudio R. Goncalves <lgoncalv@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/e8f7b80f353aa22c63bd8557208163989af8493d.1464983675.git.bristot@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 28da50a5bc76..4e9617a7e7d9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3168,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
 		pr_cont("\n");
 	}
 #endif
+	if (panic_on_warn)
+		panic("scheduling while atomic\n");
+
 	dump_stack();
 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
-- 
cgit v1.3-8-gc7d7


From 44530d588e142a96cf0cd345a7cb8911c4f88720 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 10 Jul 2016 20:58:36 +0200
Subject: Revert "perf/x86/intel, watchdog: Switch NMI watchdog to ref cycles
 on x86"

This reverts commit 2c95afc1e83d93fac3be6923465e1753c2c53b0a.

Stephane reported the following regression:

 > Since Andi added:
 >
 > commit 2c95afc1e83d93fac3be6923465e1753c2c53b0a
 > Author: Andi Kleen <ak@linux.intel.com>
 > Date:   Thu Jun 9 06:14:38 2016 -0700
 >
 >    perf/x86/intel, watchdog: Switch NMI watchdog to ref cycles on x86
 >
 > $ perf stat -e ref-cycles ls
 >   <not counted> ....
 >
 > fails systematically because the ref-cycles is now used by the
 > watchdog and given this is a system-wide pinned event, it monopolizes
 > the fixed counter 2 which is the only counter able to measure this event.

Since the next merge window is near, fix the regression for now
by reverting the commit.

Reported-by: Stephane Eranian <eranian@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/hw_nmi.c | 8 --------
 include/linux/nmi.h           | 1 -
 kernel/watchdog.c             | 7 -------
 3 files changed, 16 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 016f4263fad4..7788ce643bf4 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -18,16 +18,8 @@
 #include <linux/nmi.h>
 #include <linux/module.h>
 #include <linux/delay.h>
-#include <linux/perf_event.h>
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
-int hw_nmi_get_event(void)
-{
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-		return PERF_COUNT_HW_REF_CPU_CYCLES;
-	return PERF_COUNT_HW_CPU_CYCLES;
-}
-
 u64 hw_nmi_get_sample_period(int watchdog_thresh)
 {
 	return (u64)(cpu_khz) * 1000 * watchdog_thresh;
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 79858af27209..4630eeae18e0 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -66,7 +66,6 @@ static inline bool trigger_allbutself_cpu_backtrace(void)
 
 #ifdef CONFIG_LOCKUP_DETECTOR
 u64 hw_nmi_get_sample_period(int watchdog_thresh);
-int hw_nmi_get_event(void);
 extern int nmi_watchdog_enabled;
 extern int soft_watchdog_enabled;
 extern int watchdog_user_enabled;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 8dd30fcd91be..9acb29f280ec 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -315,12 +315,6 @@ static int is_softlockup(unsigned long touch_ts)
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 
-/* Can be overriden by architecture */
-__weak int hw_nmi_get_event(void)
-{
-	return PERF_COUNT_HW_CPU_CYCLES;
-}
-
 static struct perf_event_attr wd_hw_attr = {
 	.type		= PERF_TYPE_HARDWARE,
 	.config		= PERF_COUNT_HW_CPU_CYCLES,
@@ -610,7 +604,6 @@ static int watchdog_nmi_enable(unsigned int cpu)
 
 	wd_attr = &wd_hw_attr;
 	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
-	wd_attr->config = hw_nmi_get_event();
 
 	/* Try to register using hardware perf events */
 	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
-- 
cgit v1.3-8-gc7d7


From a1b7b1a57b9919a0abb6c93fca04ac9cf840c992 Mon Sep 17 00:00:00 2001
From: Alexander Popov <alex.popov@linux.com>
Date: Sun, 3 Jul 2016 03:24:08 +0300
Subject: irqdomain: Fix irq_domain_alloc_irqs_recursive() error handling

If an irq_domain is auto-recursive and irq_domain_alloc_irqs_recursive()
for its parent has returned an error, then do return and avoid calling
irq_domain_free_irqs_recursive() uselessly, because:
- if domain->ops->alloc() had failed for an auto-recursive irq_domain,
   then irq_domain_free_irqs_recursive() had already been called;
- if domain->ops->alloc() had failed for a not auto-recursive irq_domain,
   then there is nothing to free at all.

Signed-off-by: Alexander Popov <alex.popov@linux.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Link: http://lkml.kernel.org/r/1467505448-2850-1-git-send-email-alex.popov@linux.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/irqdomain.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index a82853739694..4752b43662e0 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1192,8 +1192,10 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
 	if (recursive)
 		ret = irq_domain_alloc_irqs_recursive(parent, irq_base,
 						      nr_irqs, arg);
-	if (ret >= 0)
-		ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
+	if (ret < 0)
+		return ret;
+
+	ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
 	if (ret < 0 && recursive)
 		irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs);
 
-- 
cgit v1.3-8-gc7d7


From a536a6e13ecd0d6eb0ffc36c5d56555896617282 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 11 Jul 2016 12:51:01 -0400
Subject: bpf: make inode code explicitly non-modular

The Kconfig currently controlling compilation of this code is:

init/Kconfig:config BPF_SYSCALL
init/Kconfig:   bool "Enable bpf() system call"

...meaning that it currently is not being built as a module by anyone.

Lets remove the couple traces of modular infrastructure use, so that
when reading the driver there is no doubt it is builtin-only.

Note that MODULE_ALIAS is a no-op for non-modular code.

We replace module.h with init.h since the file does use __init.

Cc: Alexei Starovoitov <ast@kernel.org>
Cc: netdev@vger.kernel.org
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/inode.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 318858edb1cd..5967b870a895 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -11,7 +11,7 @@
  * version 2 as published by the Free Software Foundation.
  */
 
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/magic.h>
 #include <linux/major.h>
 #include <linux/mount.h>
@@ -367,8 +367,6 @@ static struct file_system_type bpf_fs_type = {
 	.kill_sb	= kill_litter_super,
 };
 
-MODULE_ALIAS_FS("bpf");
-
 static int __init bpf_init(void)
 {
 	int ret;
-- 
cgit v1.3-8-gc7d7


From 6a4e24518c8a10f78f44da219835239cb5aca90d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Jul 2016 17:16:03 +0000
Subject: cpu/hotplug: Handle early registration gracefully

We switched the hotplug machinery to smpboot threads. Early registration of
hotplug callbacks, i.e. from do_pre_smp_initcalls(), happens before the
threads are initialized. Instead of moving the thread init, we simply handle
it in the hotplug code itself and invoke the function directly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160713153332.896450738@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/cpu.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 7b61887f7ccd..fe71ce4e60f1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -517,6 +517,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
 	if (!cpu_online(cpu))
 		return 0;
 
+	/*
+	 * If we are up and running, use the hotplug thread. For early calls
+	 * we invoke the thread function directly.
+	 */
+	if (!st->thread)
+		return cpuhp_invoke_callback(cpu, state, cb);
+
 	st->cb_state = state;
 	st->cb = cb;
 	/*
-- 
cgit v1.3-8-gc7d7


From 00e16c3d68fce504e880f59c9bdf23b2a4759d6d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Jul 2016 17:16:09 +0000
Subject: perf/core: Convert to hotplug state machine

Actually a nice symmetric startup/teardown pair which fits properly into
the state machine concept. In the long run we should be able to invoke
the startup callback for the boot CPU via the state machine and get
rid of the init function which invokes it on the boot CPU.

Note: This comes actually before the perf hardware callbacks. In the notifier
model the hardware callbacks have a higher priority than the core
callback. But that's solely for CPU offline so that hardware migration of
events happens before the core is notified about the outgoing CPU.

With the symetric state array model we have the following ordering:

 UP:     core -> hardware
 DOWN:   hardware -> core

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Reviewed-by: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160713153333.587514098@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpuhotplug.h |  2 ++
 include/linux/perf_event.h |  9 ++++++++
 kernel/cpu.c               | 11 +++++++++
 kernel/events/core.c       | 56 +++++++++-------------------------------------
 4 files changed, 32 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index d769ec941fd3..067082e3fd41 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -4,6 +4,7 @@
 enum cpuhp_state {
 	CPUHP_OFFLINE,
 	CPUHP_CREATE_THREADS,
+	CPUHP_PERF_PREPARE,
 	CPUHP_NOTIFY_PREPARE,
 	CPUHP_BRINGUP_CPU,
 	CPUHP_AP_IDLE_DEAD,
@@ -22,6 +23,7 @@ enum cpuhp_state {
 	CPUHP_AP_ONLINE_IDLE,
 	CPUHP_AP_SMPBOOT_THREADS,
 	CPUHP_AP_X86_VDSO_VMA_ONLINE,
+	CPUHP_AP_PERF_ONLINE,
 	CPUHP_AP_NOTIFY_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
 	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 30,
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1a827cecd62f..9abeb6948e70 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1354,4 +1354,13 @@ _name##_show(struct device *dev,					\
 									\
 static struct device_attribute format_attr_##_name = __ATTR_RO(_name)
 
+/* Performance counter hotplug functions */
+#ifdef CONFIG_PERF_EVENTS
+int perf_event_init_cpu(unsigned int cpu);
+int perf_event_exit_cpu(unsigned int cpu);
+#else
+#define perf_event_init_cpu	NULL
+#define perf_event_exit_cpu	NULL
+#endif
+
 #endif /* _LINUX_PERF_EVENT_H */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index fe71ce4e60f1..3705d9043c08 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1180,6 +1180,11 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.teardown		= NULL,
 		.cant_stop		= true,
 	},
+	[CPUHP_PERF_PREPARE] = {
+		.name = "perf prepare",
+		.startup = perf_event_init_cpu,
+		.teardown = perf_event_exit_cpu,
+	},
 	/*
 	 * Preparatory and dead notifiers. Will be replaced once the notifiers
 	 * are converted to states.
@@ -1257,6 +1262,12 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 		.startup		= smpboot_unpark_threads,
 		.teardown		= NULL,
 	},
+	[CPUHP_AP_PERF_ONLINE] = {
+		.name = "perf online",
+		.startup = perf_event_init_cpu,
+		.teardown = perf_event_exit_cpu,
+	},
+
 	/*
 	 * Online/down_prepare notifiers. Will be removed once the notifiers
 	 * are converted to states.
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 43d43a2d5811..f3ef1c29a7c9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10255,7 +10255,7 @@ static void __init perf_event_init_all_cpus(void)
 	}
 }
 
-static void perf_event_init_cpu(int cpu)
+int perf_event_init_cpu(unsigned int cpu)
 {
 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
@@ -10268,6 +10268,7 @@ static void perf_event_init_cpu(int cpu)
 		rcu_assign_pointer(swhash->swevent_hlist, hlist);
 	}
 	mutex_unlock(&swhash->hlist_mutex);
+	return 0;
 }
 
 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
@@ -10299,14 +10300,17 @@ static void perf_event_exit_cpu_context(int cpu)
 	}
 	srcu_read_unlock(&pmus_srcu, idx);
 }
+#else
+
+static void perf_event_exit_cpu_context(int cpu) { }
+
+#endif
 
-static void perf_event_exit_cpu(int cpu)
+int perf_event_exit_cpu(unsigned int cpu)
 {
 	perf_event_exit_cpu_context(cpu);
+	return 0;
 }
-#else
-static inline void perf_event_exit_cpu(int cpu) { }
-#endif
 
 static int
 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
@@ -10328,46 +10332,6 @@ static struct notifier_block perf_reboot_notifier = {
 	.priority = INT_MIN,
 };
 
-static int
-perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-	unsigned int cpu = (long)hcpu;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-
-	case CPU_UP_PREPARE:
-		/*
-		 * This must be done before the CPU comes alive, because the
-		 * moment we can run tasks we can encounter (software) events.
-		 *
-		 * Specifically, someone can have inherited events on kthreadd
-		 * or a pre-existing worker thread that gets re-bound.
-		 */
-		perf_event_init_cpu(cpu);
-		break;
-
-	case CPU_DOWN_PREPARE:
-		/*
-		 * This must be done before the CPU dies because after that an
-		 * active event might want to IPI the CPU and that'll not work
-		 * so great for dead CPUs.
-		 *
-		 * XXX smp_call_function_single() return -ENXIO without a warn
-		 * so we could possibly deal with this.
-		 *
-		 * This is safe against new events arriving because
-		 * sys_perf_event_open() serializes against hotplug using
-		 * get_online_cpus().
-		 */
-		perf_event_exit_cpu(cpu);
-		break;
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
 void __init perf_event_init(void)
 {
 	int ret;
@@ -10380,7 +10344,7 @@ void __init perf_event_init(void)
 	perf_pmu_register(&perf_cpu_clock, NULL, -1);
 	perf_pmu_register(&perf_task_clock, NULL, -1);
 	perf_tp_register();
-	perf_cpu_notifier(perf_cpu_notify);
+	perf_event_init_cpu(smp_processor_id());
 	register_reboot_notifier(&perf_reboot_notifier);
 
 	ret = init_hw_breakpoint();
-- 
cgit v1.3-8-gc7d7


From 7ee681b25284782ecf380bf5ccf55f13c52fd0ce Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Jul 2016 17:16:29 +0000
Subject: workqueue: Convert to state machine callbacks

Get rid of the prio ordering of the separate notifiers and use a proper state
callback pair.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160713153335.197083890@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpu.h        |   9 ----
 include/linux/cpuhotplug.h |   2 +
 include/linux/workqueue.h  |   6 +++
 kernel/cpu.c               |  10 +++++
 kernel/workqueue.c         | 108 ++++++++++++++++++---------------------------
 5 files changed, 61 insertions(+), 74 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index ca2dd865a34e..797d9c8e9a1b 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -55,15 +55,6 @@ extern ssize_t arch_cpu_release(const char *, size_t);
 #endif
 struct notifier_block;
 
-/*
- * CPU notifier priorities.
- */
-enum {
-	/* bring up workqueues before normal notifiers and down after */
-	CPU_PRI_WORKQUEUE_UP	= 5,
-	CPU_PRI_WORKQUEUE_DOWN	= -5,
-};
-
 #define CPU_ONLINE		0x0002 /* CPU (unsigned)v is up */
 #define CPU_UP_PREPARE		0x0003 /* CPU (unsigned)v coming up */
 #define CPU_UP_CANCELED		0x0004 /* CPU (unsigned)v NOT coming up */
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index acfeda137df8..60557a9e783d 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -12,6 +12,7 @@ enum cpuhp_state {
 	CPUHP_PERF_BFIN,
 	CPUHP_PERF_POWER,
 	CPUHP_PERF_SUPERH,
+	CPUHP_WORKQUEUE_PREP,
 	CPUHP_NOTIFY_PREPARE,
 	CPUHP_BRINGUP_CPU,
 	CPUHP_AP_IDLE_DEAD,
@@ -49,6 +50,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_S390_SF_ONLINE,
 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
 	CPUHP_AP_PERF_ARM_CCN_ONLINE,
+	CPUHP_AP_WORKQUEUE_ONLINE,
 	CPUHP_AP_NOTIFY_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
 	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 30,
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index ca73c503b92a..26cc1df280d6 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -625,4 +625,10 @@ void wq_watchdog_touch(int cpu);
 static inline void wq_watchdog_touch(int cpu) { }
 #endif	/* CONFIG_WQ_WATCHDOG */
 
+#ifdef CONFIG_SMP
+int workqueue_prepare_cpu(unsigned int cpu);
+int workqueue_online_cpu(unsigned int cpu);
+int workqueue_offline_cpu(unsigned int cpu);
+#endif
+
 #endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3705d9043c08..af53f820fec9 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1185,6 +1185,11 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.startup = perf_event_init_cpu,
 		.teardown = perf_event_exit_cpu,
 	},
+	[CPUHP_WORKQUEUE_PREP] = {
+		.name = "workqueue prepare",
+		.startup = workqueue_prepare_cpu,
+		.teardown = NULL,
+	},
 	/*
 	 * Preparatory and dead notifiers. Will be replaced once the notifiers
 	 * are converted to states.
@@ -1267,6 +1272,11 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 		.startup = perf_event_init_cpu,
 		.teardown = perf_event_exit_cpu,
 	},
+	[CPUHP_AP_WORKQUEUE_ONLINE] = {
+		.name = "workqueue online",
+		.startup = workqueue_online_cpu,
+		.teardown = workqueue_offline_cpu,
+	},
 
 	/*
 	 * Online/down_prepare notifiers. Will be removed once the notifiers
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e1c0e996b5ae..c9dd5fbdbf33 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4611,84 +4611,65 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
 						  pool->attrs->cpumask) < 0);
 }
 
-/*
- * Workqueues should be brought up before normal priority CPU notifiers.
- * This will be registered high priority CPU notifier.
- */
-static int workqueue_cpu_up_callback(struct notifier_block *nfb,
-					       unsigned long action,
-					       void *hcpu)
+int workqueue_prepare_cpu(unsigned int cpu)
+{
+	struct worker_pool *pool;
+
+	for_each_cpu_worker_pool(pool, cpu) {
+		if (pool->nr_workers)
+			continue;
+		if (!create_worker(pool))
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+int workqueue_online_cpu(unsigned int cpu)
 {
-	int cpu = (unsigned long)hcpu;
 	struct worker_pool *pool;
 	struct workqueue_struct *wq;
 	int pi;
 
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_UP_PREPARE:
-		for_each_cpu_worker_pool(pool, cpu) {
-			if (pool->nr_workers)
-				continue;
-			if (!create_worker(pool))
-				return NOTIFY_BAD;
-		}
-		break;
-
-	case CPU_DOWN_FAILED:
-	case CPU_ONLINE:
-		mutex_lock(&wq_pool_mutex);
+	mutex_lock(&wq_pool_mutex);
 
-		for_each_pool(pool, pi) {
-			mutex_lock(&pool->attach_mutex);
+	for_each_pool(pool, pi) {
+		mutex_lock(&pool->attach_mutex);
 
-			if (pool->cpu == cpu)
-				rebind_workers(pool);
-			else if (pool->cpu < 0)
-				restore_unbound_workers_cpumask(pool, cpu);
+		if (pool->cpu == cpu)
+			rebind_workers(pool);
+		else if (pool->cpu < 0)
+			restore_unbound_workers_cpumask(pool, cpu);
 
-			mutex_unlock(&pool->attach_mutex);
-		}
+		mutex_unlock(&pool->attach_mutex);
+	}
 
-		/* update NUMA affinity of unbound workqueues */
-		list_for_each_entry(wq, &workqueues, list)
-			wq_update_unbound_numa(wq, cpu, true);
+	/* update NUMA affinity of unbound workqueues */
+	list_for_each_entry(wq, &workqueues, list)
+		wq_update_unbound_numa(wq, cpu, true);
 
-		mutex_unlock(&wq_pool_mutex);
-		break;
-	}
-	return NOTIFY_OK;
+	mutex_unlock(&wq_pool_mutex);
+	return 0;
 }
 
-/*
- * Workqueues should be brought down after normal priority CPU notifiers.
- * This will be registered as low priority CPU notifier.
- */
-static int workqueue_cpu_down_callback(struct notifier_block *nfb,
-						 unsigned long action,
-						 void *hcpu)
+int workqueue_offline_cpu(unsigned int cpu)
 {
-	int cpu = (unsigned long)hcpu;
 	struct work_struct unbind_work;
 	struct workqueue_struct *wq;
 
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DOWN_PREPARE:
-		/* unbinding per-cpu workers should happen on the local CPU */
-		INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
-		queue_work_on(cpu, system_highpri_wq, &unbind_work);
-
-		/* update NUMA affinity of unbound workqueues */
-		mutex_lock(&wq_pool_mutex);
-		list_for_each_entry(wq, &workqueues, list)
-			wq_update_unbound_numa(wq, cpu, false);
-		mutex_unlock(&wq_pool_mutex);
-
-		/* wait for per-cpu unbinding to finish */
-		flush_work(&unbind_work);
-		destroy_work_on_stack(&unbind_work);
-		break;
-	}
-	return NOTIFY_OK;
+	/* unbinding per-cpu workers should happen on the local CPU */
+	INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
+	queue_work_on(cpu, system_highpri_wq, &unbind_work);
+
+	/* update NUMA affinity of unbound workqueues */
+	mutex_lock(&wq_pool_mutex);
+	list_for_each_entry(wq, &workqueues, list)
+		wq_update_unbound_numa(wq, cpu, false);
+	mutex_unlock(&wq_pool_mutex);
+
+	/* wait for per-cpu unbinding to finish */
+	flush_work(&unbind_work);
+	destroy_work_on_stack(&unbind_work);
+	return 0;
 }
 
 #ifdef CONFIG_SMP
@@ -5490,9 +5471,6 @@ static int __init init_workqueues(void)
 
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
-	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
-	hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
-
 	wq_numa_init();
 
 	/* initialize CPU pools */
-- 
cgit v1.3-8-gc7d7


From 57430218317e5b280a80582a139b26029c25de6c Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 13 Jul 2016 16:50:01 +0200
Subject: sched/cputime: Count actually elapsed irq & softirq time

Currently, if there was any irq or softirq time during 'ticks'
jiffies, the entire period will be accounted as irq or softirq
time.

This is inaccurate if only a subset of the time was actually spent
handling irqs, and could conceivably mis-count all of the ticks during
a period as irq time, when there was some irq and some softirq time.

This can actually happen when irqtime_account_process_tick is called
from account_idle_ticks, which can pass a larger number of ticks down
all at once.

Fix this by changing irqtime_account_hi_update(), irqtime_account_si_update(),
and steal_account_process_ticks() to work with cputime_t time units, and
return the amount of time spent in each mode.

Rename steal_account_process_ticks() to steal_account_process_time(), to
reflect that time is now accounted in cputime_t, instead of ticks.

Additionally, have irqtime_account_process_tick() take into account how
much time was spent in each of steal, irq, and softirq time.

The latter could help improve the accuracy of cputime
accounting when returning from idle on a NO_HZ_IDLE CPU.

Properly accounting how much time was spent in hardirq and
softirq time will also allow the NO_HZ_FULL code to re-use
these same functions for hardirq and softirq accounting.

Signed-off-by: Rik van Riel <riel@redhat.com>
[ Make nsecs_to_cputime64() actually return cputime64_t. ]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1468421405-20056-2-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/cputime_nsecs.h |   2 +
 kernel/sched/cputime.c              | 124 ++++++++++++++++++++++--------------
 2 files changed, 79 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h
index 0f1c6f315cdc..a84e28e0c634 100644
--- a/include/asm-generic/cputime_nsecs.h
+++ b/include/asm-generic/cputime_nsecs.h
@@ -50,6 +50,8 @@ typedef u64 __nocast cputime64_t;
 	(__force u64)(__ct)
 #define nsecs_to_cputime(__nsecs)	\
 	(__force cputime_t)(__nsecs)
+#define nsecs_to_cputime64(__nsecs)	\
+	(__force cputime64_t)(__nsecs)
 
 
 /*
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 3d60e5d76fdb..db82ae12cf01 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -79,40 +79,50 @@ void irqtime_account_irq(struct task_struct *curr)
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
 
-static int irqtime_account_hi_update(void)
+static cputime_t irqtime_account_hi_update(cputime_t maxtime)
 {
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 	unsigned long flags;
-	u64 latest_ns;
-	int ret = 0;
+	cputime_t irq_cputime;
 
 	local_irq_save(flags);
-	latest_ns = this_cpu_read(cpu_hardirq_time);
-	if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
-		ret = 1;
+	irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
+		      cpustat[CPUTIME_IRQ];
+	irq_cputime = min(irq_cputime, maxtime);
+	cpustat[CPUTIME_IRQ] += irq_cputime;
 	local_irq_restore(flags);
-	return ret;
+	return irq_cputime;
 }
 
-static int irqtime_account_si_update(void)
+static cputime_t irqtime_account_si_update(cputime_t maxtime)
 {
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 	unsigned long flags;
-	u64 latest_ns;
-	int ret = 0;
+	cputime_t softirq_cputime;
 
 	local_irq_save(flags);
-	latest_ns = this_cpu_read(cpu_softirq_time);
-	if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
-		ret = 1;
+	softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) -
+			  cpustat[CPUTIME_SOFTIRQ];
+	softirq_cputime = min(softirq_cputime, maxtime);
+	cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
 	local_irq_restore(flags);
-	return ret;
+	return softirq_cputime;
 }
 
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 
 #define sched_clock_irqtime	(0)
 
+static cputime_t irqtime_account_hi_update(cputime_t dummy)
+{
+	return 0;
+}
+
+static cputime_t irqtime_account_si_update(cputime_t dummy)
+{
+	return 0;
+}
+
 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
 
 static inline void task_group_account_field(struct task_struct *p, int index,
@@ -257,31 +267,44 @@ void account_idle_time(cputime_t cputime)
 		cpustat[CPUTIME_IDLE] += (__force u64) cputime;
 }
 
-static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies)
+static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
 {
 #ifdef CONFIG_PARAVIRT
 	if (static_key_false(&paravirt_steal_enabled)) {
+		cputime_t steal_cputime;
 		u64 steal;
-		unsigned long steal_jiffies;
 
 		steal = paravirt_steal_clock(smp_processor_id());
 		steal -= this_rq()->prev_steal_time;
 
-		/*
-		 * steal is in nsecs but our caller is expecting steal
-		 * time in jiffies. Lets cast the result to jiffies
-		 * granularity and account the rest on the next rounds.
-		 */
-		steal_jiffies = min(nsecs_to_jiffies(steal), max_jiffies);
-		this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
+		steal_cputime = min(nsecs_to_cputime(steal), maxtime);
+		account_steal_time(steal_cputime);
+		this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime);
 
-		account_steal_time(jiffies_to_cputime(steal_jiffies));
-		return steal_jiffies;
+		return steal_cputime;
 	}
 #endif
 	return 0;
 }
 
+/*
+ * Account how much elapsed time was spent in steal, irq, or softirq time.
+ */
+static inline cputime_t account_other_time(cputime_t max)
+{
+	cputime_t accounted;
+
+	accounted = steal_account_process_time(max);
+
+	if (accounted < max)
+		accounted += irqtime_account_hi_update(max - accounted);
+
+	if (accounted < max)
+		accounted += irqtime_account_si_update(max - accounted);
+
+	return accounted;
+}
+
 /*
  * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
  * tasks (sum on group iteration) belonging to @tsk's group.
@@ -342,21 +365,23 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 					 struct rq *rq, int ticks)
 {
-	cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
-	u64 cputime = (__force u64) cputime_one_jiffy;
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	u64 cputime = (__force u64) cputime_one_jiffy * ticks;
+	cputime_t scaled, other;
 
-	if (steal_account_process_tick(ULONG_MAX))
+	/*
+	 * When returning from idle, many ticks can get accounted at
+	 * once, including some ticks of steal, irq, and softirq time.
+	 * Subtract those ticks from the amount of time accounted to
+	 * idle, or potentially user or system time. Due to rounding,
+	 * other time can exceed ticks occasionally.
+	 */
+	other = account_other_time(cputime);
+	if (other >= cputime)
 		return;
+	cputime -= other;
+	scaled = cputime_to_scaled(cputime);
 
-	cputime *= ticks;
-	scaled *= ticks;
-
-	if (irqtime_account_hi_update()) {
-		cpustat[CPUTIME_IRQ] += cputime;
-	} else if (irqtime_account_si_update()) {
-		cpustat[CPUTIME_SOFTIRQ] += cputime;
-	} else if (this_cpu_ksoftirqd() == p) {
+	if (this_cpu_ksoftirqd() == p) {
 		/*
 		 * ksoftirqd time do not get accounted in cpu_softirq_time.
 		 * So, we have to handle it separately here.
@@ -466,7 +491,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
  */
 void account_process_tick(struct task_struct *p, int user_tick)
 {
-	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+	cputime_t cputime, scaled, steal;
 	struct rq *rq = this_rq();
 
 	if (vtime_accounting_cpu_enabled())
@@ -477,16 +502,21 @@ void account_process_tick(struct task_struct *p, int user_tick)
 		return;
 	}
 
-	if (steal_account_process_tick(ULONG_MAX))
+	cputime = cputime_one_jiffy;
+	steal = steal_account_process_time(cputime);
+
+	if (steal >= cputime)
 		return;
 
+	cputime -= steal;
+	scaled = cputime_to_scaled(cputime);
+
 	if (user_tick)
-		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+		account_user_time(p, cputime, scaled);
 	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
-		account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
-				    one_jiffy_scaled);
+		account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);
 	else
-		account_idle_time(cputime_one_jiffy);
+		account_idle_time(cputime);
 }
 
 /*
@@ -681,14 +711,14 @@ static cputime_t vtime_delta(struct task_struct *tsk)
 static cputime_t get_vtime_delta(struct task_struct *tsk)
 {
 	unsigned long now = READ_ONCE(jiffies);
-	unsigned long delta_jiffies, steal_jiffies;
+	cputime_t delta, steal;
 
-	delta_jiffies = now - tsk->vtime_snap;
-	steal_jiffies = steal_account_process_tick(delta_jiffies);
+	delta = jiffies_to_cputime(now - tsk->vtime_snap);
+	steal = steal_account_process_time(delta);
 	WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
 	tsk->vtime_snap = now;
 
-	return jiffies_to_cputime(delta_jiffies - steal_jiffies);
+	return delta - steal;
 }
 
 static void __vtime_account_system(struct task_struct *tsk)
-- 
cgit v1.3-8-gc7d7


From b58c35840521bb02b150e1d0d34ca9197f8b7145 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 13 Jul 2016 16:50:02 +0200
Subject: sched/cputime: Replace VTIME_GEN irq time code with
 IRQ_TIME_ACCOUNTING code

The CONFIG_VIRT_CPU_ACCOUNTING_GEN irq time tracking code does not
appear to currently work right.

On CPUs without nohz_full=, only tick based irq time sampling is
done, which breaks down when dealing with a nohz_idle CPU.

On firewalls and similar systems, no ticks may happen on a CPU for a
while, and the irq time spent may never get accounted properly. This
can cause issues with capacity planning and power saving, which use
the CPU statistics as inputs in decision making.

Remove the VTIME_GEN vtime irq time code, and replace it with the
IRQ_TIME_ACCOUNTING code, when selected as a config option by the user.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1468421405-20056-3-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/vtime.h  | 32 ++++++++++++++------------------
 init/Kconfig           |  6 +++---
 kernel/sched/cputime.c | 16 +++-------------
 3 files changed, 20 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index fa2196990f84..d1977d84ebdf 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -14,6 +14,18 @@ struct task_struct;
  */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 static inline bool vtime_accounting_cpu_enabled(void) { return true; }
+
+#ifdef __ARCH_HAS_VTIME_ACCOUNT
+extern void vtime_account_irq_enter(struct task_struct *tsk);
+#else
+extern void vtime_common_account_irq_enter(struct task_struct *tsk);
+static inline void vtime_account_irq_enter(struct task_struct *tsk)
+{
+	if (vtime_accounting_cpu_enabled())
+		vtime_common_account_irq_enter(tsk);
+}
+#endif /* __ARCH_HAS_VTIME_ACCOUNT */
+
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@@ -64,17 +76,6 @@ extern void vtime_account_system(struct task_struct *tsk);
 extern void vtime_account_idle(struct task_struct *tsk);
 extern void vtime_account_user(struct task_struct *tsk);
 
-#ifdef __ARCH_HAS_VTIME_ACCOUNT
-extern void vtime_account_irq_enter(struct task_struct *tsk);
-#else
-extern void vtime_common_account_irq_enter(struct task_struct *tsk);
-static inline void vtime_account_irq_enter(struct task_struct *tsk)
-{
-	if (vtime_accounting_cpu_enabled())
-		vtime_common_account_irq_enter(tsk);
-}
-#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
 static inline void vtime_task_switch(struct task_struct *prev) { }
@@ -85,13 +86,8 @@ static inline void vtime_account_irq_enter(struct task_struct *tsk) { }
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 extern void arch_vtime_task_switch(struct task_struct *tsk);
-extern void vtime_gen_account_irq_exit(struct task_struct *tsk);
-
-static inline void vtime_account_irq_exit(struct task_struct *tsk)
-{
-	if (vtime_accounting_cpu_enabled())
-		vtime_gen_account_irq_exit(tsk);
-}
+static inline void vtime_account_irq_enter(struct task_struct *tsk) { }
+static inline void vtime_account_irq_exit(struct task_struct *tsk) { }
 
 extern void vtime_user_enter(struct task_struct *tsk);
 
diff --git a/init/Kconfig b/init/Kconfig
index c02d89777713..787dd76acf29 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -375,9 +375,11 @@ config VIRT_CPU_ACCOUNTING_GEN
 
 	  If unsure, say N.
 
+endchoice
+
 config IRQ_TIME_ACCOUNTING
 	bool "Fine granularity task level IRQ time accounting"
-	depends on HAVE_IRQ_TIME_ACCOUNTING && !NO_HZ_FULL
+	depends on HAVE_IRQ_TIME_ACCOUNTING && !VIRT_CPU_ACCOUNTING_NATIVE
 	help
 	  Select this option to enable fine granularity task irq time
 	  accounting. This is done by reading a timestamp on each
@@ -386,8 +388,6 @@ config IRQ_TIME_ACCOUNTING
 
 	  If in doubt, say N here.
 
-endchoice
-
 config BSD_PROCESS_ACCT
 	bool "BSD Process Accounting"
 	depends on MULTIUSER
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index db82ae12cf01..ca7e33cb0967 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -711,14 +711,14 @@ static cputime_t vtime_delta(struct task_struct *tsk)
 static cputime_t get_vtime_delta(struct task_struct *tsk)
 {
 	unsigned long now = READ_ONCE(jiffies);
-	cputime_t delta, steal;
+	cputime_t delta, other;
 
 	delta = jiffies_to_cputime(now - tsk->vtime_snap);
-	steal = steal_account_process_time(delta);
+	other = account_other_time(delta);
 	WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
 	tsk->vtime_snap = now;
 
-	return delta - steal;
+	return delta - other;
 }
 
 static void __vtime_account_system(struct task_struct *tsk)
@@ -738,16 +738,6 @@ void vtime_account_system(struct task_struct *tsk)
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
-void vtime_gen_account_irq_exit(struct task_struct *tsk)
-{
-	write_seqcount_begin(&tsk->vtime_seqcount);
-	if (vtime_delta(tsk))
-		__vtime_account_system(tsk);
-	if (context_tracking_in_user())
-		tsk->vtime_snap_whence = VTIME_USER;
-	write_seqcount_end(&tsk->vtime_seqcount);
-}
-
 void vtime_account_user(struct task_struct *tsk)
 {
 	cputime_t delta_cpu;
-- 
cgit v1.3-8-gc7d7


From 0cfdf9a198b0d4f5ad6c87d894db7830b796b2cc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 13 Jul 2016 16:50:03 +0200
Subject: sched/cputime: Clean up the old vtime gen irqtime accounting
 completely

Vtime generic irqtime accounting has been removed but there are a few
remnants to clean up:

* The vtime_accounting_cpu_enabled() check in irq entry was only used
  by CONFIG_VIRT_CPU_ACCOUNTING_GEN. We can safely remove it.

* Without the vtime_accounting_cpu_enabled(), we no longer need to
  have a vtime_common_account_irq_enter() indirect function.

* Move vtime_account_irq_enter() implementation under
  CONFIG_VIRT_CPU_ACCOUNTING_NATIVE which is the last user.

* The vtime_account_user() call was only used on irq entry for
  CONFIG_VIRT_CPU_ACCOUNTING_GEN. We can remove that too.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1468421405-20056-4-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/vtime.h  | 11 -----------
 kernel/sched/cputime.c | 33 ++++++++++-----------------------
 2 files changed, 10 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index d1977d84ebdf..65aef5e9d04e 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -14,18 +14,7 @@ struct task_struct;
  */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 static inline bool vtime_accounting_cpu_enabled(void) { return true; }
-
-#ifdef __ARCH_HAS_VTIME_ACCOUNT
 extern void vtime_account_irq_enter(struct task_struct *tsk);
-#else
-extern void vtime_common_account_irq_enter(struct task_struct *tsk);
-static inline void vtime_account_irq_enter(struct task_struct *tsk)
-{
-	if (vtime_accounting_cpu_enabled())
-		vtime_common_account_irq_enter(tsk);
-}
-#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ca7e33cb0967..16a873c203b1 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -431,6 +431,10 @@ void vtime_common_task_switch(struct task_struct *prev)
 }
 #endif
 
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 /*
  * Archs that account the whole time spent in the idle task
  * (outside irq) as idle time can rely on this and just implement
@@ -440,33 +444,16 @@ void vtime_common_task_switch(struct task_struct *prev)
  * vtime_account().
  */
 #ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_common_account_irq_enter(struct task_struct *tsk)
+void vtime_account_irq_enter(struct task_struct *tsk)
 {
-	if (!in_interrupt()) {
-		/*
-		 * If we interrupted user, context_tracking_in_user()
-		 * is 1 because the context tracking don't hook
-		 * on irq entry/exit. This way we know if
-		 * we need to flush user time on kernel entry.
-		 */
-		if (context_tracking_in_user()) {
-			vtime_account_user(tsk);
-			return;
-		}
-
-		if (is_idle_task(tsk)) {
-			vtime_account_idle(tsk);
-			return;
-		}
-	}
-	vtime_account_system(tsk);
+	if (!in_interrupt() && is_idle_task(tsk))
+		vtime_account_idle(tsk);
+	else
+		vtime_account_system(tsk);
 }
-EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
-
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
 	*ut = p->utime;
-- 
cgit v1.3-8-gc7d7


From 553bf6bbfd8a540c70aee28eb50e24caff456a03 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 13 Jul 2016 16:50:05 +0200
Subject: sched/cputime: Drop local_irq_save/restore from irqtime_account_irq()

Paolo pointed out that irqs are already blocked when irqtime_account_irq()
is called. That means there is no reason to call local_irq_save/restore()
again.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1468421405-20056-6-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 16a873c203b1..ea0f6f31a244 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -49,15 +49,12 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq);
  */
 void irqtime_account_irq(struct task_struct *curr)
 {
-	unsigned long flags;
 	s64 delta;
 	int cpu;
 
 	if (!sched_clock_irqtime)
 		return;
 
-	local_irq_save(flags);
-
 	cpu = smp_processor_id();
 	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
 	__this_cpu_add(irq_start_time, delta);
@@ -75,7 +72,6 @@ void irqtime_account_irq(struct task_struct *curr)
 		__this_cpu_add(cpu_softirq_time, delta);
 
 	irq_time_write_end();
-	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
 
-- 
cgit v1.3-8-gc7d7


From 0b7a0fdb29715e38641beb39db4d01695b22b5aa Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Thu, 14 Jul 2016 10:59:19 -0400
Subject: audit: fix whitespace in CWD record

Fix the whitespace in the CWD record

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
[PM: fixed subject line]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ec4c552876a7..aa3feec4df14 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1430,7 +1430,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	if (context->pwd.dentry && context->pwd.mnt) {
 		ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
 		if (ab) {
-			audit_log_d_path(ab, " cwd=", &context->pwd);
+			audit_log_d_path(ab, "cwd=", &context->pwd);
 			audit_log_end(ab);
 		}
 	}
-- 
cgit v1.3-8-gc7d7


From 27590dc17b34aedc4f3e14bd107ee59b9db9b0a6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Jul 2016 10:41:04 +0200
Subject: hrtimer: Convert to hotplug state machine

Split out the clockevents callbacks instead of piggybacking them on
hrtimers.

This gets rid of a POST_DEAD user. See commit:

  54e88fad223c ("sched: Make sure timers have migrated before killing the migration_thread")

We just move the callback state to the proper place in the state machine.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160713153337.485419196@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpuhotplug.h |  1 +
 include/linux/hrtimer.h    |  7 +++++++
 kernel/cpu.c               |  5 +++++
 kernel/time/hrtimer.c      | 40 +++++-----------------------------------
 4 files changed, 18 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 070cc7f28862..04ecc55009ed 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -15,6 +15,7 @@ enum cpuhp_state {
 	CPUHP_X86_HPET_DEAD,
 	CPUHP_X86_APB_DEAD,
 	CPUHP_WORKQUEUE_PREP,
+	CPUHP_HRTIMERS_PREPARE,
 	CPUHP_NOTIFY_PREPARE,
 	CPUHP_BRINGUP_CPU,
 	CPUHP_AP_IDLE_DEAD,
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index c98c6539e2c2..5e00f80b1535 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -494,4 +494,11 @@ extern void __init hrtimers_init(void);
 /* Show pending timers: */
 extern void sysrq_timer_list_show(void);
 
+int hrtimers_prepare_cpu(unsigned int cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+int hrtimers_dead_cpu(unsigned int cpu);
+#else
+#define hrtimers_dead_cpu	NULL
+#endif
+
 #endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index af53f820fec9..85500e7fe238 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1190,6 +1190,11 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.startup = workqueue_prepare_cpu,
 		.teardown = NULL,
 	},
+	[CPUHP_HRTIMERS_PREPARE] = {
+		.name = "hrtimers prepare",
+		.startup = hrtimers_prepare_cpu,
+		.teardown = hrtimers_dead_cpu,
+	},
 	/*
 	 * Preparatory and dead notifiers. Will be replaced once the notifiers
 	 * are converted to states.
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d13c9aebf7a3..9ba7c820fc23 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1590,7 +1590,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
 /*
  * Functions related to boot-time initialization:
  */
-static void init_hrtimers_cpu(int cpu)
+int hrtimers_prepare_cpu(unsigned int cpu)
 {
 	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
 	int i;
@@ -1602,6 +1602,7 @@ static void init_hrtimers_cpu(int cpu)
 
 	cpu_base->cpu = cpu;
 	hrtimer_init_hres(cpu_base);
+	return 0;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1636,7 +1637,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 	}
 }
 
-static void migrate_hrtimers(int scpu)
+int hrtimers_dead_cpu(unsigned int scpu)
 {
 	struct hrtimer_cpu_base *old_base, *new_base;
 	int i;
@@ -1665,45 +1666,14 @@ static void migrate_hrtimers(int scpu)
 	/* Check, if we got expired work to do */
 	__hrtimer_peek_ahead_timers();
 	local_irq_enable();
+	return 0;
 }
 
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int hrtimer_cpu_notify(struct notifier_block *self,
-					unsigned long action, void *hcpu)
-{
-	int scpu = (long)hcpu;
-
-	switch (action) {
-
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		init_hrtimers_cpu(scpu);
-		break;
-
-#ifdef CONFIG_HOTPLUG_CPU
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		migrate_hrtimers(scpu);
-		break;
-#endif
-
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block hrtimers_nb = {
-	.notifier_call = hrtimer_cpu_notify,
-};
-
 void __init hrtimers_init(void)
 {
-	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
-			  (void *)(long)smp_processor_id());
-	register_cpu_notifier(&hrtimers_nb);
+	hrtimers_prepare_cpu(smp_processor_id());
 }
 
 /**
-- 
cgit v1.3-8-gc7d7


From 24f73b99716a9cd8cbb345c41ced6b3b5ed94006 Mon Sep 17 00:00:00 2001
From: Richard Cochran <rcochran@linutronix.de>
Date: Wed, 13 Jul 2016 17:16:59 +0000
Subject: timers/core: Convert to hotplug state machine

When tearing down, call timers_dead_cpu() before notify_dead().
There is a hidden dependency between:

 - timers
 - block multiqueue
 - rcutree

If timers_dead_cpu() comes later than blk_mq_queue_reinit_notify()
that latter function causes a RCU stall.

Signed-off-by: Richard Cochran <rcochran@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160713153337.566790058@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpuhotplug.h |  1 +
 include/linux/timer.h      |  6 ++++++
 kernel/cpu.c               |  5 +++++
 kernel/time/timer.c        | 25 ++-----------------------
 4 files changed, 14 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 04ecc55009ed..15d46d2a1d16 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -16,6 +16,7 @@ enum cpuhp_state {
 	CPUHP_X86_APB_DEAD,
 	CPUHP_WORKQUEUE_PREP,
 	CPUHP_HRTIMERS_PREPARE,
+	CPUHP_TIMERS_DEAD,
 	CPUHP_NOTIFY_PREPARE,
 	CPUHP_BRINGUP_CPU,
 	CPUHP_AP_IDLE_DEAD,
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 4419506b564e..51d601f192d4 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -273,4 +273,10 @@ unsigned long __round_jiffies_up_relative(unsigned long j, int cpu);
 unsigned long round_jiffies_up(unsigned long j);
 unsigned long round_jiffies_up_relative(unsigned long j);
 
+#ifdef CONFIG_HOTPLUG_CPU
+int timers_dead_cpu(unsigned int cpu);
+#else
+#define timers_dead_cpu NULL
+#endif
+
 #endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 85500e7fe238..e1017d92d308 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1195,6 +1195,11 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.startup = hrtimers_prepare_cpu,
 		.teardown = hrtimers_dead_cpu,
 	},
+	[CPUHP_TIMERS_DEAD] = {
+		.name = "timers dead",
+		.startup = NULL,
+		.teardown = timers_dead_cpu,
+	},
 	/*
 	 * Preparatory and dead notifiers. Will be replaced once the notifiers
 	 * are converted to states.
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index cb9ab401e2d9..555670a5143c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1804,7 +1804,7 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
 	}
 }
 
-static void migrate_timers(int cpu)
+int timers_dead_cpu(unsigned int cpu)
 {
 	struct timer_base *old_base;
 	struct timer_base *new_base;
@@ -1831,29 +1831,9 @@ static void migrate_timers(int cpu)
 		spin_unlock_irq(&new_base->lock);
 		put_cpu_ptr(&timer_bases);
 	}
+	return 0;
 }
 
-static int timer_cpu_notify(struct notifier_block *self,
-				unsigned long action, void *hcpu)
-{
-	switch (action) {
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		migrate_timers((long)hcpu);
-		break;
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static inline void timer_register_cpu_notifier(void)
-{
-	cpu_notifier(timer_cpu_notify, 0);
-}
-#else
-static inline void timer_register_cpu_notifier(void) { }
 #endif /* CONFIG_HOTPLUG_CPU */
 
 static void __init init_timer_cpu(int cpu)
@@ -1881,7 +1861,6 @@ void __init init_timers(void)
 {
 	init_timer_cpus();
 	init_timer_stats();
-	timer_register_cpu_notifier();
 	open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
 }
 
-- 
cgit v1.3-8-gc7d7


From e722d8daafb974b9ad1bbaf42f384a5ea5929f5f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 13 Jul 2016 17:16:59 +0000
Subject: profile: Convert to hotplug state machine

Install the callbacks via the state machine and let the core invoke
the callbacks on the already online CPUs. A lot of code is removed because
the for-loop is used and create_hash_tables() is removed since its purpose
is covered by the startup / teardown hooks.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160713153337.649867675@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpuhotplug.h |   1 +
 kernel/profile.c           | 181 ++++++++++++++++-----------------------------
 2 files changed, 66 insertions(+), 116 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 15d46d2a1d16..ace5ad0fc3ec 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -16,6 +16,7 @@ enum cpuhp_state {
 	CPUHP_X86_APB_DEAD,
 	CPUHP_WORKQUEUE_PREP,
 	CPUHP_HRTIMERS_PREPARE,
+	CPUHP_PROFILE_PREPARE,
 	CPUHP_TIMERS_DEAD,
 	CPUHP_NOTIFY_PREPARE,
 	CPUHP_BRINGUP_CPU,
diff --git a/kernel/profile.c b/kernel/profile.c
index c2199e9901c9..2dbccf2d806c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -328,68 +328,57 @@ out:
 	put_cpu();
 }
 
-static int profile_cpu_callback(struct notifier_block *info,
-					unsigned long action, void *__cpu)
+static int profile_dead_cpu(unsigned int cpu)
 {
-	int node, cpu = (unsigned long)__cpu;
 	struct page *page;
+	int i;
 
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		node = cpu_to_mem(cpu);
-		per_cpu(cpu_profile_flip, cpu) = 0;
-		if (!per_cpu(cpu_profile_hits, cpu)[1]) {
-			page = __alloc_pages_node(node,
-					GFP_KERNEL | __GFP_ZERO,
-					0);
-			if (!page)
-				return notifier_from_errno(-ENOMEM);
-			per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
-		}
-		if (!per_cpu(cpu_profile_hits, cpu)[0]) {
-			page = __alloc_pages_node(node,
-					GFP_KERNEL | __GFP_ZERO,
-					0);
-			if (!page)
-				goto out_free;
-			per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
-		}
-		break;
-out_free:
-		page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
-		per_cpu(cpu_profile_hits, cpu)[1] = NULL;
-		__free_page(page);
-		return notifier_from_errno(-ENOMEM);
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		if (prof_cpu_mask != NULL)
-			cpumask_set_cpu(cpu, prof_cpu_mask);
-		break;
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		if (prof_cpu_mask != NULL)
-			cpumask_clear_cpu(cpu, prof_cpu_mask);
-		if (per_cpu(cpu_profile_hits, cpu)[0]) {
-			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
-			per_cpu(cpu_profile_hits, cpu)[0] = NULL;
+	if (prof_cpu_mask != NULL)
+		cpumask_clear_cpu(cpu, prof_cpu_mask);
+
+	for (i = 0; i < 2; i++) {
+		if (per_cpu(cpu_profile_hits, cpu)[i]) {
+			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]);
+			per_cpu(cpu_profile_hits, cpu)[i] = NULL;
 			__free_page(page);
 		}
-		if (per_cpu(cpu_profile_hits, cpu)[1]) {
-			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
-			per_cpu(cpu_profile_hits, cpu)[1] = NULL;
-			__free_page(page);
+	}
+	return 0;
+}
+
+static int profile_prepare_cpu(unsigned int cpu)
+{
+	int i, node = cpu_to_mem(cpu);
+	struct page *page;
+
+	per_cpu(cpu_profile_flip, cpu) = 0;
+
+	for (i = 0; i < 2; i++) {
+		if (per_cpu(cpu_profile_hits, cpu)[i])
+			continue;
+
+		page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+		if (!page) {
+			profile_dead_cpu(cpu);
+			return -ENOMEM;
 		}
-		break;
+		per_cpu(cpu_profile_hits, cpu)[i] = page_address(page);
+
 	}
-	return NOTIFY_OK;
+	return 0;
+}
+
+static int profile_online_cpu(unsigned int cpu)
+{
+	if (prof_cpu_mask != NULL)
+		cpumask_set_cpu(cpu, prof_cpu_mask);
+
+	return 0;
 }
+
 #else /* !CONFIG_SMP */
 #define profile_flip_buffers()		do { } while (0)
 #define profile_discard_flip_buffers()	do { } while (0)
-#define profile_cpu_callback		NULL
 
 static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
@@ -531,83 +520,43 @@ static const struct file_operations proc_profile_operations = {
 	.llseek		= default_llseek,
 };
 
-#ifdef CONFIG_SMP
-static void profile_nop(void *unused)
-{
-}
-
-static int create_hash_tables(void)
+int __ref create_proc_profile(void)
 {
-	int cpu;
-
-	for_each_online_cpu(cpu) {
-		int node = cpu_to_mem(cpu);
-		struct page *page;
-
-		page = __alloc_pages_node(node,
-				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
-				0);
-		if (!page)
-			goto out_cleanup;
-		per_cpu(cpu_profile_hits, cpu)[1]
-				= (struct profile_hit *)page_address(page);
-		page = __alloc_pages_node(node,
-				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
-				0);
-		if (!page)
-			goto out_cleanup;
-		per_cpu(cpu_profile_hits, cpu)[0]
-				= (struct profile_hit *)page_address(page);
-	}
-	return 0;
-out_cleanup:
-	prof_on = 0;
-	smp_mb();
-	on_each_cpu(profile_nop, NULL, 1);
-	for_each_online_cpu(cpu) {
-		struct page *page;
-
-		if (per_cpu(cpu_profile_hits, cpu)[0]) {
-			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
-			per_cpu(cpu_profile_hits, cpu)[0] = NULL;
-			__free_page(page);
-		}
-		if (per_cpu(cpu_profile_hits, cpu)[1]) {
-			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
-			per_cpu(cpu_profile_hits, cpu)[1] = NULL;
-			__free_page(page);
-		}
-	}
-	return -1;
-}
-#else
-#define create_hash_tables()			({ 0; })
+	struct proc_dir_entry *entry;
+#ifdef CONFIG_SMP
+	enum cpuhp_state online_state;
 #endif
 
-int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
-{
-	struct proc_dir_entry *entry;
 	int err = 0;
 
 	if (!prof_on)
 		return 0;
-
-	cpu_notifier_register_begin();
-
-	if (create_hash_tables()) {
-		err = -ENOMEM;
-		goto out;
-	}
-
+#ifdef CONFIG_SMP
+	err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE",
+				profile_prepare_cpu, profile_dead_cpu);
+	if (err)
+		return err;
+
+	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE",
+				profile_online_cpu, NULL);
+	if (err < 0)
+		goto err_state_prep;
+	online_state = err;
+	err = 0;
+#endif
 	entry = proc_create("profile", S_IWUSR | S_IRUGO,
 			    NULL, &proc_profile_operations);
 	if (!entry)
-		goto out;
+		goto err_state_onl;
 	proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
-	__hotcpu_notifier(profile_cpu_callback, 0);
 
-out:
-	cpu_notifier_register_done();
+	return err;
+err_state_onl:
+#ifdef CONFIG_SMP
+	cpuhp_remove_state(online_state);
+err_state_prep:
+	cpuhp_remove_state(CPUHP_PROFILE_PREPARE);
+#endif
 	return err;
 }
 subsys_initcall(create_proc_profile);
-- 
cgit v1.3-8-gc7d7


From 31487f8328f20fdb302430b020a5d6e8446c1971 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Wed, 13 Jul 2016 17:17:01 +0000
Subject: smp/cfd: Convert core to hotplug state machine

Install the callbacks via the state machine. They are installed at runtime so
smpcfd_prepare_cpu() needs to be invoked by the boot-CPU.

Signed-off-by: Richard Weinberger <richard@nod.at>
[ Added the dropped CPU dying case back in. ]
Signed-off-by: Richard Cochran <rcochran@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Davidlohr Bueso <dave@stgolabs>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160713153337.818376366@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpuhotplug.h |  2 ++
 include/linux/smp.h        |  5 +++
 kernel/cpu.c               |  9 ++++++
 kernel/smp.c               | 79 +++++++++++++++++++---------------------------
 4 files changed, 48 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 78170827a776..b5cf01ace71b 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -18,6 +18,7 @@ enum cpuhp_state {
 	CPUHP_HRTIMERS_PREPARE,
 	CPUHP_PROFILE_PREPARE,
 	CPUHP_X2APIC_PREPARE,
+	CPUHP_SMPCFD_PREPARE,
 	CPUHP_TIMERS_DEAD,
 	CPUHP_NOTIFY_PREPARE,
 	CPUHP_BRINGUP_CPU,
@@ -57,6 +58,7 @@ enum cpuhp_state {
 	CPUHP_AP_ARM_CORESIGHT4_STARTING,
 	CPUHP_AP_ARM64_ISNDEP_STARTING,
 	CPUHP_AP_LEDTRIG_STARTING,
+	CPUHP_AP_SMPCFD_DYING,
 	CPUHP_AP_X86_TBOOT_DYING,
 	CPUHP_AP_NOTIFY_STARTING,
 	CPUHP_AP_ONLINE,
diff --git a/include/linux/smp.h b/include/linux/smp.h
index c4414074bd88..eccae4690f41 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -196,4 +196,9 @@ extern void arch_enable_nonboot_cpus_end(void);
 
 void smp_setup_processor_id(void);
 
+/* SMP core functions */
+int smpcfd_prepare_cpu(unsigned int cpu);
+int smpcfd_dead_cpu(unsigned int cpu);
+int smpcfd_dying_cpu(unsigned int cpu);
+
 #endif /* __LINUX_SMP_H */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e1017d92d308..008e2fd40cb1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1195,6 +1195,11 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.startup = hrtimers_prepare_cpu,
 		.teardown = hrtimers_dead_cpu,
 	},
+	[CPUHP_SMPCFD_PREPARE] = {
+		.name = "SMPCFD prepare",
+		.startup = smpcfd_prepare_cpu,
+		.teardown = smpcfd_dead_cpu,
+	},
 	[CPUHP_TIMERS_DEAD] = {
 		.name = "timers dead",
 		.startup = NULL,
@@ -1218,6 +1223,10 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.teardown		= NULL,
 		.cant_stop		= true,
 	},
+	[CPUHP_AP_SMPCFD_DYING] = {
+		.startup = NULL,
+		.teardown = smpcfd_dying_cpu,
+	},
 	/*
 	 * Handled on controll processor until the plugged processor manages
 	 * this itself.
diff --git a/kernel/smp.c b/kernel/smp.c
index 74165443c240..7180491c9678 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -33,69 +33,54 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
 
 static void flush_smp_call_function_queue(bool warn_cpu_offline);
 
-static int
-hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
+int smpcfd_prepare_cpu(unsigned int cpu)
 {
-	long cpu = (long)hcpu;
 	struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
 
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
-				cpu_to_node(cpu)))
-			return notifier_from_errno(-ENOMEM);
-		cfd->csd = alloc_percpu(struct call_single_data);
-		if (!cfd->csd) {
-			free_cpumask_var(cfd->cpumask);
-			return notifier_from_errno(-ENOMEM);
-		}
-		break;
-
-#ifdef CONFIG_HOTPLUG_CPU
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-		/* Fall-through to the CPU_DEAD[_FROZEN] case. */
-
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
+	if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
+				     cpu_to_node(cpu)))
+		return -ENOMEM;
+	cfd->csd = alloc_percpu(struct call_single_data);
+	if (!cfd->csd) {
 		free_cpumask_var(cfd->cpumask);
-		free_percpu(cfd->csd);
-		break;
+		return -ENOMEM;
+	}
 
-	case CPU_DYING:
-	case CPU_DYING_FROZEN:
-		/*
-		 * The IPIs for the smp-call-function callbacks queued by other
-		 * CPUs might arrive late, either due to hardware latencies or
-		 * because this CPU disabled interrupts (inside stop-machine)
-		 * before the IPIs were sent. So flush out any pending callbacks
-		 * explicitly (without waiting for the IPIs to arrive), to
-		 * ensure that the outgoing CPU doesn't go offline with work
-		 * still pending.
-		 */
-		flush_smp_call_function_queue(false);
-		break;
-#endif
-	};
+	return 0;
+}
+
+int smpcfd_dead_cpu(unsigned int cpu)
+{
+	struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
 
-	return NOTIFY_OK;
+	free_cpumask_var(cfd->cpumask);
+	free_percpu(cfd->csd);
+	return 0;
 }
 
-static struct notifier_block hotplug_cfd_notifier = {
-	.notifier_call		= hotplug_cfd,
-};
+int smpcfd_dying_cpu(unsigned int cpu)
+{
+	/*
+	 * The IPIs for the smp-call-function callbacks queued by other
+	 * CPUs might arrive late, either due to hardware latencies or
+	 * because this CPU disabled interrupts (inside stop-machine)
+	 * before the IPIs were sent. So flush out any pending callbacks
+	 * explicitly (without waiting for the IPIs to arrive), to
+	 * ensure that the outgoing CPU doesn't go offline with work
+	 * still pending.
+	 */
+	flush_smp_call_function_queue(false);
+	return 0;
+}
 
 void __init call_function_init(void)
 {
-	void *cpu = (void *)(long)smp_processor_id();
 	int i;
 
 	for_each_possible_cpu(i)
 		init_llist_head(&per_cpu(call_single_queue, i));
 
-	hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
-	register_cpu_notifier(&hotplug_cfd_notifier);
+	smpcfd_prepare_cpu(smp_processor_id());
 }
 
 /*
-- 
cgit v1.3-8-gc7d7


From 4df8374254ea9294dfe4b8c447a1b7eddc543dbf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Jul 2016 17:17:03 +0000
Subject: rcu: Convert rcutree to hotplug state machine

Straight forward conversion to the state machine. Though the question arises
whether this needs really all these state transitions to work.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160713153337.982013161@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpuhotplug.h |   3 ++
 include/linux/rcutiny.h    |   7 +++
 include/linux/rcutree.h    |   7 +++
 kernel/cpu.c               |  14 ++++++
 kernel/rcu/tree.c          | 105 ++++++++++++++++++++++-----------------------
 5 files changed, 83 insertions(+), 53 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 544b5563720e..201a2e23bc49 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -20,11 +20,13 @@ enum cpuhp_state {
 	CPUHP_X2APIC_PREPARE,
 	CPUHP_SMPCFD_PREPARE,
 	CPUHP_TIMERS_DEAD,
+	CPUHP_RCUTREE_PREP,
 	CPUHP_NOTIFY_PREPARE,
 	CPUHP_BRINGUP_CPU,
 	CPUHP_AP_IDLE_DEAD,
 	CPUHP_AP_OFFLINE,
 	CPUHP_AP_SCHED_STARTING,
+	CPUHP_AP_RCUTREE_DYING,
 	CPUHP_AP_IRQ_GIC_STARTING,
 	CPUHP_AP_IRQ_GICV3_STARTING,
 	CPUHP_AP_IRQ_HIP04_STARTING,
@@ -80,6 +82,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
 	CPUHP_AP_PERF_ARM_CCN_ONLINE,
 	CPUHP_AP_WORKQUEUE_ONLINE,
+	CPUHP_AP_RCUTREE_ONLINE,
 	CPUHP_AP_NOTIFY_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
 	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 30,
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 93aea75029fb..ac81e4063b40 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -243,4 +243,11 @@ static inline void rcu_all_qs(void)
 	barrier(); /* Avoid RCU read-side critical sections leaking across. */
 }
 
+/* RCUtree hotplug events */
+#define rcutree_prepare_cpu      NULL
+#define rcutree_online_cpu       NULL
+#define rcutree_offline_cpu      NULL
+#define rcutree_dead_cpu         NULL
+#define rcutree_dying_cpu        NULL
+
 #endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 5043cb823fb2..63a4e4cf40a5 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -111,4 +111,11 @@ bool rcu_is_watching(void);
 
 void rcu_all_qs(void);
 
+/* RCUtree hotplug events */
+int rcutree_prepare_cpu(unsigned int cpu);
+int rcutree_online_cpu(unsigned int cpu);
+int rcutree_offline_cpu(unsigned int cpu);
+int rcutree_dead_cpu(unsigned int cpu);
+int rcutree_dying_cpu(unsigned int cpu);
+
 #endif /* __LINUX_RCUTREE_H */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 008e2fd40cb1..f24f45915b54 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1205,6 +1205,11 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.startup = NULL,
 		.teardown = timers_dead_cpu,
 	},
+	[CPUHP_RCUTREE_PREP] = {
+		.name = "RCU-tree prepare",
+		.startup = rcutree_prepare_cpu,
+		.teardown = rcutree_dead_cpu,
+	},
 	/*
 	 * Preparatory and dead notifiers. Will be replaced once the notifiers
 	 * are converted to states.
@@ -1263,6 +1268,10 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 		.startup		= sched_cpu_starting,
 		.teardown		= sched_cpu_dying,
 	},
+	[CPUHP_AP_RCUTREE_DYING] = {
+		.startup = NULL,
+		.teardown = rcutree_dying_cpu,
+	},
 	/*
 	 * Low level startup/teardown notifiers. Run with interrupts
 	 * disabled. Will be removed once the notifiers are converted to
@@ -1296,6 +1305,11 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 		.startup = workqueue_online_cpu,
 		.teardown = workqueue_offline_cpu,
 	},
+	[CPUHP_AP_RCUTREE_ONLINE] = {
+		.name = "RCU-tree online",
+		.startup = rcutree_online_cpu,
+		.teardown = rcutree_offline_cpu,
+	},
 
 	/*
 	 * Online/down_prepare notifiers. Will be removed once the notifiers
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f433959e9322..5d80925e7fc8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1073,11 +1073,11 @@ EXPORT_SYMBOL_GPL(rcu_is_watching);
  * offline to continue to use RCU for one jiffy after marking itself
  * offline in the cpu_online_mask.  This leniency is necessary given the
  * non-atomic nature of the online and offline processing, for example,
- * the fact that a CPU enters the scheduler after completing the CPU_DYING
- * notifiers.
+ * the fact that a CPU enters the scheduler after completing the teardown
+ * of the CPU.
  *
- * This is also why RCU internally marks CPUs online during the
- * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
+ * This is also why RCU internally marks CPUs online during in the
+ * preparation phase and offline after the CPU has been taken down.
  *
  * Disable checking if in an NMI handler because we cannot safely report
  * errors from NMI handlers anyway.
@@ -3806,12 +3806,58 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
 
-static void rcu_prepare_cpu(int cpu)
+int rcutree_prepare_cpu(unsigned int cpu)
 {
 	struct rcu_state *rsp;
 
 	for_each_rcu_flavor(rsp)
 		rcu_init_percpu_data(cpu, rsp);
+
+	rcu_prepare_kthreads(cpu);
+	rcu_spawn_all_nocb_kthreads(cpu);
+
+	return 0;
+}
+
+static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
+{
+	struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
+
+	rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
+}
+
+int rcutree_online_cpu(unsigned int cpu)
+{
+	sync_sched_exp_online_cleanup(cpu);
+	rcutree_affinity_setting(cpu, -1);
+	return 0;
+}
+
+int rcutree_offline_cpu(unsigned int cpu)
+{
+	rcutree_affinity_setting(cpu, cpu);
+	return 0;
+}
+
+
+int rcutree_dying_cpu(unsigned int cpu)
+{
+	struct rcu_state *rsp;
+
+	for_each_rcu_flavor(rsp)
+		rcu_cleanup_dying_cpu(rsp);
+	return 0;
+}
+
+int rcutree_dead_cpu(unsigned int cpu)
+{
+	struct rcu_state *rsp;
+
+	for_each_rcu_flavor(rsp) {
+		rcu_cleanup_dead_cpu(cpu, rsp);
+		do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
+	}
+	return 0;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -3851,52 +3897,6 @@ void rcu_report_dead(unsigned int cpu)
 }
 #endif
 
-/*
- * Handle CPU online/offline notification events.
- */
-int rcu_cpu_notify(struct notifier_block *self,
-		   unsigned long action, void *hcpu)
-{
-	long cpu = (long)hcpu;
-	struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
-	struct rcu_node *rnp = rdp->mynode;
-	struct rcu_state *rsp;
-
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		rcu_prepare_cpu(cpu);
-		rcu_prepare_kthreads(cpu);
-		rcu_spawn_all_nocb_kthreads(cpu);
-		break;
-	case CPU_ONLINE:
-	case CPU_DOWN_FAILED:
-		sync_sched_exp_online_cleanup(cpu);
-		rcu_boost_kthread_setaffinity(rnp, -1);
-		break;
-	case CPU_DOWN_PREPARE:
-		rcu_boost_kthread_setaffinity(rnp, cpu);
-		break;
-	case CPU_DYING:
-	case CPU_DYING_FROZEN:
-		for_each_rcu_flavor(rsp)
-			rcu_cleanup_dying_cpu(rsp);
-		break;
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-		for_each_rcu_flavor(rsp) {
-			rcu_cleanup_dead_cpu(cpu, rsp);
-			do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
-		}
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_OK;
-}
-
 static int rcu_pm_notify(struct notifier_block *self,
 			 unsigned long action, void *hcpu)
 {
@@ -4208,10 +4208,9 @@ void __init rcu_init(void)
 	 * this is called early in boot, before either interrupts
 	 * or the scheduler are operational.
 	 */
-	cpu_notifier(rcu_cpu_notify, 0);
 	pm_notifier(rcu_pm_notify, 0);
 	for_each_online_cpu(cpu)
-		rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+		rcutree_prepare_cpu(cpu);
 }
 
 #include "tree_exp.h"
-- 
cgit v1.3-8-gc7d7


From 7bd8830875bfa380c68f390efbad893293749324 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 15 Jul 2016 06:35:24 -0500
Subject: cgroupns: Fix the locking in copy_cgroup_ns

If "clone(CLONE_NEWCGROUP...)" is called it results in a nice lockdep
valid splat.

In __cgroup_proc_write the lock ordering is:
     cgroup_mutex -- through cgroup_kn_lock_live
     cgroup_threadgroup_rwsem

In copy_process the guts of clone the lock ordering is:
     cgroup_threadgroup_rwsem -- through threadgroup_change_begin
     cgroup_mutex -- through copy_namespaces -- copy_cgroup_ns

lockdep reports some a different call chains for the first ordering of
cgroup_mutex and cgroup_threadgroup_rwsem but it is harder to trace.
This is most definitely deadlock potential under the right
circumstances.

Fix this by by skipping the cgroup_mutex and making the locking in
copy_cgroup_ns mirror the locking in cgroup_post_fork which also runs
during fork under the cgroup_threadgroup_rwsem.

Cc: stable@vger.kernel.org
Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 75c0ff00aca6..5f01e00cffc4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6309,14 +6309,11 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
 	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	mutex_lock(&cgroup_mutex);
+	/* It is not safe to take cgroup_mutex here */
 	spin_lock_irq(&css_set_lock);
-
 	cset = task_css_set(current);
 	get_css_set(cset);
-
 	spin_unlock_irq(&css_set_lock);
-	mutex_unlock(&cgroup_mutex);
 
 	new_ns = alloc_cgroup_ns();
 	if (IS_ERR(new_ns)) {
-- 
cgit v1.3-8-gc7d7


From eedd0f4cbf5f3b81e82649832091e1d9d53f0709 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 15 Jul 2016 06:35:51 -0500
Subject: cgroupns: Close race between cgroup_post_fork and copy_cgroup_ns

In most code paths involving cgroup migration cgroup_threadgroup_rwsem
is taken.  There are two exceptions:

- remove_tasks_in_empty_cpuset calls cgroup_transfer_tasks
- vhost_attach_cgroups_work calls cgroup_attach_task_all

With cgroup_threadgroup_rwsem held it is guaranteed that cgroup_post_fork
and copy_cgroup_ns will reference the same css_set from the process calling
fork.

Without such an interlock there process after fork could reference one
css_set from it's new cgroup namespace and another css_set from
task->cgroups, which semantically is nonsensical.

Cc: stable@vger.kernel.org
Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5f01e00cffc4..e75efa819911 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2962,6 +2962,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 	int retval = 0;
 
 	mutex_lock(&cgroup_mutex);
+	percpu_down_write(&cgroup_threadgroup_rwsem);
 	for_each_root(root) {
 		struct cgroup *from_cgrp;
 
@@ -2976,6 +2977,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 		if (retval)
 			break;
 	}
+	percpu_up_write(&cgroup_threadgroup_rwsem);
 	mutex_unlock(&cgroup_mutex);
 
 	return retval;
@@ -4343,6 +4345,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 
 	mutex_lock(&cgroup_mutex);
 
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+
 	/* all tasks in @from are being moved, all csets are source */
 	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &from->cset_links, cset_link)
@@ -4371,6 +4375,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	} while (task && !ret);
 out_err:
 	cgroup_migrate_finish(&preloaded_csets);
+	percpu_up_write(&cgroup_threadgroup_rwsem);
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
-- 
cgit v1.3-8-gc7d7


From 726a4994b05ff5b6f83d64b5b43c3251217366ce Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 15 Jul 2016 06:36:44 -0500
Subject: cgroupns: Only allow creation of hierarchies in the initial cgroup
 namespace

Unprivileged users can't use hierarchies if they create them as they do not
have privilieges to the root directory.

Which means the only thing a hiearchy created by an unprivileged user
is good for is expanding the number of cgroup links in every css_set,
which is a DOS attack.

We could allow hierarchies to be created in namespaces in the initial
user namespace.  Unfortunately there is only a single namespace for
the names of heirarchies, so that is likely to create more confusion
than not.

So do the simple thing and restrict hiearchy creation to the initial
cgroup namespace.

Cc: stable@vger.kernel.org
Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e75efa819911..e0be49fc382f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2215,12 +2215,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		goto out_unlock;
 	}
 
-	/*
-	 * We know this subsystem has not yet been bound.  Users in a non-init
-	 * user namespace may only mount hierarchies with no bound subsystems,
-	 * i.e. 'none,name=user1'
-	 */
-	if (!opts.none && !capable(CAP_SYS_ADMIN)) {
+	/* Hierarchies may only be created in the initial cgroup namespace. */
+	if (ns != &init_cgroup_ns) {
 		ret = -EPERM;
 		goto out_unlock;
 	}
-- 
cgit v1.3-8-gc7d7


From 406f992e4a372dafbe3c2cff7efbb2002a5c8ebd Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 14 Jul 2016 03:55:23 +0200
Subject: x86 / hibernate: Use hlt_play_dead() when resuming from hibernation

On Intel hardware, native_play_dead() uses mwait_play_dead() by
default and only falls back to the other methods if that fails.
That also happens during resume from hibernation, when the restore
(boot) kernel runs disable_nonboot_cpus() to take all of the CPUs
except for the boot one offline.

However, that is problematic, because the address passed to
__monitor() in mwait_play_dead() is likely to be written to in the
last phase of hibernate image restoration and that causes the "dead"
CPU to start executing instructions again.  Unfortunately, the page
containing the address in that CPU's instruction pointer may not be
valid any more at that point.

First, that page may have been overwritten with image kernel memory
contents already, so the instructions the CPU attempts to execute may
simply be invalid.  Second, the page tables previously used by that
CPU may have been overwritten by image kernel memory contents, so the
address in its instruction pointer is impossible to resolve then.

A report from Varun Koyyalagunta and investigation carried out by
Chen Yu show that the latter sometimes happens in practice.

To prevent it from happening, temporarily change the smp_ops.play_dead
pointer during resume from hibernation so that it points to a special
"play dead" routine which uses hlt_play_dead() and avoids the
inadvertent "revivals" of "dead" CPUs this way.

A slightly unpleasant consequence of this change is that if the
system is hibernated with one or more CPUs offline, it will generally
draw more power after resume than it did before hibernation, because
the physical state entered by CPUs via hlt_play_dead() is higher-power
than the mwait_play_dead() one in the majority of cases.  It is
possible to work around this, but it is unclear how much of a problem
that's going to be in practice, so the workaround will be implemented
later if it turns out to be necessary.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=106371
Reported-by: Varun Koyyalagunta <cpudebug@centtech.com>
Original-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/smp.h |  1 +
 arch/x86/kernel/smpboot.c  |  2 +-
 arch/x86/power/cpu.c       | 30 ++++++++++++++++++++++++++++++
 kernel/power/hibernate.c   |  7 ++++++-
 kernel/power/power.h       |  2 ++
 5 files changed, 40 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 66b057306f40..7427ca895a27 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -135,6 +135,7 @@ int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
 int native_cpu_disable(void);
 int common_cpu_die(unsigned int cpu);
 void native_cpu_die(unsigned int cpu);
+void hlt_play_dead(void);
 void native_play_dead(void);
 void play_dead_common(void);
 void wbinvd_on_cpu(int cpu);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fafe8b923cac..8264dfad9cf8 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1622,7 +1622,7 @@ static inline void mwait_play_dead(void)
 	}
 }
 
-static inline void hlt_play_dead(void)
+void hlt_play_dead(void)
 {
 	if (__this_cpu_read(cpu_info.x86) >= 4)
 		wbinvd();
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index d5f64996394a..b12c26e2e309 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -12,6 +12,7 @@
 #include <linux/export.h>
 #include <linux/smp.h>
 #include <linux/perf_event.h>
+#include <linux/tboot.h>
 
 #include <asm/pgtable.h>
 #include <asm/proto.h>
@@ -266,6 +267,35 @@ void notrace restore_processor_state(void)
 EXPORT_SYMBOL(restore_processor_state);
 #endif
 
+#if defined(CONFIG_HIBERNATION) && defined(CONFIG_HOTPLUG_CPU)
+static void resume_play_dead(void)
+{
+	play_dead_common();
+	tboot_shutdown(TB_SHUTDOWN_WFS);
+	hlt_play_dead();
+}
+
+int hibernate_resume_nonboot_cpu_disable(void)
+{
+	void (*play_dead)(void) = smp_ops.play_dead;
+	int ret;
+
+	/*
+	 * Ensure that MONITOR/MWAIT will not be used in the "play dead" loop
+	 * during hibernate image restoration, because it is likely that the
+	 * monitored address will be actually written to at that time and then
+	 * the "dead" CPU will attempt to execute instructions again, but the
+	 * address in its instruction pointer may not be possible to resolve
+	 * any more at that point (the page tables used by it previously may
+	 * have been overwritten by hibernate image data).
+	 */
+	smp_ops.play_dead = resume_play_dead;
+	ret = disable_nonboot_cpus();
+	smp_ops.play_dead = play_dead;
+	return ret;
+}
+#endif
+
 /*
  * When bsp_check() is called in hibernate and suspend, cpu hotplug
  * is disabled already. So it's unnessary to handle race condition between
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 51441d87f0b6..5f3523e18e46 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -409,6 +409,11 @@ int hibernation_snapshot(int platform_mode)
 	goto Close;
 }
 
+int __weak hibernate_resume_nonboot_cpu_disable(void)
+{
+	return disable_nonboot_cpus();
+}
+
 /**
  * resume_target_kernel - Restore system state from a hibernation image.
  * @platform_mode: Whether or not to use the platform driver.
@@ -433,7 +438,7 @@ static int resume_target_kernel(bool platform_mode)
 	if (error)
 		goto Cleanup;
 
-	error = disable_nonboot_cpus();
+	error = hibernate_resume_nonboot_cpu_disable();
 	if (error)
 		goto Enable_cpus;
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 064963e89194..242d8b827dd5 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -38,6 +38,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
 }
 #endif /* CONFIG_ARCH_HIBERNATION_HEADER */
 
+extern int hibernate_resume_nonboot_cpu_disable(void);
+
 /*
  * Keep some memory free so that I/O operations can succeed without paging
  * [Might this be more than 4 MB?]
-- 
cgit v1.3-8-gc7d7


From 7e3f977edd0bd9ea6104156feba95bb5ae9bdd38 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 14 Jul 2016 18:08:03 +0200
Subject: perf, events: add non-linear data support for raw records

This patch adds support for non-linear data on raw records. It
extends raw records to have one or multiple fragments that will
be written linearly into the ring slot, where each fragment can
optionally have a custom callback handler to walk and extract
complex, possibly non-linear data.

If a callback handler is provided for a fragment, then the new
__output_custom() will be used instead of __output_copy() for
the perf_output_sample() part. perf_prepare_sample() does all
the size calculation only once, so perf_output_sample() doesn't
need to redo the same work anymore, meaning real_size and padding
will be cached in the raw record. The raw record becomes 32 bytes
in size without holes; to not increase it further and to avoid
doing unnecessary recalculations in fast-path, we can reuse
next pointer of the last fragment, idea here is borrowed from
ZERO_OR_NULL_PTR(), which should keep the perf_output_sample()
path for PERF_SAMPLE_RAW minimal.

This facility is needed for BPF's event output helper as a first
user that will, in a follow-up, add an additional perf_raw_frag
to its perf_raw_record in order to be able to more efficiently
dump skb context after a linear head meta data related to it.
skbs can be non-linear and thus need a custom output function to
dump buffers. Currently, the skb data needs to be copied twice;
with the help of __output_custom() this work only needs to be
done once. Future users could be things like XDP/BPF programs
that work on different context though and would thus also have
a different callback function.

The few users of raw records are adapted to initialize their frag
data from the raw record itself, no change in behavior for them.
The code is based upon a PoC diff provided by Peter Zijlstra [1].

  [1] http://thread.gmane.org/gmane.linux.network/421294

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/s390/kernel/perf_cpum_sf.c |  9 ++++--
 arch/x86/events/amd/ibs.c       |  8 +++--
 include/linux/perf_event.h      | 20 ++++++++++++-
 kernel/events/core.c            | 66 ++++++++++++++++++++++++++++-------------
 kernel/events/internal.h        | 16 +++++++---
 kernel/trace/bpf_trace.c        |  6 ++--
 6 files changed, 93 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index a8e832166417..92619cce57ed 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -979,12 +979,15 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr)
 	struct pt_regs regs;
 	struct perf_sf_sde_regs *sde_regs;
 	struct perf_sample_data data;
-	struct perf_raw_record raw;
+	struct perf_raw_record raw = {
+		.frag = {
+			.size = sfr->size,
+			.data = sfr,
+		},
+	};
 
 	/* Setup perf sample */
 	perf_sample_data_init(&data, 0, event->hw.last_period);
-	raw.size = sfr->size;
-	raw.data = sfr;
 	data.raw = &raw;
 
 	/* Setup pt_regs to look like an CPU-measurement external interrupt
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index feb90f6730e8..72dea2f40fc4 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -655,8 +655,12 @@ fail:
 	}
 
 	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-		raw.size = sizeof(u32) + ibs_data.size;
-		raw.data = ibs_data.data;
+		raw = (struct perf_raw_record){
+			.frag = {
+				.size = sizeof(u32) + ibs_data.size,
+				.data = ibs_data.data,
+			},
+		};
 		data.raw = &raw;
 	}
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1a827cecd62f..e79e6c6fed89 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -69,9 +69,22 @@ struct perf_callchain_entry_ctx {
 	bool			    contexts_maxed;
 };
 
+typedef unsigned long (*perf_copy_f)(void *dst, const void *src,
+				     unsigned long len);
+
+struct perf_raw_frag {
+	union {
+		struct perf_raw_frag	*next;
+		unsigned long		pad;
+	};
+	perf_copy_f			copy;
+	void				*data;
+	u32				size;
+} __packed;
+
 struct perf_raw_record {
+	struct perf_raw_frag		frag;
 	u32				size;
-	void				*data;
 };
 
 /*
@@ -1283,6 +1296,11 @@ extern void perf_restore_debug_store(void);
 static inline void perf_restore_debug_store(void)			{ }
 #endif
 
+static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag)
+{
+	return frag->pad < sizeof(u64);
+}
+
 #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x))
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9c51ec3f0f44..b1891b6b5c1f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5553,16 +5553,26 @@ void perf_output_sample(struct perf_output_handle *handle,
 	}
 
 	if (sample_type & PERF_SAMPLE_RAW) {
-		if (data->raw) {
-			u32 raw_size = data->raw->size;
-			u32 real_size = round_up(raw_size + sizeof(u32),
-						 sizeof(u64)) - sizeof(u32);
-			u64 zero = 0;
-
-			perf_output_put(handle, real_size);
-			__output_copy(handle, data->raw->data, raw_size);
-			if (real_size - raw_size)
-				__output_copy(handle, &zero, real_size - raw_size);
+		struct perf_raw_record *raw = data->raw;
+
+		if (raw) {
+			struct perf_raw_frag *frag = &raw->frag;
+
+			perf_output_put(handle, raw->size);
+			do {
+				if (frag->copy) {
+					__output_custom(handle, frag->copy,
+							frag->data, frag->size);
+				} else {
+					__output_copy(handle, frag->data,
+						      frag->size);
+				}
+				if (perf_raw_frag_last(frag))
+					break;
+				frag = frag->next;
+			} while (1);
+			if (frag->pad)
+				__output_skip(handle, NULL, frag->pad);
 		} else {
 			struct {
 				u32	size;
@@ -5687,14 +5697,28 @@ void perf_prepare_sample(struct perf_event_header *header,
 	}
 
 	if (sample_type & PERF_SAMPLE_RAW) {
-		int size = sizeof(u32);
-
-		if (data->raw)
-			size += data->raw->size;
-		else
-			size += sizeof(u32);
+		struct perf_raw_record *raw = data->raw;
+		int size;
+
+		if (raw) {
+			struct perf_raw_frag *frag = &raw->frag;
+			u32 sum = 0;
+
+			do {
+				sum += frag->size;
+				if (perf_raw_frag_last(frag))
+					break;
+				frag = frag->next;
+			} while (1);
+
+			size = round_up(sum + sizeof(u32), sizeof(u64));
+			raw->size = size - sizeof(u32);
+			frag->pad = raw->size - sum;
+		} else {
+			size = sizeof(u64);
+		}
 
-		header->size += round_up(size, sizeof(u64));
+		header->size += size;
 	}
 
 	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -7331,7 +7355,7 @@ static struct pmu perf_swevent = {
 static int perf_tp_filter_match(struct perf_event *event,
 				struct perf_sample_data *data)
 {
-	void *record = data->raw->data;
+	void *record = data->raw->frag.data;
 
 	/* only top level events have filters set */
 	if (event->parent)
@@ -7387,8 +7411,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 	struct perf_event *event;
 
 	struct perf_raw_record raw = {
-		.size = entry_size,
-		.data = record,
+		.frag = {
+			.size = entry_size,
+			.data = record,
+		},
 	};
 
 	perf_sample_data_init(&data, 0, 0);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 05f9f6d626df..2417eb5512cd 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -123,10 +123,7 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
 	return rb->aux_nr_pages << PAGE_SHIFT;
 }
 
-#define DEFINE_OUTPUT_COPY(func_name, memcpy_func)			\
-static inline unsigned long						\
-func_name(struct perf_output_handle *handle,				\
-	  const void *buf, unsigned long len)				\
+#define __DEFINE_OUTPUT_COPY_BODY(memcpy_func)				\
 {									\
 	unsigned long size, written;					\
 									\
@@ -152,6 +149,17 @@ func_name(struct perf_output_handle *handle,				\
 	return len;							\
 }
 
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func)			\
+static inline unsigned long						\
+func_name(struct perf_output_handle *handle,				\
+	  const void *buf, unsigned long len)				\
+__DEFINE_OUTPUT_COPY_BODY(memcpy_func)
+
+static inline unsigned long
+__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func,
+		const void *buf, unsigned long len)
+__DEFINE_OUTPUT_COPY_BODY(copy_func)
+
 static inline unsigned long
 memcpy_common(void *dst, const void *src, unsigned long n)
 {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 094c716154ed..35ab1b2b041b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -245,8 +245,10 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 	struct bpf_event_entry *ee;
 	struct perf_event *event;
 	struct perf_raw_record raw = {
-		.size = size,
-		.data = data,
+		.frag = {
+			.size = size,
+			.data = data,
+		},
 	};
 
 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
-- 
cgit v1.3-8-gc7d7


From 8e7a3920ac277dd4e690c0e70c9750176e3acb83 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 14 Jul 2016 18:08:04 +0200
Subject: bpf, perf: split bpf_perf_event_output

Split the bpf_perf_event_output() helper as a preparation into
two parts. The new bpf_perf_event_output() will prepare the raw
record itself and test for unknown flags from BPF trace context,
where the __bpf_perf_event_output() does the core work. The
latter will be reused later on from bpf_event_output() directly.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 35ab1b2b041b..c35883a9bc11 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -233,26 +233,17 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+static __always_inline u64
+__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
+			u64 flags, struct perf_raw_record *raw)
 {
-	struct pt_regs *regs = (struct pt_regs *) (long) r1;
-	struct bpf_map *map = (struct bpf_map *) (long) r2;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	unsigned int cpu = smp_processor_id();
 	u64 index = flags & BPF_F_INDEX_MASK;
-	void *data = (void *) (long) r4;
 	struct perf_sample_data sample_data;
 	struct bpf_event_entry *ee;
 	struct perf_event *event;
-	struct perf_raw_record raw = {
-		.frag = {
-			.size = size,
-			.data = data,
-		},
-	};
 
-	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
-		return -EINVAL;
 	if (index == BPF_F_CURRENT_CPU)
 		index = cpu;
 	if (unlikely(index >= array->map.max_entries))
@@ -271,11 +262,29 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 		return -EOPNOTSUPP;
 
 	perf_sample_data_init(&sample_data, 0, 0);
-	sample_data.raw = &raw;
+	sample_data.raw = raw;
 	perf_event_output(event, &sample_data, regs);
 	return 0;
 }
 
+static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+{
+	struct pt_regs *regs = (struct pt_regs *)(long) r1;
+	struct bpf_map *map  = (struct bpf_map *)(long) r2;
+	void *data = (void *)(long) r4;
+	struct perf_raw_record raw = {
+		.frag = {
+			.size = size,
+			.data = data,
+		},
+	};
+
+	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+		return -EINVAL;
+
+	return __bpf_perf_event_output(regs, map, flags, &raw);
+}
+
 static const struct bpf_func_proto bpf_perf_event_output_proto = {
 	.func		= bpf_perf_event_output,
 	.gpl_only	= true,
-- 
cgit v1.3-8-gc7d7


From 555c8a8623a3a87b3c990ba30b7fd2e5914e41d2 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 14 Jul 2016 18:08:05 +0200
Subject: bpf: avoid stack copy and use skb ctx for event output

This work addresses a couple of issues bpf_skb_event_output()
helper currently has: i) We need two copies instead of just a
single one for the skb data when it should be part of a sample.
The data can be non-linear and thus needs to be extracted via
bpf_skb_load_bytes() helper first, and then copied once again
into the ring buffer slot. ii) Since bpf_skb_load_bytes()
currently needs to be used first, the helper needs to see a
constant size on the passed stack buffer to make sure BPF
verifier can do sanity checks on it during verification time.
Thus, just passing skb->len (or any other non-constant value)
wouldn't work, but changing bpf_skb_load_bytes() is also not
the proper solution, since the two copies are generally still
needed. iii) bpf_skb_load_bytes() is just for rather small
buffers like headers, since they need to sit on the limited
BPF stack anyway. Instead of working around in bpf_skb_load_bytes(),
this work improves the bpf_skb_event_output() helper to address
all 3 at once.

We can make use of the passed in skb context that we have in
the helper anyway, and use some of the reserved flag bits as
a length argument. The helper will use the new __output_custom()
facility from perf side with bpf_skb_copy() as callback helper
to walk and extract the data. It will pass the data for setup
to bpf_event_output(), which generates and pushes the raw record
with an additional frag part. The linear data used in the first
frag of the record serves as programmatically defined meta data
passed along with the appended sample.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  7 ++++++-
 include/uapi/linux/bpf.h |  2 ++
 kernel/bpf/core.c        |  6 ++++--
 kernel/trace/bpf_trace.c | 33 +++++++++++++++------------------
 net/core/filter.c        | 43 ++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 69 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b3336b4f5d04..c13e92b00bf5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -209,7 +209,12 @@ u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
-const struct bpf_func_proto *bpf_get_event_output_proto(void);
+
+typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src,
+					unsigned long len);
+
+u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy);
 
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 262a7e883b19..c4d922439d20 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -401,6 +401,8 @@ enum bpf_func_id {
 /* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */
 #define BPF_F_INDEX_MASK		0xffffffffULL
 #define BPF_F_CURRENT_CPU		BPF_F_INDEX_MASK
+/* BPF_FUNC_perf_event_output for sk_buff input context. */
+#define BPF_F_CTXLEN_MASK		(0xfffffULL << 32)
 
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d638062f66d6..03fd23d4d587 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1054,9 +1054,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 	return NULL;
 }
 
-const struct bpf_func_proto * __weak bpf_get_event_output_proto(void)
+u64 __weak
+bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+		 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
 {
-	return NULL;
+	return -ENOTSUPP;
 }
 
 /* Always built-in helper functions. */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c35883a9bc11..ebfbb7dd7033 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -298,29 +298,26 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 
 static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
 
-static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
 {
 	struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
+	struct perf_raw_frag frag = {
+		.copy		= ctx_copy,
+		.size		= ctx_size,
+		.data		= ctx,
+	};
+	struct perf_raw_record raw = {
+		.frag = {
+			.next	= ctx_size ? &frag : NULL,
+			.size	= meta_size,
+			.data	= meta,
+		},
+	};
 
 	perf_fetch_caller_regs(regs);
 
-	return bpf_perf_event_output((long)regs, r2, flags, r4, size);
-}
-
-static const struct bpf_func_proto bpf_event_output_proto = {
-	.func		= bpf_event_output,
-	.gpl_only	= true,
-	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_CONST_MAP_PTR,
-	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_STACK,
-	.arg5_type	= ARG_CONST_STACK_SIZE,
-};
-
-const struct bpf_func_proto *bpf_get_event_output_proto(void)
-{
-	return &bpf_event_output_proto;
+	return __bpf_perf_event_output(regs, map, flags, &raw);
 }
 
 static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
diff --git a/net/core/filter.c b/net/core/filter.c
index 10c4a2f9e8bb..22e3992c8b48 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2025,6 +2025,47 @@ bool bpf_helper_changes_skb_data(void *func)
 	return false;
 }
 
+static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
+				  unsigned long len)
+{
+	void *ptr = skb_header_pointer(skb, 0, len, dst_buff);
+
+	if (unlikely(!ptr))
+		return len;
+	if (ptr != dst_buff)
+		memcpy(dst_buff, ptr, len);
+
+	return 0;
+}
+
+static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4,
+				u64 meta_size)
+{
+	struct sk_buff *skb = (struct sk_buff *)(long) r1;
+	struct bpf_map *map = (struct bpf_map *)(long) r2;
+	u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
+	void *meta = (void *)(long) r4;
+
+	if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
+		return -EINVAL;
+	if (unlikely(skb_size > skb->len))
+		return -EFAULT;
+
+	return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
+				bpf_skb_copy);
+}
+
+static const struct bpf_func_proto bpf_skb_event_output_proto = {
+	.func		= bpf_skb_event_output,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_STACK,
+	.arg5_type	= ARG_CONST_STACK_SIZE,
+};
+
 static unsigned short bpf_tunnel_key_af(u64 flags)
 {
 	return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
@@ -2357,7 +2398,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_get_hash_recalc:
 		return &bpf_get_hash_recalc_proto;
 	case BPF_FUNC_perf_event_output:
-		return bpf_get_event_output_proto();
+		return &bpf_skb_event_output_proto;
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
 #ifdef CONFIG_SOCK_CGROUP_DATA
-- 
cgit v1.3-8-gc7d7


From 858d68f10238fdd1ebdd0096f912f063e97c6766 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 16 Jul 2016 01:15:55 +0200
Subject: bpf: bpf_event_entry_gen's alloc needs to be in atomic context

Should have been obvious, only called from bpf() syscall via map_update_elem()
that calls bpf_fd_array_map_update_elem() under RCU read lock and thus this
must also be in GFP_ATOMIC, of course.

Fixes: 3b1efb196eee ("bpf, maps: flush own entries on perf map release")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index db1a743e3db2..633a650d7aeb 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -430,7 +430,7 @@ static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
 {
 	struct bpf_event_entry *ee;
 
-	ee = kzalloc(sizeof(*ee), GFP_KERNEL);
+	ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
 	if (ee) {
 		ee->event = perf_file->private_data;
 		ee->perf_file = perf_file;
-- 
cgit v1.3-8-gc7d7


From 775be506266a860f141f6b848c92c316c602a94f Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben.dooks@codethink.co.uk>
Date: Fri, 17 Jun 2016 16:56:14 +0100
Subject: clockevents: Make clockevents_subsys static

The clockevents_subsys struct is used for sysfs support and
is not declared or used outside the file it is defined in.
Fix the following warning by making it static:

kernel/time/clockevents.c:648:17: warning: symbol 'clockevents_subsys' was not declared. Should it be static?

Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
Cc: linux-kernel@lists.codethink.co.uk
Link: http://lkml.kernel.org/r/1466178974-7105-1-git-send-email-ben.dooks@codethink.co.uk
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index a9b76a40319e..2c5bc77c0bb0 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -645,7 +645,7 @@ void tick_cleanup_dead_cpu(int cpu)
 #endif
 
 #ifdef CONFIG_SYSFS
-struct bus_type clockevents_subsys = {
+static struct bus_type clockevents_subsys = {
 	.name		= "clockevents",
 	.dev_name       = "clockevent",
 };
-- 
cgit v1.3-8-gc7d7


From eb0dc47ab6810c432e8193beccd9905ba0db8b22 Mon Sep 17 00:00:00 2001
From: Vincent Stehle <vincent.stehle@laposte.net>
Date: Mon, 18 Jul 2016 22:56:26 +0200
Subject: genirq: Fix missing irq allocation affinity hint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The new affinity hint argument of __irq_domain_alloc_irqs() is missing in
irq_reserve_ipi(). Add it.

This fixes the following compilation error:

  kernel/irq/ipi.c: In function ‘irq_reserve_ipi’:
  kernel/irq/ipi.c:85:9: error: too few arguments to function ‘__irq_domain_alloc_irqs’
    virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE,
           ^
Fixes: 06ee6d571f0e ("genirq: Add affinity hint to irq allocation")
Signed-off-by: Vincent Stehlé <vincent.stehle@laposte.net>
Cc: linux-pci@vger.kernel.org
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/ipi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 4fd23510d5f2..1a9abc1c8ea0 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -83,7 +83,7 @@ int irq_reserve_ipi(struct irq_domain *domain,
 	}
 
 	virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE,
-				       (void *) dest, true);
+				       (void *) dest, true, NULL);
 
 	if (virq <= 0) {
 		pr_warn("Can't reserve IPI, failed to alloc hw irqs\n");
-- 
cgit v1.3-8-gc7d7


From 1f3b0f8243cb934307f59bd4d8e43b868e61d4d9 Mon Sep 17 00:00:00 2001
From: Gaurav Jindal <Gaurav.Jindal@spreadtrum.com>
Date: Thu, 14 Jul 2016 12:04:20 +0000
Subject: tick/nohz: Optimize nohz idle enter

tick_nohz_start_idle is called before checking whether the idle tick can be
stopped. If the tick cannot be stopped, calling tick_nohz_start_idle() is
pointless and just wasting CPU cycles.

Only invoke tick_nohz_start_idle() when can_stop_idle_tick() returns true. A
short one minute observation of the effect on ARM64 shows a reduction of calls
by 1.5% thus optimizing the idle entry sequence.

[tglx: Massaged changelog ]

Co-developed-by: Sanjeev Yadav<sanjeev.yadav@spreadtrum.com>
Signed-off-by: Gaurav Jindal<gaurav.jindal@spreadtrum.com>
Link: http://lkml.kernel.org/r/20160714120416.GB21099@gaurav.jindal@spreadtrum.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2ec7c00228f3..204fdc86863d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -908,11 +908,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
 	ktime_t now, expires;
 	int cpu = smp_processor_id();
 
-	now = tick_nohz_start_idle(ts);
-
 	if (can_stop_idle_tick(cpu, ts)) {
 		int was_stopped = ts->tick_stopped;
 
+		now = tick_nohz_start_idle(ts);
 		ts->idle_calls++;
 
 		expires = tick_nohz_stop_sched_tick(ts, now, cpu);
-- 
cgit v1.3-8-gc7d7


From 55094f57535831883b60776de5eb78c6bfb3c16d Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Tue, 19 Jul 2016 12:02:39 +0000
Subject: cgroup: remove duplicated include from cgroup.c

Remove duplicated include.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index dd26e1bb7222..861995c7fc3f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -61,7 +61,6 @@
 #include <linux/cpuset.h>
 #include <linux/proc_ns.h>
 #include <linux/nsproxy.h>
-#include <linux/proc_ns.h>
 #include <net/sock.h>
 
 /*
-- 
cgit v1.3-8-gc7d7


From 183fc1537ec39be242dc8b619f71fc11b393d295 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 18 Jul 2016 15:50:58 -0700
Subject: kernel/trace/bpf_trace.c: work around gcc-4.4.4 anon union
 initialization bug

kernel/trace/bpf_trace.c: In function 'bpf_event_output':
kernel/trace/bpf_trace.c:312: error: unknown field 'next' specified in initializer
kernel/trace/bpf_trace.c:312: warning: missing braces around initializer
kernel/trace/bpf_trace.c:312: warning: (near initialization for 'raw.frag.<anonymous>')

Fixes: 555c8a8623a3a87 ("bpf: avoid stack copy and use skb ctx for event output")
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ebfbb7dd7033..a12bbd32c0a6 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -309,7 +309,9 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 	};
 	struct perf_raw_record raw = {
 		.frag = {
-			.next	= ctx_size ? &frag : NULL,
+			{
+				.next	= ctx_size ? &frag : NULL,
+			},
 			.size	= meta_size,
 			.data	= meta,
 		},
-- 
cgit v1.3-8-gc7d7


From 59d3656d5bf504f771fc44fdbc7a9a8590795f22 Mon Sep 17 00:00:00 2001
From: Brenden Blanco <bblanco@plumgrid.com>
Date: Tue, 19 Jul 2016 12:16:46 -0700
Subject: bpf: add bpf_prog_add api for bulk prog refcnt

A subsystem may need to store many copies of a bpf program, each
deserving its own reference. Rather than requiring the caller to loop
one by one (with possible mid-loop failure), add a bulk bpf_prog_add
api.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h  |  1 +
 kernel/bpf/syscall.c | 12 +++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c13e92b00bf5..75a5ae6bee07 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -224,6 +224,7 @@ void bpf_register_map_type(struct bpf_map_type_list *tl);
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
+struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i);
 struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog);
 void bpf_prog_put(struct bpf_prog *prog);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 96d938a22050..228f962447a5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -670,14 +670,20 @@ static struct bpf_prog *____bpf_prog_get(struct fd f)
 	return f.file->private_data;
 }
 
-struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
 {
-	if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) {
-		atomic_dec(&prog->aux->refcnt);
+	if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
+		atomic_sub(i, &prog->aux->refcnt);
 		return ERR_PTR(-EBUSY);
 	}
 	return prog;
 }
+EXPORT_SYMBOL_GPL(bpf_prog_add);
+
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+{
+	return bpf_prog_add(prog, 1);
+}
 
 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 {
-- 
cgit v1.3-8-gc7d7


From 6a773a15a1e8874e5eccd2f29190c31085912c95 Mon Sep 17 00:00:00 2001
From: Brenden Blanco <bblanco@plumgrid.com>
Date: Tue, 19 Jul 2016 12:16:47 -0700
Subject: bpf: add XDP prog type for early driver filter

Add a new bpf prog type that is intended to run in early stages of the
packet rx path. Only minimal packet metadata will be available, hence a
new context type, struct xdp_md, is exposed to userspace. So far only
expose the packet start and end pointers, and only in read mode.

An XDP program must return one of the well known enum values, all other
return codes are reserved for future use. Unfortunately, this
restriction is hard to enforce at verification time, so take the
approach of warning at runtime when such programs are encountered. Out
of bounds return codes should alias to XDP_ABORTED.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h   | 18 +++++++++++
 include/uapi/linux/bpf.h | 20 ++++++++++++
 kernel/bpf/verifier.c    |  1 +
 net/core/filter.c        | 79 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 118 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6fc31ef1da2d..15d816a8b755 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -368,6 +368,11 @@ struct bpf_skb_data_end {
 	void *data_end;
 };
 
+struct xdp_buff {
+	void *data;
+	void *data_end;
+};
+
 /* compute the linear packet data range [data, data_end) which
  * will be accessed by cls_bpf and act_bpf programs
  */
@@ -429,6 +434,18 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
 	return BPF_PROG_RUN(prog, skb);
 }
 
+static inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
+				   struct xdp_buff *xdp)
+{
+	u32 ret;
+
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(prog, (void *)xdp);
+	rcu_read_unlock();
+
+	return ret;
+}
+
 static inline unsigned int bpf_prog_size(unsigned int proglen)
 {
 	return max(sizeof(struct bpf_prog),
@@ -509,6 +526,7 @@ bool bpf_helper_changes_skb_data(void *func);
 
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
+void bpf_warn_invalid_xdp_action(u32 act);
 
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c4d922439d20..a51786566c2f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -94,6 +94,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SCHED_CLS,
 	BPF_PROG_TYPE_SCHED_ACT,
 	BPF_PROG_TYPE_TRACEPOINT,
+	BPF_PROG_TYPE_XDP,
 };
 
 #define BPF_PSEUDO_MAP_FD	1
@@ -439,4 +440,23 @@ struct bpf_tunnel_key {
 	__u32 tunnel_label;
 };
 
+/* User return codes for XDP prog type.
+ * A valid XDP program must return one of these defined values. All other
+ * return codes are reserved for future use. Unknown return codes will result
+ * in packet drop.
+ */
+enum xdp_action {
+	XDP_ABORTED = 0,
+	XDP_DROP,
+	XDP_PASS,
+};
+
+/* user accessible metadata for XDP packet hook
+ * new fields must be added to the end of this structure
+ */
+struct xdp_md {
+	__u32 data;
+	__u32 data_end;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e206c2181412..a8d67d097b0d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -713,6 +713,7 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
 	switch (env->prog->type) {
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
+	case BPF_PROG_TYPE_XDP:
 		break;
 	default:
 		verbose("verifier is misconfigured\n");
diff --git a/net/core/filter.c b/net/core/filter.c
index 22e3992c8b48..6c627bc4be6e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2410,6 +2410,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+static const struct bpf_func_proto *
+xdp_func_proto(enum bpf_func_id func_id)
+{
+	return sk_filter_func_proto(func_id);
+}
+
 static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 {
 	if (off < 0 || off >= sizeof(struct __sk_buff))
@@ -2477,6 +2483,44 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static bool __is_valid_xdp_access(int off, int size,
+				  enum bpf_access_type type)
+{
+	if (off < 0 || off >= sizeof(struct xdp_md))
+		return false;
+	if (off % size != 0)
+		return false;
+	if (size != 4)
+		return false;
+
+	return true;
+}
+
+static bool xdp_is_valid_access(int off, int size,
+				enum bpf_access_type type,
+				enum bpf_reg_type *reg_type)
+{
+	if (type == BPF_WRITE)
+		return false;
+
+	switch (off) {
+	case offsetof(struct xdp_md, data):
+		*reg_type = PTR_TO_PACKET;
+		break;
+	case offsetof(struct xdp_md, data_end):
+		*reg_type = PTR_TO_PACKET_END;
+		break;
+	}
+
+	return __is_valid_xdp_access(off, size, type);
+}
+
+void bpf_warn_invalid_xdp_action(u32 act)
+{
+	WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act);
+}
+EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
+
 static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 				      int src_reg, int ctx_off,
 				      struct bpf_insn *insn_buf,
@@ -2628,6 +2672,29 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	return insn - insn_buf;
 }
 
+static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+				  int src_reg, int ctx_off,
+				  struct bpf_insn *insn_buf,
+				  struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (ctx_off) {
+	case offsetof(struct xdp_md, data):
+		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)),
+				      dst_reg, src_reg,
+				      offsetof(struct xdp_buff, data));
+		break;
+	case offsetof(struct xdp_md, data_end):
+		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)),
+				      dst_reg, src_reg,
+				      offsetof(struct xdp_buff, data_end));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static const struct bpf_verifier_ops sk_filter_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
@@ -2640,6 +2707,12 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
 	.convert_ctx_access	= bpf_net_convert_ctx_access,
 };
 
+static const struct bpf_verifier_ops xdp_ops = {
+	.get_func_proto		= xdp_func_proto,
+	.is_valid_access	= xdp_is_valid_access,
+	.convert_ctx_access	= xdp_convert_ctx_access,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops	= &sk_filter_ops,
 	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
@@ -2655,11 +2728,17 @@ static struct bpf_prog_type_list sched_act_type __read_mostly = {
 	.type	= BPF_PROG_TYPE_SCHED_ACT,
 };
 
+static struct bpf_prog_type_list xdp_type __read_mostly = {
+	.ops	= &xdp_ops,
+	.type	= BPF_PROG_TYPE_XDP,
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
 	bpf_register_prog_type(&sched_cls_type);
 	bpf_register_prog_type(&sched_act_type);
+	bpf_register_prog_type(&xdp_type);
 
 	return 0;
 }
-- 
cgit v1.3-8-gc7d7


From 4acf6c0b84c91243c705303cd9ff16421914150d Mon Sep 17 00:00:00 2001
From: Brenden Blanco <bblanco@plumgrid.com>
Date: Tue, 19 Jul 2016 12:16:56 -0700
Subject: bpf: enable direct packet data write for xdp progs

For forwarding to be effective, XDP programs should be allowed to
rewrite packet data.

This requires that the drivers supporting XDP must all map the packet
memory as TODEVICE or BIDIRECTIONAL before invoking the program.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a8d67d097b0d..f72f23b8fdab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -653,6 +653,16 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
 
 #define MAX_PACKET_OFF 0xffff
 
+static bool may_write_pkt_data(enum bpf_prog_type type)
+{
+	switch (type) {
+	case BPF_PROG_TYPE_XDP:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static int check_packet_access(struct verifier_env *env, u32 regno, int off,
 			       int size)
 {
@@ -806,10 +816,15 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 			err = check_stack_read(state, off, size, value_regno);
 		}
 	} else if (state->regs[regno].type == PTR_TO_PACKET) {
-		if (t == BPF_WRITE) {
+		if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
 		}
+		if (t == BPF_WRITE && value_regno >= 0 &&
+		    is_pointer_value(env, value_regno)) {
+			verbose("R%d leaks addr into packet\n", value_regno);
+			return -EACCES;
+		}
 		err = check_packet_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown_value(state->regs, value_regno);
-- 
cgit v1.3-8-gc7d7


From 43761473c254b45883a64441dd0bc85a42f3645c Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 19 Jul 2016 17:42:57 -0400
Subject: audit: fix a double fetch in audit_log_single_execve_arg()

There is a double fetch problem in audit_log_single_execve_arg()
where we first check the execve(2) argumnets for any "bad" characters
which would require hex encoding and then re-fetch the arguments for
logging in the audit record[1].  Of course this leaves a window of
opportunity for an unsavory application to munge with the data.

This patch reworks things by only fetching the argument data once[2]
into a buffer where it is scanned and logged into the audit
records(s).  In addition to fixing the double fetch, this patch
improves on the original code in a few other ways: better handling
of large arguments which require encoding, stricter record length
checking, and some performance improvements (completely unverified,
but we got rid of some strlen() calls, that's got to be a good
thing).

As part of the development of this patch, I've also created a basic
regression test for the audit-testsuite, the test can be tracked on
GitHub at the following link:

 * https://github.com/linux-audit/audit-testsuite/issues/25

[1] If you pay careful attention, there is actually a triple fetch
problem due to a strnlen_user() call at the top of the function.

[2] This is a tiny white lie, we do make a call to strnlen_user()
prior to fetching the argument data.  I don't like it, but due to the
way the audit record is structured we really have no choice unless we
copy the entire argument at once (which would require a rather
wasteful allocation).  The good news is that with this patch the
kernel no longer relies on this strnlen_user() value for anything
beyond recording it in the log, we also update it with a trustworthy
value whenever possible.

Reported-by: Pengfei Wang <wpengfeinudt@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c | 332 +++++++++++++++++++++++++++----------------------------
 1 file changed, 164 insertions(+), 168 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index aa3feec4df14..c65af21a12d6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -73,6 +73,7 @@
 #include <linux/compat.h>
 #include <linux/ctype.h>
 #include <linux/string.h>
+#include <linux/uaccess.h>
 #include <uapi/linux/limits.h>
 
 #include "audit.h"
@@ -82,7 +83,8 @@
 #define AUDITSC_SUCCESS 1
 #define AUDITSC_FAILURE 2
 
-/* no execve audit message should be longer than this (userspace limits) */
+/* no execve audit message should be longer than this (userspace limits),
+ * see the note near the top of audit_log_execve_info() about this value */
 #define MAX_EXECVE_AUDIT_LEN 7500
 
 /* max length to print of cmdline/proctitle value during audit */
@@ -992,184 +994,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
 	return rc;
 }
 
-/*
- * to_send and len_sent accounting are very loose estimates.  We aren't
- * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
- * within about 500 bytes (next page boundary)
- *
- * why snprintf?  an int is up to 12 digits long.  if we just assumed when
- * logging that a[%d]= was going to be 16 characters long we would be wasting
- * space in every audit message.  In one 7500 byte message we can log up to
- * about 1000 min size arguments.  That comes down to about 50% waste of space
- * if we didn't do the snprintf to find out how long arg_num_len was.
- */
-static int audit_log_single_execve_arg(struct audit_context *context,
-					struct audit_buffer **ab,
-					int arg_num,
-					size_t *len_sent,
-					const char __user *p,
-					char *buf)
+static void audit_log_execve_info(struct audit_context *context,
+				  struct audit_buffer **ab)
 {
-	char arg_num_len_buf[12];
-	const char __user *tmp_p = p;
-	/* how many digits are in arg_num? 5 is the length of ' a=""' */
-	size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
-	size_t len, len_left, to_send;
-	size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
-	unsigned int i, has_cntl = 0, too_long = 0;
-	int ret;
-
-	/* strnlen_user includes the null we don't want to send */
-	len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1;
-
-	/*
-	 * We just created this mm, if we can't find the strings
-	 * we just copied into it something is _very_ wrong. Similar
-	 * for strings that are too long, we should not have created
-	 * any.
-	 */
-	if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) {
-		send_sig(SIGKILL, current, 0);
-		return -1;
+	long len_max;
+	long len_rem;
+	long len_full;
+	long len_buf;
+	long len_abuf;
+	long len_tmp;
+	bool require_data;
+	bool encode;
+	unsigned int iter;
+	unsigned int arg;
+	char *buf_head;
+	char *buf;
+	const char __user *p = (const char __user *)current->mm->arg_start;
+
+	/* NOTE: this buffer needs to be large enough to hold all the non-arg
+	 *       data we put in the audit record for this argument (see the
+	 *       code below) ... at this point in time 96 is plenty */
+	char abuf[96];
+
+	/* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the
+	 *       current value of 7500 is not as important as the fact that it
+	 *       is less than 8k, a setting of 7500 gives us plenty of wiggle
+	 *       room if we go over a little bit in the logging below */
+	WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500);
+	len_max = MAX_EXECVE_AUDIT_LEN;
+
+	/* scratch buffer to hold the userspace args */
+	buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
+	if (!buf_head) {
+		audit_panic("out of memory for argv string");
+		return;
 	}
+	buf = buf_head;
 
-	/* walk the whole argument looking for non-ascii chars */
+	audit_log_format(*ab, "argc=%d", context->execve.argc);
+
+	len_rem = len_max;
+	len_buf = 0;
+	len_full = 0;
+	require_data = true;
+	encode = false;
+	iter = 0;
+	arg = 0;
 	do {
-		if (len_left > MAX_EXECVE_AUDIT_LEN)
-			to_send = MAX_EXECVE_AUDIT_LEN;
-		else
-			to_send = len_left;
-		ret = copy_from_user(buf, tmp_p, to_send);
-		/*
-		 * There is no reason for this copy to be short. We just
-		 * copied them here, and the mm hasn't been exposed to user-
-		 * space yet.
-		 */
-		if (ret) {
-			WARN_ON(1);
-			send_sig(SIGKILL, current, 0);
-			return -1;
-		}
-		buf[to_send] = '\0';
-		has_cntl = audit_string_contains_control(buf, to_send);
-		if (has_cntl) {
-			/*
-			 * hex messages get logged as 2 bytes, so we can only
-			 * send half as much in each message
-			 */
-			max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2;
-			break;
-		}
-		len_left -= to_send;
-		tmp_p += to_send;
-	} while (len_left > 0);
-
-	len_left = len;
-
-	if (len > max_execve_audit_len)
-		too_long = 1;
-
-	/* rewalk the argument actually logging the message */
-	for (i = 0; len_left > 0; i++) {
-		int room_left;
-
-		if (len_left > max_execve_audit_len)
-			to_send = max_execve_audit_len;
-		else
-			to_send = len_left;
-
-		/* do we have space left to send this argument in this ab? */
-		room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent;
-		if (has_cntl)
-			room_left -= (to_send * 2);
-		else
-			room_left -= to_send;
-		if (room_left < 0) {
-			*len_sent = 0;
-			audit_log_end(*ab);
-			*ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE);
-			if (!*ab)
-				return 0;
-		}
+		/* NOTE: we don't ever want to trust this value for anything
+		 *       serious, but the audit record format insists we
+		 *       provide an argument length for really long arguments,
+		 *       e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but
+		 *       to use strncpy_from_user() to obtain this value for
+		 *       recording in the log, although we don't use it
+		 *       anywhere here to avoid a double-fetch problem */
+		if (len_full == 0)
+			len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1;
+
+		/* read more data from userspace */
+		if (require_data) {
+			/* can we make more room in the buffer? */
+			if (buf != buf_head) {
+				memmove(buf_head, buf, len_buf);
+				buf = buf_head;
+			}
+
+			/* fetch as much as we can of the argument */
+			len_tmp = strncpy_from_user(&buf_head[len_buf], p,
+						    len_max - len_buf);
+			if (len_tmp == -EFAULT) {
+				/* unable to copy from userspace */
+				send_sig(SIGKILL, current, 0);
+				goto out;
+			} else if (len_tmp == (len_max - len_buf)) {
+				/* buffer is not large enough */
+				require_data = true;
+				/* NOTE: if we are going to span multiple
+				 *       buffers force the encoding so we stand
+				 *       a chance at a sane len_full value and
+				 *       consistent record encoding */
+				encode = true;
+				len_full = len_full * 2;
+				p += len_tmp;
+			} else {
+				require_data = false;
+				if (!encode)
+					encode = audit_string_contains_control(
+								buf, len_tmp);
+				/* try to use a trusted value for len_full */
+				if (len_full < len_max)
+					len_full = (encode ?
+						    len_tmp * 2 : len_tmp);
+				p += len_tmp + 1;
+			}
+			len_buf += len_tmp;
+			buf_head[len_buf] = '\0';
 
-		/*
-		 * first record needs to say how long the original string was
-		 * so we can be sure nothing was lost.
-		 */
-		if ((i == 0) && (too_long))
-			audit_log_format(*ab, " a%d_len=%zu", arg_num,
-					 has_cntl ? 2*len : len);
-
-		/*
-		 * normally arguments are small enough to fit and we already
-		 * filled buf above when we checked for control characters
-		 * so don't bother with another copy_from_user
-		 */
-		if (len >= max_execve_audit_len)
-			ret = copy_from_user(buf, p, to_send);
-		else
-			ret = 0;
-		if (ret) {
-			WARN_ON(1);
-			send_sig(SIGKILL, current, 0);
-			return -1;
+			/* length of the buffer in the audit record? */
+			len_abuf = (encode ? len_buf * 2 : len_buf + 2);
 		}
-		buf[to_send] = '\0';
-
-		/* actually log it */
-		audit_log_format(*ab, " a%d", arg_num);
-		if (too_long)
-			audit_log_format(*ab, "[%d]", i);
-		audit_log_format(*ab, "=");
-		if (has_cntl)
-			audit_log_n_hex(*ab, buf, to_send);
-		else
-			audit_log_string(*ab, buf);
-
-		p += to_send;
-		len_left -= to_send;
-		*len_sent += arg_num_len;
-		if (has_cntl)
-			*len_sent += to_send * 2;
-		else
-			*len_sent += to_send;
-	}
-	/* include the null we didn't log */
-	return len + 1;
-}
 
-static void audit_log_execve_info(struct audit_context *context,
-				  struct audit_buffer **ab)
-{
-	int i, len;
-	size_t len_sent = 0;
-	const char __user *p;
-	char *buf;
+		/* write as much as we can to the audit log */
+		if (len_buf > 0) {
+			/* NOTE: some magic numbers here - basically if we
+			 *       can't fit a reasonable amount of data into the
+			 *       existing audit buffer, flush it and start with
+			 *       a new buffer */
+			if ((sizeof(abuf) + 8) > len_rem) {
+				len_rem = len_max;
+				audit_log_end(*ab);
+				*ab = audit_log_start(context,
+						      GFP_KERNEL, AUDIT_EXECVE);
+				if (!*ab)
+					goto out;
+			}
 
-	p = (const char __user *)current->mm->arg_start;
+			/* create the non-arg portion of the arg record */
+			len_tmp = 0;
+			if (require_data || (iter > 0) ||
+			    ((len_abuf + sizeof(abuf)) > len_rem)) {
+				if (iter == 0) {
+					len_tmp += snprintf(&abuf[len_tmp],
+							sizeof(abuf) - len_tmp,
+							" a%d_len=%lu",
+							arg, len_full);
+				}
+				len_tmp += snprintf(&abuf[len_tmp],
+						    sizeof(abuf) - len_tmp,
+						    " a%d[%d]=", arg, iter++);
+			} else
+				len_tmp += snprintf(&abuf[len_tmp],
+						    sizeof(abuf) - len_tmp,
+						    " a%d=", arg);
+			WARN_ON(len_tmp >= sizeof(abuf));
+			abuf[sizeof(abuf) - 1] = '\0';
+
+			/* log the arg in the audit record */
+			audit_log_format(*ab, "%s", abuf);
+			len_rem -= len_tmp;
+			len_tmp = len_buf;
+			if (encode) {
+				if (len_abuf > len_rem)
+					len_tmp = len_rem / 2; /* encoding */
+				audit_log_n_hex(*ab, buf, len_tmp);
+				len_rem -= len_tmp * 2;
+				len_abuf -= len_tmp * 2;
+			} else {
+				if (len_abuf > len_rem)
+					len_tmp = len_rem - 2; /* quotes */
+				audit_log_n_string(*ab, buf, len_tmp);
+				len_rem -= len_tmp + 2;
+				/* don't subtract the "2" because we still need
+				 * to add quotes to the remaining string */
+				len_abuf -= len_tmp;
+			}
+			len_buf -= len_tmp;
+			buf += len_tmp;
+		}
 
-	audit_log_format(*ab, "argc=%d", context->execve.argc);
+		/* ready to move to the next argument? */
+		if ((len_buf == 0) && !require_data) {
+			arg++;
+			iter = 0;
+			len_full = 0;
+			require_data = true;
+			encode = false;
+		}
+	} while (arg < context->execve.argc);
 
-	/*
-	 * we need some kernel buffer to hold the userspace args.  Just
-	 * allocate one big one rather than allocating one of the right size
-	 * for every single argument inside audit_log_single_execve_arg()
-	 * should be <8k allocation so should be pretty safe.
-	 */
-	buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
-	if (!buf) {
-		audit_panic("out of memory for argv string");
-		return;
-	}
+	/* NOTE: the caller handles the final audit_log_end() call */
 
-	for (i = 0; i < context->execve.argc; i++) {
-		len = audit_log_single_execve_arg(context, ab, i,
-						  &len_sent, p, buf);
-		if (len <= 0)
-			break;
-		p += len;
-	}
-	kfree(buf);
+out:
+	kfree(buf_head);
 }
 
 static void show_special(struct audit_context *context, int *call_panic)
-- 
cgit v1.3-8-gc7d7


From 5cbea46984d67f614c74c4401b54b9d681861e80 Mon Sep 17 00:00:00 2001
From: Steve Muckle <steve.muckle@linaro.org>
Date: Wed, 13 Jul 2016 13:25:26 -0700
Subject: cpufreq: schedutil: map raw required frequency to driver frequency

The slow-path frequency transition path is relatively expensive as it
requires waking up a thread to do work. Should support be added for
remote CPU cpufreq updates that is also expensive since it requires an
IPI. These activities should be avoided if they are not necessary.

To that end, calculate the actual driver-supported frequency required by
the new utilization value in schedutil by using the recently added
cpufreq_driver_resolve_freq API. If it is the same as the previously
requested driver frequency then there is no need to continue with the
update assuming the cpu frequency limits have not changed. This will
have additional benefits should the semantics of the rate limit be
changed to apply solely to frequency transitions rather than to
frequency calculations in schedutil.

The last raw required frequency is cached. This allows the driver
frequency lookup to be skipped in the event that the new raw required
frequency matches the last one, assuming a frequency update has not been
forced due to limits changing (indicated by a next_freq value of
UINT_MAX, see sugov_should_update_freq).

Signed-off-by: Steve Muckle <smuckle@linaro.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 758efd7f3abe..a84641b222c1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -47,6 +47,8 @@ struct sugov_cpu {
 	struct update_util_data update_util;
 	struct sugov_policy *sg_policy;
 
+	unsigned int cached_raw_freq;
+
 	/* The fields below are only needed when sharing a policy. */
 	unsigned long util;
 	unsigned long max;
@@ -106,7 +108,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
 
 /**
  * get_next_freq - Compute a new frequency for a given cpufreq policy.
- * @policy: cpufreq policy object to compute the new frequency for.
+ * @sg_cpu: schedutil cpu object to compute the new frequency for.
  * @util: Current CPU utilization.
  * @max: CPU capacity.
  *
@@ -121,14 +123,25 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
  * next_freq = C * curr_freq * util_raw / max
  *
  * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
+ *
+ * The lowest driver-supported frequency which is equal or greater than the raw
+ * next_freq (as calculated above) is returned, subject to policy min/max and
+ * cpufreq driver limitations.
  */
-static unsigned int get_next_freq(struct cpufreq_policy *policy,
-				  unsigned long util, unsigned long max)
+static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
+				  unsigned long max)
 {
+	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+	struct cpufreq_policy *policy = sg_policy->policy;
 	unsigned int freq = arch_scale_freq_invariant() ?
 				policy->cpuinfo.max_freq : policy->cur;
 
-	return (freq + (freq >> 2)) * util / max;
+	freq = (freq + (freq >> 2)) * util / max;
+
+	if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+		return sg_policy->next_freq;
+	sg_cpu->cached_raw_freq = freq;
+	return cpufreq_driver_resolve_freq(policy, freq);
 }
 
 static void sugov_update_single(struct update_util_data *hook, u64 time,
@@ -143,13 +156,14 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 		return;
 
 	next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq :
-			get_next_freq(policy, util, max);
+			get_next_freq(sg_cpu, util, max);
 	sugov_update_commit(sg_policy, time, next_f);
 }
 
-static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
 					   unsigned long util, unsigned long max)
 {
+	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 	struct cpufreq_policy *policy = sg_policy->policy;
 	unsigned int max_f = policy->cpuinfo.max_freq;
 	u64 last_freq_update_time = sg_policy->last_freq_update_time;
@@ -189,7 +203,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
 		}
 	}
 
-	return get_next_freq(policy, util, max);
+	return get_next_freq(sg_cpu, util, max);
 }
 
 static void sugov_update_shared(struct update_util_data *hook, u64 time,
@@ -206,7 +220,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
 	sg_cpu->last_update = time;
 
 	if (sugov_should_update_freq(sg_policy, time)) {
-		next_f = sugov_next_freq_shared(sg_policy, util, max);
+		next_f = sugov_next_freq_shared(sg_cpu, util, max);
 		sugov_update_commit(sg_policy, time, next_f);
 	}
 
@@ -433,6 +447,7 @@ static int sugov_start(struct cpufreq_policy *policy)
 			sg_cpu->util = ULONG_MAX;
 			sg_cpu->max = 0;
 			sg_cpu->last_update = 0;
+			sg_cpu->cached_raw_freq = 0;
 			cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
 						     sugov_update_shared);
 		} else {
-- 
cgit v1.3-8-gc7d7


From fe12c00d21bb4985fa8da282942250be21e7dd59 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Fri, 22 Jul 2016 10:30:47 +0800
Subject: PM / hibernate: Introduce test_resume mode for hibernation

test_resume mode is to verify if the snapshot data
written to swap device can be successfully restored
to memory. It is useful to ease the debugging process
on hibernation, since this mode can not only bypass
the BIOSes/bootloader, but also the system re-initialization.

To avoid the risk to break the filesystm on persistent storage,
this patch resumes the image with tasks frozen.

For example:
echo test_resume > /sys/power/disk
echo disk > /sys/power/state

[  187.306470] PM: Image saving progress:  70%
[  187.395298] PM: Image saving progress:  80%
[  187.476697] PM: Image saving progress:  90%
[  187.554641] PM: Image saving done.
[  187.558896] PM: Wrote 594600 kbytes in 0.90 seconds (660.66 MB/s)
[  187.566000] PM: S|
[  187.589742] PM: Basic memory bitmaps freed
[  187.594694] PM: Checking hibernation image
[  187.599865] PM: Image signature found, resuming
[  187.605209] PM: Loading hibernation image.
[  187.665753] PM: Basic memory bitmaps created
[  187.691397] PM: Using 3 thread(s) for decompression.
[  187.691397] PM: Loading and decompressing image data (148650 pages)...
[  187.889719] PM: Image loading progress:   0%
[  188.100452] PM: Image loading progress:  10%
[  188.244781] PM: Image loading progress:  20%
[  189.057305] PM: Image loading done.
[  189.068793] PM: Image successfully loaded

Suggested-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 65 ++++++++++++++++++++++++++++++++----------------
 kernel/power/swap.c      |  6 +++++
 2 files changed, 50 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 5f3523e18e46..0ee1df0a0bd6 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -52,6 +52,7 @@ enum {
 #ifdef CONFIG_SUSPEND
 	HIBERNATION_SUSPEND,
 #endif
+	HIBERNATION_TEST_RESUME,
 	/* keep last */
 	__HIBERNATION_AFTER_LAST
 };
@@ -647,12 +648,39 @@ static void power_down(void)
 		cpu_relax();
 }
 
+static int load_image_and_restore(void)
+{
+	int error;
+	unsigned int flags;
+
+	pr_debug("PM: Loading hibernation image.\n");
+
+	lock_device_hotplug();
+	error = create_basic_memory_bitmaps();
+	if (error)
+		goto Unlock;
+
+	error = swsusp_read(&flags);
+	swsusp_close(FMODE_READ);
+	if (!error)
+		hibernation_restore(flags & SF_PLATFORM_MODE);
+
+	printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
+	swsusp_free();
+	free_basic_memory_bitmaps();
+ Unlock:
+	unlock_device_hotplug();
+
+	return error;
+}
+
 /**
  * hibernate - Carry out system hibernation, including saving the image.
  */
 int hibernate(void)
 {
 	int error, nr_calls = 0;
+	bool snapshot_test = false;
 
 	if (!hibernation_available()) {
 		pr_debug("PM: Hibernation not available.\n");
@@ -704,8 +732,12 @@ int hibernate(void)
 		pr_debug("PM: writing image.\n");
 		error = swsusp_write(flags);
 		swsusp_free();
-		if (!error)
-			power_down();
+		if (!error) {
+			if (hibernation_mode == HIBERNATION_TEST_RESUME)
+				snapshot_test = true;
+			else
+				power_down();
+		}
 		in_suspend = 0;
 		pm_restore_gfp_mask();
 	} else {
@@ -716,6 +748,12 @@ int hibernate(void)
 	free_basic_memory_bitmaps();
  Thaw:
 	unlock_device_hotplug();
+	if (snapshot_test) {
+		pr_debug("PM: Checking hibernation image\n");
+		error = swsusp_check();
+		if (!error)
+			error = load_image_and_restore();
+	}
 	thaw_processes();
 
 	/* Don't bother checking whether freezer_test_done is true */
@@ -748,7 +786,6 @@ int hibernate(void)
 static int software_resume(void)
 {
 	int error, nr_calls = 0;
-	unsigned int flags;
 
 	/*
 	 * If the user said "noresume".. bail out early.
@@ -844,24 +881,7 @@ static int software_resume(void)
 	error = freeze_processes();
 	if (error)
 		goto Close_Finish;
-
-	pr_debug("PM: Loading hibernation image.\n");
-
-	lock_device_hotplug();
-	error = create_basic_memory_bitmaps();
-	if (error)
-		goto Thaw;
-
-	error = swsusp_read(&flags);
-	swsusp_close(FMODE_READ);
-	if (!error)
-		hibernation_restore(flags & SF_PLATFORM_MODE);
-
-	printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
-	swsusp_free();
-	free_basic_memory_bitmaps();
- Thaw:
-	unlock_device_hotplug();
+	error = load_image_and_restore();
 	thaw_processes();
  Finish:
 	__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
@@ -887,6 +907,7 @@ static const char * const hibernation_modes[] = {
 #ifdef CONFIG_SUSPEND
 	[HIBERNATION_SUSPEND]	= "suspend",
 #endif
+	[HIBERNATION_TEST_RESUME]	= "test_resume",
 };
 
 /*
@@ -933,6 +954,7 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
 #ifdef CONFIG_SUSPEND
 		case HIBERNATION_SUSPEND:
 #endif
+		case HIBERNATION_TEST_RESUME:
 			break;
 		case HIBERNATION_PLATFORM:
 			if (hibernation_ops)
@@ -979,6 +1001,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
 #ifdef CONFIG_SUSPEND
 		case HIBERNATION_SUSPEND:
 #endif
+		case HIBERNATION_TEST_RESUME:
 			hibernation_mode = mode;
 			break;
 		case HIBERNATION_PLATFORM:
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 160e1006640d..51cef8432154 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -348,6 +348,12 @@ static int swsusp_swap_check(void)
 	if (res < 0)
 		blkdev_put(hib_resume_bdev, FMODE_WRITE);
 
+	/*
+	 * Update the resume device to the one actually used,
+	 * so the test_resume mode can use it in case it is
+	 * invoked from hibernate() to test the snapshot.
+	 */
+	swsusp_resume_device = hib_resume_bdev->bd_dev;
 	return res;
 }
 
-- 
cgit v1.3-8-gc7d7


From aa7145c16d6bf086538ad7eb20c807513bfa5efc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 22 Jul 2016 01:19:42 +0200
Subject: bpf, events: fix offset in skb copy handler

This patch fixes the __output_custom() routine we currently use with
bpf_skb_copy(). I missed that when len is larger than the size of the
current handle, we can issue multiple invocations of copy_func, and
__output_custom() advances destination but also source buffer by the
written amount of bytes. When we have __output_custom(), this is actually
wrong since in that case the source buffer points to a non-linear object,
in our case an skb, which the copy_func helper is supposed to walk.
Therefore, since this is non-linear we thus need to pass the offset into
the helper, so that copy_func can use it for extracting the data from
the source object.

Therefore, adjust the callback signatures properly and pass offset
into the skb_header_pointer() invoked from bpf_skb_copy() callback. The
__DEFINE_OUTPUT_COPY_BODY() is adjusted to accommodate for two things:
i) to pass in whether we should advance source buffer or not; this is
a compile-time constant condition, ii) to pass in the offset for
__output_custom(), which we do with help of __VA_ARGS__, so everything
can stay inlined as is currently. Both changes allow for adapting the
__output_* fast-path helpers w/o extra overhead.

Fixes: 555c8a8623a3 ("bpf: avoid stack copy and use skb ctx for event output")
Fixes: 7e3f977edd0b ("perf, events: add non-linear data support for raw records")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h        |  2 +-
 include/linux/perf_event.h |  2 +-
 kernel/events/internal.h   | 15 ++++++++++-----
 net/core/filter.c          |  4 ++--
 4 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 36da0749205a..11134238417d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -211,7 +211,7 @@ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *f
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
 typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src,
-					unsigned long len);
+					unsigned long off, unsigned long len);
 
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e79e6c6fed89..15e55b7ee096 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -70,7 +70,7 @@ struct perf_callchain_entry_ctx {
 };
 
 typedef unsigned long (*perf_copy_f)(void *dst, const void *src,
-				     unsigned long len);
+				     unsigned long off, unsigned long len);
 
 struct perf_raw_frag {
 	union {
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 2417eb5512cd..486fd78eb8d5 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -123,18 +123,19 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
 	return rb->aux_nr_pages << PAGE_SHIFT;
 }
 
-#define __DEFINE_OUTPUT_COPY_BODY(memcpy_func)				\
+#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...)	\
 {									\
 	unsigned long size, written;					\
 									\
 	do {								\
 		size    = min(handle->size, len);			\
-		written = memcpy_func(handle->addr, buf, size);		\
+		written = memcpy_func(__VA_ARGS__);			\
 		written = size - written;				\
 									\
 		len -= written;						\
 		handle->addr += written;				\
-		buf += written;						\
+		if (advance_buf)					\
+			buf += written;					\
 		handle->size -= written;				\
 		if (!handle->size) {					\
 			struct ring_buffer *rb = handle->rb;		\
@@ -153,12 +154,16 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
 static inline unsigned long						\
 func_name(struct perf_output_handle *handle,				\
 	  const void *buf, unsigned long len)				\
-__DEFINE_OUTPUT_COPY_BODY(memcpy_func)
+__DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size)
 
 static inline unsigned long
 __output_custom(struct perf_output_handle *handle, perf_copy_f copy_func,
 		const void *buf, unsigned long len)
-__DEFINE_OUTPUT_COPY_BODY(copy_func)
+{
+	unsigned long orig_len = len;
+	__DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf,
+				  orig_len - len, size)
+}
 
 static inline unsigned long
 memcpy_common(void *dst, const void *src, unsigned long n)
diff --git a/net/core/filter.c b/net/core/filter.c
index 0b521353008d..5708999f8a79 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2026,9 +2026,9 @@ bool bpf_helper_changes_skb_data(void *func)
 }
 
 static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
-				  unsigned long len)
+				  unsigned long off, unsigned long len)
 {
-	void *ptr = skb_header_pointer(skb, 0, len, dst_buff);
+	void *ptr = skb_header_pointer(skb, off, len, dst_buff);
 
 	if (unlikely(!ptr))
 		return len;
-- 
cgit v1.3-8-gc7d7


From 96ae52279594470622ff0585621a13e96b700600 Mon Sep 17 00:00:00 2001
From: Sargun Dhillon <sargun@sargun.me>
Date: Mon, 25 Jul 2016 05:54:46 -0700
Subject: bpf: Add bpf_probe_write_user BPF helper to be called in tracers

This allows user memory to be written to during the course of a kprobe.
It shouldn't be used to implement any kind of security mechanism
because of TOC-TOU attacks, but rather to debug, divert, and
manipulate execution of semi-cooperative processes.

Although it uses probe_kernel_write, we limit the address space
the probe can write into by checking the space with access_ok.
We do this as opposed to calling copy_to_user directly, in order
to avoid sleeping. In addition we ensure the threads's current fs
/ segment is USER_DS and the thread isn't exiting nor a kernel thread.

Given this feature is meant for experiments, and it has a risk of
crashing the system, and running programs, we print a warning on
when a proglet that attempts to use this helper is installed,
along with the pid and process name.

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h  | 10 ++++++++++
 kernel/trace/bpf_trace.c  | 45 +++++++++++++++++++++++++++++++++++++++++++++
 samples/bpf/bpf_helpers.h |  2 ++
 3 files changed, 57 insertions(+)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2b7076f5b5ad..da218fec6056 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -365,6 +365,16 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_get_current_task,
 
+	/**
+	 * bpf_probe_write_user(void *dst, void *src, int len)
+	 * safely attempt to write to a location
+	 * @dst: destination address in userspace
+	 * @src: source address on stack
+	 * @len: number of bytes to copy
+	 * Return: 0 on success or negative error
+	 */
+	BPF_FUNC_probe_write_user,
+
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a12bbd32c0a6..b20438fdb029 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -81,6 +81,49 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	void *unsafe_ptr = (void *) (long) r1;
+	void *src = (void *) (long) r2;
+	int size = (int) r3;
+
+	/*
+	 * Ensure we're in user context which is safe for the helper to
+	 * run. This helper has no business in a kthread.
+	 *
+	 * access_ok() should prevent writing to non-user memory, but in
+	 * some situations (nommu, temporary switch, etc) access_ok() does
+	 * not provide enough validation, hence the check on KERNEL_DS.
+	 */
+
+	if (unlikely(in_interrupt() ||
+		     current->flags & (PF_KTHREAD | PF_EXITING)))
+		return -EPERM;
+	if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+		return -EPERM;
+	if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
+		return -EPERM;
+
+	return probe_kernel_write(unsafe_ptr, src, size);
+}
+
+static const struct bpf_func_proto bpf_probe_write_user_proto = {
+	.func		= bpf_probe_write_user,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+	.arg2_type	= ARG_PTR_TO_STACK,
+	.arg3_type	= ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
+{
+	pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
+			    current->comm, task_pid_nr(current));
+
+	return &bpf_probe_write_user_proto;
+}
+
 /*
  * limited trace_printk()
  * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
@@ -362,6 +405,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_perf_event_read:
 		return &bpf_perf_event_read_proto;
+	case BPF_FUNC_probe_write_user:
+		return bpf_get_probe_write_proto();
 	default:
 		return NULL;
 	}
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index 84e3fd919a06..217c8d507f2e 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -41,6 +41,8 @@ static int (*bpf_perf_event_output)(void *ctx, void *map, int index, void *data,
 	(void *) BPF_FUNC_perf_event_output;
 static int (*bpf_get_stackid)(void *ctx, void *map, int flags) =
 	(void *) BPF_FUNC_get_stackid;
+static int (*bpf_probe_write_user)(void *dst, void *src, int size) =
+	(void *) BPF_FUNC_probe_write_user;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
cgit v1.3-8-gc7d7


From 4949148ad433f6f11cf837978b2907092ec99f3a Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Tue, 26 Jul 2016 15:24:24 -0700
Subject: mm: charge/uncharge kmemcg from generic page allocator paths

Currently, to charge a non-slab allocation to kmemcg one has to use
alloc_kmem_pages helper with __GFP_ACCOUNT flag.  A page allocated with
this helper should finally be freed using free_kmem_pages, otherwise it
won't be uncharged.

This API suits its current users fine, but it turns out to be impossible
to use along with page reference counting, i.e.  when an allocation is
supposed to be freed with put_page, as it is the case with pipe or unix
socket buffers.

To overcome this limitation, this patch moves charging/uncharging to
generic page allocator paths, i.e.  to __alloc_pages_nodemask and
free_pages_prepare, and zaps alloc/free_kmem_pages helpers.  This way,
one can use any of the available page allocation functions to get the
allocated page charged to kmemcg - it's enough to pass __GFP_ACCOUNT,
just like in case of kmalloc and friends.  A charged page will be
automatically uncharged on free.

To make it possible, we need to mark pages charged to kmemcg somehow.
To avoid introducing a new page flag, we make use of page->_mapcount for
marking such pages.  Since pages charged to kmemcg are not supposed to
be mapped to userspace, it should work just fine.  There are other
(ab)users of page->_mapcount - buddy and balloon pages - but we don't
conflict with them.

In case kmemcg is compiled out or not used at runtime, this patch
introduces no overhead to generic page allocator paths.  If kmemcg is
used, it will be plus one gfp flags check on alloc and plus one
page->_mapcount check on free, which shouldn't hurt performance, because
the data accessed are hot.

Link: http://lkml.kernel.org/r/a9736d856f895bcb465d9f257b54efe32eda6f99.1464079538.git.vdavydov@virtuozzo.com
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h        | 10 +------
 include/linux/page-flags.h |  7 +++++
 kernel/fork.c              |  6 ++---
 mm/page_alloc.c            | 66 +++++++++-------------------------------------
 mm/slab_common.c           |  2 +-
 mm/slub.c                  |  6 ++---
 mm/vmalloc.c               |  6 ++---
 7 files changed, 31 insertions(+), 72 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 570383a41853..c29e9d347bc6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -78,8 +78,7 @@ struct vm_area_struct;
  * __GFP_THISNODE forces the allocation to be satisified from the requested
  *   node with no fallbacks or placement policy enforcements.
  *
- * __GFP_ACCOUNT causes the allocation to be accounted to kmemcg (only relevant
- *   to kmem allocations).
+ * __GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
  */
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
 #define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)
@@ -486,10 +485,6 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
 #define alloc_page_vma_node(gfp_mask, vma, addr, node)		\
 	alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)
 
-extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order);
-extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask,
-					  unsigned int order);
-
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
 
@@ -513,9 +508,6 @@ extern void *__alloc_page_frag(struct page_frag_cache *nc,
 			       unsigned int fragsz, gfp_t gfp_mask);
 extern void __free_page_frag(void *addr);
 
-extern void __free_kmem_pages(struct page *page, unsigned int order);
-extern void free_kmem_pages(unsigned long addr, unsigned int order);
-
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr), 0)
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 96084ee74ee8..7c8e82ac2eb7 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -641,6 +641,13 @@ PAGE_MAPCOUNT_OPS(Buddy, BUDDY)
 #define PAGE_BALLOON_MAPCOUNT_VALUE		(-256)
 PAGE_MAPCOUNT_OPS(Balloon, BALLOON)
 
+/*
+ * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
+ * pages allocated with __GFP_ACCOUNT. It gets cleared on page free.
+ */
+#define PAGE_KMEMCG_MAPCOUNT_VALUE		(-512)
+PAGE_MAPCOUNT_OPS(Kmemcg, KMEMCG)
+
 extern bool is_free_buddy_page(struct page *page);
 
 __PAGEFLAG(Isolated, isolated, PF_ANY);
diff --git a/kernel/fork.c b/kernel/fork.c
index 4a7ec0c6c88c..de21f25e0d2c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -162,8 +162,8 @@ void __weak arch_release_thread_stack(unsigned long *stack)
 static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
 						  int node)
 {
-	struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
-						  THREAD_SIZE_ORDER);
+	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+					     THREAD_SIZE_ORDER);
 
 	if (page)
 		memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
@@ -178,7 +178,7 @@ static inline void free_thread_stack(unsigned long *stack)
 
 	memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
 				    -(1 << THREAD_SIZE_ORDER));
-	__free_kmem_pages(page, THREAD_SIZE_ORDER);
+	__free_pages(page, THREAD_SIZE_ORDER);
 }
 # else
 static struct kmem_cache *thread_stack_cache;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index de2491c42d4f..7023a31edc5c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -63,6 +63,7 @@
 #include <linux/sched/rt.h>
 #include <linux/page_owner.h>
 #include <linux/kthread.h>
+#include <linux/memcontrol.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -1018,6 +1019,10 @@ static __always_inline bool free_pages_prepare(struct page *page,
 	}
 	if (PageMappingFlags(page))
 		page->mapping = NULL;
+	if (memcg_kmem_enabled() && PageKmemcg(page)) {
+		memcg_kmem_uncharge(page, order);
+		__ClearPageKmemcg(page);
+	}
 	if (check_free)
 		bad += free_pages_check(page);
 	if (bad)
@@ -3841,6 +3846,14 @@ no_zone:
 	}
 
 out:
+	if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page) {
+		if (unlikely(memcg_kmem_charge(page, gfp_mask, order))) {
+			__free_pages(page, order);
+			page = NULL;
+		} else
+			__SetPageKmemcg(page);
+	}
+
 	if (kmemcheck_enabled && page)
 		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
 
@@ -3996,59 +4009,6 @@ void __free_page_frag(void *addr)
 }
 EXPORT_SYMBOL(__free_page_frag);
 
-/*
- * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
- * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is
- * equivalent to alloc_pages.
- *
- * It should be used when the caller would like to use kmalloc, but since the
- * allocation is large, it has to fall back to the page allocator.
- */
-struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
-{
-	struct page *page;
-
-	page = alloc_pages(gfp_mask, order);
-	if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) &&
-	    page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
-		__free_pages(page, order);
-		page = NULL;
-	}
-	return page;
-}
-
-struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
-{
-	struct page *page;
-
-	page = alloc_pages_node(nid, gfp_mask, order);
-	if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) &&
-	    page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
-		__free_pages(page, order);
-		page = NULL;
-	}
-	return page;
-}
-
-/*
- * __free_kmem_pages and free_kmem_pages will free pages allocated with
- * alloc_kmem_pages.
- */
-void __free_kmem_pages(struct page *page, unsigned int order)
-{
-	if (memcg_kmem_enabled())
-		memcg_kmem_uncharge(page, order);
-	__free_pages(page, order);
-}
-
-void free_kmem_pages(unsigned long addr, unsigned int order)
-{
-	if (addr != 0) {
-		VM_BUG_ON(!virt_addr_valid((void *)addr));
-		__free_kmem_pages(virt_to_page((void *)addr), order);
-	}
-}
-
 static void *make_alloc_exact(unsigned long addr, unsigned int order,
 		size_t size)
 {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index da88c1588752..71f0b28a1bec 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1012,7 +1012,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
 	struct page *page;
 
 	flags |= __GFP_COMP;
-	page = alloc_kmem_pages(flags, order);
+	page = alloc_pages(flags, order);
 	ret = page ? page_address(page) : NULL;
 	kmemleak_alloc(ret, size, 1, flags);
 	kasan_kmalloc_large(ret, size, flags);
diff --git a/mm/slub.c b/mm/slub.c
index c0cfa2722539..f9da8716b8b3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2977,7 +2977,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
 		if (unlikely(!PageSlab(page))) {
 			BUG_ON(!PageCompound(page));
 			kfree_hook(object);
-			__free_kmem_pages(page, compound_order(page));
+			__free_pages(page, compound_order(page));
 			p[size] = NULL; /* mark object processed */
 			return size;
 		}
@@ -3693,7 +3693,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
 	void *ptr = NULL;
 
 	flags |= __GFP_COMP | __GFP_NOTRACK;
-	page = alloc_kmem_pages_node(node, flags, get_order(size));
+	page = alloc_pages_node(node, flags, get_order(size));
 	if (page)
 		ptr = page_address(page);
 
@@ -3774,7 +3774,7 @@ void kfree(const void *x)
 	if (unlikely(!PageSlab(page))) {
 		BUG_ON(!PageCompound(page));
 		kfree_hook(x);
-		__free_kmem_pages(page, compound_order(page));
+		__free_pages(page, compound_order(page));
 		return;
 	}
 	slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e11475cdeb7a..91f44e78c516 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1501,7 +1501,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
 			struct page *page = area->pages[i];
 
 			BUG_ON(!page);
-			__free_kmem_pages(page, 0);
+			__free_pages(page, 0);
 		}
 
 		kvfree(area->pages);
@@ -1629,9 +1629,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 		struct page *page;
 
 		if (node == NUMA_NO_NODE)
-			page = alloc_kmem_pages(alloc_mask, order);
+			page = alloc_pages(alloc_mask, order);
 		else
-			page = alloc_kmem_pages_node(node, alloc_mask, order);
+			page = alloc_pages_node(node, alloc_mask, order);
 
 		if (unlikely(!page)) {
 			/* Successfully allocated i pages, free them in __vunmap() */
-- 
cgit v1.3-8-gc7d7


From 1fe4d021acbc356723818a633fe0a10c59c2a4c1 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 26 Jul 2016 15:26:58 -0700
Subject: cgroup: fix idr leak for the first cgroup root

The valid cgroup hierarchy ID range includes 0, so we can't filter for
positive numbers when freeing it, or it'll leak the first ID.  No big
deal, just disruptive when reading the code.

The ID is freed during error handling and when the reference count hits
zero, so the double-free test is not necessary; remove it.

Link: http://lkml.kernel.org/r/20160617162359.GB19084@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 75c0ff00aca6..3108150e47b1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1160,18 +1160,12 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
 {
 	lockdep_assert_held(&cgroup_mutex);
 
-	if (root->hierarchy_id) {
-		idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
-		root->hierarchy_id = 0;
-	}
+	idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
 }
 
 static void cgroup_free_root(struct cgroup_root *root)
 {
 	if (root) {
-		/* hierarchy ID should already have been released */
-		WARN_ON_ONCE(root->hierarchy_id);
-
 		idr_destroy(&root->cgroup_idr);
 		kfree(root);
 	}
-- 
cgit v1.3-8-gc7d7


From cb773df88a737d7d7e05ca7ca516414d3fcdcab8 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 26 Jul 2016 15:27:01 -0700
Subject: cgroup: remove unnecessary 0 check from css_from_id()

css_idr allocation starts at 1, so index 0 will never point to an item.
css_from_id() currently filters that before asking idr_find(), but
idr_find() would also just return NULL, so this is not needed.

Link: http://lkml.kernel.org/r/20160617162427.GC19084@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3108150e47b1..fa943843a32f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6166,7 +6166,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
 {
 	WARN_ON_ONCE(!rcu_read_lock_held());
-	return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
+	return idr_find(&ss->css_idr, id);
 }
 
 /**
-- 
cgit v1.3-8-gc7d7


From bf262dcec6383188a3324192c4a7e405b3b1ad23 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jikos@kernel.org>
Date: Tue, 12 Apr 2016 05:02:09 +0930
Subject: module: fix noreturn attribute for __module_put_and_exit()

__module_put_and_exit() is makred noreturn in module.h declaration, but is
lacking the attribute in the definition, which makes some tools (such as
sparse) unhappy. Amend the definition with the attribute as well (and
reformat the declaration so that it uses more common format).

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h | 4 ++--
 kernel/module.c        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 3daf2b3a09d2..f777164c238b 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -575,8 +575,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
 					     struct module *, unsigned long),
 				   void *data);
 
-extern void __module_put_and_exit(struct module *mod, long code)
-	__attribute__((noreturn));
+extern void __noreturn __module_put_and_exit(struct module *mod,
+			long code);
 #define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code)
 
 #ifdef CONFIG_MODULE_UNLOAD
diff --git a/kernel/module.c b/kernel/module.c
index 5f71aa63ed2a..5e876977844b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -336,7 +336,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag,
  * A thread that wants to hold a reference to a module only while it
  * is running can call this to safely exit.  nfsd and lockd use this.
  */
-void __module_put_and_exit(struct module *mod, long code)
+void __noreturn __module_put_and_exit(struct module *mod, long code)
 {
 	module_put(mod);
 	do_exit(code);
-- 
cgit v1.3-8-gc7d7


From c75b590d60ffa3e31bcb9608b68006a8bab9e0ed Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 12 Apr 2016 05:03:09 +0930
Subject: module: fix redundant test.

[linux-4.5-rc4/kernel/module.c:1692]: (style) Redundant condition: attr.test.
'!attr.test || (attr.test && attr.test(mod))' is equivalent to '!attr.test ||
attr.test(mod)'

This code was added like this ten years ago, in c988d2b284549
"modules: add version and srcversion to sysfs".

Reported-by: David Binderman <dcb314@hotmail.com>
Cc: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5e876977844b..9e04a4210a4a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1693,8 +1693,7 @@ static int module_add_modinfo_attrs(struct module *mod)
 
 	temp_attr = mod->modinfo_attrs;
 	for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
-		if (!attr->test ||
-		    (attr->test && attr->test(mod))) {
+		if (!attr->test || attr->test(mod)) {
 			memcpy(temp_attr, attr, sizeof(*temp_attr));
 			sysfs_attr_init(&temp_attr->attr);
 			error = sysfs_create_file(&mod->mkobj.kobj,
-- 
cgit v1.3-8-gc7d7


From 3205c36cf7d96024626f92d65f560035df1abcb2 Mon Sep 17 00:00:00 2001
From: Libor Pechacek <lpechacek@suse.com>
Date: Wed, 13 Apr 2016 11:06:12 +0930
Subject: module: Issue warnings when tainting kernel

While most of the locations where a kernel taint bit is set are accompanied
with a warning message, there are two which set their bits silently.  If
the tainting module gets unloaded later on, it is almost impossible to tell
what was the reason for setting the flag.

Signed-off-by: Libor Pechacek <lpechacek@suse.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 9e04a4210a4a..0b4f3a85d4fc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2919,8 +2919,12 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
 		return -ENOEXEC;
 	}
 
-	if (!get_modinfo(info, "intree"))
+	if (!get_modinfo(info, "intree")) {
+		if (!test_taint(TAINT_OOT_MODULE))
+			pr_warn("%s: loading out-of-tree module taints kernel.\n",
+				mod->name);
 		add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
+	}
 
 	if (get_modinfo(info, "staging")) {
 		add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
@@ -3089,6 +3093,8 @@ static int move_module(struct module *mod, struct load_info *info)
 
 static int check_module_license_and_versions(struct module *mod)
 {
+	int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
+
 	/*
 	 * ndiswrapper is under GPL by itself, but loads proprietary modules.
 	 * Don't use add_taint_module(), as it would prevent ndiswrapper from
@@ -3107,6 +3113,9 @@ static int check_module_license_and_versions(struct module *mod)
 		add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
 				 LOCKDEP_NOW_UNRELIABLE);
 
+	if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
+		pr_warn("%s: module license taints kernel.\n", mod->name);
+
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
-- 
cgit v1.3-8-gc7d7


From bca014caaa6130e57f69b5bf527967aa8ee70fdd Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Thu, 28 Apr 2016 09:24:01 +0930
Subject: module: Invalidate signatures on force-loaded modules

Signing a module should only make it trusted by the specific kernel it
was built for, not anything else.  Loading a signed module meant for a
kernel with a different ABI could have interesting effects.
Therefore, treat all signatures as invalid when a module is
force-loaded.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 0b4f3a85d4fc..7f21ab238aa7 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2686,13 +2686,18 @@ static inline void kmemleak_load_module(const struct module *mod,
 #endif
 
 #ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info)
+static int module_sig_check(struct load_info *info, int flags)
 {
 	int err = -ENOKEY;
 	const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
 	const void *mod = info->hdr;
 
-	if (info->len > markerlen &&
+	/*
+	 * Require flags == 0, as a module with version information
+	 * removed is no longer the module that was signed
+	 */
+	if (flags == 0 &&
+	    info->len > markerlen &&
 	    memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
 		/* We truncate the module to discard the signature */
 		info->len -= markerlen;
@@ -2711,7 +2716,7 @@ static int module_sig_check(struct load_info *info)
 	return err;
 }
 #else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info)
+static int module_sig_check(struct load_info *info, int flags)
 {
 	return 0;
 }
@@ -3506,7 +3511,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	long err;
 	char *after_dashes;
 
-	err = module_sig_check(info);
+	err = module_sig_check(info, flags);
 	if (err)
 		goto free_copy;
 
-- 
cgit v1.3-8-gc7d7


From ce4f06dcbb5d6d04d202f1b81ac72d5679dcdfc0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 26 Jul 2016 20:57:36 +0200
Subject: stop_machine: Touch_nmi_watchdog() after MULTI_STOP_PREPARE

Suppose that stop_machine(fn) hangs because fn() hangs. In this case NMI
hard-lockup can be triggered on another CPU which does nothing wrong and
the trace from nmi_panic() won't help to investigate the problem.

And this change "fixes" the problem we (seem to) hit in practice.

 - stop_two_cpus(0, 1) races with show_state_filter() running on CPU_0.

 - CPU_1 already spins in MULTI_STOP_PREPARE state, it detects the soft
   lockup and tries to report the problem.

 - show_state_filter() enables preemption, CPU_0 calls multi_cpu_stop()
   which goes to MULTI_STOP_DISABLE_IRQ state and disables interrupts.

 - CPU_1 spends more than 10 seconds trying to flush the log buffer to
   the slow serial console.

 - NMI interrupt on CPU_0 (which now waits for CPU_1) calls nmi_panic().

Reported-by: Wang Shu <shuwang@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Anderson <anderson@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/20160726185736.GB4088@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/stop_machine.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index a467e6c28a3b..4a1ca5f6da7e 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -21,6 +21,7 @@
 #include <linux/smpboot.h>
 #include <linux/atomic.h>
 #include <linux/lglock.h>
+#include <linux/nmi.h>
 
 /*
  * Structure to determine completion condition and record errors.  May
@@ -209,6 +210,13 @@ static int multi_cpu_stop(void *data)
 				break;
 			}
 			ack_state(msdata);
+		} else if (curstate > MULTI_STOP_PREPARE) {
+			/*
+			 * At this stage all other CPUs we depend on must spin
+			 * in the same loop. Any reason for hard-lockup should
+			 * be detected and reported on their side.
+			 */
+			touch_nmi_watchdog();
 		}
 	} while (curstate != MULTI_STOP_EXIT);
 
-- 
cgit v1.3-8-gc7d7


From 4fae16dffb812f0e0d98a0b2b0856ca48ca63e6c Mon Sep 17 00:00:00 2001
From: Richard Cochran <rcochran@linutronix.de>
Date: Wed, 27 Jul 2016 11:08:18 +0200
Subject: timers/core: Correct callback order during CPU hot plug

On the tear-down path, the dead CPU callback for the timers was
misplaced within the 'cpuhp_state' enumeration.  There is a hidden
dependency between the timers and block multiqueue.  The timers
callback must happen before the block multiqueue callback otherwise a
RCU stall occurs.

Move the timers callback to the proper place in the state machine.

Reported-and-tested-by: Jon Hunter <jonathanh@nvidia.com>
Reported-by: kbuild test robot <lkp@intel.com>
Fixes: 24f73b99716a ("timers/core: Convert to hotplug state machine")
Signed-off-by: Richard Cochran <rcochran@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: John Stultz <john.stultz@linaro.org>
Cc: rt@linutronix.de
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1469610498-25914-1-git-send-email-rcochran@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpuhotplug.h |  2 +-
 kernel/cpu.c               | 15 ++++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 6d405db6fb12..242bf530edfc 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -20,9 +20,9 @@ enum cpuhp_state {
 	CPUHP_PROFILE_PREPARE,
 	CPUHP_X2APIC_PREPARE,
 	CPUHP_SMPCFD_PREPARE,
-	CPUHP_TIMERS_DEAD,
 	CPUHP_RCUTREE_PREP,
 	CPUHP_NOTIFY_PREPARE,
+	CPUHP_TIMERS_DEAD,
 	CPUHP_BRINGUP_CPU,
 	CPUHP_AP_IDLE_DEAD,
 	CPUHP_AP_OFFLINE,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f24f45915b54..341bf80f80bd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1200,11 +1200,6 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.startup = smpcfd_prepare_cpu,
 		.teardown = smpcfd_dead_cpu,
 	},
-	[CPUHP_TIMERS_DEAD] = {
-		.name = "timers dead",
-		.startup = NULL,
-		.teardown = timers_dead_cpu,
-	},
 	[CPUHP_RCUTREE_PREP] = {
 		.name = "RCU-tree prepare",
 		.startup = rcutree_prepare_cpu,
@@ -1221,6 +1216,16 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.skip_onerr		= true,
 		.cant_stop		= true,
 	},
+	/*
+	 * On the tear-down path, timers_dead_cpu() must be invoked
+	 * before blk_mq_queue_reinit_notify() from notify_dead(),
+	 * otherwise a RCU stall occurs.
+	 */
+	[CPUHP_TIMERS_DEAD] = {
+		.name = "timers dead",
+		.startup = NULL,
+		.teardown = timers_dead_cpu,
+	},
 	/* Kicks the plugged cpu into life */
 	[CPUHP_BRINGUP_CPU] = {
 		.name			= "cpu:bringup",
-- 
cgit v1.3-8-gc7d7


From a34c80a7294e34ba213c285dff38b1137745f94b Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 28 Jul 2016 15:45:16 -0700
Subject: freezer, oom: check TIF_MEMDIE on the correct task

freezing_slow_path() is checking TIF_MEMDIE to skip OOM killed tasks.
It is, however, checking the flag on the current task rather than the
given one.  This is really confusing because freezing() can be called
also on !current tasks.  It would end up working correctly for its main
purpose because __refrigerator will be always called on the current task
so the oom victim will never get frozen.  But it could lead to
surprising results when a task which is freezing a cgroup got oom killed
because only part of the cgroup would get frozen.  This is highly
unlikely but worth fixing as the resulting code would be more clear
anyway.

Link: http://lkml.kernel.org/r/1467029719-17602-2-git-send-email-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: Miao Xie <miaoxie@huawei.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/freezer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/freezer.c b/kernel/freezer.c
index a8900a3bc27a..6f56a9e219fa 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p)
 	if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
 		return false;
 
-	if (test_thread_flag(TIF_MEMDIE))
+	if (test_tsk_thread_flag(p, TIF_MEMDIE))
 		return false;
 
 	if (pm_nosig_freezing || cgroup_freezing(p))
-- 
cgit v1.3-8-gc7d7


From fec1e5f987bfc41f9f08cbd206e7302e6ac2ab0c Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 28 Jul 2016 15:45:19 -0700
Subject: cpuset, mm: fix TIF_MEMDIE check in cpuset_change_task_nodemask

Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when
changing cpuset's mems") has added TIF_MEMDIE and PF_EXITING check but
it is checking the flag on the current task rather than the given one.

This doesn't make much sense and it is actually wrong.  If the current
task which updates the nodemask of a cpuset got killed by the OOM killer
then a part of the cpuset cgroup processes would have incompatible
nodemask which is surprising to say the least.

The comment suggests the intention was to skip oom victim or an exiting
task so we should be checking the given task.  But even then it would be
layering violation because it is the memory allocator to interpret the
TIF_MEMDIE meaning.  Simply drop both checks.  All tasks in the cpuset
should simply follow the same mask.

Link: http://lkml.kernel.org/r/1467029719-17602-3-git-send-email-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: Miao Xie <miaoxie@huawei.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 73e93e53884d..c7fd2778ed50 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1034,15 +1034,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
 {
 	bool need_loop;
 
-	/*
-	 * Allow tasks that have access to memory reserves because they have
-	 * been OOM killed to get memory anywhere.
-	 */
-	if (unlikely(test_thread_flag(TIF_MEMDIE)))
-		return;
-	if (current->flags & PF_EXITING) /* Let dying task have memory */
-		return;
-
 	task_lock(tsk);
 	/*
 	 * Determine if a loop is necessary if another thread is doing
-- 
cgit v1.3-8-gc7d7


From 599d0c954f91d0689c9bb421b5bc04ea02437a41 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 28 Jul 2016 15:45:31 -0700
Subject: mm, vmscan: move LRU lists to node

This moves the LRU lists from the zone to the node and related data such
as counters, tracing, congestion tracking and writeback tracking.

Unfortunately, due to reclaim and compaction retry logic, it is
necessary to account for the number of LRU pages on both zone and node
logic.  Most reclaim logic is based on the node counters but the retry
logic uses the zone counters which do not distinguish inactive and
active sizes.  It would be possible to leave the LRU counters on a
per-zone basis but it's a heavier calculation across multiple cache
lines that is much more frequent than the retry checks.

Other than the LRU counters, this is mostly a mechanical patch but note
that it introduces a number of anomalies.  For example, the scans are
per-zone but using per-node counters.  We also mark a node as congested
when a zone is congested.  This causes weird problems that are fixed
later but is easier to review.

In the event that there is excessive overhead on 32-bit systems due to
the nodes being on LRU then there are two potential solutions

1. Long-term isolation of highmem pages when reclaim is lowmem

   When pages are skipped, they are immediately added back onto the LRU
   list. If lowmem reclaim persisted for long periods of time, the same
   highmem pages get continually scanned. The idea would be that lowmem
   keeps those pages on a separate list until a reclaim for highmem pages
   arrives that splices the highmem pages back onto the LRU. It potentially
   could be implemented similar to the UNEVICTABLE list.

   That would reduce the skip rate with the potential corner case is that
   highmem pages have to be scanned and reclaimed to free lowmem slab pages.

2. Linear scan lowmem pages if the initial LRU shrink fails

   This will break LRU ordering but may be preferable and faster during
   memory pressure than skipping LRU pages.

Link: http://lkml.kernel.org/r/1467970510-21195-4-git-send-email-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/tile/mm/pgtable.c                    |   8 +-
 drivers/base/node.c                       |  19 +--
 drivers/staging/android/lowmemorykiller.c |   8 +-
 include/linux/backing-dev.h               |   2 +-
 include/linux/memcontrol.h                |  18 +--
 include/linux/mm_inline.h                 |  21 ++-
 include/linux/mmzone.h                    |  68 +++++----
 include/linux/swap.h                      |   1 +
 include/linux/vm_event_item.h             |  10 +-
 include/linux/vmstat.h                    |  17 +++
 include/trace/events/vmscan.h             |  12 +-
 kernel/power/snapshot.c                   |  10 +-
 mm/backing-dev.c                          |  15 +-
 mm/compaction.c                           |  18 +--
 mm/huge_memory.c                          |   2 +-
 mm/internal.h                             |   2 +-
 mm/khugepaged.c                           |   4 +-
 mm/memcontrol.c                           |  17 +--
 mm/memory-failure.c                       |   4 +-
 mm/memory_hotplug.c                       |   2 +-
 mm/mempolicy.c                            |   2 +-
 mm/migrate.c                              |  21 +--
 mm/mlock.c                                |   2 +-
 mm/page-writeback.c                       |   8 +-
 mm/page_alloc.c                           |  68 +++++----
 mm/swap.c                                 |  50 +++----
 mm/vmscan.c                               | 226 +++++++++++++++++-------------
 mm/vmstat.c                               |  47 ++++---
 mm/workingset.c                           |   4 +-
 29 files changed, 386 insertions(+), 300 deletions(-)

(limited to 'kernel')

diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index c4d5bf841a7f..9e389213580d 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -45,10 +45,10 @@ void show_mem(unsigned int filter)
 	struct zone *zone;
 
 	pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu pagecache:%lu swap:%lu\n",
-	       (global_page_state(NR_ACTIVE_ANON) +
-		global_page_state(NR_ACTIVE_FILE)),
-	       (global_page_state(NR_INACTIVE_ANON) +
-		global_page_state(NR_INACTIVE_FILE)),
+	       (global_node_page_state(NR_ACTIVE_ANON) +
+		global_node_page_state(NR_ACTIVE_FILE)),
+	       (global_node_page_state(NR_INACTIVE_ANON) +
+		global_node_page_state(NR_INACTIVE_FILE)),
 	       global_page_state(NR_FILE_DIRTY),
 	       global_page_state(NR_WRITEBACK),
 	       global_page_state(NR_UNSTABLE_NFS),
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 0a1b6433a76c..d4698f096209 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -56,6 +56,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 {
 	int n;
 	int nid = dev->id;
+	struct pglist_data *pgdat = NODE_DATA(nid);
 	struct sysinfo i;
 
 	si_meminfo_node(&i, nid);
@@ -74,15 +75,15 @@ static ssize_t node_read_meminfo(struct device *dev,
 		       nid, K(i.totalram),
 		       nid, K(i.freeram),
 		       nid, K(i.totalram - i.freeram),
-		       nid, K(sum_zone_node_page_state(nid, NR_ACTIVE_ANON) +
-				sum_zone_node_page_state(nid, NR_ACTIVE_FILE)),
-		       nid, K(sum_zone_node_page_state(nid, NR_INACTIVE_ANON) +
-				sum_zone_node_page_state(nid, NR_INACTIVE_FILE)),
-		       nid, K(sum_zone_node_page_state(nid, NR_ACTIVE_ANON)),
-		       nid, K(sum_zone_node_page_state(nid, NR_INACTIVE_ANON)),
-		       nid, K(sum_zone_node_page_state(nid, NR_ACTIVE_FILE)),
-		       nid, K(sum_zone_node_page_state(nid, NR_INACTIVE_FILE)),
-		       nid, K(sum_zone_node_page_state(nid, NR_UNEVICTABLE)),
+		       nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) +
+				node_page_state(pgdat, NR_ACTIVE_FILE)),
+		       nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) +
+				node_page_state(pgdat, NR_INACTIVE_FILE)),
+		       nid, K(node_page_state(pgdat, NR_ACTIVE_ANON)),
+		       nid, K(node_page_state(pgdat, NR_INACTIVE_ANON)),
+		       nid, K(node_page_state(pgdat, NR_ACTIVE_FILE)),
+		       nid, K(node_page_state(pgdat, NR_INACTIVE_FILE)),
+		       nid, K(node_page_state(pgdat, NR_UNEVICTABLE)),
 		       nid, K(sum_zone_node_page_state(nid, NR_MLOCK)));
 
 #ifdef CONFIG_HIGHMEM
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index 24d2745e9437..93dbcc38eb0f 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -72,10 +72,10 @@ static unsigned long lowmem_deathpending_timeout;
 static unsigned long lowmem_count(struct shrinker *s,
 				  struct shrink_control *sc)
 {
-	return global_page_state(NR_ACTIVE_ANON) +
-		global_page_state(NR_ACTIVE_FILE) +
-		global_page_state(NR_INACTIVE_ANON) +
-		global_page_state(NR_INACTIVE_FILE);
+	return global_node_page_state(NR_ACTIVE_ANON) +
+		global_node_page_state(NR_ACTIVE_FILE) +
+		global_node_page_state(NR_INACTIVE_ANON) +
+		global_node_page_state(NR_INACTIVE_FILE);
 }
 
 static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index c82794f20110..491a91717788 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -197,7 +197,7 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
 }
 
 long congestion_wait(int sync, long timeout);
-long wait_iff_congested(struct zone *zone, int sync, long timeout);
+long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout);
 int pdflush_proc_obsolete(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp, loff_t *ppos);
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1c4df4420258..6d2321c148cd 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -339,7 +339,7 @@ static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
 	struct lruvec *lruvec;
 
 	if (mem_cgroup_disabled()) {
-		lruvec = &zone->lruvec;
+		lruvec = zone_lruvec(zone);
 		goto out;
 	}
 
@@ -348,15 +348,15 @@ static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
 out:
 	/*
 	 * Since a node can be onlined after the mem_cgroup was created,
-	 * we have to be prepared to initialize lruvec->zone here;
+	 * we have to be prepared to initialize lruvec->pgdat here;
 	 * and if offlined then reonlined, we need to reinitialize it.
 	 */
-	if (unlikely(lruvec->zone != zone))
-		lruvec->zone = zone;
+	if (unlikely(lruvec->pgdat != zone->zone_pgdat))
+		lruvec->pgdat = zone->zone_pgdat;
 	return lruvec;
 }
 
-struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
+struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
 
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
@@ -437,7 +437,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 
 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
-		int nr_pages);
+		enum zone_type zid, int nr_pages);
 
 unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 					   int nid, unsigned int lru_mask);
@@ -612,13 +612,13 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new)
 static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
 						    struct mem_cgroup *memcg)
 {
-	return &zone->lruvec;
+	return zone_lruvec(zone);
 }
 
 static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
-						    struct zone *zone)
+						    struct pglist_data *pgdat)
 {
-	return &zone->lruvec;
+	return &pgdat->lruvec;
 }
 
 static inline bool mm_match_cgroup(struct mm_struct *mm,
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 5bd29ba4f174..9aadcc781857 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -23,25 +23,32 @@ static inline int page_is_file_cache(struct page *page)
 }
 
 static __always_inline void __update_lru_size(struct lruvec *lruvec,
-				enum lru_list lru, int nr_pages)
+				enum lru_list lru, enum zone_type zid,
+				int nr_pages)
 {
-	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
+	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+	__mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages);
+	__mod_zone_page_state(&pgdat->node_zones[zid],
+		NR_ZONE_LRU_BASE + !!is_file_lru(lru),
+		nr_pages);
 }
 
 static __always_inline void update_lru_size(struct lruvec *lruvec,
-				enum lru_list lru, int nr_pages)
+				enum lru_list lru, enum zone_type zid,
+				int nr_pages)
 {
 #ifdef CONFIG_MEMCG
-	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+	mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
 #else
-	__update_lru_size(lruvec, lru, nr_pages);
+	__update_lru_size(lruvec, lru, zid, nr_pages);
 #endif
 }
 
 static __always_inline void add_page_to_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
-	update_lru_size(lruvec, lru, hpage_nr_pages(page));
+	update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page));
 	list_add(&page->lru, &lruvec->lists[lru]);
 }
 
@@ -49,7 +56,7 @@ static __always_inline void del_page_from_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
 	list_del(&page->lru);
-	update_lru_size(lruvec, lru, -hpage_nr_pages(page));
+	update_lru_size(lruvec, lru, page_zonenum(page), -hpage_nr_pages(page));
 }
 
 /**
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cfa870107abe..d4f5cac0a8c3 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -111,12 +111,9 @@ enum zone_stat_item {
 	/* First 128 byte cacheline (assuming 64 bit words) */
 	NR_FREE_PAGES,
 	NR_ALLOC_BATCH,
-	NR_LRU_BASE,
-	NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
-	NR_ACTIVE_ANON,		/*  "     "     "   "       "         */
-	NR_INACTIVE_FILE,	/*  "     "     "   "       "         */
-	NR_ACTIVE_FILE,		/*  "     "     "   "       "         */
-	NR_UNEVICTABLE,		/*  "     "     "   "       "         */
+	NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
+	NR_ZONE_LRU_ANON = NR_ZONE_LRU_BASE,
+	NR_ZONE_LRU_FILE,
 	NR_MLOCK,		/* mlock()ed pages found and moved off LRU */
 	NR_ANON_PAGES,	/* Mapped anonymous pages */
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
@@ -134,12 +131,9 @@ enum zone_stat_item {
 	NR_VMSCAN_WRITE,
 	NR_VMSCAN_IMMEDIATE,	/* Prioritise for reclaim when writeback ends */
 	NR_WRITEBACK_TEMP,	/* Writeback using temporary buffers */
-	NR_ISOLATED_ANON,	/* Temporary isolated pages from anon lru */
-	NR_ISOLATED_FILE,	/* Temporary isolated pages from file lru */
 	NR_SHMEM,		/* shmem pages (included tmpfs/GEM pages) */
 	NR_DIRTIED,		/* page dirtyings since bootup */
 	NR_WRITTEN,		/* page writings since bootup */
-	NR_PAGES_SCANNED,	/* pages scanned since last reclaim */
 #if IS_ENABLED(CONFIG_ZSMALLOC)
 	NR_ZSPAGES,		/* allocated in zsmalloc */
 #endif
@@ -161,6 +155,15 @@ enum zone_stat_item {
 	NR_VM_ZONE_STAT_ITEMS };
 
 enum node_stat_item {
+	NR_LRU_BASE,
+	NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
+	NR_ACTIVE_ANON,		/*  "     "     "   "       "         */
+	NR_INACTIVE_FILE,	/*  "     "     "   "       "         */
+	NR_ACTIVE_FILE,		/*  "     "     "   "       "         */
+	NR_UNEVICTABLE,		/*  "     "     "   "       "         */
+	NR_ISOLATED_ANON,	/* Temporary isolated pages from anon lru */
+	NR_ISOLATED_FILE,	/* Temporary isolated pages from file lru */
+	NR_PAGES_SCANNED,	/* pages scanned since last reclaim */
 	NR_VM_NODE_STAT_ITEMS
 };
 
@@ -219,7 +222,7 @@ struct lruvec {
 	/* Evictions & activations on the inactive file list */
 	atomic_long_t			inactive_age;
 #ifdef CONFIG_MEMCG
-	struct zone			*zone;
+	struct pglist_data *pgdat;
 #endif
 };
 
@@ -357,13 +360,6 @@ struct zone {
 #ifdef CONFIG_NUMA
 	int node;
 #endif
-
-	/*
-	 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
-	 * this zone's LRU.  Maintained by the pageout code.
-	 */
-	unsigned int inactive_ratio;
-
 	struct pglist_data	*zone_pgdat;
 	struct per_cpu_pageset __percpu *pageset;
 
@@ -495,9 +491,6 @@ struct zone {
 
 	/* Write-intensive fields used by page reclaim */
 
-	/* Fields commonly accessed by the page reclaim scanner */
-	struct lruvec		lruvec;
-
 	/*
 	 * When free pages are below this point, additional steps are taken
 	 * when reading the number of free pages to avoid per-cpu counter
@@ -537,17 +530,20 @@ struct zone {
 
 enum zone_flags {
 	ZONE_RECLAIM_LOCKED,		/* prevents concurrent reclaim */
-	ZONE_CONGESTED,			/* zone has many dirty pages backed by
+	ZONE_FAIR_DEPLETED,		/* fair zone policy batch depleted */
+};
+
+enum pgdat_flags {
+	PGDAT_CONGESTED,		/* pgdat has many dirty pages backed by
 					 * a congested BDI
 					 */
-	ZONE_DIRTY,			/* reclaim scanning has recently found
+	PGDAT_DIRTY,			/* reclaim scanning has recently found
 					 * many dirty file pages at the tail
 					 * of the LRU.
 					 */
-	ZONE_WRITEBACK,			/* reclaim scanning has recently found
+	PGDAT_WRITEBACK,		/* reclaim scanning has recently found
 					 * many pages under writeback
 					 */
-	ZONE_FAIR_DEPLETED,		/* fair zone policy batch depleted */
 };
 
 static inline unsigned long zone_end_pfn(const struct zone *zone)
@@ -707,6 +703,19 @@ typedef struct pglist_data {
 	unsigned long split_queue_len;
 #endif
 
+	/* Fields commonly accessed by the page reclaim scanner */
+	struct lruvec		lruvec;
+
+	/*
+	 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
+	 * this node's LRU.  Maintained by the pageout code.
+	 */
+	unsigned int inactive_ratio;
+
+	unsigned long		flags;
+
+	ZONE_PADDING(_pad2_)
+
 	/* Per-node vmstats */
 	struct per_cpu_nodestat __percpu *per_cpu_nodestats;
 	atomic_long_t		vm_stat[NR_VM_NODE_STAT_ITEMS];
@@ -728,6 +737,11 @@ static inline spinlock_t *zone_lru_lock(struct zone *zone)
 	return &zone->zone_pgdat->lru_lock;
 }
 
+static inline struct lruvec *zone_lruvec(struct zone *zone)
+{
+	return &zone->zone_pgdat->lruvec;
+}
+
 static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
 {
 	return pgdat->node_start_pfn + pgdat->node_spanned_pages;
@@ -779,12 +793,12 @@ extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
 
 extern void lruvec_init(struct lruvec *lruvec);
 
-static inline struct zone *lruvec_zone(struct lruvec *lruvec)
+static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
 {
 #ifdef CONFIG_MEMCG
-	return lruvec->zone;
+	return lruvec->pgdat;
 #else
-	return container_of(lruvec, struct zone, lruvec);
+	return container_of(lruvec, struct pglist_data, lruvec);
 #endif
 }
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0af2bb2028fd..c82f916008b7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -317,6 +317,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
 
 /* linux/mm/vmscan.c */
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
+extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat);
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
 extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 42604173f122..1798ff542517 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -26,11 +26,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		PGFREE, PGACTIVATE, PGDEACTIVATE,
 		PGFAULT, PGMAJFAULT,
 		PGLAZYFREED,
-		FOR_ALL_ZONES(PGREFILL),
-		FOR_ALL_ZONES(PGSTEAL_KSWAPD),
-		FOR_ALL_ZONES(PGSTEAL_DIRECT),
-		FOR_ALL_ZONES(PGSCAN_KSWAPD),
-		FOR_ALL_ZONES(PGSCAN_DIRECT),
+		PGREFILL,
+		PGSTEAL_KSWAPD,
+		PGSTEAL_DIRECT,
+		PGSCAN_KSWAPD,
+		PGSCAN_DIRECT,
 		PGSCAN_DIRECT_THROTTLE,
 #ifdef CONFIG_NUMA
 		PGSCAN_ZONE_RECLAIM_FAILED,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index d1744aa3ab9c..fee321c98550 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -178,6 +178,23 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
 	return x;
 }
 
+static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat,
+					enum node_stat_item item)
+{
+	long x = atomic_long_read(&pgdat->vm_stat[item]);
+
+#ifdef CONFIG_SMP
+	int cpu;
+	for_each_online_cpu(cpu)
+		x += per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->vm_node_stat_diff[item];
+
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
+}
+
+
 #ifdef CONFIG_NUMA
 extern unsigned long sum_zone_node_page_state(int node,
 						enum zone_stat_item item);
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 0101ef37f1ee..897f1aa1ee5f 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -352,15 +352,14 @@ TRACE_EVENT(mm_vmscan_writepage,
 
 TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
 
-	TP_PROTO(struct zone *zone,
+	TP_PROTO(int nid,
 		unsigned long nr_scanned, unsigned long nr_reclaimed,
 		int priority, int file),
 
-	TP_ARGS(zone, nr_scanned, nr_reclaimed, priority, file),
+	TP_ARGS(nid, nr_scanned, nr_reclaimed, priority, file),
 
 	TP_STRUCT__entry(
 		__field(int, nid)
-		__field(int, zid)
 		__field(unsigned long, nr_scanned)
 		__field(unsigned long, nr_reclaimed)
 		__field(int, priority)
@@ -368,16 +367,15 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
 	),
 
 	TP_fast_assign(
-		__entry->nid = zone_to_nid(zone);
-		__entry->zid = zone_idx(zone);
+		__entry->nid = nid;
 		__entry->nr_scanned = nr_scanned;
 		__entry->nr_reclaimed = nr_reclaimed;
 		__entry->priority = priority;
 		__entry->reclaim_flags = trace_shrink_flags(file);
 	),
 
-	TP_printk("nid=%d zid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s",
-		__entry->nid, __entry->zid,
+	TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s",
+		__entry->nid,
 		__entry->nr_scanned, __entry->nr_reclaimed,
 		__entry->priority,
 		show_reclaim_flags(__entry->reclaim_flags))
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d90df926b59f..9a0178c2ac1d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1627,11 +1627,11 @@ static unsigned long minimum_image_size(unsigned long saveable)
 	unsigned long size;
 
 	size = global_page_state(NR_SLAB_RECLAIMABLE)
-		+ global_page_state(NR_ACTIVE_ANON)
-		+ global_page_state(NR_INACTIVE_ANON)
-		+ global_page_state(NR_ACTIVE_FILE)
-		+ global_page_state(NR_INACTIVE_FILE)
-		- global_page_state(NR_FILE_MAPPED);
+		+ global_node_page_state(NR_ACTIVE_ANON)
+		+ global_node_page_state(NR_INACTIVE_ANON)
+		+ global_node_page_state(NR_ACTIVE_FILE)
+		+ global_node_page_state(NR_INACTIVE_FILE)
+		- global_node_page_state(NR_FILE_MAPPED);
 
 	return saveable <= size ? 0 : saveable - size;
 }
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index ed173b8ae8f2..efe237742074 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -947,24 +947,24 @@ long congestion_wait(int sync, long timeout)
 EXPORT_SYMBOL(congestion_wait);
 
 /**
- * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
- * @zone: A zone to check if it is heavily congested
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
+ * @pgdat: A pgdat to check if it is heavily congested
  * @sync: SYNC or ASYNC IO
  * @timeout: timeout in jiffies
  *
  * In the event of a congested backing_dev (any backing_dev) and the given
- * @zone has experienced recent congestion, this waits for up to @timeout
+ * @pgdat has experienced recent congestion, this waits for up to @timeout
  * jiffies for either a BDI to exit congestion of the given @sync queue
  * or a write to complete.
  *
- * In the absence of zone congestion, cond_resched() is called to yield
+ * In the absence of pgdat congestion, cond_resched() is called to yield
  * the processor if necessary but otherwise does not sleep.
  *
  * The return value is 0 if the sleep is for the full timeout. Otherwise,
  * it is the number of jiffies that were still remaining when the function
  * returned. return_value == timeout implies the function did not sleep.
  */
-long wait_iff_congested(struct zone *zone, int sync, long timeout)
+long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout)
 {
 	long ret;
 	unsigned long start = jiffies;
@@ -973,12 +973,13 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
 
 	/*
 	 * If there is no congestion, or heavy congestion is not being
-	 * encountered in the current zone, yield if necessary instead
+	 * encountered in the current pgdat, yield if necessary instead
 	 * of sleeping on the congestion queue
 	 */
 	if (atomic_read(&nr_wb_congested[sync]) == 0 ||
-	    !test_bit(ZONE_CONGESTED, &zone->flags)) {
+	    !test_bit(PGDAT_CONGESTED, &pgdat->flags)) {
 		cond_resched();
+
 		/* In case we scheduled, work out time remaining */
 		ret = timeout - (jiffies - start);
 		if (ret < 0)
diff --git a/mm/compaction.c b/mm/compaction.c
index 5c65fad3f330..e5995f38d677 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -646,8 +646,8 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
 	list_for_each_entry(page, &cc->migratepages, lru)
 		count[!!page_is_file_cache(page)]++;
 
-	mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
-	mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]);
+	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]);
 }
 
 /* Similar to reclaim, but different enough that they don't share logic */
@@ -655,12 +655,12 @@ static bool too_many_isolated(struct zone *zone)
 {
 	unsigned long active, inactive, isolated;
 
-	inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
-					zone_page_state(zone, NR_INACTIVE_ANON);
-	active = zone_page_state(zone, NR_ACTIVE_FILE) +
-					zone_page_state(zone, NR_ACTIVE_ANON);
-	isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
-					zone_page_state(zone, NR_ISOLATED_ANON);
+	inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) +
+			node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON);
+	active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) +
+			node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON);
+	isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) +
+			node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON);
 
 	return isolated > (inactive + active) / 2;
 }
@@ -856,7 +856,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			}
 		}
 
-		lruvec = mem_cgroup_page_lruvec(page, zone);
+		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
 
 		/* Try isolate the page */
 		if (__isolate_lru_page(page, isolate_mode) != 0)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 99578b63814b..481fb0128d21 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1818,7 +1818,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 	pgoff_t end = -1;
 	int i;
 
-	lruvec = mem_cgroup_page_lruvec(head, zone);
+	lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
 
 	/* complete memcg works before add pages to LRU */
 	mem_cgroup_split_huge_fixup(head);
diff --git a/mm/internal.h b/mm/internal.h
index 9b6a6c43ac39..2f80d0343c56 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -78,7 +78,7 @@ extern unsigned long highest_memmap_pfn;
  */
 extern int isolate_lru_page(struct page *page);
 extern void putback_lru_page(struct page *page);
-extern bool zone_reclaimable(struct zone *zone);
+extern bool pgdat_reclaimable(struct pglist_data *pgdat);
 
 /*
  * in mm/rmap.c:
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 7dbee698d6aa..374237bb059d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -480,7 +480,7 @@ void __khugepaged_exit(struct mm_struct *mm)
 static void release_pte_page(struct page *page)
 {
 	/* 0 stands for page_is_file_cache(page) == false */
-	dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
+	dec_node_page_state(page, NR_ISOLATED_ANON + 0);
 	unlock_page(page);
 	putback_lru_page(page);
 }
@@ -576,7 +576,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			goto out;
 		}
 		/* 0 stands for page_is_file_cache(page) == false */
-		inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
+		inc_node_page_state(page, NR_ISOLATED_ANON + 0);
 		VM_BUG_ON_PAGE(!PageLocked(page), page);
 		VM_BUG_ON_PAGE(PageLRU(page), page);
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9b70f9ca8ddf..50c86ad121bc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -943,14 +943,14 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
  * and putback protocol: the LRU lock must be held, and the page must
  * either be PageLRU() or the caller must have isolated/allocated it.
  */
-struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
+struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
 {
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup *memcg;
 	struct lruvec *lruvec;
 
 	if (mem_cgroup_disabled()) {
-		lruvec = &zone->lruvec;
+		lruvec = &pgdat->lruvec;
 		goto out;
 	}
 
@@ -970,8 +970,8 @@ out:
 	 * we have to be prepared to initialize lruvec->zone here;
 	 * and if offlined then reonlined, we need to reinitialize it.
 	 */
-	if (unlikely(lruvec->zone != zone))
-		lruvec->zone = zone;
+	if (unlikely(lruvec->pgdat != pgdat))
+		lruvec->pgdat = pgdat;
 	return lruvec;
 }
 
@@ -979,6 +979,7 @@ out:
  * mem_cgroup_update_lru_size - account for adding or removing an lru page
  * @lruvec: mem_cgroup per zone lru vector
  * @lru: index of lru list the page is sitting on
+ * @zid: Zone ID of the zone pages have been added to
  * @nr_pages: positive when adding or negative when removing
  *
  * This function must be called under lru_lock, just before a page is added
@@ -986,14 +987,14 @@ out:
  * so as to allow it to check that lru_size 0 is consistent with list_empty).
  */
 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
-				int nr_pages)
+				enum zone_type zid, int nr_pages)
 {
 	struct mem_cgroup_per_zone *mz;
 	unsigned long *lru_size;
 	long size;
 	bool empty;
 
-	__update_lru_size(lruvec, lru, nr_pages);
+	__update_lru_size(lruvec, lru, zid, nr_pages);
 
 	if (mem_cgroup_disabled())
 		return;
@@ -2069,7 +2070,7 @@ static void lock_page_lru(struct page *page, int *isolated)
 	if (PageLRU(page)) {
 		struct lruvec *lruvec;
 
-		lruvec = mem_cgroup_page_lruvec(page, zone);
+		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
 		ClearPageLRU(page);
 		del_page_from_lru_list(page, lruvec, page_lru(page));
 		*isolated = 1;
@@ -2084,7 +2085,7 @@ static void unlock_page_lru(struct page *page, int isolated)
 	if (isolated) {
 		struct lruvec *lruvec;
 
-		lruvec = mem_cgroup_page_lruvec(page, zone);
+		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
 		VM_BUG_ON_PAGE(PageLRU(page), page);
 		SetPageLRU(page);
 		add_page_to_lru_list(page, lruvec, page_lru(page));
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2fcca6b0e005..11de752ccaf5 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1663,7 +1663,7 @@ static int __soft_offline_page(struct page *page, int flags)
 	put_hwpoison_page(page);
 	if (!ret) {
 		LIST_HEAD(pagelist);
-		inc_zone_page_state(page, NR_ISOLATED_ANON +
+		inc_node_page_state(page, NR_ISOLATED_ANON +
 					page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
 		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
@@ -1671,7 +1671,7 @@ static int __soft_offline_page(struct page *page, int flags)
 		if (ret) {
 			if (!list_empty(&pagelist)) {
 				list_del(&page->lru);
-				dec_zone_page_state(page, NR_ISOLATED_ANON +
+				dec_node_page_state(page, NR_ISOLATED_ANON +
 						page_is_file_cache(page));
 				putback_lru_page(page);
 			}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 82d0b98d27f8..c5278360ca66 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1586,7 +1586,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 			put_page(page);
 			list_add_tail(&page->lru, &source);
 			move_pages--;
-			inc_zone_page_state(page, NR_ISOLATED_ANON +
+			inc_node_page_state(page, NR_ISOLATED_ANON +
 					    page_is_file_cache(page));
 
 		} else {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 53e40d3f3933..d8c4e38fb5f4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -962,7 +962,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 		if (!isolate_lru_page(page)) {
 			list_add_tail(&page->lru, pagelist);
-			inc_zone_page_state(page, NR_ISOLATED_ANON +
+			inc_node_page_state(page, NR_ISOLATED_ANON +
 					    page_is_file_cache(page));
 		}
 	}
diff --git a/mm/migrate.c b/mm/migrate.c
index 2232f6923cc7..3033dae33a0a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -168,7 +168,7 @@ void putback_movable_pages(struct list_head *l)
 			continue;
 		}
 		list_del(&page->lru);
-		dec_zone_page_state(page, NR_ISOLATED_ANON +
+		dec_node_page_state(page, NR_ISOLATED_ANON +
 				page_is_file_cache(page));
 		/*
 		 * We isolated non-lru movable page so here we can use
@@ -1119,7 +1119,7 @@ out:
 		 * restored.
 		 */
 		list_del(&page->lru);
-		dec_zone_page_state(page, NR_ISOLATED_ANON +
+		dec_node_page_state(page, NR_ISOLATED_ANON +
 				page_is_file_cache(page));
 	}
 
@@ -1460,7 +1460,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
 		err = isolate_lru_page(page);
 		if (!err) {
 			list_add_tail(&page->lru, &pagelist);
-			inc_zone_page_state(page, NR_ISOLATED_ANON +
+			inc_node_page_state(page, NR_ISOLATED_ANON +
 					    page_is_file_cache(page));
 		}
 put_and_set:
@@ -1726,15 +1726,16 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
 				   unsigned long nr_migrate_pages)
 {
 	int z;
+
+	if (!pgdat_reclaimable(pgdat))
+		return false;
+
 	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
 		struct zone *zone = pgdat->node_zones + z;
 
 		if (!populated_zone(zone))
 			continue;
 
-		if (!zone_reclaimable(zone))
-			continue;
-
 		/* Avoid waking kswapd by allocating pages_to_migrate pages. */
 		if (!zone_watermark_ok(zone, 0,
 				       high_wmark_pages(zone) +
@@ -1828,7 +1829,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 	}
 
 	page_lru = page_is_file_cache(page);
-	mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru,
+	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
 				hpage_nr_pages(page));
 
 	/*
@@ -1886,7 +1887,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
 			list_del(&page->lru);
-			dec_zone_page_state(page, NR_ISOLATED_ANON +
+			dec_node_page_state(page, NR_ISOLATED_ANON +
 					page_is_file_cache(page));
 			putback_lru_page(page);
 		}
@@ -1979,7 +1980,7 @@ fail_putback:
 		/* Retake the callers reference and putback on LRU */
 		get_page(page);
 		putback_lru_page(page);
-		mod_zone_page_state(page_zone(page),
+		mod_node_page_state(page_pgdat(page),
 			 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
 
 		goto out_unlock;
@@ -2030,7 +2031,7 @@ fail_putback:
 	count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
 	count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
 
-	mod_zone_page_state(page_zone(page),
+	mod_node_page_state(page_pgdat(page),
 			NR_ISOLATED_ANON + page_lru,
 			-HPAGE_PMD_NR);
 	return isolated;
diff --git a/mm/mlock.c b/mm/mlock.c
index 997f63082ff5..14645be06e30 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -103,7 +103,7 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
 	if (PageLRU(page)) {
 		struct lruvec *lruvec;
 
-		lruvec = mem_cgroup_page_lruvec(page, page_zone(page));
+		lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
 		if (getpage)
 			get_page(page);
 		ClearPageLRU(page);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d578d2a56b19..0ada2b2954b0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -285,8 +285,8 @@ static unsigned long zone_dirtyable_memory(struct zone *zone)
 	 */
 	nr_pages -= min(nr_pages, zone->totalreserve_pages);
 
-	nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
-	nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
+	nr_pages += node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE);
+	nr_pages += node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE);
 
 	return nr_pages;
 }
@@ -348,8 +348,8 @@ static unsigned long global_dirtyable_memory(void)
 	 */
 	x -= min(x, totalreserve_pages);
 
-	x += global_page_state(NR_INACTIVE_FILE);
-	x += global_page_state(NR_ACTIVE_FILE);
+	x += global_node_page_state(NR_INACTIVE_FILE);
+	x += global_node_page_state(NR_ACTIVE_FILE);
 
 	if (!vm_highmem_is_dirtyable)
 		x -= highmem_dirtyable_memory(x);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5760c626c309..35e2d0f9d44f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1078,9 +1078,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 
 	spin_lock(&zone->lock);
 	isolated_pageblocks = has_isolate_pageblock(zone);
-	nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+	nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
 	if (nr_scanned)
-		__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+		__mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
 
 	while (count) {
 		struct page *page;
@@ -1135,9 +1135,9 @@ static void free_one_page(struct zone *zone,
 {
 	unsigned long nr_scanned;
 	spin_lock(&zone->lock);
-	nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+	nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
 	if (nr_scanned)
-		__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+		__mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
 
 	if (unlikely(has_isolate_pageblock(zone) ||
 		is_migrate_isolate(migratetype))) {
@@ -4288,6 +4288,7 @@ void show_free_areas(unsigned int filter)
 	unsigned long free_pcp = 0;
 	int cpu;
 	struct zone *zone;
+	pg_data_t *pgdat;
 
 	for_each_populated_zone(zone) {
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
@@ -4306,13 +4307,13 @@ void show_free_areas(unsigned int filter)
 		" anon_thp: %lu shmem_thp: %lu shmem_pmdmapped: %lu\n"
 #endif
 		" free:%lu free_pcp:%lu free_cma:%lu\n",
-		global_page_state(NR_ACTIVE_ANON),
-		global_page_state(NR_INACTIVE_ANON),
-		global_page_state(NR_ISOLATED_ANON),
-		global_page_state(NR_ACTIVE_FILE),
-		global_page_state(NR_INACTIVE_FILE),
-		global_page_state(NR_ISOLATED_FILE),
-		global_page_state(NR_UNEVICTABLE),
+		global_node_page_state(NR_ACTIVE_ANON),
+		global_node_page_state(NR_INACTIVE_ANON),
+		global_node_page_state(NR_ISOLATED_ANON),
+		global_node_page_state(NR_ACTIVE_FILE),
+		global_node_page_state(NR_INACTIVE_FILE),
+		global_node_page_state(NR_ISOLATED_FILE),
+		global_node_page_state(NR_UNEVICTABLE),
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
 		global_page_state(NR_UNSTABLE_NFS),
@@ -4331,6 +4332,28 @@ void show_free_areas(unsigned int filter)
 		free_pcp,
 		global_page_state(NR_FREE_CMA_PAGES));
 
+	for_each_online_pgdat(pgdat) {
+		printk("Node %d"
+			" active_anon:%lukB"
+			" inactive_anon:%lukB"
+			" active_file:%lukB"
+			" inactive_file:%lukB"
+			" unevictable:%lukB"
+			" isolated(anon):%lukB"
+			" isolated(file):%lukB"
+			" all_unreclaimable? %s"
+			"\n",
+			pgdat->node_id,
+			K(node_page_state(pgdat, NR_ACTIVE_ANON)),
+			K(node_page_state(pgdat, NR_INACTIVE_ANON)),
+			K(node_page_state(pgdat, NR_ACTIVE_FILE)),
+			K(node_page_state(pgdat, NR_INACTIVE_FILE)),
+			K(node_page_state(pgdat, NR_UNEVICTABLE)),
+			K(node_page_state(pgdat, NR_ISOLATED_ANON)),
+			K(node_page_state(pgdat, NR_ISOLATED_FILE)),
+			!pgdat_reclaimable(pgdat) ? "yes" : "no");
+	}
+
 	for_each_populated_zone(zone) {
 		int i;
 
@@ -4347,13 +4370,6 @@ void show_free_areas(unsigned int filter)
 			" min:%lukB"
 			" low:%lukB"
 			" high:%lukB"
-			" active_anon:%lukB"
-			" inactive_anon:%lukB"
-			" active_file:%lukB"
-			" inactive_file:%lukB"
-			" unevictable:%lukB"
-			" isolated(anon):%lukB"
-			" isolated(file):%lukB"
 			" present:%lukB"
 			" managed:%lukB"
 			" mlocked:%lukB"
@@ -4376,21 +4392,13 @@ void show_free_areas(unsigned int filter)
 			" local_pcp:%ukB"
 			" free_cma:%lukB"
 			" writeback_tmp:%lukB"
-			" pages_scanned:%lu"
-			" all_unreclaimable? %s"
+			" node_pages_scanned:%lu"
 			"\n",
 			zone->name,
 			K(zone_page_state(zone, NR_FREE_PAGES)),
 			K(min_wmark_pages(zone)),
 			K(low_wmark_pages(zone)),
 			K(high_wmark_pages(zone)),
-			K(zone_page_state(zone, NR_ACTIVE_ANON)),
-			K(zone_page_state(zone, NR_INACTIVE_ANON)),
-			K(zone_page_state(zone, NR_ACTIVE_FILE)),
-			K(zone_page_state(zone, NR_INACTIVE_FILE)),
-			K(zone_page_state(zone, NR_UNEVICTABLE)),
-			K(zone_page_state(zone, NR_ISOLATED_ANON)),
-			K(zone_page_state(zone, NR_ISOLATED_FILE)),
 			K(zone->present_pages),
 			K(zone->managed_pages),
 			K(zone_page_state(zone, NR_MLOCK)),
@@ -4415,9 +4423,7 @@ void show_free_areas(unsigned int filter)
 			K(this_cpu_read(zone->pageset->pcp.count)),
 			K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
 			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
-			K(zone_page_state(zone, NR_PAGES_SCANNED)),
-			(!zone_reclaimable(zone) ? "yes" : "no")
-			);
+			K(node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED)));
 		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
 			printk(" %ld", zone->lowmem_reserve[i]);
@@ -5967,7 +5973,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 		/* For bootup, initialized properly in watermark setup */
 		mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
 
-		lruvec_init(&zone->lruvec);
+		lruvec_init(zone_lruvec(zone));
 		if (!size)
 			continue;
 
diff --git a/mm/swap.c b/mm/swap.c
index bf37e5cfae81..77af473635fe 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -63,7 +63,7 @@ static void __page_cache_release(struct page *page)
 		unsigned long flags;
 
 		spin_lock_irqsave(zone_lru_lock(zone), flags);
-		lruvec = mem_cgroup_page_lruvec(page, zone);
+		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
 		VM_BUG_ON_PAGE(!PageLRU(page), page);
 		__ClearPageLRU(page);
 		del_page_from_lru_list(page, lruvec, page_off_lru(page));
@@ -194,7 +194,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
 			spin_lock_irqsave(zone_lru_lock(zone), flags);
 		}
 
-		lruvec = mem_cgroup_page_lruvec(page, zone);
+		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
 		(*move_fn)(page, lruvec, arg);
 	}
 	if (zone)
@@ -319,7 +319,7 @@ void activate_page(struct page *page)
 
 	page = compound_head(page);
 	spin_lock_irq(zone_lru_lock(zone));
-	__activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
+	__activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL);
 	spin_unlock_irq(zone_lru_lock(zone));
 }
 #endif
@@ -445,16 +445,16 @@ void lru_cache_add(struct page *page)
  */
 void add_page_to_unevictable_list(struct page *page)
 {
-	struct zone *zone = page_zone(page);
+	struct pglist_data *pgdat = page_pgdat(page);
 	struct lruvec *lruvec;
 
-	spin_lock_irq(zone_lru_lock(zone));
-	lruvec = mem_cgroup_page_lruvec(page, zone);
+	spin_lock_irq(&pgdat->lru_lock);
+	lruvec = mem_cgroup_page_lruvec(page, pgdat);
 	ClearPageActive(page);
 	SetPageUnevictable(page);
 	SetPageLRU(page);
 	add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
-	spin_unlock_irq(zone_lru_lock(zone));
+	spin_unlock_irq(&pgdat->lru_lock);
 }
 
 /**
@@ -730,7 +730,7 @@ void release_pages(struct page **pages, int nr, bool cold)
 {
 	int i;
 	LIST_HEAD(pages_to_free);
-	struct zone *zone = NULL;
+	struct pglist_data *locked_pgdat = NULL;
 	struct lruvec *lruvec;
 	unsigned long uninitialized_var(flags);
 	unsigned int uninitialized_var(lock_batch);
@@ -741,11 +741,11 @@ void release_pages(struct page **pages, int nr, bool cold)
 		/*
 		 * Make sure the IRQ-safe lock-holding time does not get
 		 * excessive with a continuous string of pages from the
-		 * same zone. The lock is held only if zone != NULL.
+		 * same pgdat. The lock is held only if pgdat != NULL.
 		 */
-		if (zone && ++lock_batch == SWAP_CLUSTER_MAX) {
-			spin_unlock_irqrestore(zone_lru_lock(zone), flags);
-			zone = NULL;
+		if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
+			spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
+			locked_pgdat = NULL;
 		}
 
 		if (is_huge_zero_page(page)) {
@@ -758,27 +758,27 @@ void release_pages(struct page **pages, int nr, bool cold)
 			continue;
 
 		if (PageCompound(page)) {
-			if (zone) {
-				spin_unlock_irqrestore(zone_lru_lock(zone), flags);
-				zone = NULL;
+			if (locked_pgdat) {
+				spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
+				locked_pgdat = NULL;
 			}
 			__put_compound_page(page);
 			continue;
 		}
 
 		if (PageLRU(page)) {
-			struct zone *pagezone = page_zone(page);
+			struct pglist_data *pgdat = page_pgdat(page);
 
-			if (pagezone != zone) {
-				if (zone)
-					spin_unlock_irqrestore(zone_lru_lock(zone),
+			if (pgdat != locked_pgdat) {
+				if (locked_pgdat)
+					spin_unlock_irqrestore(&locked_pgdat->lru_lock,
 									flags);
 				lock_batch = 0;
-				zone = pagezone;
-				spin_lock_irqsave(zone_lru_lock(zone), flags);
+				locked_pgdat = pgdat;
+				spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
 			}
 
-			lruvec = mem_cgroup_page_lruvec(page, zone);
+			lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
 			VM_BUG_ON_PAGE(!PageLRU(page), page);
 			__ClearPageLRU(page);
 			del_page_from_lru_list(page, lruvec, page_off_lru(page));
@@ -789,8 +789,8 @@ void release_pages(struct page **pages, int nr, bool cold)
 
 		list_add(&page->lru, &pages_to_free);
 	}
-	if (zone)
-		spin_unlock_irqrestore(zone_lru_lock(zone), flags);
+	if (locked_pgdat)
+		spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
 
 	mem_cgroup_uncharge_list(&pages_to_free);
 	free_hot_cold_page_list(&pages_to_free, cold);
@@ -826,7 +826,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
 	VM_BUG_ON_PAGE(PageCompound(page_tail), page);
 	VM_BUG_ON_PAGE(PageLRU(page_tail), page);
 	VM_BUG_ON(NR_CPUS != 1 &&
-		  !spin_is_locked(zone_lru_lock(lruvec_zone(lruvec))));
+		  !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock));
 
 	if (!list)
 		SetPageLRU(page_tail);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e7ffcd259cc4..86a523a761c9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -191,26 +191,42 @@ static bool sane_reclaim(struct scan_control *sc)
 }
 #endif
 
+/*
+ * This misses isolated pages which are not accounted for to save counters.
+ * As the data only determines if reclaim or compaction continues, it is
+ * not expected that isolated pages will be a dominating factor.
+ */
 unsigned long zone_reclaimable_pages(struct zone *zone)
 {
 	unsigned long nr;
 
-	nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) +
-	     zone_page_state_snapshot(zone, NR_INACTIVE_FILE) +
-	     zone_page_state_snapshot(zone, NR_ISOLATED_FILE);
+	nr = zone_page_state_snapshot(zone, NR_ZONE_LRU_FILE);
+	if (get_nr_swap_pages() > 0)
+		nr += zone_page_state_snapshot(zone, NR_ZONE_LRU_ANON);
+
+	return nr;
+}
+
+unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
+{
+	unsigned long nr;
+
+	nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) +
+	     node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) +
+	     node_page_state_snapshot(pgdat, NR_ISOLATED_FILE);
 
 	if (get_nr_swap_pages() > 0)
-		nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) +
-		      zone_page_state_snapshot(zone, NR_INACTIVE_ANON) +
-		      zone_page_state_snapshot(zone, NR_ISOLATED_ANON);
+		nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) +
+		      node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) +
+		      node_page_state_snapshot(pgdat, NR_ISOLATED_ANON);
 
 	return nr;
 }
 
-bool zone_reclaimable(struct zone *zone)
+bool pgdat_reclaimable(struct pglist_data *pgdat)
 {
-	return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) <
-		zone_reclaimable_pages(zone) * 6;
+	return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) <
+		pgdat_reclaimable_pages(pgdat) * 6;
 }
 
 unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
@@ -218,7 +234,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
 	if (!mem_cgroup_disabled())
 		return mem_cgroup_get_lru_size(lruvec, lru);
 
-	return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
+	return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
 }
 
 /*
@@ -877,7 +893,7 @@ static void page_check_dirty_writeback(struct page *page,
  * shrink_page_list() returns the number of reclaimed pages
  */
 static unsigned long shrink_page_list(struct list_head *page_list,
-				      struct zone *zone,
+				      struct pglist_data *pgdat,
 				      struct scan_control *sc,
 				      enum ttu_flags ttu_flags,
 				      unsigned long *ret_nr_dirty,
@@ -917,7 +933,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			goto keep;
 
 		VM_BUG_ON_PAGE(PageActive(page), page);
-		VM_BUG_ON_PAGE(page_zone(page) != zone, page);
 
 		sc->nr_scanned++;
 
@@ -996,7 +1011,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			/* Case 1 above */
 			if (current_is_kswapd() &&
 			    PageReclaim(page) &&
-			    test_bit(ZONE_WRITEBACK, &zone->flags)) {
+			    test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
 				nr_immediate++;
 				goto keep_locked;
 
@@ -1092,7 +1107,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			 */
 			if (page_is_file_cache(page) &&
 					(!current_is_kswapd() ||
-					 !test_bit(ZONE_DIRTY, &zone->flags))) {
+					 !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
 				/*
 				 * Immediately reclaim when written back.
 				 * Similar in principal to deactivate_page()
@@ -1266,11 +1281,11 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 		}
 	}
 
-	ret = shrink_page_list(&clean_pages, zone, &sc,
+	ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
 			TTU_UNMAP|TTU_IGNORE_ACCESS,
 			&dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
 	list_splice(&clean_pages, page_list);
-	mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
 	return ret;
 }
 
@@ -1375,7 +1390,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 {
 	struct list_head *src = &lruvec->lists[lru];
 	unsigned long nr_taken = 0;
-	unsigned long scan;
+	unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
+	unsigned long scan, nr_pages;
 
 	for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
 					!list_empty(src); scan++) {
@@ -1388,7 +1404,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 
 		switch (__isolate_lru_page(page, mode)) {
 		case 0:
-			nr_taken += hpage_nr_pages(page);
+			nr_pages = hpage_nr_pages(page);
+			nr_taken += nr_pages;
+			nr_zone_taken[page_zonenum(page)] += nr_pages;
 			list_move(&page->lru, dst);
 			break;
 
@@ -1405,6 +1423,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	*nr_scanned = scan;
 	trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
 				    nr_taken, mode, is_file_lru(lru));
+	for (scan = 0; scan < MAX_NR_ZONES; scan++) {
+		nr_pages = nr_zone_taken[scan];
+		if (!nr_pages)
+			continue;
+
+		update_lru_size(lruvec, lru, scan, -nr_pages);
+	}
 	return nr_taken;
 }
 
@@ -1445,7 +1470,7 @@ int isolate_lru_page(struct page *page)
 		struct lruvec *lruvec;
 
 		spin_lock_irq(zone_lru_lock(zone));
-		lruvec = mem_cgroup_page_lruvec(page, zone);
+		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
 		if (PageLRU(page)) {
 			int lru = page_lru(page);
 			get_page(page);
@@ -1465,7 +1490,7 @@ int isolate_lru_page(struct page *page)
  * the LRU list will go small and be scanned faster than necessary, leading to
  * unnecessary swapping, thrashing and OOM.
  */
-static int too_many_isolated(struct zone *zone, int file,
+static int too_many_isolated(struct pglist_data *pgdat, int file,
 		struct scan_control *sc)
 {
 	unsigned long inactive, isolated;
@@ -1477,11 +1502,11 @@ static int too_many_isolated(struct zone *zone, int file,
 		return 0;
 
 	if (file) {
-		inactive = zone_page_state(zone, NR_INACTIVE_FILE);
-		isolated = zone_page_state(zone, NR_ISOLATED_FILE);
+		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
+		isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
 	} else {
-		inactive = zone_page_state(zone, NR_INACTIVE_ANON);
-		isolated = zone_page_state(zone, NR_ISOLATED_ANON);
+		inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
+		isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
 	}
 
 	/*
@@ -1499,7 +1524,7 @@ static noinline_for_stack void
 putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
 {
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
-	struct zone *zone = lruvec_zone(lruvec);
+	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	LIST_HEAD(pages_to_free);
 
 	/*
@@ -1512,13 +1537,13 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
 		VM_BUG_ON_PAGE(PageLRU(page), page);
 		list_del(&page->lru);
 		if (unlikely(!page_evictable(page))) {
-			spin_unlock_irq(zone_lru_lock(zone));
+			spin_unlock_irq(&pgdat->lru_lock);
 			putback_lru_page(page);
-			spin_lock_irq(zone_lru_lock(zone));
+			spin_lock_irq(&pgdat->lru_lock);
 			continue;
 		}
 
-		lruvec = mem_cgroup_page_lruvec(page, zone);
+		lruvec = mem_cgroup_page_lruvec(page, pgdat);
 
 		SetPageLRU(page);
 		lru = page_lru(page);
@@ -1535,10 +1560,10 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
 			del_page_from_lru_list(page, lruvec, lru);
 
 			if (unlikely(PageCompound(page))) {
-				spin_unlock_irq(zone_lru_lock(zone));
+				spin_unlock_irq(&pgdat->lru_lock);
 				mem_cgroup_uncharge(page);
 				(*get_compound_page_dtor(page))(page);
-				spin_lock_irq(zone_lru_lock(zone));
+				spin_lock_irq(&pgdat->lru_lock);
 			} else
 				list_add(&page->lru, &pages_to_free);
 		}
@@ -1582,10 +1607,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	unsigned long nr_immediate = 0;
 	isolate_mode_t isolate_mode = 0;
 	int file = is_file_lru(lru);
-	struct zone *zone = lruvec_zone(lruvec);
+	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 
-	while (unlikely(too_many_isolated(zone, file, sc))) {
+	while (unlikely(too_many_isolated(pgdat, file, sc))) {
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
 
 		/* We are about to die and free our memory. Return now. */
@@ -1600,48 +1625,45 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	if (!sc->may_writepage)
 		isolate_mode |= ISOLATE_CLEAN;
 
-	spin_lock_irq(zone_lru_lock(zone));
+	spin_lock_irq(&pgdat->lru_lock);
 
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
 				     &nr_scanned, sc, isolate_mode, lru);
 
-	update_lru_size(lruvec, lru, -nr_taken);
-	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
 	reclaim_stat->recent_scanned[file] += nr_taken;
 
 	if (global_reclaim(sc)) {
-		__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
+		__mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
 		if (current_is_kswapd())
-			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
+			__count_vm_events(PGSCAN_KSWAPD, nr_scanned);
 		else
-			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
+			__count_vm_events(PGSCAN_DIRECT, nr_scanned);
 	}
-	spin_unlock_irq(zone_lru_lock(zone));
+	spin_unlock_irq(&pgdat->lru_lock);
 
 	if (nr_taken == 0)
 		return 0;
 
-	nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
+	nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
 				&nr_dirty, &nr_unqueued_dirty, &nr_congested,
 				&nr_writeback, &nr_immediate,
 				false);
 
-	spin_lock_irq(zone_lru_lock(zone));
+	spin_lock_irq(&pgdat->lru_lock);
 
 	if (global_reclaim(sc)) {
 		if (current_is_kswapd())
-			__count_zone_vm_events(PGSTEAL_KSWAPD, zone,
-					       nr_reclaimed);
+			__count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
 		else
-			__count_zone_vm_events(PGSTEAL_DIRECT, zone,
-					       nr_reclaimed);
+			__count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
 	}
 
 	putback_inactive_pages(lruvec, &page_list);
 
-	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
+	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
 
-	spin_unlock_irq(zone_lru_lock(zone));
+	spin_unlock_irq(&pgdat->lru_lock);
 
 	mem_cgroup_uncharge_list(&page_list);
 	free_hot_cold_page_list(&page_list, true);
@@ -1661,7 +1683,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	 * are encountered in the nr_immediate check below.
 	 */
 	if (nr_writeback && nr_writeback == nr_taken)
-		set_bit(ZONE_WRITEBACK, &zone->flags);
+		set_bit(PGDAT_WRITEBACK, &pgdat->flags);
 
 	/*
 	 * Legacy memcg will stall in page writeback so avoid forcibly
@@ -1673,16 +1695,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 		 * backed by a congested BDI and wait_iff_congested will stall.
 		 */
 		if (nr_dirty && nr_dirty == nr_congested)
-			set_bit(ZONE_CONGESTED, &zone->flags);
+			set_bit(PGDAT_CONGESTED, &pgdat->flags);
 
 		/*
 		 * If dirty pages are scanned that are not queued for IO, it
 		 * implies that flushers are not keeping up. In this case, flag
-		 * the zone ZONE_DIRTY and kswapd will start writing pages from
+		 * the pgdat PGDAT_DIRTY and kswapd will start writing pages from
 		 * reclaim context.
 		 */
 		if (nr_unqueued_dirty == nr_taken)
-			set_bit(ZONE_DIRTY, &zone->flags);
+			set_bit(PGDAT_DIRTY, &pgdat->flags);
 
 		/*
 		 * If kswapd scans pages marked marked for immediate
@@ -1701,9 +1723,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	 */
 	if (!sc->hibernation_mode && !current_is_kswapd() &&
 	    current_may_throttle())
-		wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+		wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10);
 
-	trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed,
+	trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
+			nr_scanned, nr_reclaimed,
 			sc->priority, file);
 	return nr_reclaimed;
 }
@@ -1731,20 +1754,20 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
 				     struct list_head *pages_to_free,
 				     enum lru_list lru)
 {
-	struct zone *zone = lruvec_zone(lruvec);
+	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	unsigned long pgmoved = 0;
 	struct page *page;
 	int nr_pages;
 
 	while (!list_empty(list)) {
 		page = lru_to_page(list);
-		lruvec = mem_cgroup_page_lruvec(page, zone);
+		lruvec = mem_cgroup_page_lruvec(page, pgdat);
 
 		VM_BUG_ON_PAGE(PageLRU(page), page);
 		SetPageLRU(page);
 
 		nr_pages = hpage_nr_pages(page);
-		update_lru_size(lruvec, lru, nr_pages);
+		update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
 		list_move(&page->lru, &lruvec->lists[lru]);
 		pgmoved += nr_pages;
 
@@ -1754,10 +1777,10 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
 			del_page_from_lru_list(page, lruvec, lru);
 
 			if (unlikely(PageCompound(page))) {
-				spin_unlock_irq(zone_lru_lock(zone));
+				spin_unlock_irq(&pgdat->lru_lock);
 				mem_cgroup_uncharge(page);
 				(*get_compound_page_dtor(page))(page);
-				spin_lock_irq(zone_lru_lock(zone));
+				spin_lock_irq(&pgdat->lru_lock);
 			} else
 				list_add(&page->lru, pages_to_free);
 		}
@@ -1783,7 +1806,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	unsigned long nr_rotated = 0;
 	isolate_mode_t isolate_mode = 0;
 	int file = is_file_lru(lru);
-	struct zone *zone = lruvec_zone(lruvec);
+	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
 	lru_add_drain();
 
@@ -1792,20 +1815,19 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	if (!sc->may_writepage)
 		isolate_mode |= ISOLATE_CLEAN;
 
-	spin_lock_irq(zone_lru_lock(zone));
+	spin_lock_irq(&pgdat->lru_lock);
 
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
 				     &nr_scanned, sc, isolate_mode, lru);
 
-	update_lru_size(lruvec, lru, -nr_taken);
-	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
 	reclaim_stat->recent_scanned[file] += nr_taken;
 
 	if (global_reclaim(sc))
-		__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
-	__count_zone_vm_events(PGREFILL, zone, nr_scanned);
+		__mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
+	__count_vm_events(PGREFILL, nr_scanned);
 
-	spin_unlock_irq(zone_lru_lock(zone));
+	spin_unlock_irq(&pgdat->lru_lock);
 
 	while (!list_empty(&l_hold)) {
 		cond_resched();
@@ -1850,7 +1872,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	/*
 	 * Move pages back to the lru list.
 	 */
-	spin_lock_irq(zone_lru_lock(zone));
+	spin_lock_irq(&pgdat->lru_lock);
 	/*
 	 * Count referenced pages from currently used mappings as rotated,
 	 * even though only some of them are actually re-activated.  This
@@ -1861,8 +1883,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
 
 	move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
 	move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
-	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
-	spin_unlock_irq(zone_lru_lock(zone));
+	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
+	spin_unlock_irq(&pgdat->lru_lock);
 
 	mem_cgroup_uncharge_list(&l_hold);
 	free_hot_cold_page_list(&l_hold, true);
@@ -1956,7 +1978,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 	u64 fraction[2];
 	u64 denominator = 0;	/* gcc */
-	struct zone *zone = lruvec_zone(lruvec);
+	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	unsigned long anon_prio, file_prio;
 	enum scan_balance scan_balance;
 	unsigned long anon, file;
@@ -1977,7 +1999,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * well.
 	 */
 	if (current_is_kswapd()) {
-		if (!zone_reclaimable(zone))
+		if (!pgdat_reclaimable(pgdat))
 			force_scan = true;
 		if (!mem_cgroup_online(memcg))
 			force_scan = true;
@@ -2023,14 +2045,24 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * anon pages.  Try to detect this based on file LRU size.
 	 */
 	if (global_reclaim(sc)) {
-		unsigned long zonefile;
-		unsigned long zonefree;
+		unsigned long pgdatfile;
+		unsigned long pgdatfree;
+		int z;
+		unsigned long total_high_wmark = 0;
 
-		zonefree = zone_page_state(zone, NR_FREE_PAGES);
-		zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +
-			   zone_page_state(zone, NR_INACTIVE_FILE);
+		pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+		pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
+			   node_page_state(pgdat, NR_INACTIVE_FILE);
+
+		for (z = 0; z < MAX_NR_ZONES; z++) {
+			struct zone *zone = &pgdat->node_zones[z];
+			if (!populated_zone(zone))
+				continue;
+
+			total_high_wmark += high_wmark_pages(zone);
+		}
 
-		if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {
+		if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
 			scan_balance = SCAN_ANON;
 			goto out;
 		}
@@ -2077,7 +2109,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
 		lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
 
-	spin_lock_irq(zone_lru_lock(zone));
+	spin_lock_irq(&pgdat->lru_lock);
 	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
 		reclaim_stat->recent_scanned[0] /= 2;
 		reclaim_stat->recent_rotated[0] /= 2;
@@ -2098,7 +2130,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 
 	fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
 	fp /= reclaim_stat->recent_rotated[1] + 1;
-	spin_unlock_irq(zone_lru_lock(zone));
+	spin_unlock_irq(&pgdat->lru_lock);
 
 	fraction[0] = ap;
 	fraction[1] = fp;
@@ -2352,9 +2384,9 @@ static inline bool should_continue_reclaim(struct zone *zone,
 	 * inactive lists are large enough, continue reclaiming
 	 */
 	pages_for_compaction = (2UL << sc->order);
-	inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
+	inactive_lru_pages = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE);
 	if (get_nr_swap_pages() > 0)
-		inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
+		inactive_lru_pages += node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON);
 	if (sc->nr_reclaimed < pages_for_compaction &&
 			inactive_lru_pages > pages_for_compaction)
 		return true;
@@ -2554,7 +2586,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 				continue;
 
 			if (sc->priority != DEF_PRIORITY &&
-			    !zone_reclaimable(zone))
+			    !pgdat_reclaimable(zone->zone_pgdat))
 				continue;	/* Let kswapd poll it */
 
 			/*
@@ -2692,7 +2724,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 	for (i = 0; i <= ZONE_NORMAL; i++) {
 		zone = &pgdat->node_zones[i];
 		if (!populated_zone(zone) ||
-		    zone_reclaimable_pages(zone) == 0)
+		    pgdat_reclaimable_pages(pgdat) == 0)
 			continue;
 
 		pfmemalloc_reserve += min_wmark_pages(zone);
@@ -3000,7 +3032,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
 		 * DEF_PRIORITY. Effectively, it considers them balanced so
 		 * they must be considered balanced here as well!
 		 */
-		if (!zone_reclaimable(zone)) {
+		if (!pgdat_reclaimable(zone->zone_pgdat)) {
 			balanced_pages += zone->managed_pages;
 			continue;
 		}
@@ -3063,6 +3095,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
 {
 	unsigned long balance_gap;
 	bool lowmem_pressure;
+	struct pglist_data *pgdat = zone->zone_pgdat;
 
 	/* Reclaim above the high watermark. */
 	sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
@@ -3087,7 +3120,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
 
 	shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
 
-	clear_bit(ZONE_WRITEBACK, &zone->flags);
+	/* TODO: ANOMALY */
+	clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
 
 	/*
 	 * If a zone reaches its high watermark, consider it to be no longer
@@ -3095,10 +3129,10 @@ static bool kswapd_shrink_zone(struct zone *zone,
 	 * BDIs but as pressure is relieved, speculatively avoid congestion
 	 * waits.
 	 */
-	if (zone_reclaimable(zone) &&
+	if (pgdat_reclaimable(zone->zone_pgdat) &&
 	    zone_balanced(zone, sc->order, false, 0, classzone_idx)) {
-		clear_bit(ZONE_CONGESTED, &zone->flags);
-		clear_bit(ZONE_DIRTY, &zone->flags);
+		clear_bit(PGDAT_CONGESTED, &pgdat->flags);
+		clear_bit(PGDAT_DIRTY, &pgdat->flags);
 	}
 
 	return sc->nr_scanned >= sc->nr_to_reclaim;
@@ -3157,7 +3191,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 				continue;
 
 			if (sc.priority != DEF_PRIORITY &&
-			    !zone_reclaimable(zone))
+			    !pgdat_reclaimable(zone->zone_pgdat))
 				continue;
 
 			/*
@@ -3184,9 +3218,11 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 				/*
 				 * If balanced, clear the dirty and congested
 				 * flags
+				 *
+				 * TODO: ANOMALY
 				 */
-				clear_bit(ZONE_CONGESTED, &zone->flags);
-				clear_bit(ZONE_DIRTY, &zone->flags);
+				clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
+				clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
 			}
 		}
 
@@ -3216,7 +3252,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 				continue;
 
 			if (sc.priority != DEF_PRIORITY &&
-			    !zone_reclaimable(zone))
+			    !pgdat_reclaimable(zone->zone_pgdat))
 				continue;
 
 			sc.nr_scanned = 0;
@@ -3612,8 +3648,8 @@ int sysctl_min_slab_ratio = 5;
 static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
 {
 	unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
-	unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
-		zone_page_state(zone, NR_ACTIVE_FILE);
+	unsigned long file_lru = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) +
+		node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE);
 
 	/*
 	 * It's possible for there to be more file mapped pages than
@@ -3716,7 +3752,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	    zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
 		return ZONE_RECLAIM_FULL;
 
-	if (!zone_reclaimable(zone))
+	if (!pgdat_reclaimable(zone->zone_pgdat))
 		return ZONE_RECLAIM_FULL;
 
 	/*
@@ -3795,7 +3831,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
 			zone = pagezone;
 			spin_lock_irq(zone_lru_lock(zone));
 		}
-		lruvec = mem_cgroup_page_lruvec(page, zone);
+		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
 
 		if (!PageLRU(page) || !PageUnevictable(page))
 			continue;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 3345d396a99b..de0c17076270 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -936,11 +936,8 @@ const char * const vmstat_text[] = {
 	/* enum zone_stat_item countes */
 	"nr_free_pages",
 	"nr_alloc_batch",
-	"nr_inactive_anon",
-	"nr_active_anon",
-	"nr_inactive_file",
-	"nr_active_file",
-	"nr_unevictable",
+	"nr_zone_anon_lru",
+	"nr_zone_file_lru",
 	"nr_mlock",
 	"nr_anon_pages",
 	"nr_mapped",
@@ -956,12 +953,9 @@ const char * const vmstat_text[] = {
 	"nr_vmscan_write",
 	"nr_vmscan_immediate_reclaim",
 	"nr_writeback_temp",
-	"nr_isolated_anon",
-	"nr_isolated_file",
 	"nr_shmem",
 	"nr_dirtied",
 	"nr_written",
-	"nr_pages_scanned",
 #if IS_ENABLED(CONFIG_ZSMALLOC)
 	"nr_zspages",
 #endif
@@ -981,6 +975,16 @@ const char * const vmstat_text[] = {
 	"nr_shmem_pmdmapped",
 	"nr_free_cma",
 
+	/* Node-based counters */
+	"nr_inactive_anon",
+	"nr_active_anon",
+	"nr_inactive_file",
+	"nr_active_file",
+	"nr_unevictable",
+	"nr_isolated_anon",
+	"nr_isolated_file",
+	"nr_pages_scanned",
+
 	/* enum writeback_stat_item counters */
 	"nr_dirty_threshold",
 	"nr_dirty_background_threshold",
@@ -1002,11 +1006,11 @@ const char * const vmstat_text[] = {
 	"pgmajfault",
 	"pglazyfreed",
 
-	TEXTS_FOR_ZONES("pgrefill")
-	TEXTS_FOR_ZONES("pgsteal_kswapd")
-	TEXTS_FOR_ZONES("pgsteal_direct")
-	TEXTS_FOR_ZONES("pgscan_kswapd")
-	TEXTS_FOR_ZONES("pgscan_direct")
+	"pgrefill",
+	"pgsteal_kswapd",
+	"pgsteal_direct",
+	"pgscan_kswapd",
+	"pgscan_direct",
 	"pgscan_direct_throttle",
 
 #ifdef CONFIG_NUMA
@@ -1434,7 +1438,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n        min      %lu"
 		   "\n        low      %lu"
 		   "\n        high     %lu"
-		   "\n        scanned  %lu"
+		   "\n   node_scanned  %lu"
 		   "\n        spanned  %lu"
 		   "\n        present  %lu"
 		   "\n        managed  %lu",
@@ -1442,13 +1446,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   min_wmark_pages(zone),
 		   low_wmark_pages(zone),
 		   high_wmark_pages(zone),
-		   zone_page_state(zone, NR_PAGES_SCANNED),
+		   node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED),
 		   zone->spanned_pages,
 		   zone->present_pages,
 		   zone->managed_pages);
 
 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-		seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
+		seq_printf(m, "\n      %-12s %lu", vmstat_text[i],
 				zone_page_state(zone, i));
 
 	seq_printf(m,
@@ -1478,12 +1482,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 #endif
 	}
 	seq_printf(m,
-		   "\n  all_unreclaimable: %u"
-		   "\n  start_pfn:         %lu"
-		   "\n  inactive_ratio:    %u",
-		   !zone_reclaimable(zone),
+		   "\n  node_unreclaimable:  %u"
+		   "\n  start_pfn:           %lu"
+		   "\n  node_inactive_ratio: %u",
+		   !pgdat_reclaimable(zone->zone_pgdat),
 		   zone->zone_start_pfn,
-		   zone->inactive_ratio);
+		   zone->zone_pgdat->inactive_ratio);
 	seq_putc(m, '\n');
 }
 
@@ -1574,7 +1578,6 @@ static int vmstat_show(struct seq_file *m, void *arg)
 {
 	unsigned long *l = arg;
 	unsigned long off = l - (unsigned long *)m->private;
-
 	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
 	return 0;
 }
diff --git a/mm/workingset.c b/mm/workingset.c
index 5ffba0c0adc6..7820a7e1ca98 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -355,8 +355,8 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
 		pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
 						     LRU_ALL_FILE);
 	} else {
-		pages = sum_zone_node_page_state(sc->nid, NR_ACTIVE_FILE) +
-			sum_zone_node_page_state(sc->nid, NR_INACTIVE_FILE);
+		pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
+			node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
 	}
 
 	/*
-- 
cgit v1.3-8-gc7d7


From a5f5f91da6ad647fb0cc7fce0e17343c0d1c5a9a Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 28 Jul 2016 15:46:32 -0700
Subject: mm: convert zone_reclaim to node_reclaim

As reclaim is now per-node based, convert zone_reclaim to be
node_reclaim.  It is possible that a node will be reclaimed multiple
times if it has multiple zones but this is unavoidable without caching
all nodes traversed so far.  The documentation and interface to
userspace is the same from a configuration perspective and will will be
similar in behaviour unless the node-local allocation requests were also
limited to lower zones.

Link: http://lkml.kernel.org/r/1467970510-21195-24-git-send-email-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h   | 18 +++++------
 include/linux/swap.h     |  9 +++---
 include/linux/topology.h |  2 +-
 kernel/sysctl.c          |  4 +--
 mm/internal.h            |  8 ++---
 mm/khugepaged.c          |  4 +--
 mm/page_alloc.c          | 24 ++++++++++-----
 mm/vmscan.c              | 77 ++++++++++++++++++++++++------------------------
 8 files changed, 77 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e3d6d42722a0..e19c081c794e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -372,14 +372,6 @@ struct zone {
 	unsigned long		*pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
 
-#ifdef CONFIG_NUMA
-	/*
-	 * zone reclaim becomes active if more unmapped pages exist.
-	 */
-	unsigned long		min_unmapped_pages;
-	unsigned long		min_slab_pages;
-#endif /* CONFIG_NUMA */
-
 	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
 	unsigned long		zone_start_pfn;
 
@@ -525,7 +517,6 @@ struct zone {
 } ____cacheline_internodealigned_in_smp;
 
 enum zone_flags {
-	ZONE_RECLAIM_LOCKED,		/* prevents concurrent reclaim */
 	ZONE_FAIR_DEPLETED,		/* fair zone policy batch depleted */
 };
 
@@ -540,6 +531,7 @@ enum pgdat_flags {
 	PGDAT_WRITEBACK,		/* reclaim scanning has recently found
 					 * many pages under writeback
 					 */
+	PGDAT_RECLAIM_LOCKED,		/* prevents concurrent reclaim */
 };
 
 static inline unsigned long zone_end_pfn(const struct zone *zone)
@@ -688,6 +680,14 @@ typedef struct pglist_data {
 	 */
 	unsigned long		totalreserve_pages;
 
+#ifdef CONFIG_NUMA
+	/*
+	 * zone reclaim becomes active if more unmapped pages exist.
+	 */
+	unsigned long		min_unmapped_pages;
+	unsigned long		min_slab_pages;
+#endif /* CONFIG_NUMA */
+
 	/* Write-intensive fields used by page reclaim */
 	ZONE_PADDING(_pad1_)
 	spinlock_t		lru_lock;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2a23ddc96edd..b17cc4830fa6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -326,13 +326,14 @@ extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern unsigned long vm_total_pages;
 
 #ifdef CONFIG_NUMA
-extern int zone_reclaim_mode;
+extern int node_reclaim_mode;
 extern int sysctl_min_unmapped_ratio;
 extern int sysctl_min_slab_ratio;
-extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
+extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
 #else
-#define zone_reclaim_mode 0
-static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
+#define node_reclaim_mode 0
+static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
+				unsigned int order)
 {
 	return 0;
 }
diff --git a/include/linux/topology.h b/include/linux/topology.h
index afce69296ac0..cb0775e1ee4b 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -54,7 +54,7 @@ int arch_update_cpu_topology(void);
 /*
  * If the distance between nodes in a system is larger than RECLAIM_DISTANCE
  * (in whatever arch specific measurement units returned by node_distance())
- * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim()
+ * and node_reclaim_mode is enabled then the VM will only call node_reclaim()
  * on nodes within this distance.
  */
 #define RECLAIM_DISTANCE 30
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 35f0dcb1cb4f..53954631a4e1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1508,8 +1508,8 @@ static struct ctl_table vm_table[] = {
 #ifdef CONFIG_NUMA
 	{
 		.procname	= "zone_reclaim_mode",
-		.data		= &zone_reclaim_mode,
-		.maxlen		= sizeof(zone_reclaim_mode),
+		.data		= &node_reclaim_mode,
+		.maxlen		= sizeof(node_reclaim_mode),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 		.extra1		= &zero,
diff --git a/mm/internal.h b/mm/internal.h
index 2f80d0343c56..1e21b2d3838d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -433,10 +433,10 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 }
 #endif /* CONFIG_SPARSEMEM */
 
-#define ZONE_RECLAIM_NOSCAN	-2
-#define ZONE_RECLAIM_FULL	-1
-#define ZONE_RECLAIM_SOME	0
-#define ZONE_RECLAIM_SUCCESS	1
+#define NODE_RECLAIM_NOSCAN	-2
+#define NODE_RECLAIM_FULL	-1
+#define NODE_RECLAIM_SOME	0
+#define NODE_RECLAIM_SUCCESS	1
 
 extern int hwpoison_filter(struct page *p);
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index d03b14a6ef5e..d1423d790f6d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -672,10 +672,10 @@ static bool khugepaged_scan_abort(int nid)
 	int i;
 
 	/*
-	 * If zone_reclaim_mode is disabled, then no extra effort is made to
+	 * If node_reclaim_mode is disabled, then no extra effort is made to
 	 * allocate memory locally.
 	 */
-	if (!zone_reclaim_mode)
+	if (!node_reclaim_mode)
 		return false;
 
 	/* If there is a count for this node already, it must be acceptable */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f2c56a13b065..c9d1720c58a3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2942,16 +2942,16 @@ zonelist_scan:
 			if (alloc_flags & ALLOC_NO_WATERMARKS)
 				goto try_this_zone;
 
-			if (zone_reclaim_mode == 0 ||
+			if (node_reclaim_mode == 0 ||
 			    !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
 				continue;
 
-			ret = zone_reclaim(zone, gfp_mask, order);
+			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
 			switch (ret) {
-			case ZONE_RECLAIM_NOSCAN:
+			case NODE_RECLAIM_NOSCAN:
 				/* did not scan */
 				continue;
-			case ZONE_RECLAIM_FULL:
+			case NODE_RECLAIM_FULL:
 				/* scanned but unreclaimable */
 				continue;
 			default:
@@ -5948,9 +5948,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 		zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
 #ifdef CONFIG_NUMA
 		zone->node = nid;
-		zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
+		pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio)
 						/ 100;
-		zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
+		pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100;
 #endif
 		zone->name = zone_names[j];
 		zone->zone_pgdat = pgdat;
@@ -6922,6 +6922,7 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
+	struct pglist_data *pgdat;
 	struct zone *zone;
 	int rc;
 
@@ -6929,8 +6930,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
 	if (rc)
 		return rc;
 
+	for_each_online_pgdat(pgdat)
+		pgdat->min_slab_pages = 0;
+
 	for_each_zone(zone)
-		zone->min_unmapped_pages = (zone->managed_pages *
+		zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
 				sysctl_min_unmapped_ratio) / 100;
 	return 0;
 }
@@ -6938,6 +6942,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
+	struct pglist_data *pgdat;
 	struct zone *zone;
 	int rc;
 
@@ -6945,8 +6950,11 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
 	if (rc)
 		return rc;
 
+	for_each_online_pgdat(pgdat)
+		pgdat->min_slab_pages = 0;
+
 	for_each_zone(zone)
-		zone->min_slab_pages = (zone->managed_pages *
+		zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
 				sysctl_min_slab_ratio) / 100;
 	return 0;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 31edd7776289..1013f37cd815 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3565,12 +3565,12 @@ module_init(kswapd_init)
 
 #ifdef CONFIG_NUMA
 /*
- * Zone reclaim mode
+ * Node reclaim mode
  *
- * If non-zero call zone_reclaim when the number of free pages falls below
+ * If non-zero call node_reclaim when the number of free pages falls below
  * the watermarks.
  */
-int zone_reclaim_mode __read_mostly;
+int node_reclaim_mode __read_mostly;
 
 #define RECLAIM_OFF 0
 #define RECLAIM_ZONE (1<<0)	/* Run shrink_inactive_list on the zone */
@@ -3578,14 +3578,14 @@ int zone_reclaim_mode __read_mostly;
 #define RECLAIM_UNMAP (1<<2)	/* Unmap pages during reclaim */
 
 /*
- * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * Priority for NODE_RECLAIM. This determines the fraction of pages
  * of a node considered for each zone_reclaim. 4 scans 1/16th of
  * a zone.
  */
-#define ZONE_RECLAIM_PRIORITY 4
+#define NODE_RECLAIM_PRIORITY 4
 
 /*
- * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * Percentage of pages in a zone that must be unmapped for node_reclaim to
  * occur.
  */
 int sysctl_min_unmapped_ratio = 1;
@@ -3611,7 +3611,7 @@ static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
 }
 
 /* Work out how many page cache pages we can reclaim in this reclaim_mode */
-static unsigned long zone_pagecache_reclaimable(struct zone *zone)
+static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
 {
 	unsigned long nr_pagecache_reclaimable;
 	unsigned long delta = 0;
@@ -3622,14 +3622,14 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
 	 * pages like swapcache and node_unmapped_file_pages() provides
 	 * a better estimate
 	 */
-	if (zone_reclaim_mode & RECLAIM_UNMAP)
-		nr_pagecache_reclaimable = node_page_state(zone->zone_pgdat, NR_FILE_PAGES);
+	if (node_reclaim_mode & RECLAIM_UNMAP)
+		nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
 	else
-		nr_pagecache_reclaimable = node_unmapped_file_pages(zone->zone_pgdat);
+		nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
 
 	/* If we can't clean pages, remove dirty pages from consideration */
-	if (!(zone_reclaim_mode & RECLAIM_WRITE))
-		delta += node_page_state(zone->zone_pgdat, NR_FILE_DIRTY);
+	if (!(node_reclaim_mode & RECLAIM_WRITE))
+		delta += node_page_state(pgdat, NR_FILE_DIRTY);
 
 	/* Watch for any possible underflows due to delta */
 	if (unlikely(delta > nr_pagecache_reclaimable))
@@ -3639,23 +3639,24 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
 }
 
 /*
- * Try to free up some pages from this zone through reclaim.
+ * Try to free up some pages from this node through reclaim.
  */
-static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
 {
 	/* Minimum pages needed in order to stay on node */
 	const unsigned long nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
+	int classzone_idx = gfp_zone(gfp_mask);
 	struct scan_control sc = {
 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
 		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
 		.order = order,
-		.priority = ZONE_RECLAIM_PRIORITY,
-		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
-		.may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
+		.priority = NODE_RECLAIM_PRIORITY,
+		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
+		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
 		.may_swap = 1,
-		.reclaim_idx = zone_idx(zone),
+		.reclaim_idx = classzone_idx,
 	};
 
 	cond_resched();
@@ -3669,13 +3670,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
+	if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
 		/*
 		 * Free memory by calling shrink zone with increasing
 		 * priorities until we have enough memory freed.
 		 */
 		do {
-			shrink_node(zone->zone_pgdat, &sc, zone_idx(zone));
+			shrink_node(pgdat, &sc, classzone_idx);
 		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
 	}
 
@@ -3685,49 +3686,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	return sc.nr_reclaimed >= nr_pages;
 }
 
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
 {
-	int node_id;
 	int ret;
 
 	/*
-	 * Zone reclaim reclaims unmapped file backed pages and
+	 * Node reclaim reclaims unmapped file backed pages and
 	 * slab pages if we are over the defined limits.
 	 *
 	 * A small portion of unmapped file backed pages is needed for
 	 * file I/O otherwise pages read by file I/O will be immediately
-	 * thrown out if the zone is overallocated. So we do not reclaim
-	 * if less than a specified percentage of the zone is used by
+	 * thrown out if the node is overallocated. So we do not reclaim
+	 * if less than a specified percentage of the node is used by
 	 * unmapped file backed pages.
 	 */
-	if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
-	    zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
-		return ZONE_RECLAIM_FULL;
+	if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
+	    sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
+		return NODE_RECLAIM_FULL;
 
-	if (!pgdat_reclaimable(zone->zone_pgdat))
-		return ZONE_RECLAIM_FULL;
+	if (!pgdat_reclaimable(pgdat))
+		return NODE_RECLAIM_FULL;
 
 	/*
 	 * Do not scan if the allocation should not be delayed.
 	 */
 	if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
-		return ZONE_RECLAIM_NOSCAN;
+		return NODE_RECLAIM_NOSCAN;
 
 	/*
-	 * Only run zone reclaim on the local zone or on zones that do not
+	 * Only run node reclaim on the local node or on nodes that do not
 	 * have associated processors. This will favor the local processor
 	 * over remote processors and spread off node memory allocations
 	 * as wide as possible.
 	 */
-	node_id = zone_to_nid(zone);
-	if (node_state(node_id, N_CPU) && node_id != numa_node_id())
-		return ZONE_RECLAIM_NOSCAN;
+	if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
+		return NODE_RECLAIM_NOSCAN;
 
-	if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))
-		return ZONE_RECLAIM_NOSCAN;
+	if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+		return NODE_RECLAIM_NOSCAN;
 
-	ret = __zone_reclaim(zone, gfp_mask, order);
-	clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
+	ret = __node_reclaim(pgdat, gfp_mask, order);
+	clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
 
 	if (!ret)
 		count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
-- 
cgit v1.3-8-gc7d7


From 11db04864336f20e19e16b64ade781eeefc3f6d3 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 28 Jul 2016 15:48:11 -0700
Subject: mm: cleanup ifdef guards for vmem_altmap

Now that ZONE_DEVICE depends on SPARSEMEM_VMEMMAP we can simplify some
ifdef guards to just ZONE_DEVICE.

Link: http://lkml.kernel.org/r/146687646788.39261.8020536391978771940.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memremap.h | 2 +-
 kernel/memremap.c        | 8 --------
 2 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index bcaa634139a9..93416196ba64 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -26,7 +26,7 @@ struct vmem_altmap {
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
 void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
 
-#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_ZONE_DEVICE)
+#ifdef CONFIG_ZONE_DEVICE
 struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start);
 #else
 static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 017532193fb1..ddb3247a872a 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -308,12 +308,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 	if (is_ram == REGION_INTERSECTS)
 		return __va(res->start);
 
-	if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
-		dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
-				__func__);
-		return ERR_PTR(-ENXIO);
-	}
-
 	if (!ref)
 		return ERR_PTR(-EINVAL);
 
@@ -401,7 +395,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
 	altmap->alloc -= nr_pfns;
 }
 
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
 struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
 {
 	/*
@@ -427,5 +420,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
 
 	return pgmap ? pgmap->altmap : NULL;
 }
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 #endif /* CONFIG_ZONE_DEVICE */
-- 
cgit v1.3-8-gc7d7


From d30dd8be06a5ae640766b20ea9ae288832bd12ac Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jul 2016 15:48:14 -0700
Subject: mm: track NR_KERNEL_STACK in KiB instead of number of stacks

Currently, NR_KERNEL_STACK tracks the number of kernel stacks in a zone.
This only makes sense if each kernel stack exists entirely in one zone,
and allowing vmapped stacks could break this assumption.

Since frv has THREAD_SIZE < PAGE_SIZE, we need to track kernel stack
allocations in a unit that divides both THREAD_SIZE and PAGE_SIZE on all
architectures.  Keep it simple and use KiB.

Link: http://lkml.kernel.org/r/083c71e642c5fa5f1b6898902e1b2db7b48940d4.1468523549.git.luto@kernel.org
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    | 3 +--
 fs/proc/meminfo.c      | 2 +-
 include/linux/mmzone.h | 2 +-
 kernel/fork.c          | 3 ++-
 mm/page_alloc.c        | 3 +--
 5 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 264cc214c4df..29cd96661b30 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -124,8 +124,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 		       nid, K(node_page_state(pgdat, NR_FILE_MAPPED)),
 		       nid, K(node_page_state(pgdat, NR_ANON_MAPPED)),
 		       nid, K(i.sharedram),
-		       nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK) *
-				THREAD_SIZE / 1024,
+		       nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB),
 		       nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)),
 		       nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
 		       nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index c1fdcc1a907a..09e18fdf61e5 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -147,7 +147,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 				global_page_state(NR_SLAB_UNRECLAIMABLE)),
 		K(global_page_state(NR_SLAB_RECLAIMABLE)),
 		K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
-		global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
+		global_page_state(NR_KERNEL_STACK_KB),
 		K(global_page_state(NR_PAGETABLE)),
 #ifdef CONFIG_QUICKLIST
 		K(quicklist_total_size()),
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ca0fbc483441..f2e4e90621ec 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -121,7 +121,7 @@ enum zone_stat_item {
 	NR_SLAB_RECLAIMABLE,
 	NR_SLAB_UNRECLAIMABLE,
 	NR_PAGETABLE,		/* used for pagetables */
-	NR_KERNEL_STACK,
+	NR_KERNEL_STACK_KB,	/* measured in KiB */
 	/* Second 128 byte cacheline */
 	NR_BOUNCE,
 #if IS_ENABLED(CONFIG_ZSMALLOC)
diff --git a/kernel/fork.c b/kernel/fork.c
index de21f25e0d2c..af3637e0ee52 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -225,7 +225,8 @@ static void account_kernel_stack(unsigned long *stack, int account)
 {
 	struct zone *zone = page_zone(virt_to_page(stack));
 
-	mod_zone_page_state(zone, NR_KERNEL_STACK, account);
+	mod_zone_page_state(zone, NR_KERNEL_STACK_KB,
+			    THREAD_SIZE / 1024 * account);
 }
 
 void free_task(struct task_struct *tsk)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dfdb608f7b3d..c281125b2349 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4359,8 +4359,7 @@ void show_free_areas(unsigned int filter)
 			K(zone_page_state(zone, NR_MLOCK)),
 			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
 			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
-			zone_page_state(zone, NR_KERNEL_STACK) *
-				THREAD_SIZE / 1024,
+			zone_page_state(zone, NR_KERNEL_STACK_KB),
 			K(zone_page_state(zone, NR_PAGETABLE)),
 			K(zone_page_state(zone, NR_BOUNCE)),
 			K(free_pcp),
-- 
cgit v1.3-8-gc7d7


From efdc94907977d2db84b4b00cb9bd98ca011f6819 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jul 2016 15:48:17 -0700
Subject: mm: fix memcg stack accounting for sub-page stacks

We should account for stacks regardless of stack size, and we need to
account in sub-page units if THREAD_SIZE < PAGE_SIZE.  Change the units
to kilobytes and Move it into account_kernel_stack().

Fixes: 12580e4b54ba8 ("mm: memcontrol: report kernel stack usage in cgroup2 memory.stat")
Link: http://lkml.kernel.org/r/9b5314e3ee5eda61b0317ec1563768602c1ef438.1468523549.git.luto@kernel.org
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  2 +-
 kernel/fork.c              | 19 ++++++++-----------
 mm/memcontrol.c            |  2 +-
 3 files changed, 10 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5147e650287a..5d8ca6e02e39 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -52,7 +52,7 @@ enum mem_cgroup_stat_index {
 	MEM_CGROUP_STAT_SWAP,		/* # of pages, swapped out */
 	MEM_CGROUP_STAT_NSTATS,
 	/* default hierarchy stats */
-	MEMCG_KERNEL_STACK = MEM_CGROUP_STAT_NSTATS,
+	MEMCG_KERNEL_STACK_KB = MEM_CGROUP_STAT_NSTATS,
 	MEMCG_SLAB_RECLAIMABLE,
 	MEMCG_SLAB_UNRECLAIMABLE,
 	MEMCG_SOCK,
diff --git a/kernel/fork.c b/kernel/fork.c
index af3637e0ee52..52e725d4a866 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -165,20 +165,12 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
 	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
 					     THREAD_SIZE_ORDER);
 
-	if (page)
-		memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
-					    1 << THREAD_SIZE_ORDER);
-
 	return page ? page_address(page) : NULL;
 }
 
 static inline void free_thread_stack(unsigned long *stack)
 {
-	struct page *page = virt_to_page(stack);
-
-	memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
-				    -(1 << THREAD_SIZE_ORDER));
-	__free_pages(page, THREAD_SIZE_ORDER);
+	__free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
 }
 # else
 static struct kmem_cache *thread_stack_cache;
@@ -223,10 +215,15 @@ static struct kmem_cache *mm_cachep;
 
 static void account_kernel_stack(unsigned long *stack, int account)
 {
-	struct zone *zone = page_zone(virt_to_page(stack));
+	/* All stack pages are in the same zone and belong to the same memcg. */
+	struct page *first_page = virt_to_page(stack);
 
-	mod_zone_page_state(zone, NR_KERNEL_STACK_KB,
+	mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
 			    THREAD_SIZE / 1024 * account);
+
+	memcg_kmem_update_page_stat(
+		first_page, MEMCG_KERNEL_STACK_KB,
+		account * (THREAD_SIZE / 1024));
 }
 
 void free_task(struct task_struct *tsk)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 13be30c3ea78..c265212bec8c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5171,7 +5171,7 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	seq_printf(m, "file %llu\n",
 		   (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
 	seq_printf(m, "kernel_stack %llu\n",
-		   (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE);
+		   (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
 	seq_printf(m, "slab %llu\n",
 		   (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
 			 stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
-- 
cgit v1.3-8-gc7d7


From 8b70ca65616b3588ea1907e87f0df6d2530350df Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jul 2016 15:48:23 -0700
Subject: printk: when dumping regs, show the stack, not thread_info

We currently show:

  task: <current> ti: <current_thread_info()> task.ti: <task_thread_info(current)>"

"ti" and "task.ti" are redundant, and neither is actually what we want
to show, which the the base of the thread stack.  Change the display to
show the stack pointer explicitly.

Link: http://lkml.kernel.org/r/543ac5bd66ff94000a57a02e11af7239571a3055.1468523549.git.luto@kernel.org
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 60cdf6386763..d4de33934dac 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3177,9 +3177,8 @@ void show_regs_print_info(const char *log_lvl)
 {
 	dump_stack_print_info(log_lvl);
 
-	printk("%stask: %p ti: %p task.ti: %p\n",
-	       log_lvl, current, current_thread_info(),
-	       task_thread_info(current));
+	printk("%stask: %p task.stack: %p\n",
+	       log_lvl, current, task_stack_page(current));
 }
 
 #endif
-- 
cgit v1.3-8-gc7d7


From 784bdf3bb694b256fcd6120b93e8947a84249a3a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 29 Jul 2016 16:32:30 +0200
Subject: futex: Assume all mappings are private on !MMU systems

To quote Rick why there is no need for shared mapping on !MMU systems:

|With MMU, shared futex keys need to identify the physical backing for
|a memory address because it may be mapped at different addresses in
|different processes (or even multiple times in the same process).
|Without MMU this cannot happen. You only have physical addresses. So
|the "private futex" behavior of using the virtual address as the key
|is always correct (for both shared and private cases) on nommu
|systems.

This patch disables the FLAGS_SHARED in a way that allows the compiler to
remove that code.

[bigeasy: Added changelog ]
Reported-by: Rich Felker <dalias@libc.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20160729143230.GA21715@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 33664f70e2d2..46cb3a301bc1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -179,7 +179,15 @@ int __read_mostly futex_cmpxchg_enabled;
  * Futex flags used to encode options to functions and preserve them across
  * restarts.
  */
-#define FLAGS_SHARED		0x01
+#ifdef CONFIG_MMU
+# define FLAGS_SHARED		0x01
+#else
+/*
+ * NOMMU does not have per process address space. Let the compiler optimize
+ * code away.
+ */
+# define FLAGS_SHARED		0x00
+#endif
 #define FLAGS_CLOCKRT		0x02
 #define FLAGS_HAS_TIMEOUT	0x04
 
@@ -405,6 +413,16 @@ static void get_futex_key_refs(union futex_key *key)
 	if (!key->both.ptr)
 		return;
 
+	/*
+	 * On MMU less systems futexes are always "private" as there is no per
+	 * process address space. We need the smp wmb nevertheless - yes,
+	 * arch/blackfin has MMU less SMP ...
+	 */
+	if (!IS_ENABLED(CONFIG_MMU)) {
+		smp_mb(); /* explicit smp_mb(); (B) */
+		return;
+	}
+
 	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 	case FUT_OFF_INODE:
 		ihold(key->shared.inode); /* implies smp_mb(); (B) */
@@ -436,6 +454,9 @@ static void drop_futex_key_refs(union futex_key *key)
 		return;
 	}
 
+	if (!IS_ENABLED(CONFIG_MMU))
+		return;
+
 	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 	case FUT_OFF_INODE:
 		iput(key->shared.inode);
-- 
cgit v1.3-8-gc7d7


From e3f91083facb792dc8d8fd0a59639e4d6e7c0c8f Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Sat, 23 Jul 2016 14:42:37 +0530
Subject: jump_label: Make it possible for arches to invoke jump_label_init()
 earlier

Some arches (powerpc at least) would like to invoke jump_label_init()
much earlier in boot. So check static_key_initialized in order to make
sure this function runs only once.

LGTM-by: Ingo (http://marc.info/?l=linux-kernel&m=144049104329961&w=2)
Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 kernel/jump_label.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 0dbea887d625..2d693be967df 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -235,6 +235,9 @@ void __init jump_label_init(void)
 	struct static_key *key = NULL;
 	struct jump_entry *iter;
 
+	if (static_key_initialized)
+		return;
+
 	jump_label_lock();
 	jump_label_sort_entries(iter_start, iter_stop);
 
-- 
cgit v1.3-8-gc7d7


From 0d87d7ec22a0879d3926faa4f4f4412a5dee1fba Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Mon, 1 Aug 2016 13:49:29 -0700
Subject: perf/core: Change log level for duration warning to KERN_INFO

When the perf interrupt handler exceeds a threshold warning messages
are displayed on console:

  [12739.31793] perf interrupt took too long (2504 > 2500), lowering kernel.perf_event_max_sample_rate to 50000
  [71340.165065] perf interrupt took too long (5005 > 5000), lowering kernel.perf_event_max_sample_rate to 25000

Many customers and users are confused by the message wondering if
something is wrong or they need to take action to fix a problem.
Since a user can not do anything to fix the issue, the message is really
more informational than a warning. Adjust the log level accordingly.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1470084569-438-1-git-send-email-dsa@cumulusnetworks.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 356a6c7cb52a..a19550d80ab1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -448,7 +448,7 @@ static u64 __report_allowed;
 
 static void perf_duration_warn(struct irq_work *w)
 {
-	printk_ratelimited(KERN_WARNING
+	printk_ratelimited(KERN_INFO
 		"perf: interrupt took too long (%lld > %lld), lowering "
 		"kernel.perf_event_max_sample_rate to %d\n",
 		__report_avg, __report_allowed,
-- 
cgit v1.3-8-gc7d7


From 377ccbb483738f84400ddf5840c7dd8825716985 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 28 Jul 2016 22:30:43 -0400
Subject: Makefile: Mute warning for __builtin_return_address(>0) for tracing
 only

With the latest gcc compilers, they give a warning if
__builtin_return_address() parameter is greater than 0. That is because if
it is used by a function called by a top level function (or in the case of
the kernel, by assembly), it can try to access stack frames outside the
stack and crash the system.

The tracing system uses __builtin_return_address() of up to 2! But it is
well aware of the dangers that it may have, and has even added precautions
to protect against it (see the thunk code in arch/x86/entry/thunk*.S)

Linus originally added KBUILD_CFLAGS that would suppress the warning for the
entire kernel, as simply adding KBUILD_CFLAGS to the tracing directory
wouldn't work. The tracing directory plays a bit with the CFLAGS and
requires a little more logic.

This adds that special logic to only suppress the warning for the tracing
directory. If it is used anywhere else outside of tracing, the warning will
still be triggered.

Link: http://lkml.kernel.org/r/20160728223043.51996267@grimm.local.home

Tested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 Makefile              | 1 -
 kernel/trace/Makefile | 4 ++++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/Makefile b/Makefile
index 393b6159ae92..d384848478b9 100644
--- a/Makefile
+++ b/Makefile
@@ -620,7 +620,6 @@ include arch/$(SRCARCH)/Makefile
 
 KBUILD_CFLAGS	+= $(call cc-option,-fno-delete-null-pointer-checks,)
 KBUILD_CFLAGS	+= $(call cc-disable-warning,maybe-uninitialized,)
-KBUILD_CFLAGS	+= $(call cc-disable-warning,frame-address,)
 
 ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
 KBUILD_CFLAGS	+= -Os
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 979e7bfbde7a..d0a1617b52b4 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,4 +1,8 @@
 
+# We are fully aware of the dangers of __builtin_return_address()
+FRAME_CFLAGS := $(call cc-disable-warning,frame-address)
+KBUILD_CFLAGS += $(FRAME_CFLAGS)
+
 # Do not instrument the tracer itself:
 
 ifdef CONFIG_FUNCTION_TRACER
-- 
cgit v1.3-8-gc7d7


From 47c1856971dd05cac730f70d073518da021b2e5c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 29 Jun 2016 19:55:59 -0500
Subject: tracing: Fix use-after-free in hist_unreg_all/hist_enable_unreg_all

While running tools/testing/selftests test suite with KASAN, Dmitry
Vyukov hit the following use-after-free report:

  ==================================================================
  BUG: KASAN: use-after-free in hist_unreg_all+0x1a1/0x1d0 at addr
  ffff880031632cc0
  Read of size 8 by task ftracetest/7413
  ==================================================================
  BUG kmalloc-128 (Not tainted): kasan: bad access detected
  ------------------------------------------------------------------

This fixes the problem, along with the same problem in
hist_enable_unreg_all().

Link: http://lkml.kernel.org/r/c3d05b79e42555b6e36a3a99aae0e37315ee5304.1467247517.git.tom.zanussi@linux.intel.com

Cc: Dmitry Vyukov <dvyukov@google.com>
[Copied Steve's hist_enable_unreg_all() fix to hist_unreg_all()]
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0c05b8a99806..19ae135120a3 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1500,9 +1500,9 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
 
 static void hist_unreg_all(struct trace_event_file *file)
 {
-	struct event_trigger_data *test;
+	struct event_trigger_data *test, *n;
 
-	list_for_each_entry_rcu(test, &file->triggers, list) {
+	list_for_each_entry_safe(test, n, &file->triggers, list) {
 		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
 			list_del_rcu(&test->list);
 			trace_event_trigger_enable_disable(file, 0);
@@ -1699,9 +1699,9 @@ hist_enable_get_trigger_ops(char *cmd, char *param)
 
 static void hist_enable_unreg_all(struct trace_event_file *file)
 {
-	struct event_trigger_data *test;
+	struct event_trigger_data *test, *n;
 
-	list_for_each_entry_rcu(test, &file->triggers, list) {
+	list_for_each_entry_safe(test, n, &file->triggers, list) {
 		if (test->cmd_ops->trigger_type == ETT_HIST_ENABLE) {
 			list_del_rcu(&test->list);
 			update_cond_flag(file);
-- 
cgit v1.3-8-gc7d7


From 7522c03ae307e657114ff909aec650304371a134 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Wed, 29 Jun 2016 19:56:00 -0500
Subject: tracing: Fix use-after-free in hist_register_trigger()

This fixes a use-after-free case flagged by KASAN; make sure the test
happens before the potential free in this case.

Link: http://lkml.kernel.org/r/48fd74ab61bebd7dca9714386bb47d7c5ccd6a7b.1467247517.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 19ae135120a3..f3a960ed75a1 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1441,6 +1441,9 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 		goto out;
 	}
 
+	if (hist_data->attrs->pause)
+		data->paused = true;
+
 	if (named_data) {
 		destroy_hist_data(data->private_data);
 		data->private_data = named_data->private_data;
@@ -1448,9 +1451,6 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 		data->ops = &event_hist_trigger_named_ops;
 	}
 
-	if (hist_data->attrs->pause)
-		data->paused = true;
-
 	if (data->ops->init) {
 		ret = data->ops->init(data->ops, data);
 		if (ret < 0)
-- 
cgit v1.3-8-gc7d7


From 61e96496d3c949701a48b908f99f4ed891cd1101 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 2 Aug 2016 14:03:44 -0700
Subject: task_work: use READ_ONCE/lockless_dereference, avoid pi_lock if
 !task_works

Change task_work_cancel() to use lockless_dereference(), this is what
the code really wants but we didn't have this helper when it was
written.

Also add the fast-path task->task_works == NULL check, in the likely
case this task has no pending works and we can avoid
spin_lock(task->pi_lock).

While at it, change other users of ACCESS_ONCE() to use READ_ONCE().

Link: http://lkml.kernel.org/r/20160610150042.GA13868@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Parri <parri.andrea@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/task_work.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/task_work.c b/kernel/task_work.c
index 6ab4842b00e8..d513051fcca2 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -29,7 +29,7 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
 	struct callback_head *head;
 
 	do {
-		head = ACCESS_ONCE(task->task_works);
+		head = READ_ONCE(task->task_works);
 		if (unlikely(head == &work_exited))
 			return -ESRCH;
 		work->next = head;
@@ -57,6 +57,9 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
 	struct callback_head **pprev = &task->task_works;
 	struct callback_head *work;
 	unsigned long flags;
+
+	if (likely(!task->task_works))
+		return NULL;
 	/*
 	 * If cmpxchg() fails we continue without updating pprev.
 	 * Either we raced with task_work_add() which added the
@@ -64,8 +67,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
 	 * we raced with task_work_run(), *pprev == NULL/exited.
 	 */
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
-	while ((work = ACCESS_ONCE(*pprev))) {
-		smp_read_barrier_depends();
+	while ((work = lockless_dereference(*pprev))) {
 		if (work->func != func)
 			pprev = &work->next;
 		else if (cmpxchg(pprev, work, work->next) == work)
@@ -95,7 +97,7 @@ void task_work_run(void)
 		 * work_exited unless the list is empty.
 		 */
 		do {
-			work = ACCESS_ONCE(task->task_works);
+			work = READ_ONCE(task->task_works);
 			head = !work && (task->flags & PF_EXITING) ?
 				&work_exited : NULL;
 		} while (cmpxchg(&task->task_works, work, head) != work);
-- 
cgit v1.3-8-gc7d7


From 9d5059c959ac739dbf837cec14586e58e7a67292 Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luisbg@osg.samsung.com>
Date: Tue, 2 Aug 2016 14:03:47 -0700
Subject: dynamic_debug: only add header when used

kernel.h header doesn't directly use dynamic debug, instead we can
include it in module.c (which used it via kernel.h).  printk.h only uses
it if CONFIG_DYNAMIC_DEBUG is on, changing the inclusion to only happen
in that case.

Link: http://lkml.kernel.org/r/1468429793-16917-1-git-send-email-luisbg@osg.samsung.com
[luisbg@osg.samsung.com: include dynamic_debug.h in drb_int.h]
  Link: http://lkml.kernel.org/r/1468447828-18558-2-git-send-email-luisbg@osg.samsung.com
Signed-off-by: Luis de Bethencourt <luisbg@osg.samsung.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/drbd/drbd_actlog.c | 1 -
 drivers/block/drbd/drbd_int.h    | 1 +
 include/linux/kernel.h           | 1 -
 include/linux/printk.h           | 3 ++-
 kernel/module.c                  | 1 +
 5 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 0a1aaf8c24c4..2d3d50ab74bf 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -27,7 +27,6 @@
 #include <linux/crc32c.h>
 #include <linux/drbd.h>
 #include <linux/drbd_limits.h>
-#include <linux/dynamic_debug.h>
 #include "drbd_int.h"
 
 
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 7b54354976a5..4cb8f21ff4ef 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -41,6 +41,7 @@
 #include <linux/backing-dev.h>
 #include <linux/genhd.h>
 #include <linux/idr.h>
+#include <linux/dynamic_debug.h>
 #include <net/tcp.h>
 #include <linux/lru_cache.h>
 #include <linux/prefetch.h>
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index c42082112ec8..d96a6118d26a 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -11,7 +11,6 @@
 #include <linux/log2.h>
 #include <linux/typecheck.h>
 #include <linux/printk.h>
-#include <linux/dynamic_debug.h>
 #include <asm/byteorder.h>
 #include <uapi/linux/kernel.h>
 
diff --git a/include/linux/printk.h b/include/linux/printk.h
index f136b22c7772..987c65ed34e5 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -289,10 +289,11 @@ extern asmlinkage void dump_stack(void) __cold;
 	no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
 #endif
 
-#include <linux/dynamic_debug.h>
 
 /* If you are writing a driver, please use dev_dbg instead */
 #if defined(CONFIG_DYNAMIC_DEBUG)
+#include <linux/dynamic_debug.h>
+
 /* dynamic_pr_debug() uses pr_fmt() internally so we don't need it here */
 #define pr_debug(fmt, ...) \
 	dynamic_pr_debug(fmt, ##__VA_ARGS__)
diff --git a/kernel/module.c b/kernel/module.c
index 5f71aa63ed2a..a0f48b8b00da 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -60,6 +60,7 @@
 #include <linux/jump_label.h>
 #include <linux/pfn.h>
 #include <linux/bsearch.h>
+#include <linux/dynamic_debug.h>
 #include <uapi/linux/module.h>
 #include "module-internal.h"
 
-- 
cgit v1.3-8-gc7d7


From bebca05281d039e4144e1c51f402fd1d5f54b5e2 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 2 Aug 2016 14:03:50 -0700
Subject: printk: do not include interrupt.h

A trivial cosmetic change: interrupt.h header is redundant since commit
6b898c07cb1d ("console: use might_sleep in console_lock").

Link: http://lkml.kernel.org/r/20160620132847.21930-1-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index d4de33934dac..09af62e71fee 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -26,7 +26,6 @@
 #include <linux/nmi.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
-#include <linux/interrupt.h>			/* For in_interrupt() */
 #include <linux/delay.h>
 #include <linux/smp.h>
 #include <linux/security.h>
-- 
cgit v1.3-8-gc7d7


From 874f9c7da9a4acbc1b9e12ca722579fb50e4d142 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 2 Aug 2016 14:03:53 -0700
Subject: printk: create pr_<level> functions

Using functions instead of macros can reduce overall code size by
eliminating unnecessary "KERN_SOH<digit>" prefixes from format strings.

  defconfig x86-64:

  $ size vmlinux*
     text    data     bss      dec     hex  filename
  10193570 4331464 1105920 15630954  ee826a vmlinux.new
  10192623 4335560 1105920 15634103  ee8eb7 vmlinux.old

As the return value are unimportant and unused in the kernel tree, these
new functions return void.

Miscellanea:

 - change pr_<level> macros to call new __pr_<level> functions
 - change vprintk_nmi and vprintk_default to add LOGLEVEL_<level> argument

[akpm@linux-foundation.org: fix LOGLEVEL_INFO, per Joe]
Link: http://lkml.kernel.org/r/e16cc34479dfefcae37c98b481e6646f0f69efc3.1466718827.git.joe@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h   | 48 +++++++++++++++++++++++++++++++++---------------
 kernel/printk/internal.h | 16 ++++++++++------
 kernel/printk/nmi.c      | 13 +++++++++++--
 kernel/printk/printk.c   | 27 ++++++++++++++++++++++++---
 4 files changed, 78 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 987c65ed34e5..c2158f0f1499 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -257,21 +257,39 @@ extern asmlinkage void dump_stack(void) __cold;
  * and other debug macros are compiled out unless either DEBUG is defined
  * or CONFIG_DYNAMIC_DEBUG is set.
  */
-#define pr_emerg(fmt, ...) \
-	printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_alert(fmt, ...) \
-	printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_crit(fmt, ...) \
-	printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_err(fmt, ...) \
-	printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_warning(fmt, ...) \
-	printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_warn pr_warning
-#define pr_notice(fmt, ...) \
-	printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_info(fmt, ...) \
-	printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
+
+#ifdef CONFIG_PRINTK
+
+asmlinkage __printf(1, 2) __cold void __pr_emerg(const char *fmt, ...);
+asmlinkage __printf(1, 2) __cold void __pr_alert(const char *fmt, ...);
+asmlinkage __printf(1, 2) __cold void __pr_crit(const char *fmt, ...);
+asmlinkage __printf(1, 2) __cold void __pr_err(const char *fmt, ...);
+asmlinkage __printf(1, 2) __cold void __pr_warn(const char *fmt, ...);
+asmlinkage __printf(1, 2) __cold void __pr_notice(const char *fmt, ...);
+asmlinkage __printf(1, 2) __cold void __pr_info(const char *fmt, ...);
+
+#define pr_emerg(fmt, ...)	__pr_emerg(pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_alert(fmt, ...)	__pr_alert(pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_crit(fmt, ...)	__pr_crit(pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_err(fmt, ...)	__pr_err(pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_warn(fmt, ...)	__pr_warn(pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_notice(fmt, ...)	__pr_notice(pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_info(fmt, ...)	__pr_info(pr_fmt(fmt), ##__VA_ARGS__)
+
+#else
+
+#define pr_emerg(fmt, ...)	printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_alert(fmt, ...)	printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_crit(fmt, ...)	printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_err(fmt, ...)	printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_warn(fmt, ...)	printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_notice(fmt, ...)	printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_info(fmt, ...)	printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
+
+#endif
+
+#define pr_warning pr_warn
+
 /*
  * Like KERN_CONT, pr_cont() should only be used when continuing
  * a line with no newline ('\n') enclosed. Otherwise it defaults
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 7fd2838fa417..5d4505f30083 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -16,9 +16,11 @@
  */
 #include <linux/percpu.h>
 
-typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
+typedef __printf(2, 0) int (*printk_func_t)(int level, const char *fmt,
+					    va_list args);
 
-int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
+__printf(2, 0)
+int vprintk_default(int level, const char *fmt, va_list args);
 
 #ifdef CONFIG_PRINTK_NMI
 
@@ -31,9 +33,10 @@ extern raw_spinlock_t logbuf_lock;
  * via per-CPU variable.
  */
 DECLARE_PER_CPU(printk_func_t, printk_func);
-static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+__printf(2, 0)
+static inline int vprintk_func(int level, const char *fmt, va_list args)
 {
-	return this_cpu_read(printk_func)(fmt, args);
+	return this_cpu_read(printk_func)(level, fmt, args);
 }
 
 extern atomic_t nmi_message_lost;
@@ -44,9 +47,10 @@ static inline int get_nmi_message_lost(void)
 
 #else /* CONFIG_PRINTK_NMI */
 
-static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+__printf(2, 0)
+static inline int vprintk_func(int level, const char *fmt, va_list args)
 {
-	return vprintk_default(fmt, args);
+	return vprintk_default(level, fmt, args);
 }
 
 static inline int get_nmi_message_lost(void)
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index b69eb8a2876f..bc3eeb1ae6da 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -58,7 +58,7 @@ static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
  * one writer running. But the buffer might get flushed from another
  * CPU, so we need to be careful.
  */
-static int vprintk_nmi(const char *fmt, va_list args)
+static int vprintk_nmi(int level, const char *fmt, va_list args)
 {
 	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
 	int add = 0;
@@ -79,7 +79,16 @@ again:
 	if (!len)
 		smp_rmb();
 
-	add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+	if (level != LOGLEVEL_DEFAULT) {
+		add = snprintf(s->buffer + len, sizeof(s->buffer) - len,
+				KERN_SOH "%c", '0' + level);
+		add += vsnprintf(s->buffer + len + add,
+				 sizeof(s->buffer) - len - add,
+				 fmt, args);
+	} else {
+		add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len,
+				fmt, args);
+	}
 
 	/*
 	 * Do it once again if the buffer has been flushed in the meantime.
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 09af62e71fee..d2accf2f4448 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1801,7 +1801,28 @@ asmlinkage int printk_emit(int facility, int level,
 }
 EXPORT_SYMBOL(printk_emit);
 
-int vprintk_default(const char *fmt, va_list args)
+#ifdef CONFIG_PRINTK
+#define define_pr_level(func, loglevel)				\
+asmlinkage __visible void func(const char *fmt, ...)		\
+{								\
+	va_list args;						\
+								\
+	va_start(args, fmt);					\
+	vprintk_default(loglevel, fmt, args);			\
+	va_end(args);						\
+}								\
+EXPORT_SYMBOL(func)
+
+define_pr_level(__pr_emerg, LOGLEVEL_EMERG);
+define_pr_level(__pr_alert, LOGLEVEL_ALERT);
+define_pr_level(__pr_crit, LOGLEVEL_CRIT);
+define_pr_level(__pr_err, LOGLEVEL_ERR);
+define_pr_level(__pr_warn, LOGLEVEL_WARNING);
+define_pr_level(__pr_notice, LOGLEVEL_NOTICE);
+define_pr_level(__pr_info, LOGLEVEL_INFO);
+#endif
+
+int vprintk_default(int level, const char *fmt, va_list args)
 {
 	int r;
 
@@ -1811,7 +1832,7 @@ int vprintk_default(const char *fmt, va_list args)
 		return r;
 	}
 #endif
-	r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+	r = vprintk_emit(0, level, NULL, 0, fmt, args);
 
 	return r;
 }
@@ -1844,7 +1865,7 @@ asmlinkage __visible int printk(const char *fmt, ...)
 	int r;
 
 	va_start(args, fmt);
-	r = vprintk_func(fmt, args);
+	r = vprintk_func(LOGLEVEL_DEFAULT, fmt, args);
 	va_end(args);
 
 	return r;
-- 
cgit v1.3-8-gc7d7


From cf7754441c563230ed74096fcd4b8cca49910550 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 2 Aug 2016 14:03:56 -0700
Subject: printk: introduce suppress_message_printing()

Messages' levels and console log level are inspected when the actual
printing occurs, which may provoke console_unlock() and
console_cont_flush() to waste CPU cycles on every message that has
loglevel above the current console_loglevel.

Schematically, console_unlock() does the following:

console_unlock()
{
        ...
        for (;;) {
                ...
                raw_spin_lock_irqsave(&logbuf_lock, flags);
skip:
                msg = log_from_idx(console_idx);

                if (msg->flags & LOG_NOCONS) {
                        ...
                        goto skip;
                }

                level = msg->level;
                len += msg_print_text();                        >> sprintfs
                                                                   memcpy,
                                                                   etc.

                if (nr_ext_console_drivers) {
                        ext_len = msg_print_ext_header();       >> scnprintf
                        ext_len += msg_print_ext_body();        >> scnprintfs
                                                                   etc.
                }
                ...
                raw_spin_unlock(&logbuf_lock);

                call_console_drivers(level, ext_text, ext_len, text, len)
                {
                        if (level >= console_loglevel &&        >> drop the message
                                        !ignore_loglevel)
                                return;

                        console->write(...);
                }

                local_irq_restore(flags);
        }
        ...
}

The thing here is this deferred `level >= console_loglevel' check.  We
are wasting CPU cycles on sprintfs/memcpy/etc.  preparing the messages
that we will eventually drop.

This can be huge when we register a new CON_PRINTBUFFER console, for
instance.  For every such a console register_console() resets the

        console_seq, console_idx, console_prev

and sets a `exclusive console' pointer to replay the log buffer to that
just-registered console.  And there can be a lot of messages to replay,
in the worst case most of which can be dropped after console_loglevel
test.

We know messages' levels long before we call msg_print_text() and
friends, so we can just move console_loglevel check out of
call_console_drivers() and format a new message only if we are sure that
it won't be dropped.

The patch factors out loglevel check into suppress_message_printing()
function and tests message->level and console_loglevel before formatting
functions in console_unlock() and console_cont_flush() are getting
executed.  This improves things not only for exclusive CON_PRINTBUFFER
consoles, but for every console_unlock() that attempts to print a
message of level above the console_loglevel.

Link: http://lkml.kernel.org/r/20160627135012.8229-1-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Calvin Owens <calvinowens@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index d2accf2f4448..8bdce14254f4 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -985,6 +985,11 @@ module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(ignore_loglevel,
 		 "ignore loglevel setting (prints all kernel messages to the console)");
 
+static bool suppress_message_printing(int level)
+{
+	return (level >= console_loglevel && !ignore_loglevel);
+}
+
 #ifdef CONFIG_BOOT_PRINTK_DELAY
 
 static int boot_delay; /* msecs delay after each printk during bootup */
@@ -1014,7 +1019,7 @@ static void boot_delay_msec(int level)
 	unsigned long timeout;
 
 	if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
-		|| (level >= console_loglevel && !ignore_loglevel)) {
+		|| suppress_message_printing(level)) {
 		return;
 	}
 
@@ -1438,8 +1443,6 @@ static void call_console_drivers(int level,
 
 	trace_console(text, len);
 
-	if (level >= console_loglevel && !ignore_loglevel)
-		return;
 	if (!console_drivers)
 		return;
 
@@ -1908,6 +1911,7 @@ static void call_console_drivers(int level,
 static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
 			     bool syslog, char *buf, size_t size) { return 0; }
 static size_t cont_print_text(char *text, size_t size) { return 0; }
+static bool suppress_message_printing(int level) { return false; }
 
 /* Still needs to be defined for users */
 DEFINE_PER_CPU(printk_func_t, printk_func);
@@ -2187,6 +2191,13 @@ static void console_cont_flush(char *text, size_t size)
 	if (!cont.len)
 		goto out;
 
+	if (suppress_message_printing(cont.level)) {
+		cont.cons = cont.len;
+		if (cont.flushed)
+			cont.len = 0;
+		goto out;
+	}
+
 	/*
 	 * We still queue earlier records, likely because the console was
 	 * busy. The earlier ones need to be printed before this one, we
@@ -2290,10 +2301,13 @@ skip:
 			break;
 
 		msg = log_from_idx(console_idx);
-		if (msg->flags & LOG_NOCONS) {
+		level = msg->level;
+		if ((msg->flags & LOG_NOCONS) ||
+				suppress_message_printing(level)) {
 			/*
 			 * Skip record we have buffered and already printed
-			 * directly to the console when we received it.
+			 * directly to the console when we received it, and
+			 * record that has level above the console loglevel.
 			 */
 			console_idx = log_next(console_idx);
 			console_seq++;
@@ -2307,7 +2321,6 @@ skip:
 			goto skip;
 		}
 
-		level = msg->level;
 		len += msg_print_text(msg, console_prev, false,
 				      text + len, sizeof(text) - len);
 		if (nr_ext_console_drivers) {
-- 
cgit v1.3-8-gc7d7


From 40a7d9f5f90681c6d7890b6a07f230bb4afe7e39 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Aug 2016 14:03:59 -0700
Subject: printk: include <asm/sections.h> instead of <asm-generic/sections.h>

asm-generic headers are generic implementations for architecture
specific code and should not be included by common code.  Thus use the
asm/ version of sections.h to get at the linker sections.

Link: http://lkml.kernel.org/r/1468285008-7331-1-git-send-email-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 8bdce14254f4..70c66c5ba212 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -47,7 +47,7 @@
 #include <linux/uio.h>
 
 #include <asm/uaccess.h>
-#include <asm-generic/sections.h>
+#include <asm/sections.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/printk.h>
-- 
cgit v1.3-8-gc7d7


From 750afe7babd117daabebf4855da18e4418ea845e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 2 Aug 2016 14:04:07 -0700
Subject: printk: add kernel parameter to control writes to /dev/kmsg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a "printk.devkmsg" kernel command line parameter which controls how
userspace writes into /dev/kmsg.  It has three options:

 * ratelimit - ratelimit logging from userspace.
 * on  - unlimited logging from userspace
 * off - logging from userspace gets ignored

The default setting is to ratelimit the messages written to it.

This changes the kernel default setting of "on" to "ratelimit" and we do
that because we want to keep userspace spamming /dev/kmsg to sane
levels.  This is especially moot when a small kernel log buffer wraps
around and messages get lost.  So the ratelimiting setting should be a
sane setting where kernel messages should have a bit higher chance of
survival from all the spamming.

It additionally does not limit logging to /dev/kmsg while the system is
booting if we haven't disabled it on the command line.

Furthermore, we can control the logging from a lower priority sysctl
interface - kernel.printk_devkmsg.

That interface will succeed only if printk.devkmsg *hasn't* been
supplied on the command line.  If it has, then printk.devkmsg is a
one-time setting which remains for the duration of the system lifetime.
This "locking" of the setting is to prevent userspace from changing the
logging on us through sysctl(2).

This patch is based on previous patches from Linus and Steven.

[bp@suse.de: fixes]
  Link: http://lkml.kernel.org/r/20160719072344.GC25563@nazgul.tnic
Link: http://lkml.kernel.org/r/20160716061745.15795-3-bp@alien8.de
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Dave Young <dyoung@redhat.com>
Cc: Franck Bui <fbui@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kernel-parameters.txt |   7 ++
 Documentation/sysctl/kernel.txt     |  14 ++++
 include/linux/printk.h              |   9 +++
 kernel/printk/printk.c              | 142 ++++++++++++++++++++++++++++++++++--
 kernel/sysctl.c                     |   7 ++
 5 files changed, 171 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e24aa11e8f8a..b240540e49f2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3173,6 +3173,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Format: <bool>  (1/Y/y=enable, 0/N/n=disable)
 			default: disabled
 
+	printk.devkmsg={on,off,ratelimit}
+			Control writing to /dev/kmsg.
+			on - unlimited logging to /dev/kmsg from userspace
+			off - logging to /dev/kmsg disabled
+			ratelimit - ratelimit the logging
+			Default: ratelimit
+
 	printk.time=	Show timing data prefixed to each printk message line
 			Format: <bool>  (1/Y/y=enable, 0/N/n=disable)
 
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 33204604de6c..ffab8b5caa60 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -764,6 +764,20 @@ send before ratelimiting kicks in.
 
 ==============================================================
 
+printk_devkmsg:
+
+Control the logging to /dev/kmsg from userspace:
+
+ratelimit: default, ratelimited
+on: unlimited logging to /dev/kmsg from userspace
+off: logging to /dev/kmsg disabled
+
+The kernel command line parameter printk.devkmsg= overrides this and is
+a one-time setting until next reboot: once set, it cannot be changed by
+this sysctl interface anymore.
+
+==============================================================
+
 randomize_va_space:
 
 This option can be used to select the type of process address
diff --git a/include/linux/printk.h b/include/linux/printk.h
index c2158f0f1499..8dc155dab3ed 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -61,6 +61,11 @@ static inline void console_verbose(void)
 		console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
 }
 
+/* strlen("ratelimit") + 1 */
+#define DEVKMSG_STR_MAX_SIZE 10
+extern char devkmsg_log_str[];
+struct ctl_table;
+
 struct va_format {
 	const char *fmt;
 	va_list *va;
@@ -175,6 +180,10 @@ extern int printk_delay_msec;
 extern int dmesg_restrict;
 extern int kptr_restrict;
 
+extern int
+devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void __user *buf,
+			  size_t *lenp, loff_t *ppos);
+
 extern void wake_up_klogd(void);
 
 char *log_buf_addr_get(void);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 70c66c5ba212..a5ef95ca18c9 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -85,6 +85,111 @@ static struct lockdep_map console_lock_dep_map = {
 };
 #endif
 
+enum devkmsg_log_bits {
+	__DEVKMSG_LOG_BIT_ON = 0,
+	__DEVKMSG_LOG_BIT_OFF,
+	__DEVKMSG_LOG_BIT_LOCK,
+};
+
+enum devkmsg_log_masks {
+	DEVKMSG_LOG_MASK_ON             = BIT(__DEVKMSG_LOG_BIT_ON),
+	DEVKMSG_LOG_MASK_OFF            = BIT(__DEVKMSG_LOG_BIT_OFF),
+	DEVKMSG_LOG_MASK_LOCK           = BIT(__DEVKMSG_LOG_BIT_LOCK),
+};
+
+/* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */
+#define DEVKMSG_LOG_MASK_DEFAULT	0
+
+static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
+
+static int __control_devkmsg(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	if (!strncmp(str, "on", 2)) {
+		devkmsg_log = DEVKMSG_LOG_MASK_ON;
+		return 2;
+	} else if (!strncmp(str, "off", 3)) {
+		devkmsg_log = DEVKMSG_LOG_MASK_OFF;
+		return 3;
+	} else if (!strncmp(str, "ratelimit", 9)) {
+		devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
+		return 9;
+	}
+	return -EINVAL;
+}
+
+static int __init control_devkmsg(char *str)
+{
+	if (__control_devkmsg(str) < 0)
+		return 1;
+
+	/*
+	 * Set sysctl string accordingly:
+	 */
+	if (devkmsg_log == DEVKMSG_LOG_MASK_ON) {
+		memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE);
+		strncpy(devkmsg_log_str, "on", 2);
+	} else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) {
+		memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE);
+		strncpy(devkmsg_log_str, "off", 3);
+	}
+	/* else "ratelimit" which is set by default. */
+
+	/*
+	 * Sysctl cannot change it anymore. The kernel command line setting of
+	 * this parameter is to force the setting to be permanent throughout the
+	 * runtime of the system. This is a precation measure against userspace
+	 * trying to be a smarta** and attempting to change it up on us.
+	 */
+	devkmsg_log |= DEVKMSG_LOG_MASK_LOCK;
+
+	return 0;
+}
+__setup("printk.devkmsg=", control_devkmsg);
+
+char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
+
+int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
+			      void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	char old_str[DEVKMSG_STR_MAX_SIZE];
+	unsigned int old;
+	int err;
+
+	if (write) {
+		if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK)
+			return -EINVAL;
+
+		old = devkmsg_log;
+		strncpy(old_str, devkmsg_log_str, DEVKMSG_STR_MAX_SIZE);
+	}
+
+	err = proc_dostring(table, write, buffer, lenp, ppos);
+	if (err)
+		return err;
+
+	if (write) {
+		err = __control_devkmsg(devkmsg_log_str);
+
+		/*
+		 * Do not accept an unknown string OR a known string with
+		 * trailing crap...
+		 */
+		if (err < 0 || (err + 1 != *lenp)) {
+
+			/* ... and restore old setting. */
+			devkmsg_log = old;
+			strncpy(devkmsg_log_str, old_str, DEVKMSG_STR_MAX_SIZE);
+
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 /*
  * Number of registered extended console drivers.
  *
@@ -613,6 +718,7 @@ struct devkmsg_user {
 	u64 seq;
 	u32 idx;
 	enum log_flags prev;
+	struct ratelimit_state rs;
 	struct mutex lock;
 	char buf[CONSOLE_EXT_LOG_MAX];
 };
@@ -622,11 +728,24 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
 	char *buf, *line;
 	int level = default_message_loglevel;
 	int facility = 1;	/* LOG_USER */
+	struct file *file = iocb->ki_filp;
+	struct devkmsg_user *user = file->private_data;
 	size_t len = iov_iter_count(from);
 	ssize_t ret = len;
 
-	if (len > LOG_LINE_MAX)
+	if (!user || len > LOG_LINE_MAX)
 		return -EINVAL;
+
+	/* Ignore when user logging is disabled. */
+	if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
+		return len;
+
+	/* Ratelimit when not explicitly enabled. */
+	if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) {
+		if (!___ratelimit(&user->rs, current->comm))
+			return ret;
+	}
+
 	buf = kmalloc(len+1, GFP_KERNEL);
 	if (buf == NULL)
 		return -ENOMEM;
@@ -799,19 +918,24 @@ static int devkmsg_open(struct inode *inode, struct file *file)
 	struct devkmsg_user *user;
 	int err;
 
-	/* write-only does not need any file context */
-	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
-		return 0;
+	if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
+		return -EPERM;
 
-	err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
-				       SYSLOG_FROM_READER);
-	if (err)
-		return err;
+	/* write-only does not need any file context */
+	if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+		err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
+					       SYSLOG_FROM_READER);
+		if (err)
+			return err;
+	}
 
 	user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
 	if (!user)
 		return -ENOMEM;
 
+	ratelimit_default_init(&user->rs);
+	ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE);
+
 	mutex_init(&user->lock);
 
 	raw_spin_lock_irq(&logbuf_lock);
@@ -830,6 +954,8 @@ static int devkmsg_release(struct inode *inode, struct file *file)
 	if (!user)
 		return 0;
 
+	ratelimit_state_exit(&user->rs);
+
 	mutex_destroy(&user->lock);
 	kfree(user);
 	return 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 53954631a4e1..b43d0b27c1fe 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -813,6 +813,13 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &ten_thousand,
 	},
+	{
+		.procname	= "printk_devkmsg",
+		.data		= devkmsg_log_str,
+		.maxlen		= DEVKMSG_STR_MAX_SIZE,
+		.mode		= 0644,
+		.proc_handler	= devkmsg_sysctl_set_loglvl,
+	},
 	{
 		.procname	= "dmesg_restrict",
 		.data		= &dmesg_restrict,
-- 
cgit v1.3-8-gc7d7


From 627393d44860386e948bb63a8e5b53f2cc44d070 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 2 Aug 2016 14:05:40 -0700
Subject: kernel/exit.c: quieten greatest stack depth printk

Many targets enable CONFIG_DEBUG_STACK_USAGE, and while the information
is useful, it isn't worthy of pr_warn().  Reduce it to pr_info().

Link: http://lkml.kernel.org/r/1466982072-29836-1-git-send-email-anton@ozlabs.org
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 84ae830234f8..2f974ae042a6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -715,7 +715,7 @@ static void check_stack_usage(void)
 
 	spin_lock(&low_water_lock);
 	if (free < lowest_to_date) {
-		pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
+		pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
 			current->comm, task_pid_nr(current), free);
 		lowest_to_date = free;
 	}
-- 
cgit v1.3-8-gc7d7


From 4caf9615247aceab56e91df6c0e11892ea55f4f0 Mon Sep 17 00:00:00 2001
From: Minfei Huang <mnghuan@gmail.com>
Date: Tue, 2 Aug 2016 14:05:45 -0700
Subject: kexec: return error number directly

This is a cleanup patch to make kexec more clear to return error number
directly.  The variable result is useless, because there is no other
function's return value assignes to it.  So remove it.

Link: http://lkml.kernel.org/r/1464179273-57668-1-git-send-email-mnghuan@gmail.com
Signed-off-by: Minfei Huang <mnghuan@gmail.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Xunlei Pang <xlpang@redhat.com>
Cc: Atsushi Kumagai <ats-kumagai@wm.jp.nec.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_core.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 56b3ed0927b0..23311c803b1b 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -147,7 +147,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
 
 int sanity_check_segment_list(struct kimage *image)
 {
-	int result, i;
+	int i;
 	unsigned long nr_segments = image->nr_segments;
 
 	/*
@@ -163,16 +163,15 @@ int sanity_check_segment_list(struct kimage *image)
 	 * simply because addresses are changed to page size
 	 * granularity.
 	 */
-	result = -EADDRNOTAVAIL;
 	for (i = 0; i < nr_segments; i++) {
 		unsigned long mstart, mend;
 
 		mstart = image->segment[i].mem;
 		mend   = mstart + image->segment[i].memsz;
 		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
-			return result;
+			return -EADDRNOTAVAIL;
 		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
-			return result;
+			return -EADDRNOTAVAIL;
 	}
 
 	/* Verify our destination addresses do not overlap.
@@ -180,7 +179,6 @@ int sanity_check_segment_list(struct kimage *image)
 	 * through very weird things can happen with no
 	 * easy explanation as one segment stops on another.
 	 */
-	result = -EINVAL;
 	for (i = 0; i < nr_segments; i++) {
 		unsigned long mstart, mend;
 		unsigned long j;
@@ -194,7 +192,7 @@ int sanity_check_segment_list(struct kimage *image)
 			pend   = pstart + image->segment[j].memsz;
 			/* Do the segments overlap ? */
 			if ((mend > pstart) && (mstart < pend))
-				return result;
+				return -EINVAL;
 		}
 	}
 
@@ -203,10 +201,9 @@ int sanity_check_segment_list(struct kimage *image)
 	 * and it is easier to check up front than to be surprised
 	 * later on.
 	 */
-	result = -EINVAL;
 	for (i = 0; i < nr_segments; i++) {
 		if (image->segment[i].bufsz > image->segment[i].memsz)
-			return result;
+			return -EINVAL;
 	}
 
 	/*
@@ -220,7 +217,6 @@ int sanity_check_segment_list(struct kimage *image)
 	 */
 
 	if (image->type == KEXEC_TYPE_CRASH) {
-		result = -EADDRNOTAVAIL;
 		for (i = 0; i < nr_segments; i++) {
 			unsigned long mstart, mend;
 
@@ -229,7 +225,7 @@ int sanity_check_segment_list(struct kimage *image)
 			/* Ensure we are within the crash kernel limits */
 			if ((mstart < crashk_res.start) ||
 			    (mend > crashk_res.end))
-				return result;
+				return -EADDRNOTAVAIL;
 		}
 	}
 
-- 
cgit v1.3-8-gc7d7


From 465d377701dfe6a08a9f361a3fd926dea7f89c74 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Tue, 2 Aug 2016 14:05:57 -0700
Subject: kexec: ensure user memory sizes do not wrap

Ensure that user memory sizes do not wrap around when validating the
user input, which can lead to the following input validation working
incorrectly.

[akpm@linux-foundation.org: fix it for kexec-return-error-number-directly.patch]
Link: http://lkml.kernel.org/r/E1b8koF-0004HM-5x@rmk-PC.armlinux.org.uk
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Reviewed-by: Pratyush Anand <panand@redhat.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Keerthy <j-keerthy@ti.com>
Cc: Vitaly Andrianov <vitalya@ti.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Simon Horman <horms@verge.net.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 23311c803b1b..5a83b2a9d584 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -168,6 +168,8 @@ int sanity_check_segment_list(struct kimage *image)
 
 		mstart = image->segment[i].mem;
 		mend   = mstart + image->segment[i].memsz;
+		if (mstart > mend)
+			return -EADDRNOTAVAIL;
 		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
 			return -EADDRNOTAVAIL;
 		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
-- 
cgit v1.3-8-gc7d7


From dae28018f56645b61f5beb84d5831346d3c5e457 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Tue, 2 Aug 2016 14:06:00 -0700
Subject: kdump: arrange for paddr_vmcoreinfo_note() to return phys_addr_t

On PAE systems (eg, ARM LPAE) the vmcore note may be located above 4GB
physical on 32-bit architectures, so we need a wider type than "unsigned
long" here.  Arrange for paddr_vmcoreinfo_note() to return a
phys_addr_t, thereby allowing it to be located above 4GB.

This makes no difference for kexec-tools, as they already assume a
64-bit type when reading from this file.

Link: http://lkml.kernel.org/r/E1b8koK-0004HS-K9@rmk-PC.armlinux.org.uk
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Reviewed-by: Pratyush Anand <panand@redhat.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Keerthy <j-keerthy@ti.com>
Cc: Vitaly Andrianov <vitalya@ti.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Simon Horman <horms@verge.net.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/machine_kexec.c | 2 +-
 include/linux/kexec.h            | 2 +-
 kernel/kexec_core.c              | 2 +-
 kernel/ksysfs.c                  | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
index b72cd7a07222..599507bcec91 100644
--- a/arch/ia64/kernel/machine_kexec.c
+++ b/arch/ia64/kernel/machine_kexec.c
@@ -163,7 +163,7 @@ void arch_crash_save_vmcoreinfo(void)
 #endif
 }
 
-unsigned long paddr_vmcoreinfo_note(void)
+phys_addr_t paddr_vmcoreinfo_note(void)
 {
 	return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note);
 }
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index ce2fe197f583..555227f0029f 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -233,7 +233,7 @@ void crash_save_vmcoreinfo(void);
 void arch_crash_save_vmcoreinfo(void);
 __printf(1, 2)
 void vmcoreinfo_append_str(const char *fmt, ...);
-unsigned long paddr_vmcoreinfo_note(void);
+phys_addr_t paddr_vmcoreinfo_note(void);
 
 #define VMCOREINFO_OSRELEASE(value) \
 	vmcoreinfo_append_str("OSRELEASE=%s\n", value)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 5a83b2a9d584..dab03f17be25 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1372,7 +1372,7 @@ void vmcoreinfo_append_str(const char *fmt, ...)
 void __weak arch_crash_save_vmcoreinfo(void)
 {}
 
-unsigned long __weak paddr_vmcoreinfo_note(void)
+phys_addr_t __weak paddr_vmcoreinfo_note(void)
 {
 	return __pa((unsigned long)(char *)&vmcoreinfo_note);
 }
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 152da4a48867..9f1920d2d0c6 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -128,8 +128,8 @@ KERNEL_ATTR_RW(kexec_crash_size);
 static ssize_t vmcoreinfo_show(struct kobject *kobj,
 			       struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%lx %x\n",
-		       paddr_vmcoreinfo_note(),
+	phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
+	return sprintf(buf, "%pa %x\n", &vmcore_base,
 		       (unsigned int)sizeof(vmcoreinfo_note));
 }
 KERNEL_ATTR_RO(vmcoreinfo);
-- 
cgit v1.3-8-gc7d7


From 43546d8669d62d75fa69ca9a45d2f586665f56bd Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Tue, 2 Aug 2016 14:06:04 -0700
Subject: kexec: allow architectures to override boot mapping

kexec physical addresses are the boot-time view of the system.  For
certain ARM systems (such as Keystone 2), the boot view of the system
does not match the kernel's view of the system: the boot view uses a
special alias in the lower 4GB of the physical address space.

To cater for these kinds of setups, we need to translate between the
boot view physical addresses and the normal kernel view physical
addresses.  This patch extracts the current transation points into
linux/kexec.h, and allows an architecture to override the functions.

Due to the translations required, we unfortunately end up with six
translation functions, which are reduced down to four that the
architecture can override.

[akpm@linux-foundation.org: kexec.h needs asm/io.h for phys_to_virt()]
Link: http://lkml.kernel.org/r/E1b8koP-0004HZ-Vf@rmk-PC.armlinux.org.uk
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Keerthy <j-keerthy@ti.com>
Cc: Pratyush Anand <panand@redhat.com>
Cc: Vitaly Andrianov <vitalya@ti.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Simon Horman <horms@verge.net.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kexec.h | 40 ++++++++++++++++++++++++++++++++++++++++
 kernel/kexec.c        |  3 ++-
 kernel/kexec_core.c   | 26 +++++++++++++-------------
 3 files changed, 55 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 555227f0029f..23e14a460cfb 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -14,6 +14,8 @@
 
 #if !defined(__ASSEMBLY__)
 
+#include <asm/io.h>
+
 #include <uapi/linux/kexec.h>
 
 #ifdef CONFIG_KEXEC_CORE
@@ -318,6 +320,44 @@ int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 void arch_kexec_protect_crashkres(void);
 void arch_kexec_unprotect_crashkres(void);
 
+#ifndef page_to_boot_pfn
+static inline unsigned long page_to_boot_pfn(struct page *page)
+{
+	return page_to_pfn(page);
+}
+#endif
+
+#ifndef boot_pfn_to_page
+static inline struct page *boot_pfn_to_page(unsigned long boot_pfn)
+{
+	return pfn_to_page(boot_pfn);
+}
+#endif
+
+#ifndef phys_to_boot_phys
+static inline unsigned long phys_to_boot_phys(phys_addr_t phys)
+{
+	return phys;
+}
+#endif
+
+#ifndef boot_phys_to_phys
+static inline phys_addr_t boot_phys_to_phys(unsigned long boot_phys)
+{
+	return boot_phys;
+}
+#endif
+
+static inline unsigned long virt_to_boot_phys(void *addr)
+{
+	return phys_to_boot_phys(__pa((unsigned long)addr));
+}
+
+static inline void *boot_phys_to_virt(unsigned long entry)
+{
+	return phys_to_virt(boot_phys_to_phys(entry));
+}
+
 #else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
 struct task_struct;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4384672d3245..980936a90ee6 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -48,7 +48,8 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
 
 	if (kexec_on_panic) {
 		/* Verify we have a valid entry point */
-		if ((entry < crashk_res.start) || (entry > crashk_res.end))
+		if ((entry < phys_to_boot_phys(crashk_res.start)) ||
+		    (entry > phys_to_boot_phys(crashk_res.end)))
 			return -EADDRNOTAVAIL;
 	}
 
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index dab03f17be25..73d4c5f57dd8 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -225,8 +225,8 @@ int sanity_check_segment_list(struct kimage *image)
 			mstart = image->segment[i].mem;
 			mend = mstart + image->segment[i].memsz - 1;
 			/* Ensure we are within the crash kernel limits */
-			if ((mstart < crashk_res.start) ||
-			    (mend > crashk_res.end))
+			if ((mstart < phys_to_boot_phys(crashk_res.start)) ||
+			    (mend > phys_to_boot_phys(crashk_res.end)))
 				return -EADDRNOTAVAIL;
 		}
 	}
@@ -350,7 +350,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 		pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
 		if (!pages)
 			break;
-		pfn   = page_to_pfn(pages);
+		pfn   = page_to_boot_pfn(pages);
 		epfn  = pfn + count;
 		addr  = pfn << PAGE_SHIFT;
 		eaddr = epfn << PAGE_SHIFT;
@@ -476,7 +476,7 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 			return -ENOMEM;
 
 		ind_page = page_address(page);
-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+		*image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION;
 		image->entry = ind_page;
 		image->last_entry = ind_page +
 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
@@ -531,13 +531,13 @@ void kimage_terminate(struct kimage *image)
 #define for_each_kimage_entry(image, ptr, entry) \
 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
 		ptr = (entry & IND_INDIRECTION) ? \
-			phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
+			boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
 
 static void kimage_free_entry(kimage_entry_t entry)
 {
 	struct page *page;
 
-	page = pfn_to_page(entry >> PAGE_SHIFT);
+	page = boot_pfn_to_page(entry >> PAGE_SHIFT);
 	kimage_free_pages(page);
 }
 
@@ -631,7 +631,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
 	 * have a match.
 	 */
 	list_for_each_entry(page, &image->dest_pages, lru) {
-		addr = page_to_pfn(page) << PAGE_SHIFT;
+		addr = page_to_boot_pfn(page) << PAGE_SHIFT;
 		if (addr == destination) {
 			list_del(&page->lru);
 			return page;
@@ -646,12 +646,12 @@ static struct page *kimage_alloc_page(struct kimage *image,
 		if (!page)
 			return NULL;
 		/* If the page cannot be used file it away */
-		if (page_to_pfn(page) >
+		if (page_to_boot_pfn(page) >
 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 			list_add(&page->lru, &image->unusable_pages);
 			continue;
 		}
-		addr = page_to_pfn(page) << PAGE_SHIFT;
+		addr = page_to_boot_pfn(page) << PAGE_SHIFT;
 
 		/* If it is the destination page we want use it */
 		if (addr == destination)
@@ -674,7 +674,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
 			struct page *old_page;
 
 			old_addr = *old & PAGE_MASK;
-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+			old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT);
 			copy_highpage(page, old_page);
 			*old = addr | (*old & ~PAGE_MASK);
 
@@ -730,7 +730,7 @@ static int kimage_load_normal_segment(struct kimage *image,
 			result  = -ENOMEM;
 			goto out;
 		}
-		result = kimage_add_page(image, page_to_pfn(page)
+		result = kimage_add_page(image, page_to_boot_pfn(page)
 								<< PAGE_SHIFT);
 		if (result < 0)
 			goto out;
@@ -791,7 +791,7 @@ static int kimage_load_crash_segment(struct kimage *image,
 		char *ptr;
 		size_t uchunk, mchunk;
 
-		page = pfn_to_page(maddr >> PAGE_SHIFT);
+		page = boot_pfn_to_page(maddr >> PAGE_SHIFT);
 		if (!page) {
 			result  = -ENOMEM;
 			goto out;
@@ -919,7 +919,7 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,
 	unsigned long addr;
 
 	for (addr = begin; addr < end; addr += PAGE_SIZE)
-		free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+		free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT));
 }
 
 int crash_shrink_memory(unsigned long new_size)
-- 
cgit v1.3-8-gc7d7


From b26e27ddfd2a986dc53e259aba572f3aac182eb8 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Tue, 2 Aug 2016 14:06:13 -0700
Subject: kexec: use core_param for crash_kexec_post_notifiers boot option

crash_kexec_post_notifiers ia a boot option which controls whether the
1st kernel calls panic notifiers or not before booting the 2nd kernel.
However, there is no need to limit it to being modifiable only at boot
time.  So, use core_param instead of early_param.

Link: http://lkml.kernel.org/r/20160705113327.5864.43139.stgit@softrs
Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 8aa74497cc5a..ca8cea1ef673 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -108,6 +108,7 @@ void panic(const char *fmt, ...)
 	long i, i_next = 0;
 	int state = 0;
 	int old_cpu, this_cpu;
+	bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
 
 	/*
 	 * Disable local interrupts. This will prevent panic_smp_self_stop
@@ -160,7 +161,7 @@ void panic(const char *fmt, ...)
 	 *
 	 * Bypass the panic_cpu check and call __crash_kexec directly.
 	 */
-	if (!crash_kexec_post_notifiers) {
+	if (!_crash_kexec_post_notifiers) {
 		printk_nmi_flush_on_panic();
 		__crash_kexec(NULL);
 	}
@@ -191,7 +192,7 @@ void panic(const char *fmt, ...)
 	 *
 	 * Bypass the panic_cpu check and call __crash_kexec directly.
 	 */
-	if (crash_kexec_post_notifiers)
+	if (_crash_kexec_post_notifiers)
 		__crash_kexec(NULL);
 
 	bust_spinlocks(0);
@@ -571,13 +572,7 @@ EXPORT_SYMBOL(__stack_chk_fail);
 core_param(panic, panic_timeout, int, 0644);
 core_param(pause_on_oops, pause_on_oops, int, 0644);
 core_param(panic_on_warn, panic_on_warn, int, 0644);
-
-static int __init setup_crash_kexec_post_notifiers(char *s)
-{
-	crash_kexec_post_notifiers = true;
-	return 0;
-}
-early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers);
+core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
 
 static int __init oops_setup(char *s)
 {
-- 
cgit v1.3-8-gc7d7


From 21db79e8bb054d0351a6b1b464f1c9c47a2e6e8d Mon Sep 17 00:00:00 2001
From: Petr Tesarik <ptesarik@suse.com>
Date: Tue, 2 Aug 2016 14:06:16 -0700
Subject: kexec: add a kexec_crash_loaded() function

Provide a wrapper function to be used by kernel code to check whether a
crash kernel is loaded.  It returns the same value that can be seen in
/sys/kernel/kexec_crash_loaded by userspace programs.

I'm exporting the function, because it will be used by Xen, and it is
possible to compile Xen modules separately to enable the use of PV
drivers with unmodified bare-metal kernels.

Link: http://lkml.kernel.org/r/20160713121955.14969.69080.stgit@hananiah.suse.cz
Signed-off-by: Petr Tesarik <ptesarik@suse.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kexec.h | 2 ++
 kernel/kexec_core.c   | 6 ++++++
 kernel/ksysfs.c       | 2 +-
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 23e14a460cfb..d7437777baaa 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -230,6 +230,7 @@ extern void *kexec_purgatory_get_symbol_addr(struct kimage *image,
 extern void __crash_kexec(struct pt_regs *);
 extern void crash_kexec(struct pt_regs *);
 int kexec_should_crash(struct task_struct *);
+int kexec_crash_loaded(void);
 void crash_save_cpu(struct pt_regs *regs, int cpu);
 void crash_save_vmcoreinfo(void);
 void arch_crash_save_vmcoreinfo(void);
@@ -364,6 +365,7 @@ struct task_struct;
 static inline void __crash_kexec(struct pt_regs *regs) { }
 static inline void crash_kexec(struct pt_regs *regs) { }
 static inline int kexec_should_crash(struct task_struct *p) { return 0; }
+static inline int kexec_crash_loaded(void) { return 0; }
 #define kexec_in_progress false
 #endif /* CONFIG_KEXEC_CORE */
 
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 73d4c5f57dd8..704534029a00 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -95,6 +95,12 @@ int kexec_should_crash(struct task_struct *p)
 	return 0;
 }
 
+int kexec_crash_loaded(void)
+{
+	return !!kexec_crash_image;
+}
+EXPORT_SYMBOL_GPL(kexec_crash_loaded);
+
 /*
  * When kexec transitions to the new kernel there is a one-to-one
  * mapping between physical and virtual addresses.  On processors
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 9f1920d2d0c6..ee1bc1bb8feb 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -101,7 +101,7 @@ KERNEL_ATTR_RO(kexec_loaded);
 static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
 				       struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%d\n", !!kexec_crash_image);
+	return sprintf(buf, "%d\n", kexec_crash_loaded());
 }
 KERNEL_ATTR_RO(kexec_crash_loaded);
 
-- 
cgit v1.3-8-gc7d7


From 1730f146604ea426e54938cdbcf87df1047ef0dc Mon Sep 17 00:00:00 2001
From: zhong jiang <zhongjiang@huawei.com>
Date: Tue, 2 Aug 2016 14:06:22 -0700
Subject: kexec: add restriction on kexec_load() segment sizes

I hit the following issue when run trinity in my system.  The kernel is
3.4 version, but mainline has the same issue.

The root cause is that the segment size is too large so the kerenl
spends too long trying to allocate a page.  Other cases will block until
the test case quits.  Also, OOM conditions will occur.

Call Trace:
  __alloc_pages_nodemask+0x14c/0x8f0
  alloc_pages_current+0xaf/0x120
  kimage_alloc_pages+0x10/0x60
  kimage_alloc_control_pages+0x5d/0x270
  machine_kexec_prepare+0xe5/0x6c0
  ? kimage_free_page_list+0x52/0x70
  sys_kexec_load+0x141/0x600
  ? vfs_write+0x100/0x180
  system_call_fastpath+0x16/0x1b

The patch changes sanity_check_segment_list() to verify that the usage by
all segments does not exceed half of memory.

[akpm@linux-foundation.org: fix for kexec-return-error-number-directly.patch, update comment]
Link: http://lkml.kernel.org/r/1469625474-53904-1-git-send-email-zhongjiang@huawei.com
Signed-off-by: zhong jiang <zhongjiang@huawei.com>
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_core.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 704534029a00..561675589511 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -146,6 +146,7 @@ EXPORT_SYMBOL_GPL(kexec_crash_loaded);
  * allocating pages whose destination address we do not care about.
  */
 #define KIMAGE_NO_DEST (-1UL)
+#define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT)
 
 static struct page *kimage_alloc_page(struct kimage *image,
 				       gfp_t gfp_mask,
@@ -155,6 +156,7 @@ int sanity_check_segment_list(struct kimage *image)
 {
 	int i;
 	unsigned long nr_segments = image->nr_segments;
+	unsigned long total_pages = 0;
 
 	/*
 	 * Verify we have good destination addresses.  The caller is
@@ -214,6 +216,21 @@ int sanity_check_segment_list(struct kimage *image)
 			return -EINVAL;
 	}
 
+	/*
+	 * Verify that no more than half of memory will be consumed. If the
+	 * request from userspace is too large, a large amount of time will be
+	 * wasted allocating pages, which can cause a soft lockup.
+	 */
+	for (i = 0; i < nr_segments; i++) {
+		if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages / 2)
+			return -EINVAL;
+
+		total_pages += PAGE_COUNT(image->segment[i].memsz);
+	}
+
+	if (total_pages > totalram_pages / 2)
+		return -EINVAL;
+
 	/*
 	 * Verify we have good destination addresses.  Normally
 	 * the caller is responsible for making certain we don't
-- 
cgit v1.3-8-gc7d7


From 59dbb2a06fc2bcb752b964e036884fe9bb9dbbe0 Mon Sep 17 00:00:00 2001
From: Akash Goel <akash.goel@intel.com>
Date: Tue, 2 Aug 2016 14:07:18 -0700
Subject: relay: add global mode support for buffer-only channels

Commit 20d8b67c06fa ("relay: add buffer-only channels; useful for early
logging") added support to use channels with no associated files.

This is useful when the exact location of relay file is not known or the
the parent directory of relay file is not available, while creating the
channel and the logging has to start right from the boot.

But there was no provision to use global mode with buffer-only channels,
which is added by this patch, without modifying the interface where
initially there will be a dummy invocation of create_buf_file callback
through which kernel client can convey the need of a global buffer.

For the use case where drivers/kernel clients want a simple interface
for the userspace, which enables them to capture data/logs from relay
file inorder & without any post processing, support of Global buffer
mode is warranted.

Modules, like i915, using relay_open() in early init would have to later
register their buffer-only relays, once debugfs is available, by calling
relay_late_setup_files().  Hence relay_late_setup_files() symbol also
needs to be exported.

Link: http://lkml.kernel.org/r/1468404563-11653-1-git-send-email-akash.goel@intel.com
Signed-off-by: Akash Goel <akash.goel@intel.com>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/relay.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 04d7cf3ef8cf..d797502140b9 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -451,6 +451,13 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
 		if (!dentry)
 			goto free_buf;
 		relay_set_buf_dentry(buf, dentry);
+	} else {
+		/* Only retrieve global info, nothing more, nothing less */
+		dentry = chan->cb->create_buf_file(NULL, NULL,
+						   S_IRUSR, buf,
+						   &chan->is_global);
+		if (WARN_ON(dentry))
+			goto free_buf;
 	}
 
  	buf->cpu = cpu;
@@ -562,6 +569,10 @@ static int relay_hotcpu_callback(struct notifier_block *nb,
  *	attributes specified.  The created channel buffer files
  *	will be named base_filename0...base_filenameN-1.  File
  *	permissions will be %S_IRUSR.
+ *
+ *	If opening a buffer (@parent = NULL) that you later wish to register
+ *	in a filesystem, call relay_late_setup_files() once the @parent dentry
+ *	is available.
  */
 struct rchan *relay_open(const char *base_filename,
 			 struct dentry *parent,
@@ -640,8 +651,12 @@ static void __relay_set_buf_dentry(void *info)
  *
  *	Returns 0 if successful, non-zero otherwise.
  *
- *	Use to setup files for a previously buffer-only channel.
- *	Useful to do early tracing in kernel, before VFS is up, for example.
+ *	Use to setup files for a previously buffer-only channel created
+ *	by relay_open() with a NULL parent dentry.
+ *
+ *	For example, this is useful for perfomring early tracing in kernel,
+ *	before VFS is up and then exposing the early results once the dentry
+ *	is available.
  */
 int relay_late_setup_files(struct rchan *chan,
 			   const char *base_filename,
@@ -666,6 +681,20 @@ int relay_late_setup_files(struct rchan *chan,
 	}
 	chan->has_base_filename = 1;
 	chan->parent = parent;
+
+	if (chan->is_global) {
+		err = -EINVAL;
+		if (!WARN_ON_ONCE(!chan->buf[0])) {
+			dentry = relay_create_buf_file(chan, chan->buf[0], 0);
+			if (dentry && !WARN_ON_ONCE(!chan->is_global)) {
+				relay_set_buf_dentry(chan->buf[0], dentry);
+				err = 0;
+			}
+		}
+		mutex_unlock(&relay_channels_mutex);
+		return err;
+	}
+
 	curr_cpu = get_cpu();
 	/*
 	 * The CPU hotplug notifier ran before us and created buffers with
@@ -706,6 +735,7 @@ int relay_late_setup_files(struct rchan *chan,
 
 	return err;
 }
+EXPORT_SYMBOL_GPL(relay_late_setup_files);
 
 /**
  *	relay_switch_subbuf - switch to a new sub-buffer
-- 
cgit v1.3-8-gc7d7


From 27eb6622ab67bad75814c9b7b08096cfb16be63a Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 2 Aug 2016 14:07:24 -0700
Subject: config: add android config fragments

Copy the config fragments from the AOSP common kernel android-4.4
branch.  It is becoming possible to run mainline kernels with Android,
but the kernel defconfigs don't work as-is and debugging missing config
options is a pain.  Adding the config fragments into the kernel tree,
makes configuring a mainline kernel as simple as:

  make ARCH=arm multi_v7_defconfig android-base.config android-recommended.config

The following non-upstream config options were removed:

  CONFIG_NETFILTER_XT_MATCH_QTAGUID
  CONFIG_NETFILTER_XT_MATCH_QUOTA2
  CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
  CONFIG_PPPOLAC
  CONFIG_PPPOPNS
  CONFIG_SECURITY_PERF_EVENTS_RESTRICT
  CONFIG_USB_CONFIGFS_F_MTP
  CONFIG_USB_CONFIGFS_F_PTP
  CONFIG_USB_CONFIGFS_F_ACC
  CONFIG_USB_CONFIGFS_F_AUDIO_SRC
  CONFIG_USB_CONFIGFS_UEVENT
  CONFIG_INPUT_KEYCHORD
  CONFIG_INPUT_KEYRESET

Link: http://lkml.kernel.org/r/1466708235-28593-1-git-send-email-robh@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
Cc: Amit Pundir <amit.pundir@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Dmitry Shmidt <dimitrysh@google.com>
Cc: Rom Lemarchand <romlem@android.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS                               |   5 +
 kernel/configs/android-base.config        | 152 ++++++++++++++++++++++++++++++
 kernel/configs/android-recommended.config | 121 ++++++++++++++++++++++++
 3 files changed, 278 insertions(+)
 create mode 100644 kernel/configs/android-base.config
 create mode 100644 kernel/configs/android-recommended.config

(limited to 'kernel')

diff --git a/MAINTAINERS b/MAINTAINERS
index e9eacacf0f08..ce38536009c7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -778,6 +778,11 @@ W:	http://ez.analog.com/community/linux-device-drivers
 S:	Supported
 F:	drivers/dma/dma-axi-dmac.c
 
+ANDROID CONFIG FRAGMENTS
+M:	Rob Herring <robh@kernel.org>
+S:	Supported
+F:	kernel/configs/android*
+
 ANDROID DRIVERS
 M:	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 M:	Arve Hjønnevåg <arve@android.com>
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
new file mode 100644
index 000000000000..9f748ed7bea8
--- /dev/null
+++ b/kernel/configs/android-base.config
@@ -0,0 +1,152 @@
+#  KEEP ALPHABETICALLY SORTED
+# CONFIG_DEVKMEM is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_INET_LRO is not set
+# CONFIG_MODULES is not set
+# CONFIG_OABI_COMPAT is not set
+# CONFIG_SYSVIPC is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_ARMV8_DEPRECATED=y
+CONFIG_ASHMEM=y
+CONFIG_AUDIT=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_EMBEDDED=y
+CONFIG_FB=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_INET=y
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_INET_ESP=y
+CONFIG_INET_XFRM_MODE_TUNNEL=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IPV6=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_IPV6_PRIVACY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_NET=y
+CONFIG_NETDEVICES=y
+CONFIG_NETFILTER=y
+CONFIG_NETFILTER_TPROXY=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_KEY=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_NAT=y
+CONFIG_NO_HZ=y
+CONFIG_PACKET=y
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PREEMPT=y
+CONFIG_QUOTA=y
+CONFIG_RTC_CLASS=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SETEND_EMULATION=y
+CONFIG_STAGING=y
+CONFIG_SWP_EMULATION=y
+CONFIG_SYNC=y
+CONFIG_TUN=y
+CONFIG_UNIX=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_CONFIGFS=y
+CONFIG_USB_CONFIGFS_F_FS=y
+CONFIG_USB_CONFIGFS_F_MIDI=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_XFRM_USER=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
new file mode 100644
index 000000000000..e3b953e966d2
--- /dev/null
+++ b/kernel/configs/android-recommended.config
@@ -0,0 +1,121 @@
+#  KEEP ALPHABETICALLY SORTED
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_INPUT_MOUSE is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_NF_CONNTRACK_SIP is not set
+# CONFIG_PM_WAKELOCKS_GC is not set
+# CONFIG_VT is not set
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_COMPACTION=y
+CONFIG_DEBUG_RODATA=y
+CONFIG_DM_UEVENT=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_FUSE_FS=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HIDRAW=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_GREENASIA=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_GPIO=y
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_ION=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_KSM=y
+CONFIG_LOGIG940_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGITECH_FF=y
+CONFIG_MD=y
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_MSDOS_FS=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_PANTHERLORD_FF=y
+CONFIG_PERF_EVENTS=y
+CONFIG_PM_DEBUG=y
+CONFIG_PM_RUNTIME=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+CONFIG_POWER_SUPPLY=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_SCHEDSTATS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_SND=y
+CONFIG_SOUND=y
+CONFIG_SUSPEND_TIME=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_TASK_XACCT=y
+CONFIG_TIMER_STATS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_UHID=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_USBNET=y
+CONFIG_VFAT_FS=y
-- 
cgit v1.3-8-gc7d7


From 1f415a74b0ca64b5bfacbb12d71ed2ec050a8cfb Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 2 Aug 2016 16:12:14 +0100
Subject: bpf: fix method of PTR_TO_PACKET reg id generation

Using per-register incrementing ID can lead to
find_good_pkt_pointers() confusing registers which
have completely different values.  Consider example:

0: (bf) r6 = r1
1: (61) r8 = *(u32 *)(r6 +76)
2: (61) r0 = *(u32 *)(r6 +80)
3: (bf) r7 = r8
4: (07) r8 += 32
5: (2d) if r8 > r0 goto pc+9
 R0=pkt_end R1=ctx R6=ctx R7=pkt(id=0,off=0,r=32) R8=pkt(id=0,off=32,r=32) R10=fp
6: (bf) r8 = r7
7: (bf) r9 = r7
8: (71) r1 = *(u8 *)(r7 +0)
9: (0f) r8 += r1
10: (71) r1 = *(u8 *)(r7 +1)
11: (0f) r9 += r1
12: (07) r8 += 32
13: (2d) if r8 > r0 goto pc+1
 R0=pkt_end R1=inv56 R6=ctx R7=pkt(id=0,off=0,r=32) R8=pkt(id=1,off=32,r=32) R9=pkt(id=1,off=0,r=32) R10=fp
14: (71) r1 = *(u8 *)(r9 +16)
15: (b7) r7 = 0
16: (bf) r0 = r7
17: (95) exit

We need to get a UNKNOWN_VALUE with imm to force id
generation so lines 0-5 make r7 a valid packet pointer.
We then read two different bytes from the packet and
add them to copies of the constructed packet pointer.
r8 (line 9) and r9 (line 11) will get the same id of 1,
independently.  When either of them is validated (line
13) - find_good_pkt_pointers() will also mark the other
as safe.  This leads to access on line 14 being mistakenly
considered safe.

Fixes: 969bf05eb3ce ("bpf: direct packet access")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f72f23b8fdab..7094c69ac199 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -194,6 +194,7 @@ struct verifier_env {
 	struct verifier_state_list **explored_states; /* search pruning optimization */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
+	u32 id_gen;			/* used to generate unique reg IDs */
 	bool allow_ptr_leaks;
 };
 
@@ -1301,7 +1302,7 @@ add_imm:
 		/* dst_reg stays as pkt_ptr type and since some positive
 		 * integer value was added to the pointer, increment its 'id'
 		 */
-		dst_reg->id++;
+		dst_reg->id = ++env->id_gen;
 
 		/* something was added to pkt_ptr, set range and off to zero */
 		dst_reg->off = 0;
-- 
cgit v1.3-8-gc7d7


From 9502514f2808d29f6f2afa1c410e7808898dede1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 19 Jul 2016 05:59:24 +0930
Subject: module: Do a WARN_ON_ONCE() for assert module mutex not held

When running with lockdep enabled, I triggered the WARN_ON() in the
module code that asserts when module_mutex or rcu_read_lock_sched are
not held. The issue I have is that this can also be called from the
dump_stack() code, causing us to enter an infinite loop...

 ------------[ cut here ]------------
 WARNING: CPU: 1 PID: 0 at kernel/module.c:268 module_assert_mutex_or_preempt+0x3c/0x3e
 Modules linked in: ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6
 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc3-test-00013-g501c2375253c #14
 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014
  ffff880215e8fa70 ffff880215e8fa70 ffffffff812fc8e3 0000000000000000
  ffffffff81d3e55b ffff880215e8fac0 ffffffff8104fc88 ffffffff8104fcab
  0000000915e88300 0000000000000046 ffffffffa019b29a 0000000000000001
 Call Trace:
  [<ffffffff812fc8e3>] dump_stack+0x67/0x90
  [<ffffffff8104fc88>] __warn+0xcb/0xe9
  [<ffffffff8104fcab>] ? warn_slowpath_null+0x5/0x1f
 ------------[ cut here ]------------
 WARNING: CPU: 1 PID: 0 at kernel/module.c:268 module_assert_mutex_or_preempt+0x3c/0x3e
 Modules linked in: ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6
 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc3-test-00013-g501c2375253c #14
 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014
  ffff880215e8f7a0 ffff880215e8f7a0 ffffffff812fc8e3 0000000000000000
  ffffffff81d3e55b ffff880215e8f7f0 ffffffff8104fc88 ffffffff8104fcab
  0000000915e88300 0000000000000046 ffffffffa019b29a 0000000000000001
 Call Trace:
  [<ffffffff812fc8e3>] dump_stack+0x67/0x90
  [<ffffffff8104fc88>] __warn+0xcb/0xe9
  [<ffffffff8104fcab>] ? warn_slowpath_null+0x5/0x1f
 ------------[ cut here ]------------
 WARNING: CPU: 1 PID: 0 at kernel/module.c:268 module_assert_mutex_or_preempt+0x3c/0x3e
 Modules linked in: ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6
 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc3-test-00013-g501c2375253c #14
 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014
  ffff880215e8f4d0 ffff880215e8f4d0 ffffffff812fc8e3 0000000000000000
  ffffffff81d3e55b ffff880215e8f520 ffffffff8104fc88 ffffffff8104fcab
  0000000915e88300 0000000000000046 ffffffffa019b29a 0000000000000001
 Call Trace:
  [<ffffffff812fc8e3>] dump_stack+0x67/0x90
  [<ffffffff8104fc88>] __warn+0xcb/0xe9
  [<ffffffff8104fcab>] ? warn_slowpath_null+0x5/0x1f
 ------------[ cut here ]------------
 WARNING: CPU: 1 PID: 0 at kernel/module.c:268 module_assert_mutex_or_preempt+0x3c/0x3e
[...]

Which gives us rather useless information. Worse yet, there's some race
that causes this, and I seldom trigger it, so I have no idea what
happened.

This would not be an issue if that warning was a WARN_ON_ONCE().

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 7f21ab238aa7..beaebea627ff 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -264,7 +264,7 @@ static void module_assert_mutex_or_preempt(void)
 	if (unlikely(!debug_locks))
 		return;
 
-	WARN_ON(!rcu_read_lock_sched_held() &&
+	WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
 		!lockdep_is_held(&module_mutex));
 #endif
 }
-- 
cgit v1.3-8-gc7d7


From be7de5f91fdc3a63ee01910c43f20db213445ce4 Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Thu, 21 Jul 2016 15:37:56 +0930
Subject: modules: Add kernel parameter to blacklist modules

Blacklisting a module in linux has long been a problem.  The current
procedure is to use rd.blacklist=module_name, however, that doesn't
cover the case after the initramfs and before a boot prompt (where one
is supposed to use /etc/modprobe.d/blacklist.conf to blacklist
runtime loading). Using rd.shell to get an early prompt is hit-or-miss,
and doesn't cover all situations AFAICT.

This patch adds this functionality of permanently blacklisting a module
by its name via the kernel parameter module_blacklist=module_name.

[v2]: Rusty, use core_param() instead of __setup() which simplifies
things.

[v3]: Rusty, undo wreckage from strsep()

[v4]: Rusty, simpler version of blacklisted()

Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: linux-doc@vger.kernel.org
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/kernel-parameters.txt |  3 +++
 kernel/module.c                     | 24 ++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 724970a25666..c59ae1af02b3 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2301,6 +2301,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Note that if CONFIG_MODULE_SIG_FORCE is set, that
 			is always true, so this option does nothing.
 
+	module_blacklist=  [KNL] Do not load a comma-separated list of
+			modules.  Useful for debugging problem modules.
+
 	mousedev.tap_time=
 			[MOUSE] Maximum time between finger touching and
 			leaving touchpad surface for touch to be considered
diff --git a/kernel/module.c b/kernel/module.c
index beaebea627ff..c91c2fdca2e6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3168,6 +3168,27 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
 	return 0;
 }
 
+/* module_blacklist is a comma-separated list of module names */
+static char *module_blacklist;
+static bool blacklisted(char *module_name)
+{
+	const char *p;
+	size_t len;
+
+	if (!module_blacklist)
+		return false;
+
+	for (p = module_blacklist; *p; p += len) {
+		len = strcspn(p, ",");
+		if (strlen(module_name) == len && !memcmp(module_name, p, len))
+			return true;
+		if (p[len] == ',')
+			len++;
+	}
+	return false;
+}
+core_param(module_blacklist, module_blacklist, charp, 0400);
+
 static struct module *layout_and_allocate(struct load_info *info, int flags)
 {
 	/* Module within temporary copy. */
@@ -3178,6 +3199,9 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
 	if (IS_ERR(mod))
 		return mod;
 
+	if (blacklisted(mod->name))
+		return ERR_PTR(-EPERM);
+
 	err = check_modinfo(mod, info, flags);
 	if (err)
 		return ERR_PTR(err);
-- 
cgit v1.3-8-gc7d7


From bdc9f373551dd3f1e6fae1618f2394ee9bc7db2e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 27 Jul 2016 12:17:35 +0930
Subject: jump_label: disable preemption around __module_text_address().

Steven reported a warning caused by not holding module_mutex or
rcu_read_lock_sched: his backtrace was corrupted but a quick audit
found this possible cause.  It's wrong anyway...

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/jump_label.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 0dbea887d625..0eef93962a91 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -284,11 +284,14 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
 {
 	struct module *mod;
 
+	preempt_disable();
 	mod = __module_text_address((unsigned long)start);
+	WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+	preempt_enable();
+
 	if (!mod)
 		return 0;
 
-	WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
 
 	return __jump_label_text_reserved(mod->jump_entries,
 				mod->jump_entries + mod->num_jump_entries,
-- 
cgit v1.3-8-gc7d7


From 444d13ff10fb13bc3e64859c3cf9ce43dcfeb075 Mon Sep 17 00:00:00 2001
From: Jessica Yu <jeyu@redhat.com>
Date: Wed, 27 Jul 2016 12:06:21 +0930
Subject: modules: add ro_after_init support

Add ro_after_init support for modules by adding a new page-aligned section
in the module layout (after rodata) for ro_after_init data and enabling RO
protection for that section after module init runs.

Signed-off-by: Jessica Yu <jeyu@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h   |  6 +++--
 include/uapi/linux/elf.h |  1 +
 kernel/livepatch/core.c  |  2 +-
 kernel/module.c          | 66 +++++++++++++++++++++++++++++++++++++++---------
 4 files changed, 60 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index f95ed243a4de..0c3207d26ac0 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -298,6 +298,8 @@ struct module_layout {
 	unsigned int text_size;
 	/* Size of RO section of the module (text+rodata) */
 	unsigned int ro_size;
+	/* Size of RO after init section */
+	unsigned int ro_after_init_size;
 
 #ifdef CONFIG_MODULES_TREE_LOOKUP
 	struct mod_tree_node mtn;
@@ -765,12 +767,12 @@ extern int module_sysfs_initialized;
 #ifdef CONFIG_DEBUG_SET_MODULE_RONX
 extern void set_all_modules_text_rw(void);
 extern void set_all_modules_text_ro(void);
-extern void module_enable_ro(const struct module *mod);
+extern void module_enable_ro(const struct module *mod, bool after_init);
 extern void module_disable_ro(const struct module *mod);
 #else
 static inline void set_all_modules_text_rw(void) { }
 static inline void set_all_modules_text_ro(void) { }
-static inline void module_enable_ro(const struct module *mod) { }
+static inline void module_enable_ro(const struct module *mod, bool after_init) { }
 static inline void module_disable_ro(const struct module *mod) { }
 #endif
 
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index cb4a72f888d5..70b172ba41ce 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -286,6 +286,7 @@ typedef struct elf64_phdr {
 #define SHF_ALLOC		0x2
 #define SHF_EXECINSTR		0x4
 #define SHF_RELA_LIVEPATCH	0x00100000
+#define SHF_RO_AFTER_INIT	0x00200000
 #define SHF_MASKPROC		0xf0000000
 
 /* special section indexes */
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 5c2bc1052691..8bbe50704621 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -309,7 +309,7 @@ static int klp_write_object_relocations(struct module *pmod,
 			break;
 	}
 
-	module_enable_ro(pmod);
+	module_enable_ro(pmod, true);
 	return ret;
 }
 
diff --git a/kernel/module.c b/kernel/module.c
index c91c2fdca2e6..205a71a97852 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1857,10 +1857,11 @@ static void mod_sysfs_teardown(struct module *mod)
  * from modification and any data from execution.
  *
  * General layout of module is:
- *          [text] [read-only-data] [writable data]
- * text_size -----^                ^               ^
- * ro_size ------------------------|               |
- * size -------------------------------------------|
+ *          [text] [read-only-data] [ro-after-init] [writable data]
+ * text_size -----^                ^               ^               ^
+ * ro_size ------------------------|               |               |
+ * ro_after_init_size -----------------------------|               |
+ * size -----------------------------------------------------------|
  *
  * These values are always page-aligned (as is base)
  */
@@ -1883,14 +1884,24 @@ static void frob_rodata(const struct module_layout *layout,
 		   (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
 }
 
+static void frob_ro_after_init(const struct module_layout *layout,
+				int (*set_memory)(unsigned long start, int num_pages))
+{
+	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
+	set_memory((unsigned long)layout->base + layout->ro_size,
+		   (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
+}
+
 static void frob_writable_data(const struct module_layout *layout,
 			       int (*set_memory)(unsigned long start, int num_pages))
 {
 	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
-	BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
 	BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
-	set_memory((unsigned long)layout->base + layout->ro_size,
-		   (layout->size - layout->ro_size) >> PAGE_SHIFT);
+	set_memory((unsigned long)layout->base + layout->ro_after_init_size,
+		   (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
 }
 
 /* livepatching wants to disable read-only so it can frob module. */
@@ -1898,21 +1909,26 @@ void module_disable_ro(const struct module *mod)
 {
 	frob_text(&mod->core_layout, set_memory_rw);
 	frob_rodata(&mod->core_layout, set_memory_rw);
+	frob_ro_after_init(&mod->core_layout, set_memory_rw);
 	frob_text(&mod->init_layout, set_memory_rw);
 	frob_rodata(&mod->init_layout, set_memory_rw);
 }
 
-void module_enable_ro(const struct module *mod)
+void module_enable_ro(const struct module *mod, bool after_init)
 {
 	frob_text(&mod->core_layout, set_memory_ro);
 	frob_rodata(&mod->core_layout, set_memory_ro);
 	frob_text(&mod->init_layout, set_memory_ro);
 	frob_rodata(&mod->init_layout, set_memory_ro);
+
+	if (after_init)
+		frob_ro_after_init(&mod->core_layout, set_memory_ro);
 }
 
 static void module_enable_nx(const struct module *mod)
 {
 	frob_rodata(&mod->core_layout, set_memory_nx);
+	frob_ro_after_init(&mod->core_layout, set_memory_nx);
 	frob_writable_data(&mod->core_layout, set_memory_nx);
 	frob_rodata(&mod->init_layout, set_memory_nx);
 	frob_writable_data(&mod->init_layout, set_memory_nx);
@@ -1921,6 +1937,7 @@ static void module_enable_nx(const struct module *mod)
 static void module_disable_nx(const struct module *mod)
 {
 	frob_rodata(&mod->core_layout, set_memory_x);
+	frob_ro_after_init(&mod->core_layout, set_memory_x);
 	frob_writable_data(&mod->core_layout, set_memory_x);
 	frob_rodata(&mod->init_layout, set_memory_x);
 	frob_writable_data(&mod->init_layout, set_memory_x);
@@ -1963,6 +1980,8 @@ static void disable_ro_nx(const struct module_layout *layout)
 	frob_text(layout, set_memory_rw);
 	frob_rodata(layout, set_memory_rw);
 	frob_rodata(layout, set_memory_x);
+	frob_ro_after_init(layout, set_memory_rw);
+	frob_ro_after_init(layout, set_memory_x);
 	frob_writable_data(layout, set_memory_x);
 }
 
@@ -2305,6 +2324,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
 		 * finder in the two loops below */
 		{ SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
 		{ SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
+		{ SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
 		{ SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
 		{ ARCH_SHF_SMALL | SHF_ALLOC, 0 }
 	};
@@ -2336,7 +2356,11 @@ static void layout_sections(struct module *mod, struct load_info *info)
 			mod->core_layout.size = debug_align(mod->core_layout.size);
 			mod->core_layout.ro_size = mod->core_layout.size;
 			break;
-		case 3: /* whole core */
+		case 2: /* RO after init */
+			mod->core_layout.size = debug_align(mod->core_layout.size);
+			mod->core_layout.ro_after_init_size = mod->core_layout.size;
+			break;
+		case 4: /* whole core */
 			mod->core_layout.size = debug_align(mod->core_layout.size);
 			break;
 		}
@@ -2366,7 +2390,14 @@ static void layout_sections(struct module *mod, struct load_info *info)
 			mod->init_layout.size = debug_align(mod->init_layout.size);
 			mod->init_layout.ro_size = mod->init_layout.size;
 			break;
-		case 3: /* whole init */
+		case 2:
+			/*
+			 * RO after init doesn't apply to init_layout (only
+			 * core_layout), so it just takes the value of ro_size.
+			 */
+			mod->init_layout.ro_after_init_size = mod->init_layout.ro_size;
+			break;
+		case 4: /* whole init */
 			mod->init_layout.size = debug_align(mod->init_layout.size);
 			break;
 		}
@@ -3193,6 +3224,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
 {
 	/* Module within temporary copy. */
 	struct module *mod;
+	unsigned int ndx;
 	int err;
 
 	mod = setup_load_info(info, flags);
@@ -3215,6 +3247,15 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
 	/* We will do a special allocation for per-cpu sections later. */
 	info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
 
+	/*
+	 * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
+	 * layout_sections() can put it in the right place.
+	 * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
+	 */
+	ndx = find_sec(info, ".data..ro_after_init");
+	if (ndx)
+		info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+
 	/* Determine total sizes, and put offsets in sh_entsize.  For now
 	   this is done generically; there doesn't appear to be any
 	   special cases for the architectures. */
@@ -3381,12 +3422,14 @@ static noinline int do_init_module(struct module *mod)
 	/* Switch to core kallsyms now init is done: kallsyms may be walking! */
 	rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
 #endif
+	module_enable_ro(mod, true);
 	mod_tree_remove_init(mod);
 	disable_ro_nx(&mod->init_layout);
 	module_arch_freeing_init(mod);
 	mod->init_layout.base = NULL;
 	mod->init_layout.size = 0;
 	mod->init_layout.ro_size = 0;
+	mod->init_layout.ro_after_init_size = 0;
 	mod->init_layout.text_size = 0;
 	/*
 	 * We want to free module_init, but be aware that kallsyms may be
@@ -3478,8 +3521,7 @@ static int complete_formation(struct module *mod, struct load_info *info)
 	/* This relies on module_mutex for list integrity. */
 	module_bug_finalize(info->hdr, info->sechdrs, mod);
 
-	/* Set RO and NX regions */
-	module_enable_ro(mod);
+	module_enable_ro(mod, false);
 	module_enable_nx(mod);
 
 	/* Mark state as coming so strong_try_module_get() ignores us,
-- 
cgit v1.3-8-gc7d7


From 97f2645f358b411ba2afb22e5966753f0ad92916 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Wed, 3 Aug 2016 13:45:50 -0700
Subject: tree-wide: replace config_enabled() with IS_ENABLED()

The use of config_enabled() against config options is ambiguous.  In
practical terms, config_enabled() is equivalent to IS_BUILTIN(), but the
author might have used it for the meaning of IS_ENABLED().  Using
IS_ENABLED(), IS_BUILTIN(), IS_MODULE() etc.  makes the intention
clearer.

This commit replaces config_enabled() with IS_ENABLED() where possible.
This commit is only touching bool config options.

I noticed two cases where config_enabled() is used against a tristate
option:

 - config_enabled(CONFIG_HWMON)
  [ drivers/net/wireless/ath/ath10k/thermal.c ]

 - config_enabled(CONFIG_BACKLIGHT_CLASS_DEVICE)
  [ drivers/gpu/drm/gma500/opregion.c ]

I did not touch them because they should be converted to IS_BUILTIN()
in order to keep the logic, but I was not sure it was the authors'
intention.

Link: http://lkml.kernel.org/r/1465215656-20569-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Stas Sergeev <stsp@list.ru>
Cc: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: Joshua Kinard <kumba@gentoo.org>
Cc: Jiri Slaby <jslaby@suse.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Markos Chandras <markos.chandras@imgtec.com>
Cc: "Dmitry V. Levin" <ldv@altlinux.org>
Cc: yu-cheng yu <yu-cheng.yu@intel.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Will Drewry <wad@chromium.org>
Cc: Nikolay Martynov <mar.kolya@gmail.com>
Cc: Huacai Chen <chenhc@lemote.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
Cc: Rafal Milecki <zajec5@gmail.com>
Cc: James Cowgill <James.Cowgill@imgtec.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Alex Smith <alex.smith@imgtec.com>
Cc: Adam Buchbinder <adam.buchbinder@gmail.com>
Cc: Qais Yousef <qais.yousef@imgtec.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Mikko Rapeli <mikko.rapeli@iki.fi>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: "Luis R. Rodriguez" <mcgrof@do-not-panic.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Tony Wu <tung7970@gmail.com>
Cc: Huaitong Han <huaitong.han@intel.com>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Juergen Gross <jgross@suse.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Gelmini <andrea.gelmini@gelma.net>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Rabin Vincent <rabin@rab.in>
Cc: "Maciej W. Rozycki" <macro@imgtec.com>
Cc: David Daney <david.daney@cavium.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/include/asm/mips-cm.h                  |  2 +-
 arch/mips/include/asm/pgtable.h                  | 16 +++++------
 arch/mips/include/asm/seccomp.h                  |  4 +--
 arch/mips/include/asm/signal.h                   |  4 +--
 arch/mips/include/asm/syscall.h                  |  2 +-
 arch/mips/include/asm/uaccess.h                  |  2 +-
 arch/mips/jz4740/setup.c                         |  2 +-
 arch/mips/kernel/cpu-bugs64.c                    |  6 ++---
 arch/mips/kernel/elf.c                           |  4 +--
 arch/mips/kernel/mips-cm.c                       |  2 +-
 arch/mips/kernel/mips-r2-to-r6-emul.c            | 34 ++++++++++++------------
 arch/mips/kernel/pm-cps.c                        |  4 +--
 arch/mips/kernel/signal.c                        | 10 +++----
 arch/mips/kernel/smp-cps.c                       |  4 +--
 arch/mips/kernel/unaligned.c                     | 10 +++----
 arch/mips/math-emu/cp1emu.c                      |  6 ++---
 arch/mips/mm/tlbex.c                             | 10 +++----
 arch/mips/mti-malta/malta-dtshim.c               |  4 +--
 arch/mips/mti-malta/malta-memory.c               |  2 +-
 arch/mips/mti-malta/malta-setup.c                |  2 +-
 arch/mips/net/bpf_jit.c                          |  4 +--
 arch/x86/include/asm/elf.h                       |  4 +--
 arch/x86/include/asm/fpu/internal.h              | 16 +++++------
 arch/x86/include/asm/mmu_context.h               |  2 +-
 arch/x86/kernel/apic/apic.c                      |  2 +-
 arch/x86/kernel/apic/vector.c                    |  2 +-
 arch/x86/kernel/fpu/signal.c                     | 12 ++++-----
 arch/x86/kernel/signal.c                         | 14 +++++-----
 drivers/firmware/broadcom/bcm47xx_sprom.c        |  2 +-
 drivers/irqchip/irq-mips-gic.c                   |  2 +-
 drivers/mtd/bcm47xxpart.c                        |  2 +-
 drivers/net/wireless/ath/ath10k/debug.c          | 12 ++++-----
 drivers/net/wireless/ath/ath10k/mac.c            | 10 +++----
 drivers/net/wireless/ath/ath10k/wmi.c            |  2 +-
 drivers/net/wireless/ath/ath6kl/cfg80211.c       |  2 +-
 drivers/net/wireless/ath/ath9k/common-spectral.c |  6 ++---
 drivers/net/wireless/ath/ath9k/init.c            |  2 +-
 drivers/net/wireless/ath/ath9k/main.c            | 12 ++++-----
 drivers/net/wireless/ath/ath9k/recv.c            |  2 +-
 drivers/net/wireless/ath/dfs_pattern_detector.c  |  2 +-
 drivers/net/wireless/ath/regd.c                  |  4 +--
 drivers/pci/ecam.c                               |  2 +-
 drivers/tty/serial/ar933x_uart.c                 |  4 +--
 include/linux/fence.h                            |  2 +-
 include/linux/ww_mutex.h                         |  4 +--
 kernel/ptrace.c                                  |  4 +--
 kernel/seccomp.c                                 |  6 ++---
 net/wireless/chan.c                              |  2 +-
 48 files changed, 135 insertions(+), 135 deletions(-)

(limited to 'kernel')

diff --git a/arch/mips/include/asm/mips-cm.h b/arch/mips/include/asm/mips-cm.h
index 9411a4c0bdad..58e7874e9347 100644
--- a/arch/mips/include/asm/mips-cm.h
+++ b/arch/mips/include/asm/mips-cm.h
@@ -462,7 +462,7 @@ static inline unsigned int mips_cm_max_vp_width(void)
 	if (mips_cm_revision() >= CM_REV_CM3)
 		return read_gcr_sys_config2() & CM_GCR_SYS_CONFIG2_MAXVPW_MSK;
 
-	if (config_enabled(CONFIG_SMP))
+	if (IS_ENABLED(CONFIG_SMP))
 		return smp_num_siblings;
 
 	return 1;
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 7d44e888134f..70128d3f770a 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -159,7 +159,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
 		 * it better already be global)
 		 */
 		if (pte_none(*buddy)) {
-			if (!config_enabled(CONFIG_XPA))
+			if (!IS_ENABLED(CONFIG_XPA))
 				buddy->pte_low |= _PAGE_GLOBAL;
 			buddy->pte_high |= _PAGE_GLOBAL;
 		}
@@ -172,7 +172,7 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *pt
 
 	htw_stop();
 	/* Preserve global status for the pair */
-	if (config_enabled(CONFIG_XPA)) {
+	if (IS_ENABLED(CONFIG_XPA)) {
 		if (ptep_buddy(ptep)->pte_high & _PAGE_GLOBAL)
 			null.pte_high = _PAGE_GLOBAL;
 	} else {
@@ -319,7 +319,7 @@ static inline int pte_young(pte_t pte)	{ return pte.pte_low & _PAGE_ACCESSED; }
 static inline pte_t pte_wrprotect(pte_t pte)
 {
 	pte.pte_low  &= ~_PAGE_WRITE;
-	if (!config_enabled(CONFIG_XPA))
+	if (!IS_ENABLED(CONFIG_XPA))
 		pte.pte_low &= ~_PAGE_SILENT_WRITE;
 	pte.pte_high &= ~_PAGE_SILENT_WRITE;
 	return pte;
@@ -328,7 +328,7 @@ static inline pte_t pte_wrprotect(pte_t pte)
 static inline pte_t pte_mkclean(pte_t pte)
 {
 	pte.pte_low  &= ~_PAGE_MODIFIED;
-	if (!config_enabled(CONFIG_XPA))
+	if (!IS_ENABLED(CONFIG_XPA))
 		pte.pte_low &= ~_PAGE_SILENT_WRITE;
 	pte.pte_high &= ~_PAGE_SILENT_WRITE;
 	return pte;
@@ -337,7 +337,7 @@ static inline pte_t pte_mkclean(pte_t pte)
 static inline pte_t pte_mkold(pte_t pte)
 {
 	pte.pte_low  &= ~_PAGE_ACCESSED;
-	if (!config_enabled(CONFIG_XPA))
+	if (!IS_ENABLED(CONFIG_XPA))
 		pte.pte_low &= ~_PAGE_SILENT_READ;
 	pte.pte_high &= ~_PAGE_SILENT_READ;
 	return pte;
@@ -347,7 +347,7 @@ static inline pte_t pte_mkwrite(pte_t pte)
 {
 	pte.pte_low |= _PAGE_WRITE;
 	if (pte.pte_low & _PAGE_MODIFIED) {
-		if (!config_enabled(CONFIG_XPA))
+		if (!IS_ENABLED(CONFIG_XPA))
 			pte.pte_low |= _PAGE_SILENT_WRITE;
 		pte.pte_high |= _PAGE_SILENT_WRITE;
 	}
@@ -358,7 +358,7 @@ static inline pte_t pte_mkdirty(pte_t pte)
 {
 	pte.pte_low |= _PAGE_MODIFIED;
 	if (pte.pte_low & _PAGE_WRITE) {
-		if (!config_enabled(CONFIG_XPA))
+		if (!IS_ENABLED(CONFIG_XPA))
 			pte.pte_low |= _PAGE_SILENT_WRITE;
 		pte.pte_high |= _PAGE_SILENT_WRITE;
 	}
@@ -369,7 +369,7 @@ static inline pte_t pte_mkyoung(pte_t pte)
 {
 	pte.pte_low |= _PAGE_ACCESSED;
 	if (!(pte.pte_low & _PAGE_NO_READ)) {
-		if (!config_enabled(CONFIG_XPA))
+		if (!IS_ENABLED(CONFIG_XPA))
 			pte.pte_low |= _PAGE_SILENT_READ;
 		pte.pte_high |= _PAGE_SILENT_READ;
 	}
diff --git a/arch/mips/include/asm/seccomp.h b/arch/mips/include/asm/seccomp.h
index 684fb3a12ed3..d886d6f7687a 100644
--- a/arch/mips/include/asm/seccomp.h
+++ b/arch/mips/include/asm/seccomp.h
@@ -16,10 +16,10 @@ static inline const int *get_compat_mode1_syscalls(void)
 		0, /* null terminated */
 	};
 
-	if (config_enabled(CONFIG_MIPS32_O32) && test_thread_flag(TIF_32BIT_REGS))
+	if (IS_ENABLED(CONFIG_MIPS32_O32) && test_thread_flag(TIF_32BIT_REGS))
 		return syscalls_O32;
 
-	if (config_enabled(CONFIG_MIPS32_N32))
+	if (IS_ENABLED(CONFIG_MIPS32_N32))
 		return syscalls_N32;
 
 	BUG();
diff --git a/arch/mips/include/asm/signal.h b/arch/mips/include/asm/signal.h
index 2292373ff11a..82eae1583bcf 100644
--- a/arch/mips/include/asm/signal.h
+++ b/arch/mips/include/asm/signal.h
@@ -19,8 +19,8 @@ extern struct mips_abi mips_abi_32;
 		((ka)->sa.sa_flags & SA_SIGINFO))
 #else
 #define sig_uses_siginfo(ka, abi)                               \
-	(config_enabled(CONFIG_64BIT) ? 1 :                     \
-		(config_enabled(CONFIG_TRAD_SIGNALS) ?          \
+	(IS_ENABLED(CONFIG_64BIT) ? 1 :                     \
+		(IS_ENABLED(CONFIG_TRAD_SIGNALS) ?          \
 			((ka)->sa.sa_flags & SA_SIGINFO) : 1) )
 #endif
 
diff --git a/arch/mips/include/asm/syscall.h b/arch/mips/include/asm/syscall.h
index 47bc45a67e9b..d87882513ee3 100644
--- a/arch/mips/include/asm/syscall.h
+++ b/arch/mips/include/asm/syscall.h
@@ -99,7 +99,7 @@ static inline void syscall_get_arguments(struct task_struct *task,
 {
 	int ret;
 	/* O32 ABI syscall() - Either 64-bit with O32 or 32-bit */
-	if ((config_enabled(CONFIG_32BIT) ||
+	if ((IS_ENABLED(CONFIG_32BIT) ||
 	    test_tsk_thread_flag(task, TIF_32BIT_REGS)) &&
 	    (regs->regs[2] == __NR_syscall))
 		i++;
diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index 7f109d4f64a4..11b965f98d95 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -88,7 +88,7 @@ extern u64 __ua_limit;
  */
 static inline bool eva_kernel_access(void)
 {
-	if (!config_enabled(CONFIG_EVA))
+	if (!IS_ENABLED(CONFIG_EVA))
 		return false;
 
 	return segment_eq(get_fs(), get_ds());
diff --git a/arch/mips/jz4740/setup.c b/arch/mips/jz4740/setup.c
index 0914ef775b5f..6d0152321819 100644
--- a/arch/mips/jz4740/setup.c
+++ b/arch/mips/jz4740/setup.c
@@ -75,7 +75,7 @@ void __init device_tree_init(void)
 
 const char *get_system_type(void)
 {
-	if (config_enabled(CONFIG_MACH_JZ4780))
+	if (IS_ENABLED(CONFIG_MACH_JZ4780))
 		return "JZ4780";
 
 	return "JZ4740";
diff --git a/arch/mips/kernel/cpu-bugs64.c b/arch/mips/kernel/cpu-bugs64.c
index 6392dbe504fb..a378e44688f5 100644
--- a/arch/mips/kernel/cpu-bugs64.c
+++ b/arch/mips/kernel/cpu-bugs64.c
@@ -244,7 +244,7 @@ static inline void check_daddi(void)
 	panic(bug64hit, !DADDI_WAR ? daddiwar : nowar);
 }
 
-int daddiu_bug	= config_enabled(CONFIG_CPU_MIPSR6) ? 0 : -1;
+int daddiu_bug	= IS_ENABLED(CONFIG_CPU_MIPSR6) ? 0 : -1;
 
 static inline void check_daddiu(void)
 {
@@ -314,7 +314,7 @@ static inline void check_daddiu(void)
 
 void __init check_bugs64_early(void)
 {
-	if (!config_enabled(CONFIG_CPU_MIPSR6)) {
+	if (!IS_ENABLED(CONFIG_CPU_MIPSR6)) {
 		check_mult_sh();
 		check_daddiu();
 	}
@@ -322,6 +322,6 @@ void __init check_bugs64_early(void)
 
 void __init check_bugs64(void)
 {
-	if (!config_enabled(CONFIG_CPU_MIPSR6))
+	if (!IS_ENABLED(CONFIG_CPU_MIPSR6))
 		check_daddi();
 }
diff --git a/arch/mips/kernel/elf.c b/arch/mips/kernel/elf.c
index 891f5ee63983..e6eb7f1f7723 100644
--- a/arch/mips/kernel/elf.c
+++ b/arch/mips/kernel/elf.c
@@ -179,7 +179,7 @@ int arch_check_elf(void *_ehdr, bool has_interpreter, void *_interp_ehdr,
 			return -ELIBBAD;
 	}
 
-	if (!config_enabled(CONFIG_MIPS_O32_FP64_SUPPORT))
+	if (!IS_ENABLED(CONFIG_MIPS_O32_FP64_SUPPORT))
 		return 0;
 
 	fp_abi = state->fp_abi;
@@ -285,7 +285,7 @@ void mips_set_personality_fp(struct arch_elf_state *state)
 	 * not be worried about N32/N64 binaries.
 	 */
 
-	if (!config_enabled(CONFIG_MIPS_O32_FP64_SUPPORT))
+	if (!IS_ENABLED(CONFIG_MIPS_O32_FP64_SUPPORT))
 		return;
 
 	switch (state->overall_fp_mode) {
diff --git a/arch/mips/kernel/mips-cm.c b/arch/mips/kernel/mips-cm.c
index 760217bbb2fa..659e6d3ae335 100644
--- a/arch/mips/kernel/mips-cm.c
+++ b/arch/mips/kernel/mips-cm.c
@@ -251,7 +251,7 @@ int mips_cm_probe(void)
 	mips_cm_probe_l2sync();
 
 	/* determine register width for this CM */
-	mips_cm_is64 = config_enabled(CONFIG_64BIT) && (mips_cm_revision() >= CM_REV_CM3);
+	mips_cm_is64 = IS_ENABLED(CONFIG_64BIT) && (mips_cm_revision() >= CM_REV_CM3);
 
 	for_each_possible_cpu(cpu)
 		spin_lock_init(&per_cpu(cm_core_lock, cpu));
diff --git a/arch/mips/kernel/mips-r2-to-r6-emul.c b/arch/mips/kernel/mips-r2-to-r6-emul.c
index 7ff2a557f4aa..43fbadc78d0a 100644
--- a/arch/mips/kernel/mips-r2-to-r6-emul.c
+++ b/arch/mips/kernel/mips-r2-to-r6-emul.c
@@ -84,7 +84,7 @@ static inline int mipsr6_emul(struct pt_regs *regs, u32 ir)
 				(s32)MIPSInst_SIMM(ir);
 		return 0;
 	case daddiu_op:
-		if (config_enabled(CONFIG_32BIT))
+		if (IS_ENABLED(CONFIG_32BIT))
 			break;
 
 		if (MIPSInst_RT(ir))
@@ -143,7 +143,7 @@ static inline int mipsr6_emul(struct pt_regs *regs, u32 ir)
 					      (u32)regs->regs[MIPSInst_RT(ir)]);
 			return 0;
 		case dsll_op:
-			if (config_enabled(CONFIG_32BIT) || MIPSInst_RS(ir))
+			if (IS_ENABLED(CONFIG_32BIT) || MIPSInst_RS(ir))
 				break;
 
 			if (MIPSInst_RD(ir))
@@ -152,7 +152,7 @@ static inline int mipsr6_emul(struct pt_regs *regs, u32 ir)
 						MIPSInst_FD(ir));
 			return 0;
 		case dsrl_op:
-			if (config_enabled(CONFIG_32BIT) || MIPSInst_RS(ir))
+			if (IS_ENABLED(CONFIG_32BIT) || MIPSInst_RS(ir))
 				break;
 
 			if (MIPSInst_RD(ir))
@@ -161,7 +161,7 @@ static inline int mipsr6_emul(struct pt_regs *regs, u32 ir)
 						MIPSInst_FD(ir));
 			return 0;
 		case daddu_op:
-			if (config_enabled(CONFIG_32BIT) || MIPSInst_FD(ir))
+			if (IS_ENABLED(CONFIG_32BIT) || MIPSInst_FD(ir))
 				break;
 
 			if (MIPSInst_RD(ir))
@@ -170,7 +170,7 @@ static inline int mipsr6_emul(struct pt_regs *regs, u32 ir)
 					(u64)regs->regs[MIPSInst_RT(ir)];
 			return 0;
 		case dsubu_op:
-			if (config_enabled(CONFIG_32BIT) || MIPSInst_FD(ir))
+			if (IS_ENABLED(CONFIG_32BIT) || MIPSInst_FD(ir))
 				break;
 
 			if (MIPSInst_RD(ir))
@@ -498,7 +498,7 @@ static int dmult_func(struct pt_regs *regs, u32 ir)
 	s64 res;
 	s64 rt, rs;
 
-	if (config_enabled(CONFIG_32BIT))
+	if (IS_ENABLED(CONFIG_32BIT))
 		return SIGILL;
 
 	rt = regs->regs[MIPSInst_RT(ir)];
@@ -530,7 +530,7 @@ static int dmultu_func(struct pt_regs *regs, u32 ir)
 	u64 res;
 	u64 rt, rs;
 
-	if (config_enabled(CONFIG_32BIT))
+	if (IS_ENABLED(CONFIG_32BIT))
 		return SIGILL;
 
 	rt = regs->regs[MIPSInst_RT(ir)];
@@ -561,7 +561,7 @@ static int ddiv_func(struct pt_regs *regs, u32 ir)
 {
 	s64 rt, rs;
 
-	if (config_enabled(CONFIG_32BIT))
+	if (IS_ENABLED(CONFIG_32BIT))
 		return SIGILL;
 
 	rt = regs->regs[MIPSInst_RT(ir)];
@@ -586,7 +586,7 @@ static int ddivu_func(struct pt_regs *regs, u32 ir)
 {
 	u64 rt, rs;
 
-	if (config_enabled(CONFIG_32BIT))
+	if (IS_ENABLED(CONFIG_32BIT))
 		return SIGILL;
 
 	rt = regs->regs[MIPSInst_RT(ir)];
@@ -825,7 +825,7 @@ static int dclz_func(struct pt_regs *regs, u32 ir)
 	u64 res;
 	u64 rs;
 
-	if (config_enabled(CONFIG_32BIT))
+	if (IS_ENABLED(CONFIG_32BIT))
 		return SIGILL;
 
 	if (!MIPSInst_RD(ir))
@@ -852,7 +852,7 @@ static int dclo_func(struct pt_regs *regs, u32 ir)
 	u64 res;
 	u64 rs;
 
-	if (config_enabled(CONFIG_32BIT))
+	if (IS_ENABLED(CONFIG_32BIT))
 		return SIGILL;
 
 	if (!MIPSInst_RD(ir))
@@ -1484,7 +1484,7 @@ fpu_emul:
 		break;
 
 	case ldl_op:
-		if (config_enabled(CONFIG_32BIT)) {
+		if (IS_ENABLED(CONFIG_32BIT)) {
 		    err = SIGILL;
 		    break;
 		}
@@ -1603,7 +1603,7 @@ fpu_emul:
 		break;
 
 	case ldr_op:
-		if (config_enabled(CONFIG_32BIT)) {
+		if (IS_ENABLED(CONFIG_32BIT)) {
 		    err = SIGILL;
 		    break;
 		}
@@ -1722,7 +1722,7 @@ fpu_emul:
 		break;
 
 	case sdl_op:
-		if (config_enabled(CONFIG_32BIT)) {
+		if (IS_ENABLED(CONFIG_32BIT)) {
 		    err = SIGILL;
 		    break;
 		}
@@ -1840,7 +1840,7 @@ fpu_emul:
 		break;
 
 	case sdr_op:
-		if (config_enabled(CONFIG_32BIT)) {
+		if (IS_ENABLED(CONFIG_32BIT)) {
 		    err = SIGILL;
 		    break;
 		}
@@ -2072,7 +2072,7 @@ fpu_emul:
 		break;
 
 	case lld_op:
-		if (config_enabled(CONFIG_32BIT)) {
+		if (IS_ENABLED(CONFIG_32BIT)) {
 		    err = SIGILL;
 		    break;
 		}
@@ -2133,7 +2133,7 @@ fpu_emul:
 		break;
 
 	case scd_op:
-		if (config_enabled(CONFIG_32BIT)) {
+		if (IS_ENABLED(CONFIG_32BIT)) {
 		    err = SIGILL;
 		    break;
 		}
diff --git a/arch/mips/kernel/pm-cps.c b/arch/mips/kernel/pm-cps.c
index adda3ffb9b78..5b31a9405ebc 100644
--- a/arch/mips/kernel/pm-cps.c
+++ b/arch/mips/kernel/pm-cps.c
@@ -148,7 +148,7 @@ int cps_pm_enter_state(enum cps_pm_state state)
 	}
 
 	/* Setup the VPE to run mips_cps_pm_restore when started again */
-	if (config_enabled(CONFIG_CPU_PM) && state == CPS_PM_POWER_GATED) {
+	if (IS_ENABLED(CONFIG_CPU_PM) && state == CPS_PM_POWER_GATED) {
 		/* Power gating relies upon CPS SMP */
 		if (!mips_cps_smp_in_use())
 			return -EINVAL;
@@ -387,7 +387,7 @@ static void * __init cps_gen_entry_code(unsigned cpu, enum cps_pm_state state)
 	memset(labels, 0, sizeof(labels));
 	memset(relocs, 0, sizeof(relocs));
 
-	if (config_enabled(CONFIG_CPU_PM) && state == CPS_PM_POWER_GATED) {
+	if (IS_ENABLED(CONFIG_CPU_PM) && state == CPS_PM_POWER_GATED) {
 		/* Power gating relies upon CPS SMP */
 		if (!mips_cps_smp_in_use())
 			goto out_err;
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index ae4231452115..1975cd2f7de6 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -165,7 +165,7 @@ static int save_msa_extcontext(void __user *buf)
 		 * should already have been done when handling scalar FP
 		 * context.
 		 */
-		BUG_ON(config_enabled(CONFIG_EVA));
+		BUG_ON(IS_ENABLED(CONFIG_EVA));
 
 		err = __put_user(read_msa_csr(), &msa->csr);
 		err |= _save_msa_all_upper(&msa->wr);
@@ -195,7 +195,7 @@ static int restore_msa_extcontext(void __user *buf, unsigned int size)
 	unsigned int csr;
 	int i, err;
 
-	if (!config_enabled(CONFIG_CPU_HAS_MSA))
+	if (!IS_ENABLED(CONFIG_CPU_HAS_MSA))
 		return SIGSYS;
 
 	if (size != sizeof(*msa))
@@ -215,7 +215,7 @@ static int restore_msa_extcontext(void __user *buf, unsigned int size)
 		 * scalar FP context, so FPU & MSA should have already been
 		 * disabled whilst handling scalar FP context.
 		 */
-		BUG_ON(config_enabled(CONFIG_EVA));
+		BUG_ON(IS_ENABLED(CONFIG_EVA));
 
 		write_msa_csr(csr);
 		err |= _restore_msa_all_upper(&msa->wr);
@@ -315,7 +315,7 @@ int protected_save_fp_context(void __user *sc)
 	 * EVA does not have userland equivalents of ldc1 or sdc1, so
 	 * save to the kernel FP context & copy that to userland below.
 	 */
-	if (config_enabled(CONFIG_EVA))
+	if (IS_ENABLED(CONFIG_EVA))
 		lose_fpu(1);
 
 	while (1) {
@@ -378,7 +378,7 @@ int protected_restore_fp_context(void __user *sc)
 	 * disable the FPU here such that the code below simply copies to
 	 * the kernel FP context.
 	 */
-	if (config_enabled(CONFIG_EVA))
+	if (IS_ENABLED(CONFIG_EVA))
 		lose_fpu(0);
 
 	while (1) {
diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index 4ed36f288d64..05b3201271b4 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -46,8 +46,8 @@ static unsigned core_vpe_count(unsigned core)
 	if (threads_disabled)
 		return 1;
 
-	if ((!config_enabled(CONFIG_MIPS_MT_SMP) || !cpu_has_mipsmt)
-		&& (!config_enabled(CONFIG_CPU_MIPSR6) || !cpu_has_vp))
+	if ((!IS_ENABLED(CONFIG_MIPS_MT_SMP) || !cpu_has_mipsmt)
+		&& (!IS_ENABLED(CONFIG_CPU_MIPSR6) || !cpu_has_vp))
 		return 1;
 
 	mips_cm_lock_other(core, 0);
diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c
index 28b3af73a17b..f1c308dbbc4a 100644
--- a/arch/mips/kernel/unaligned.c
+++ b/arch/mips/kernel/unaligned.c
@@ -1025,7 +1025,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 		if (!access_ok(VERIFY_READ, addr, 2))
 			goto sigbus;
 
-		if (config_enabled(CONFIG_EVA)) {
+		if (IS_ENABLED(CONFIG_EVA)) {
 			if (segment_eq(get_fs(), get_ds()))
 				LoadHW(addr, value, res);
 			else
@@ -1044,7 +1044,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 		if (!access_ok(VERIFY_READ, addr, 4))
 			goto sigbus;
 
-		if (config_enabled(CONFIG_EVA)) {
+		if (IS_ENABLED(CONFIG_EVA)) {
 			if (segment_eq(get_fs(), get_ds()))
 				LoadW(addr, value, res);
 			else
@@ -1063,7 +1063,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 		if (!access_ok(VERIFY_READ, addr, 2))
 			goto sigbus;
 
-		if (config_enabled(CONFIG_EVA)) {
+		if (IS_ENABLED(CONFIG_EVA)) {
 			if (segment_eq(get_fs(), get_ds()))
 				LoadHWU(addr, value, res);
 			else
@@ -1131,7 +1131,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 		compute_return_epc(regs);
 		value = regs->regs[insn.i_format.rt];
 
-		if (config_enabled(CONFIG_EVA)) {
+		if (IS_ENABLED(CONFIG_EVA)) {
 			if (segment_eq(get_fs(), get_ds()))
 				StoreHW(addr, value, res);
 			else
@@ -1151,7 +1151,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 		compute_return_epc(regs);
 		value = regs->regs[insn.i_format.rt];
 
-		if (config_enabled(CONFIG_EVA)) {
+		if (IS_ENABLED(CONFIG_EVA)) {
 			if (segment_eq(get_fs(), get_ds()))
 				StoreW(addr, value, res);
 			else
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index 6dc07fba187f..92d15e68abb6 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -784,10 +784,10 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
  */
 static inline int cop1_64bit(struct pt_regs *xcp)
 {
-	if (config_enabled(CONFIG_64BIT) && !config_enabled(CONFIG_MIPS32_O32))
+	if (IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_MIPS32_O32))
 		return 1;
-	else if (config_enabled(CONFIG_32BIT) &&
-		 !config_enabled(CONFIG_MIPS_O32_FP64_SUPPORT))
+	else if (IS_ENABLED(CONFIG_32BIT) &&
+		 !IS_ENABLED(CONFIG_MIPS_O32_FP64_SUPPORT))
 		return 0;
 
 	return !test_thread_flag(TIF_32BIT_FPREGS);
diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c
index 4004b659ce50..ff49b29c2d16 100644
--- a/arch/mips/mm/tlbex.c
+++ b/arch/mips/mm/tlbex.c
@@ -1025,7 +1025,7 @@ static void build_update_entries(u32 **p, unsigned int tmp, unsigned int ptep)
 	pte_off_odd += offsetof(pte_t, pte_high);
 #endif
 
-	if (config_enabled(CONFIG_XPA)) {
+	if (IS_ENABLED(CONFIG_XPA)) {
 		uasm_i_lw(p, tmp, pte_off_even, ptep); /* even pte */
 		UASM_i_ROTR(p, tmp, tmp, ilog2(_PAGE_GLOBAL));
 		UASM_i_MTC0(p, tmp, C0_ENTRYLO0);
@@ -1643,7 +1643,7 @@ iPTE_SW(u32 **p, struct uasm_reloc **r, unsigned int pte, unsigned int ptr,
 	unsigned int hwmode = mode & (_PAGE_VALID | _PAGE_DIRTY);
 	unsigned int swmode = mode & ~hwmode;
 
-	if (config_enabled(CONFIG_XPA) && !cpu_has_64bits) {
+	if (IS_ENABLED(CONFIG_XPA) && !cpu_has_64bits) {
 		uasm_i_lui(p, scratch, swmode >> 16);
 		uasm_i_or(p, pte, pte, scratch);
 		BUG_ON(swmode & 0xffff);
@@ -2432,7 +2432,7 @@ static void config_htw_params(void)
 		pwsize |= ilog2(PTRS_PER_PMD) << MIPS_PWSIZE_MDW_SHIFT;
 
 	/* Set pointer size to size of directory pointers */
-	if (config_enabled(CONFIG_64BIT))
+	if (IS_ENABLED(CONFIG_64BIT))
 		pwsize |= MIPS_PWSIZE_PS_MASK;
 	/* PTEs may be multiple pointers long (e.g. with XPA) */
 	pwsize |= ((PTE_T_LOG2 - PGD_T_LOG2) << MIPS_PWSIZE_PTEW_SHIFT)
@@ -2448,7 +2448,7 @@ static void config_htw_params(void)
 	 * the pwctl fields.
 	 */
 	config = 1 << MIPS_PWCTL_PWEN_SHIFT;
-	if (config_enabled(CONFIG_64BIT))
+	if (IS_ENABLED(CONFIG_64BIT))
 		config |= MIPS_PWCTL_XU_MASK;
 	write_c0_pwctl(config);
 	pr_info("Hardware Page Table Walker enabled\n");
@@ -2522,7 +2522,7 @@ void build_tlb_refill_handler(void)
 	 */
 	static int run_once = 0;
 
-	if (config_enabled(CONFIG_XPA) && !cpu_has_rixi)
+	if (IS_ENABLED(CONFIG_XPA) && !cpu_has_rixi)
 		panic("Kernels supporting XPA currently require CPUs with RIXI");
 
 	output_pgtable_bits_defines();
diff --git a/arch/mips/mti-malta/malta-dtshim.c b/arch/mips/mti-malta/malta-dtshim.c
index f7133efc5843..151f4882ec8a 100644
--- a/arch/mips/mti-malta/malta-dtshim.c
+++ b/arch/mips/mti-malta/malta-dtshim.c
@@ -31,7 +31,7 @@ static unsigned __init gen_fdt_mem_array(__be32 *mem_array, unsigned long size)
 
 	entries = 1;
 	mem_array[0] = cpu_to_be32(PHYS_OFFSET);
-	if (config_enabled(CONFIG_EVA)) {
+	if (IS_ENABLED(CONFIG_EVA)) {
 		/*
 		 * The current Malta EVA configuration is "special" in that it
 		 * always makes use of addresses in the upper half of the 32 bit
@@ -82,7 +82,7 @@ static void __init append_memory(void *fdt, int root_off)
 		physical_memsize = 32 << 20;
 	}
 
-	if (config_enabled(CONFIG_CPU_BIG_ENDIAN)) {
+	if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) {
 		/*
 		 * SOC-it swaps, or perhaps doesn't swap, when DMA'ing
 		 * the last word of physical memory.
diff --git a/arch/mips/mti-malta/malta-memory.c b/arch/mips/mti-malta/malta-memory.c
index d5f8dae6a797..a47556723b85 100644
--- a/arch/mips/mti-malta/malta-memory.c
+++ b/arch/mips/mti-malta/malta-memory.c
@@ -32,7 +32,7 @@ static void free_init_pages_eva_malta(void *begin, void *end)
 
 void __init fw_meminit(void)
 {
-	bool eva = config_enabled(CONFIG_EVA);
+	bool eva = IS_ENABLED(CONFIG_EVA);
 
 	free_init_pages_eva = eva ? free_init_pages_eva_malta : NULL;
 }
diff --git a/arch/mips/mti-malta/malta-setup.c b/arch/mips/mti-malta/malta-setup.c
index 33d5ff5069e5..ec5b21678fad 100644
--- a/arch/mips/mti-malta/malta-setup.c
+++ b/arch/mips/mti-malta/malta-setup.c
@@ -261,7 +261,7 @@ void __init plat_mem_setup(void)
 	fdt = malta_dt_shim(fdt);
 	__dt_setup_arch(fdt);
 
-	if (config_enabled(CONFIG_EVA))
+	if (IS_ENABLED(CONFIG_EVA))
 		/* EVA has already been configured in mach-malta/kernel-init.h */
 		pr_info("Enhanced Virtual Addressing (EVA) activated\n");
 
diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c
index 1a8c96035716..d1b7bd09253a 100644
--- a/arch/mips/net/bpf_jit.c
+++ b/arch/mips/net/bpf_jit.c
@@ -426,7 +426,7 @@ static inline void emit_load_ptr(unsigned int dst, unsigned int src,
 static inline void emit_load_func(unsigned int reg, ptr imm,
 				  struct jit_ctx *ctx)
 {
-	if (config_enabled(CONFIG_64BIT)) {
+	if (IS_ENABLED(CONFIG_64BIT)) {
 		/* At this point imm is always 64-bit */
 		emit_load_imm(r_tmp, (u64)imm >> 32, ctx);
 		emit_dsll(r_tmp_imm, r_tmp, 16, ctx); /* left shift by 16 */
@@ -516,7 +516,7 @@ static inline void emit_jr(unsigned int reg, struct jit_ctx *ctx)
 static inline u16 align_sp(unsigned int num)
 {
 	/* Double word alignment for 32-bit, quadword for 64-bit */
-	unsigned int align = config_enabled(CONFIG_64BIT) ? 16 : 8;
+	unsigned int align = IS_ENABLED(CONFIG_64BIT) ? 16 : 8;
 	num = (num + (align - 1)) & -align;
 	return num;
 }
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index fea7724141a0..e7f155c3045e 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -344,8 +344,8 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
  */
 static inline int mmap_is_ia32(void)
 {
-	return config_enabled(CONFIG_X86_32) ||
-	       (config_enabled(CONFIG_COMPAT) &&
+	return IS_ENABLED(CONFIG_X86_32) ||
+	       (IS_ENABLED(CONFIG_COMPAT) &&
 		test_thread_flag(TIF_ADDR32));
 }
 
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 116b58347501..2737366ea583 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -137,9 +137,9 @@ static inline int copy_fregs_to_user(struct fregs_state __user *fx)
 
 static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
 {
-	if (config_enabled(CONFIG_X86_32))
+	if (IS_ENABLED(CONFIG_X86_32))
 		return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
-	else if (config_enabled(CONFIG_AS_FXSAVEQ))
+	else if (IS_ENABLED(CONFIG_AS_FXSAVEQ))
 		return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
 
 	/* See comment in copy_fxregs_to_kernel() below. */
@@ -150,10 +150,10 @@ static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
 {
 	int err;
 
-	if (config_enabled(CONFIG_X86_32)) {
+	if (IS_ENABLED(CONFIG_X86_32)) {
 		err = check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
 	} else {
-		if (config_enabled(CONFIG_AS_FXSAVEQ)) {
+		if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) {
 			err = check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
 		} else {
 			/* See comment in copy_fxregs_to_kernel() below. */
@@ -166,9 +166,9 @@ static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
 
 static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
 {
-	if (config_enabled(CONFIG_X86_32))
+	if (IS_ENABLED(CONFIG_X86_32))
 		return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-	else if (config_enabled(CONFIG_AS_FXSAVEQ))
+	else if (IS_ENABLED(CONFIG_AS_FXSAVEQ))
 		return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
 
 	/* See comment in copy_fxregs_to_kernel() below. */
@@ -190,9 +190,9 @@ static inline int copy_user_to_fregs(struct fregs_state __user *fx)
 
 static inline void copy_fxregs_to_kernel(struct fpu *fpu)
 {
-	if (config_enabled(CONFIG_X86_32))
+	if (IS_ENABLED(CONFIG_X86_32))
 		asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
-	else if (config_enabled(CONFIG_AS_FXSAVEQ))
+	else if (IS_ENABLED(CONFIG_AS_FXSAVEQ))
 		asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
 	else {
 		/* Using "rex64; fxsave %0" is broken because, if the memory
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 396348196aa7..d8abfcf524d1 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -155,7 +155,7 @@ static inline void arch_exit_mmap(struct mm_struct *mm)
 #ifdef CONFIG_X86_64
 static inline bool is_64bit_mm(struct mm_struct *mm)
 {
-	return	!config_enabled(CONFIG_IA32_EMULATION) ||
+	return	!IS_ENABLED(CONFIG_IA32_EMULATION) ||
 		!(mm->context.ia32_compat == TIF_IA32);
 }
 #else
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 7943d38c57ca..20abd912f0e4 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -147,7 +147,7 @@ static int force_enable_local_apic __initdata;
  */
 static int __init parse_lapic(char *arg)
 {
-	if (config_enabled(CONFIG_X86_32) && !arg)
+	if (IS_ENABLED(CONFIG_X86_32) && !arg)
 		force_enable_local_apic = 1;
 	else if (arg && !strncmp(arg, "notscdeadline", 13))
 		setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index a5e400afc563..6066d945c40e 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -523,7 +523,7 @@ static int apic_set_affinity(struct irq_data *irq_data,
 	struct apic_chip_data *data = irq_data->chip_data;
 	int err, irq = irq_data->irq;
 
-	if (!config_enabled(CONFIG_SMP))
+	if (!IS_ENABLED(CONFIG_SMP))
 		return -EPERM;
 
 	if (!cpumask_intersects(dest, cpu_online_mask))
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 9e231d88bb33..a184c210efba 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -159,8 +159,8 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	struct task_struct *tsk = current;
 	int ia32_fxstate = (buf != buf_fx);
 
-	ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
-			 config_enabled(CONFIG_IA32_EMULATION));
+	ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
+			 IS_ENABLED(CONFIG_IA32_EMULATION));
 
 	if (!access_ok(VERIFY_WRITE, buf, size))
 		return -EACCES;
@@ -268,8 +268,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 	u64 xfeatures = 0;
 	int fx_only = 0;
 
-	ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
-			 config_enabled(CONFIG_IA32_EMULATION));
+	ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
+			 IS_ENABLED(CONFIG_IA32_EMULATION));
 
 	if (!buf) {
 		fpu__clear(fpu);
@@ -416,8 +416,8 @@ void fpu__init_prepare_fx_sw_frame(void)
 	fx_sw_reserved.xfeatures = xfeatures_mask;
 	fx_sw_reserved.xstate_size = fpu_user_xstate_size;
 
-	if (config_enabled(CONFIG_IA32_EMULATION) ||
-	    config_enabled(CONFIG_X86_32)) {
+	if (IS_ENABLED(CONFIG_IA32_EMULATION) ||
+	    IS_ENABLED(CONFIG_X86_32)) {
 		int fsave_header_size = sizeof(struct fregs_state);
 
 		fx_sw_reserved_ia32 = fx_sw_reserved;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 22cc2f9f8aec..99f285b512db 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -146,7 +146,7 @@ static int restore_sigcontext(struct pt_regs *regs,
 		buf = (void __user *)buf_val;
 	} get_user_catch(err);
 
-	err |= fpu__restore_sig(buf, config_enabled(CONFIG_X86_32));
+	err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32));
 
 	force_iret();
 
@@ -245,14 +245,14 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 	struct fpu *fpu = &current->thread.fpu;
 
 	/* redzone */
-	if (config_enabled(CONFIG_X86_64))
+	if (IS_ENABLED(CONFIG_X86_64))
 		sp -= 128;
 
 	/* This is the X/Open sanctioned signal stack switching.  */
 	if (ka->sa.sa_flags & SA_ONSTACK) {
 		if (sas_ss_flags(sp) == 0)
 			sp = current->sas_ss_sp + current->sas_ss_size;
-	} else if (config_enabled(CONFIG_X86_32) &&
+	} else if (IS_ENABLED(CONFIG_X86_32) &&
 		   !onsigstack &&
 		   (regs->ss & 0xffff) != __USER_DS &&
 		   !(ka->sa.sa_flags & SA_RESTORER) &&
@@ -262,7 +262,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 	}
 
 	if (fpu->fpstate_active) {
-		sp = fpu__alloc_mathframe(sp, config_enabled(CONFIG_X86_32),
+		sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
 					  &buf_fx, &math_size);
 		*fpstate = (void __user *)sp;
 	}
@@ -662,18 +662,18 @@ badframe:
 
 static inline int is_ia32_compat_frame(void)
 {
-	return config_enabled(CONFIG_IA32_EMULATION) &&
+	return IS_ENABLED(CONFIG_IA32_EMULATION) &&
 	       test_thread_flag(TIF_IA32);
 }
 
 static inline int is_ia32_frame(void)
 {
-	return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame();
+	return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame();
 }
 
 static inline int is_x32_frame(void)
 {
-	return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32);
+	return IS_ENABLED(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32);
 }
 
 static int
diff --git a/drivers/firmware/broadcom/bcm47xx_sprom.c b/drivers/firmware/broadcom/bcm47xx_sprom.c
index b6eb875d4af3..62aa3cf09b4d 100644
--- a/drivers/firmware/broadcom/bcm47xx_sprom.c
+++ b/drivers/firmware/broadcom/bcm47xx_sprom.c
@@ -669,7 +669,7 @@ static int bcm47xx_get_sprom_bcma(struct bcma_bus *bus, struct ssb_sprom *out)
 	case BCMA_HOSTTYPE_PCI:
 		memset(out, 0, sizeof(struct ssb_sprom));
 		/* On BCM47XX all PCI buses share the same domain */
-		if (config_enabled(CONFIG_BCM47XX))
+		if (IS_ENABLED(CONFIG_BCM47XX))
 			snprintf(buf, sizeof(buf), "pci/%u/%u/",
 				 bus->host_pci->bus->number + 1,
 				 PCI_SLOT(bus->host_pci->devfn));
diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 3786d0f21972..c5f33c3bd228 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -359,7 +359,7 @@ static void gic_handle_shared_int(bool chained)
 		pending_reg += gic_reg_step;
 		intrmask_reg += gic_reg_step;
 
-		if (!config_enabled(CONFIG_64BIT) || mips_cm_is64)
+		if (!IS_ENABLED(CONFIG_64BIT) || mips_cm_is64)
 			continue;
 
 		pending[i] |= (u64)gic_read(pending_reg) << 32;
diff --git a/drivers/mtd/bcm47xxpart.c b/drivers/mtd/bcm47xxpart.c
index 845dd27d9f41..377947580203 100644
--- a/drivers/mtd/bcm47xxpart.c
+++ b/drivers/mtd/bcm47xxpart.c
@@ -122,7 +122,7 @@ static int bcm47xxpart_parse(struct mtd_info *master,
 	for (offset = 0; offset <= master->size - blocksize;
 	     offset += blocksize) {
 		/* Nothing more in higher memory on BCM47XX (MIPS) */
-		if (config_enabled(CONFIG_BCM47XX) && offset >= 0x2000000)
+		if (IS_ENABLED(CONFIG_BCM47XX) && offset >= 0x2000000)
 			break;
 
 		if (curr_part >= BCM47XXPART_MAX_PARTS) {
diff --git a/drivers/net/wireless/ath/ath10k/debug.c b/drivers/net/wireless/ath/ath10k/debug.c
index 355e1ae665f9..8f0fd41dfd4b 100644
--- a/drivers/net/wireless/ath/ath10k/debug.c
+++ b/drivers/net/wireless/ath/ath10k/debug.c
@@ -139,11 +139,11 @@ void ath10k_debug_print_hwfw_info(struct ath10k *ar)
 		    ar->id.subsystem_vendor, ar->id.subsystem_device);
 
 	ath10k_info(ar, "kconfig debug %d debugfs %d tracing %d dfs %d testmode %d\n",
-		    config_enabled(CONFIG_ATH10K_DEBUG),
-		    config_enabled(CONFIG_ATH10K_DEBUGFS),
-		    config_enabled(CONFIG_ATH10K_TRACING),
-		    config_enabled(CONFIG_ATH10K_DFS_CERTIFIED),
-		    config_enabled(CONFIG_NL80211_TESTMODE));
+		    IS_ENABLED(CONFIG_ATH10K_DEBUG),
+		    IS_ENABLED(CONFIG_ATH10K_DEBUGFS),
+		    IS_ENABLED(CONFIG_ATH10K_TRACING),
+		    IS_ENABLED(CONFIG_ATH10K_DFS_CERTIFIED),
+		    IS_ENABLED(CONFIG_NL80211_TESTMODE));
 
 	firmware = ar->normal_mode_fw.fw_file.firmware;
 	if (firmware)
@@ -2424,7 +2424,7 @@ int ath10k_debug_register(struct ath10k *ar)
 	debugfs_create_file("nf_cal_period", S_IRUSR | S_IWUSR,
 			    ar->debug.debugfs_phy, ar, &fops_nf_cal_period);
 
-	if (config_enabled(CONFIG_ATH10K_DFS_CERTIFIED)) {
+	if (IS_ENABLED(CONFIG_ATH10K_DFS_CERTIFIED)) {
 		debugfs_create_file("dfs_simulate_radar", S_IWUSR,
 				    ar->debug.debugfs_phy, ar,
 				    &fops_simulate_radar);
diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index fb8e38df9446..0bbd0a00edcc 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -3039,7 +3039,7 @@ static void ath10k_regd_update(struct ath10k *ar)
 
 	regpair = ar->ath_common.regulatory.regpair;
 
-	if (config_enabled(CONFIG_ATH10K_DFS_CERTIFIED) && ar->dfs_detector) {
+	if (IS_ENABLED(CONFIG_ATH10K_DFS_CERTIFIED) && ar->dfs_detector) {
 		nl_dfs_reg = ar->dfs_detector->region;
 		wmi_dfs_reg = ath10k_mac_get_dfs_region(nl_dfs_reg);
 	} else {
@@ -3068,7 +3068,7 @@ static void ath10k_reg_notifier(struct wiphy *wiphy,
 
 	ath_reg_notifier_apply(wiphy, request, &ar->ath_common.regulatory);
 
-	if (config_enabled(CONFIG_ATH10K_DFS_CERTIFIED) && ar->dfs_detector) {
+	if (IS_ENABLED(CONFIG_ATH10K_DFS_CERTIFIED) && ar->dfs_detector) {
 		ath10k_dbg(ar, ATH10K_DBG_REGULATORY, "dfs region 0x%x\n",
 			   request->dfs_region);
 		result = ar->dfs_detector->set_dfs_domain(ar->dfs_detector,
@@ -7955,7 +7955,7 @@ int ath10k_mac_register(struct ath10k *ar)
 	if (!test_bit(ATH10K_FLAG_RAW_MODE, &ar->dev_flags))
 		ar->hw->netdev_features = NETIF_F_HW_CSUM;
 
-	if (config_enabled(CONFIG_ATH10K_DFS_CERTIFIED)) {
+	if (IS_ENABLED(CONFIG_ATH10K_DFS_CERTIFIED)) {
 		/* Init ath dfs pattern detector */
 		ar->ath_common.debug_mask = ATH_DBG_DFS;
 		ar->dfs_detector = dfs_pattern_detector_init(&ar->ath_common,
@@ -8003,7 +8003,7 @@ err_unregister:
 	ieee80211_unregister_hw(ar->hw);
 
 err_dfs_detector_exit:
-	if (config_enabled(CONFIG_ATH10K_DFS_CERTIFIED) && ar->dfs_detector)
+	if (IS_ENABLED(CONFIG_ATH10K_DFS_CERTIFIED) && ar->dfs_detector)
 		ar->dfs_detector->exit(ar->dfs_detector);
 
 err_free:
@@ -8018,7 +8018,7 @@ void ath10k_mac_unregister(struct ath10k *ar)
 {
 	ieee80211_unregister_hw(ar->hw);
 
-	if (config_enabled(CONFIG_ATH10K_DFS_CERTIFIED) && ar->dfs_detector)
+	if (IS_ENABLED(CONFIG_ATH10K_DFS_CERTIFIED) && ar->dfs_detector)
 		ar->dfs_detector->exit(ar->dfs_detector);
 
 	kfree(ar->mac.sbands[NL80211_BAND_2GHZ].channels);
diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index 169cd2e783eb..d2462886b75c 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -3704,7 +3704,7 @@ void ath10k_wmi_event_dfs(struct ath10k *ar,
 		   phyerr->tsf_timestamp, tsf, buf_len);
 
 	/* Skip event if DFS disabled */
-	if (!config_enabled(CONFIG_ATH10K_DFS_CERTIFIED))
+	if (!IS_ENABLED(CONFIG_ATH10K_DFS_CERTIFIED))
 		return;
 
 	ATH10K_DFS_STAT_INC(ar, pulses_total);
diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index 4ad6284fc37d..72e2ec67768d 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -3881,7 +3881,7 @@ int ath6kl_cfg80211_init(struct ath6kl *ar)
 					  BIT(NL80211_IFTYPE_P2P_CLIENT);
 	}
 
-	if (config_enabled(CONFIG_ATH6KL_REGDOMAIN) &&
+	if (IS_ENABLED(CONFIG_ATH6KL_REGDOMAIN) &&
 	    test_bit(ATH6KL_FW_CAPABILITY_REGDOMAIN, ar->fw_capabilities)) {
 		wiphy->reg_notifier = ath6kl_cfg80211_reg_notify;
 		ar->wiphy->features |= NL80211_FEATURE_CELL_BASE_REG_HINTS;
diff --git a/drivers/net/wireless/ath/ath9k/common-spectral.c b/drivers/net/wireless/ath/ath9k/common-spectral.c
index a8762711ad74..e2512d5bc0e1 100644
--- a/drivers/net/wireless/ath/ath9k/common-spectral.c
+++ b/drivers/net/wireless/ath/ath9k/common-spectral.c
@@ -731,7 +731,7 @@ void ath9k_cmn_spectral_scan_trigger(struct ath_common *common,
 	struct ath_hw *ah = spec_priv->ah;
 	u32 rxfilter;
 
-	if (config_enabled(CONFIG_ATH9K_TX99))
+	if (IS_ENABLED(CONFIG_ATH9K_TX99))
 		return;
 
 	if (!ath9k_hw_ops(ah)->spectral_scan_trigger) {
@@ -806,7 +806,7 @@ static ssize_t write_file_spec_scan_ctl(struct file *file,
 	char buf[32];
 	ssize_t len;
 
-	if (config_enabled(CONFIG_ATH9K_TX99))
+	if (IS_ENABLED(CONFIG_ATH9K_TX99))
 		return -EOPNOTSUPP;
 
 	len = min(count, sizeof(buf) - 1);
@@ -1072,7 +1072,7 @@ static struct rchan_callbacks rfs_spec_scan_cb = {
 
 void ath9k_cmn_spectral_deinit_debug(struct ath_spec_scan_priv *spec_priv)
 {
-	if (config_enabled(CONFIG_ATH9K_DEBUGFS)) {
+	if (IS_ENABLED(CONFIG_ATH9K_DEBUGFS)) {
 		relay_close(spec_priv->rfs_chan_spec_scan);
 		spec_priv->rfs_chan_spec_scan = NULL;
 	}
diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c
index edc74fca60aa..cfa3fe82ade3 100644
--- a/drivers/net/wireless/ath/ath9k/init.c
+++ b/drivers/net/wireless/ath/ath9k/init.c
@@ -843,7 +843,7 @@ static void ath9k_set_hw_capab(struct ath_softc *sc, struct ieee80211_hw *hw)
 			       NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE |
 			       NL80211_FEATURE_P2P_GO_CTWIN;
 
-	if (!config_enabled(CONFIG_ATH9K_TX99)) {
+	if (!IS_ENABLED(CONFIG_ATH9K_TX99)) {
 		hw->wiphy->interface_modes =
 			BIT(NL80211_IFTYPE_P2P_GO) |
 			BIT(NL80211_IFTYPE_P2P_CLIENT) |
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index 7594650f214f..a394622c9022 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -1250,7 +1250,7 @@ static int ath9k_add_interface(struct ieee80211_hw *hw,
 
 	mutex_lock(&sc->mutex);
 
-	if (config_enabled(CONFIG_ATH9K_TX99)) {
+	if (IS_ENABLED(CONFIG_ATH9K_TX99)) {
 		if (sc->cur_chan->nvifs >= 1) {
 			mutex_unlock(&sc->mutex);
 			return -EOPNOTSUPP;
@@ -1300,7 +1300,7 @@ static int ath9k_change_interface(struct ieee80211_hw *hw,
 
 	mutex_lock(&sc->mutex);
 
-	if (config_enabled(CONFIG_ATH9K_TX99)) {
+	if (IS_ENABLED(CONFIG_ATH9K_TX99)) {
 		mutex_unlock(&sc->mutex);
 		return -EOPNOTSUPP;
 	}
@@ -1360,7 +1360,7 @@ static void ath9k_enable_ps(struct ath_softc *sc)
 	struct ath_hw *ah = sc->sc_ah;
 	struct ath_common *common = ath9k_hw_common(ah);
 
-	if (config_enabled(CONFIG_ATH9K_TX99))
+	if (IS_ENABLED(CONFIG_ATH9K_TX99))
 		return;
 
 	sc->ps_enabled = true;
@@ -1379,7 +1379,7 @@ static void ath9k_disable_ps(struct ath_softc *sc)
 	struct ath_hw *ah = sc->sc_ah;
 	struct ath_common *common = ath9k_hw_common(ah);
 
-	if (config_enabled(CONFIG_ATH9K_TX99))
+	if (IS_ENABLED(CONFIG_ATH9K_TX99))
 		return;
 
 	sc->ps_enabled = false;
@@ -1953,7 +1953,7 @@ static int ath9k_get_survey(struct ieee80211_hw *hw, int idx,
 	struct ieee80211_channel *chan;
 	int pos;
 
-	if (config_enabled(CONFIG_ATH9K_TX99))
+	if (IS_ENABLED(CONFIG_ATH9K_TX99))
 		return -EOPNOTSUPP;
 
 	spin_lock_bh(&common->cc_lock);
@@ -2003,7 +2003,7 @@ static void ath9k_set_coverage_class(struct ieee80211_hw *hw,
 	struct ath_softc *sc = hw->priv;
 	struct ath_hw *ah = sc->sc_ah;
 
-	if (config_enabled(CONFIG_ATH9K_TX99))
+	if (IS_ENABLED(CONFIG_ATH9K_TX99))
 		return;
 
 	mutex_lock(&sc->mutex);
diff --git a/drivers/net/wireless/ath/ath9k/recv.c b/drivers/net/wireless/ath/ath9k/recv.c
index 32160fca876a..669734252664 100644
--- a/drivers/net/wireless/ath/ath9k/recv.c
+++ b/drivers/net/wireless/ath/ath9k/recv.c
@@ -377,7 +377,7 @@ u32 ath_calcrxfilter(struct ath_softc *sc)
 	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
 	u32 rfilt;
 
-	if (config_enabled(CONFIG_ATH9K_TX99))
+	if (IS_ENABLED(CONFIG_ATH9K_TX99))
 		return 0;
 
 	rfilt = ATH9K_RX_FILTER_UCAST | ATH9K_RX_FILTER_BCAST
diff --git a/drivers/net/wireless/ath/dfs_pattern_detector.c b/drivers/net/wireless/ath/dfs_pattern_detector.c
index 2303ef96299d..2f8136d50f78 100644
--- a/drivers/net/wireless/ath/dfs_pattern_detector.c
+++ b/drivers/net/wireless/ath/dfs_pattern_detector.c
@@ -352,7 +352,7 @@ dfs_pattern_detector_init(struct ath_common *common,
 {
 	struct dfs_pattern_detector *dpd;
 
-	if (!config_enabled(CONFIG_CFG80211_CERTIFICATION_ONUS))
+	if (!IS_ENABLED(CONFIG_CFG80211_CERTIFICATION_ONUS))
 		return NULL;
 
 	dpd = kmalloc(sizeof(*dpd), GFP_KERNEL);
diff --git a/drivers/net/wireless/ath/regd.c b/drivers/net/wireless/ath/regd.c
index 7e15ed9ed31f..f8506037736f 100644
--- a/drivers/net/wireless/ath/regd.c
+++ b/drivers/net/wireless/ath/regd.c
@@ -116,7 +116,7 @@ static const struct ieee80211_regdomain ath_world_regdom_67_68_6A_6C = {
 
 static bool dynamic_country_user_possible(struct ath_regulatory *reg)
 {
-	if (config_enabled(CONFIG_ATH_REG_DYNAMIC_USER_CERT_TESTING))
+	if (IS_ENABLED(CONFIG_ATH_REG_DYNAMIC_USER_CERT_TESTING))
 		return true;
 
 	switch (reg->country_code) {
@@ -188,7 +188,7 @@ static bool dynamic_country_user_possible(struct ath_regulatory *reg)
 
 static bool ath_reg_dyn_country_user_allow(struct ath_regulatory *reg)
 {
-	if (!config_enabled(CONFIG_ATH_REG_DYNAMIC_USER_REG_HINTS))
+	if (!IS_ENABLED(CONFIG_ATH_REG_DYNAMIC_USER_REG_HINTS))
 		return false;
 	if (!dynamic_country_user_possible(reg))
 		return false;
diff --git a/drivers/pci/ecam.c b/drivers/pci/ecam.c
index 66e0d718472f..43ed08dd8b01 100644
--- a/drivers/pci/ecam.c
+++ b/drivers/pci/ecam.c
@@ -27,7 +27,7 @@
  * since we have enough virtual address range available.  On 32-bit, we
  * ioremap the config space for each bus individually.
  */
-static const bool per_bus_mapping = !config_enabled(CONFIG_64BIT);
+static const bool per_bus_mapping = !IS_ENABLED(CONFIG_64BIT);
 
 /*
  * Create a PCI config space window
diff --git a/drivers/tty/serial/ar933x_uart.c b/drivers/tty/serial/ar933x_uart.c
index 1519d2ca7705..73137f4aac20 100644
--- a/drivers/tty/serial/ar933x_uart.c
+++ b/drivers/tty/serial/ar933x_uart.c
@@ -54,7 +54,7 @@ struct ar933x_uart_port {
 
 static inline bool ar933x_uart_console_enabled(void)
 {
-	return config_enabled(CONFIG_SERIAL_AR933X_CONSOLE);
+	return IS_ENABLED(CONFIG_SERIAL_AR933X_CONSOLE);
 }
 
 static inline unsigned int ar933x_uart_read(struct ar933x_uart_port *up,
@@ -636,7 +636,7 @@ static int ar933x_uart_probe(struct platform_device *pdev)
 	int ret;
 
 	np = pdev->dev.of_node;
-	if (config_enabled(CONFIG_OF) && np) {
+	if (IS_ENABLED(CONFIG_OF) && np) {
 		id = of_alias_get_id(np, "serial");
 		if (id < 0) {
 			dev_err(&pdev->dev, "unable to get alias id, err=%d\n",
diff --git a/include/linux/fence.h b/include/linux/fence.h
index 523ea3fbbddd..8cc719a63728 100644
--- a/include/linux/fence.h
+++ b/include/linux/fence.h
@@ -358,7 +358,7 @@ u64 fence_context_alloc(unsigned num);
 #define FENCE_TRACE(f, fmt, args...) \
 	do {								\
 		struct fence *__ff = (f);				\
-		if (config_enabled(CONFIG_FENCE_TRACE))			\
+		if (IS_ENABLED(CONFIG_FENCE_TRACE))			\
 			pr_info("f %llu#%u: " fmt,			\
 				__ff->context, __ff->seqno, ##args);	\
 	} while (0)
diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index 760399a470bd..2bb5deb0012e 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -173,14 +173,14 @@ static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
 	mutex_release(&ctx->dep_map, 0, _THIS_IP_);
 
 	DEBUG_LOCKS_WARN_ON(ctx->acquired);
-	if (!config_enabled(CONFIG_PROVE_LOCKING))
+	if (!IS_ENABLED(CONFIG_PROVE_LOCKING))
 		/*
 		 * lockdep will normally handle this,
 		 * but fail without anyway
 		 */
 		ctx->done_acquire = 1;
 
-	if (!config_enabled(CONFIG_DEBUG_LOCK_ALLOC))
+	if (!IS_ENABLED(CONFIG_DEBUG_LOCK_ALLOC))
 		/* ensure ww_acquire_fini will still fail if called twice */
 		ctx->acquired = ~0U;
 #endif
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d49bfa1e53e6..1d3b7665d0be 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -585,8 +585,8 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 		return -EINVAL;
 
 	if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
-		if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) ||
-		    !config_enabled(CONFIG_SECCOMP))
+		if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
+		    !IS_ENABLED(CONFIG_SECCOMP))
 			return -EINVAL;
 
 		if (!capable(CAP_SYS_ADMIN))
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 54d15eb2b701..ef6c6c3f9d8a 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -347,7 +347,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 {
 	struct seccomp_filter *sfilter;
 	int ret;
-	const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE);
+	const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
 
 	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
 		return ERR_PTR(-EINVAL);
@@ -542,7 +542,7 @@ void secure_computing_strict(int this_syscall)
 {
 	int mode = current->seccomp.mode;
 
-	if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+	if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
 	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
 		return;
 
@@ -655,7 +655,7 @@ int __secure_computing(const struct seccomp_data *sd)
 	int mode = current->seccomp.mode;
 	int this_syscall;
 
-	if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+	if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
 	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
 		return 0;
 
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index da49c0b1fd32..b0e11b6dc994 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -715,7 +715,7 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy,
 
 	ASSERT_RTNL();
 
-	if (!config_enabled(CONFIG_CFG80211_REG_RELAX_NO_IR) ||
+	if (!IS_ENABLED(CONFIG_CFG80211_REG_RELAX_NO_IR) ||
 	    !(wiphy->regulatory_flags & REGULATORY_ENABLE_RELAX_NO_IR))
 		return false;
 
-- 
cgit v1.3-8-gc7d7


From 1f69bf9c6137602cd028c96b4f8329121ec89231 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Wed, 3 Aug 2016 13:46:36 -0700
Subject: jump_label: remove bug.h, atomic.h dependencies for HAVE_JUMP_LABEL

The current jump_label.h includes bug.h for things such as WARN_ON().
This makes the header problematic for inclusion by kernel.h or any
headers that kernel.h includes, since bug.h includes kernel.h (circular
dependency).  The inclusion of atomic.h is similarly problematic.  Thus,
this should make jump_label.h 'includable' from most places.

Link: http://lkml.kernel.org/r/7060ce35ddd0d20b33bf170685e6b0fab816bdf2.1467837322.git.jbaron@akamai.com
Signed-off-by: Jason Baron <jbaron@akamai.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Joe Perches <joe@perches.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/jump_label.h | 46 ++++++++++++++++++++--------------------
 kernel/jump_label.c        | 53 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 68904469fba1..661af564fae8 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -76,7 +76,6 @@
 
 #include <linux/types.h>
 #include <linux/compiler.h>
-#include <linux/bug.h>
 
 extern bool static_key_initialized;
 
@@ -115,20 +114,8 @@ enum jump_label_type {
 
 struct module;
 
-#include <linux/atomic.h>
-
 #ifdef HAVE_JUMP_LABEL
 
-static inline int static_key_count(struct static_key *key)
-{
-	/*
-	 * -1 means the first static_key_slow_inc() is in progress.
-	 *  static_key_enabled() must return true, so return 1 here.
-	 */
-	int n = atomic_read(&key->enabled);
-	return n >= 0 ? n : 1;
-}
-
 #define JUMP_TYPE_FALSE	0UL
 #define JUMP_TYPE_TRUE	1UL
 #define JUMP_TYPE_MASK	1UL
@@ -157,16 +144,29 @@ extern int jump_label_text_reserved(void *start, void *end);
 extern void static_key_slow_inc(struct static_key *key);
 extern void static_key_slow_dec(struct static_key *key);
 extern void jump_label_apply_nops(struct module *mod);
+extern int static_key_count(struct static_key *key);
+extern void static_key_enable(struct static_key *key);
+extern void static_key_disable(struct static_key *key);
 
+/*
+ * We should be using ATOMIC_INIT() for initializing .enabled, but
+ * the inclusion of atomic.h is problematic for inclusion of jump_label.h
+ * in 'low-level' headers. Thus, we are initializing .enabled with a
+ * raw value, but have added a BUILD_BUG_ON() to catch any issues in
+ * jump_label_init() see: kernel/jump_label.c.
+ */
 #define STATIC_KEY_INIT_TRUE					\
-	{ .enabled = ATOMIC_INIT(1),				\
+	{ .enabled = { 1 },					\
 	  .entries = (void *)JUMP_TYPE_TRUE }
 #define STATIC_KEY_INIT_FALSE					\
-	{ .enabled = ATOMIC_INIT(0),				\
+	{ .enabled = { 0 },					\
 	  .entries = (void *)JUMP_TYPE_FALSE }
 
 #else  /* !HAVE_JUMP_LABEL */
 
+#include <linux/atomic.h>
+#include <linux/bug.h>
+
 static inline int static_key_count(struct static_key *key)
 {
 	return atomic_read(&key->enabled);
@@ -216,14 +216,6 @@ static inline int jump_label_apply_nops(struct module *mod)
 	return 0;
 }
 
-#define STATIC_KEY_INIT_TRUE	{ .enabled = ATOMIC_INIT(1) }
-#define STATIC_KEY_INIT_FALSE	{ .enabled = ATOMIC_INIT(0) }
-
-#endif	/* HAVE_JUMP_LABEL */
-
-#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE
-#define jump_label_enabled static_key_enabled
-
 static inline void static_key_enable(struct static_key *key)
 {
 	int count = static_key_count(key);
@@ -244,6 +236,14 @@ static inline void static_key_disable(struct static_key *key)
 		static_key_slow_dec(key);
 }
 
+#define STATIC_KEY_INIT_TRUE	{ .enabled = ATOMIC_INIT(1) }
+#define STATIC_KEY_INIT_FALSE	{ .enabled = ATOMIC_INIT(0) }
+
+#endif	/* HAVE_JUMP_LABEL */
+
+#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE
+#define jump_label_enabled static_key_enabled
+
 /* -------------------------------------------------------------------------- */
 
 /*
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 0dbea887d625..f19aa02a8f48 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -14,6 +14,7 @@
 #include <linux/err.h>
 #include <linux/static_key.h>
 #include <linux/jump_label_ratelimit.h>
+#include <linux/bug.h>
 
 #ifdef HAVE_JUMP_LABEL
 
@@ -56,6 +57,49 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
 
 static void jump_label_update(struct static_key *key);
 
+/*
+ * There are similar definitions for the !HAVE_JUMP_LABEL case in jump_label.h.
+ * The use of 'atomic_read()' requires atomic.h and its problematic for some
+ * kernel headers such as kernel.h and others. Since static_key_count() is not
+ * used in the branch statements as it is for the !HAVE_JUMP_LABEL case its ok
+ * to have it be a function here. Similarly, for 'static_key_enable()' and
+ * 'static_key_disable()', which require bug.h. This should allow jump_label.h
+ * to be included from most/all places for HAVE_JUMP_LABEL.
+ */
+int static_key_count(struct static_key *key)
+{
+	/*
+	 * -1 means the first static_key_slow_inc() is in progress.
+	 *  static_key_enabled() must return true, so return 1 here.
+	 */
+	int n = atomic_read(&key->enabled);
+
+	return n >= 0 ? n : 1;
+}
+EXPORT_SYMBOL_GPL(static_key_count);
+
+void static_key_enable(struct static_key *key)
+{
+	int count = static_key_count(key);
+
+	WARN_ON_ONCE(count < 0 || count > 1);
+
+	if (!count)
+		static_key_slow_inc(key);
+}
+EXPORT_SYMBOL_GPL(static_key_enable);
+
+void static_key_disable(struct static_key *key)
+{
+	int count = static_key_count(key);
+
+	WARN_ON_ONCE(count < 0 || count > 1);
+
+	if (count)
+		static_key_slow_dec(key);
+}
+EXPORT_SYMBOL_GPL(static_key_disable);
+
 void static_key_slow_inc(struct static_key *key)
 {
 	int v, v1;
@@ -235,6 +279,15 @@ void __init jump_label_init(void)
 	struct static_key *key = NULL;
 	struct jump_entry *iter;
 
+	/*
+	 * Since we are initializing the static_key.enabled field with
+	 * with the 'raw' int values (to avoid pulling in atomic.h) in
+	 * jump_label.h, let's make sure that is safe. There are only two
+	 * cases to check since we initialize to 0 or 1.
+	 */
+	BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0);
+	BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1);
+
 	jump_label_lock();
 	jump_label_sort_entries(iter_start, iter_stop);
 
-- 
cgit v1.3-8-gc7d7


From a6ed3ea65d9868fdf9eff84e6fe4f666b8d14b02 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Fri, 5 Aug 2016 14:01:27 -0700
Subject: bpf: restore behavior of bpf_map_update_elem

The introduction of pre-allocated hash elements inadvertently broke
the behavior of bpf hash maps where users expected to call
bpf_map_update_elem() without considering that the map can be full.
Some programs do:
old_value = bpf_map_lookup_elem(map, key);
if (old_value) {
  ... prepare new_value on stack ...
  bpf_map_update_elem(map, key, new_value);
}
Before pre-alloc the update() for existing element would work even
in 'map full' condition. Restore this behavior.

The above program could have updated old_value in place instead of
update() which would be faster and most programs use that approach,
but sometimes the values are large and the programs use update()
helper to do atomic replacement of the element.
Note we cannot simply update element's value in-place like percpu
hash map does and have to allocate extra num_possible_cpu elements
and use this extra reserve when the map is full.

Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/hashtab.c | 84 +++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 73 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index fff3650d52fc..570eeca7bdfa 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -26,11 +26,18 @@ struct bpf_htab {
 	struct bucket *buckets;
 	void *elems;
 	struct pcpu_freelist freelist;
+	void __percpu *extra_elems;
 	atomic_t count;	/* number of elements in this hashtable */
 	u32 n_buckets;	/* number of hash buckets */
 	u32 elem_size;	/* size of each element in bytes */
 };
 
+enum extra_elem_state {
+	HTAB_NOT_AN_EXTRA_ELEM = 0,
+	HTAB_EXTRA_ELEM_FREE,
+	HTAB_EXTRA_ELEM_USED
+};
+
 /* each htab element is struct htab_elem + key + value */
 struct htab_elem {
 	union {
@@ -38,7 +45,10 @@ struct htab_elem {
 		struct bpf_htab *htab;
 		struct pcpu_freelist_node fnode;
 	};
-	struct rcu_head rcu;
+	union {
+		struct rcu_head rcu;
+		enum extra_elem_state state;
+	};
 	u32 hash;
 	char key[0] __aligned(8);
 };
@@ -113,6 +123,23 @@ free_elems:
 	return err;
 }
 
+static int alloc_extra_elems(struct bpf_htab *htab)
+{
+	void __percpu *pptr;
+	int cpu;
+
+	pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
+	if (!pptr)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
+			HTAB_EXTRA_ELEM_FREE;
+	}
+	htab->extra_elems = pptr;
+	return 0;
+}
+
 /* Called from syscall */
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 {
@@ -185,6 +212,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	if (percpu)
 		cost += (u64) round_up(htab->map.value_size, 8) *
 			num_possible_cpus() * htab->map.max_entries;
+	else
+	       cost += (u64) htab->elem_size * num_possible_cpus();
 
 	if (cost >= U32_MAX - PAGE_SIZE)
 		/* make sure page count doesn't overflow */
@@ -212,14 +241,22 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		raw_spin_lock_init(&htab->buckets[i].lock);
 	}
 
+	if (!percpu) {
+		err = alloc_extra_elems(htab);
+		if (err)
+			goto free_buckets;
+	}
+
 	if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
 		err = prealloc_elems_and_freelist(htab);
 		if (err)
-			goto free_buckets;
+			goto free_extra_elems;
 	}
 
 	return &htab->map;
 
+free_extra_elems:
+	free_percpu(htab->extra_elems);
 free_buckets:
 	kvfree(htab->buckets);
 free_htab:
@@ -349,7 +386,6 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
 	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
 		free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
 	kfree(l);
-
 }
 
 static void htab_elem_free_rcu(struct rcu_head *head)
@@ -370,6 +406,11 @@ static void htab_elem_free_rcu(struct rcu_head *head)
 
 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 {
+	if (l->state == HTAB_EXTRA_ELEM_USED) {
+		l->state = HTAB_EXTRA_ELEM_FREE;
+		return;
+	}
+
 	if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
 		pcpu_freelist_push(&htab->freelist, &l->fnode);
 	} else {
@@ -381,25 +422,44 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 
 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 					 void *value, u32 key_size, u32 hash,
-					 bool percpu, bool onallcpus)
+					 bool percpu, bool onallcpus,
+					 bool old_elem_exists)
 {
 	u32 size = htab->map.value_size;
 	bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
 	struct htab_elem *l_new;
 	void __percpu *pptr;
+	int err = 0;
 
 	if (prealloc) {
 		l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
 		if (!l_new)
-			return ERR_PTR(-E2BIG);
+			err = -E2BIG;
 	} else {
 		if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
 			atomic_dec(&htab->count);
-			return ERR_PTR(-E2BIG);
+			err = -E2BIG;
+		} else {
+			l_new = kmalloc(htab->elem_size,
+					GFP_ATOMIC | __GFP_NOWARN);
+			if (!l_new)
+				return ERR_PTR(-ENOMEM);
 		}
-		l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
-		if (!l_new)
-			return ERR_PTR(-ENOMEM);
+	}
+
+	if (err) {
+		if (!old_elem_exists)
+			return ERR_PTR(err);
+
+		/* if we're updating the existing element and the hash table
+		 * is full, use per-cpu extra elems
+		 */
+		l_new = this_cpu_ptr(htab->extra_elems);
+		if (l_new->state != HTAB_EXTRA_ELEM_FREE)
+			return ERR_PTR(-E2BIG);
+		l_new->state = HTAB_EXTRA_ELEM_USED;
+	} else {
+		l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
 	}
 
 	memcpy(l_new->key, key, key_size);
@@ -489,7 +549,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (ret)
 		goto err;
 
-	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
+	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
+				!!l_old);
 	if (IS_ERR(l_new)) {
 		/* all pre-allocated elements are in use or memory exhausted */
 		ret = PTR_ERR(l_new);
@@ -563,7 +624,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 		}
 	} else {
 		l_new = alloc_htab_elem(htab, key, value, key_size,
-					hash, true, onallcpus);
+					hash, true, onallcpus, false);
 		if (IS_ERR(l_new)) {
 			ret = PTR_ERR(l_new);
 			goto err;
@@ -652,6 +713,7 @@ static void htab_map_free(struct bpf_map *map)
 		htab_free_elems(htab);
 		pcpu_freelist_destroy(&htab->freelist);
 	}
+	free_percpu(htab->extra_elems);
 	kvfree(htab->buckets);
 	kfree(htab);
 }
-- 
cgit v1.3-8-gc7d7


From 1eff9d322a444245c67515edb52bc0eb68374aa8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 5 Aug 2016 15:35:16 -0600
Subject: block: rename bio bi_rw to bi_opf

Since commit 63a4cc24867d, bio->bi_rw contains flags in the lower
portion and the op code in the higher portions. This means that
old code that relies on manually setting bi_rw is most likely
going to be broken. Instead of letting that brokeness linger,
rename the member, to force old and out-of-tree code to break
at compile time instead of at runtime.

No intended functional changes in this commit.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/biodoc.txt            |  4 ++--
 Documentation/device-mapper/dm-flakey.txt |  2 +-
 block/bio-integrity.c                     |  2 +-
 block/bio.c                               |  6 +++---
 block/blk-core.c                          | 26 +++++++++++++-------------
 block/blk-merge.c                         |  8 ++++----
 block/blk-mq.c                            | 10 +++++-----
 block/blk-throttle.c                      |  8 ++++----
 block/cfq-iosched.c                       |  4 ++--
 drivers/block/drbd/drbd_main.c            |  8 ++++----
 drivers/block/drbd/drbd_receiver.c        |  2 +-
 drivers/block/drbd/drbd_req.c             |  6 +++---
 drivers/block/drbd/drbd_worker.c          |  2 +-
 drivers/block/pktcdvd.c                   |  2 +-
 drivers/block/umem.c                      |  2 +-
 drivers/md/bcache/request.c               | 12 ++++++------
 drivers/md/bcache/super.c                 |  2 +-
 drivers/md/bcache/writeback.h             |  2 +-
 drivers/md/dm-cache-target.c              |  8 ++++----
 drivers/md/dm-crypt.c                     |  4 ++--
 drivers/md/dm-era-target.c                |  2 +-
 drivers/md/dm-flakey.c                    |  6 +++---
 drivers/md/dm-io.c                        |  6 +++---
 drivers/md/dm-log-writes.c                |  4 ++--
 drivers/md/dm-mpath.c                     |  2 +-
 drivers/md/dm-raid1.c                     | 10 +++++-----
 drivers/md/dm-region-hash.c               |  4 ++--
 drivers/md/dm-snap.c                      |  6 +++---
 drivers/md/dm-stripe.c                    |  4 ++--
 drivers/md/dm-thin.c                      |  8 ++++----
 drivers/md/dm-zero.c                      |  2 +-
 drivers/md/dm.c                           | 10 +++++-----
 drivers/md/linear.c                       |  2 +-
 drivers/md/md.c                           |  4 ++--
 drivers/md/multipath.c                    |  8 ++++----
 drivers/md/raid0.c                        |  2 +-
 drivers/md/raid1.c                        |  6 +++---
 drivers/md/raid10.c                       |  8 ++++----
 drivers/md/raid5-cache.c                  |  2 +-
 drivers/md/raid5.c                        | 20 ++++++++++----------
 drivers/nvdimm/pmem.c                     |  4 ++--
 fs/btrfs/check-integrity.c                | 10 +++++-----
 fs/btrfs/disk-io.c                        |  2 +-
 fs/btrfs/inode.c                          |  6 +++---
 fs/btrfs/volumes.c                        |  6 +++---
 include/linux/bio.h                       |  4 ++--
 include/linux/blk-cgroup.h                |  4 ++--
 include/linux/blk_types.h                 | 15 ++++++++-------
 include/trace/events/bcache.h             |  8 ++++----
 include/trace/events/block.h              | 14 +++++++-------
 kernel/trace/blktrace.c                   |  6 +++---
 51 files changed, 158 insertions(+), 157 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 026d13362aca..bcdb2b4c1f12 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -269,7 +269,7 @@ Arjan's proposed request priority scheme allows higher levels some broad
   requests which haven't aged too much on the queue. Potentially this priority
   could even be exposed to applications in some manner, providing higher level
   tunability. Time based aging avoids starvation of lower priority
-  requests. Some bits in the bi_rw flags field in the bio structure are
+  requests. Some bits in the bi_opf flags field in the bio structure are
   intended to be used for this priority information.
 
 
@@ -432,7 +432,7 @@ struct bio {
        struct bio          *bi_next;    /* request queue link */
        struct block_device *bi_bdev;	/* target device */
        unsigned long       bi_flags;    /* status, command, etc */
-       unsigned long       bi_rw;       /* low bits: r/w, high: priority */
+       unsigned long       bi_opf;       /* low bits: r/w, high: priority */
 
        unsigned int	bi_vcnt;     /* how may bio_vec's */
        struct bvec_iter	bi_iter;	/* current index into bio_vec array */
diff --git a/Documentation/device-mapper/dm-flakey.txt b/Documentation/device-mapper/dm-flakey.txt
index 6ff5c2327227..c43030718cef 100644
--- a/Documentation/device-mapper/dm-flakey.txt
+++ b/Documentation/device-mapper/dm-flakey.txt
@@ -42,7 +42,7 @@ Optional feature parameters:
     <direction>: Either 'r' to corrupt reads or 'w' to corrupt writes.
 		 'w' is incompatible with drop_writes.
     <value>: The value (from 0-255) to write.
-    <flags>: Perform the replacement only if bio->bi_rw has all the
+    <flags>: Perform the replacement only if bio->bi_opf has all the
 	     selected flags set.
 
 Examples:
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index f70cc3bdfd01..63f72f00c72e 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -86,7 +86,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 
 	bip->bip_bio = bio;
 	bio->bi_integrity = bip;
-	bio->bi_rw |= REQ_INTEGRITY;
+	bio->bi_opf |= REQ_INTEGRITY;
 
 	return bip;
 err:
diff --git a/block/bio.c b/block/bio.c
index 3f76a38a5e2d..f39477538fef 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -580,7 +580,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 	 */
 	bio->bi_bdev = bio_src->bi_bdev;
 	bio_set_flag(bio, BIO_CLONED);
-	bio->bi_rw = bio_src->bi_rw;
+	bio->bi_opf = bio_src->bi_opf;
 	bio->bi_iter = bio_src->bi_iter;
 	bio->bi_io_vec = bio_src->bi_io_vec;
 
@@ -663,7 +663,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 	if (!bio)
 		return NULL;
 	bio->bi_bdev		= bio_src->bi_bdev;
-	bio->bi_rw		= bio_src->bi_rw;
+	bio->bi_opf		= bio_src->bi_opf;
 	bio->bi_iter.bi_sector	= bio_src->bi_iter.bi_sector;
 	bio->bi_iter.bi_size	= bio_src->bi_iter.bi_size;
 
@@ -873,7 +873,7 @@ int submit_bio_wait(struct bio *bio)
 	init_completion(&ret.event);
 	bio->bi_private = &ret;
 	bio->bi_end_io = submit_bio_wait_endio;
-	bio->bi_rw |= REQ_SYNC;
+	bio->bi_opf |= REQ_SYNC;
 	submit_bio(bio);
 	wait_for_completion_io(&ret.event);
 
diff --git a/block/blk-core.c b/block/blk-core.c
index a687e9cc16c2..999442ec4601 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1029,7 +1029,7 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
 	 * Flush requests do not use the elevator so skip initialization.
 	 * This allows a request to share the flush and elevator data.
 	 */
-	if (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA))
+	if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA))
 		return false;
 
 	return true;
@@ -1504,7 +1504,7 @@ EXPORT_SYMBOL_GPL(blk_add_request_payload);
 bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 			    struct bio *bio)
 {
-	const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+	const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
 
 	if (!ll_back_merge_fn(q, req, bio))
 		return false;
@@ -1526,7 +1526,7 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 			     struct bio *bio)
 {
-	const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+	const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
 
 	if (!ll_front_merge_fn(q, req, bio))
 		return false;
@@ -1648,8 +1648,8 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 {
 	req->cmd_type = REQ_TYPE_FS;
 
-	req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK;
-	if (bio->bi_rw & REQ_RAHEAD)
+	req->cmd_flags |= bio->bi_opf & REQ_COMMON_MASK;
+	if (bio->bi_opf & REQ_RAHEAD)
 		req->cmd_flags |= REQ_FAILFAST_MASK;
 
 	req->errors = 0;
@@ -1660,7 +1660,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 
 static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 {
-	const bool sync = !!(bio->bi_rw & REQ_SYNC);
+	const bool sync = !!(bio->bi_opf & REQ_SYNC);
 	struct blk_plug *plug;
 	int el_ret, rw_flags = 0, where = ELEVATOR_INSERT_SORT;
 	struct request *req;
@@ -1681,7 +1681,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 		return BLK_QC_T_NONE;
 	}
 
-	if (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA)) {
+	if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) {
 		spin_lock_irq(q->queue_lock);
 		where = ELEVATOR_INSERT_FLUSH;
 		goto get_rq;
@@ -1728,7 +1728,7 @@ get_rq:
 	/*
 	 * Add in META/PRIO flags, if set, before we get to the IO scheduler
 	 */
-	rw_flags |= (bio->bi_rw & (REQ_META | REQ_PRIO));
+	rw_flags |= (bio->bi_opf & (REQ_META | REQ_PRIO));
 
 	/*
 	 * Grab a free request. This is might sleep but can not fail.
@@ -1805,7 +1805,7 @@ static void handle_bad_sector(struct bio *bio)
 	printk(KERN_INFO "attempt to access beyond end of device\n");
 	printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n",
 			bdevname(bio->bi_bdev, b),
-			bio->bi_rw,
+			bio->bi_opf,
 			(unsigned long long)bio_end_sector(bio),
 			(long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
 }
@@ -1918,9 +1918,9 @@ generic_make_request_checks(struct bio *bio)
 	 * drivers without flush support don't have to worry
 	 * about them.
 	 */
-	if ((bio->bi_rw & (REQ_PREFLUSH | REQ_FUA)) &&
+	if ((bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) &&
 	    !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
-		bio->bi_rw &= ~(REQ_PREFLUSH | REQ_FUA);
+		bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
 		if (!nr_sectors) {
 			err = 0;
 			goto end_io;
@@ -2219,7 +2219,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
 	 * one.
 	 */
 	for (bio = rq->bio; bio; bio = bio->bi_next) {
-		if ((bio->bi_rw & ff) != ff)
+		if ((bio->bi_opf & ff) != ff)
 			break;
 		bytes += bio->bi_iter.bi_size;
 	}
@@ -2630,7 +2630,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	/* mixed attributes always follow the first bio */
 	if (req->cmd_flags & REQ_MIXED_MERGE) {
 		req->cmd_flags &= ~REQ_FAILFAST_MASK;
-		req->cmd_flags |= req->bio->bi_rw & REQ_FAILFAST_MASK;
+		req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
 	}
 
 	/*
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 41cbd4878958..3eec75a9e91d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -186,7 +186,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
 
 	if (split) {
 		/* there isn't chance to merge the splitted bio */
-		split->bi_rw |= REQ_NOMERGE;
+		split->bi_opf |= REQ_NOMERGE;
 
 		bio_chain(split, *bio);
 		trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
@@ -616,9 +616,9 @@ void blk_rq_set_mixed_merge(struct request *rq)
 	 * Distributes the attributs to each bio.
 	 */
 	for (bio = rq->bio; bio; bio = bio->bi_next) {
-		WARN_ON_ONCE((bio->bi_rw & REQ_FAILFAST_MASK) &&
-			     (bio->bi_rw & REQ_FAILFAST_MASK) != ff);
-		bio->bi_rw |= ff;
+		WARN_ON_ONCE((bio->bi_opf & REQ_FAILFAST_MASK) &&
+			     (bio->bi_opf & REQ_FAILFAST_MASK) != ff);
+		bio->bi_opf |= ff;
 	}
 	rq->cmd_flags |= REQ_MIXED_MERGE;
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6a63da101bc4..e931a0e8e73d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1234,7 +1234,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	ctx = blk_mq_get_ctx(q);
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 
-	if (rw_is_sync(bio_op(bio), bio->bi_rw))
+	if (rw_is_sync(bio_op(bio), bio->bi_opf))
 		op_flags |= REQ_SYNC;
 
 	trace_block_getrq(q, bio, op);
@@ -1302,8 +1302,8 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
  */
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
-	const int is_sync = rw_is_sync(bio_op(bio), bio->bi_rw);
-	const int is_flush_fua = bio->bi_rw & (REQ_PREFLUSH | REQ_FUA);
+	const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf);
+	const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
 	struct blk_map_ctx data;
 	struct request *rq;
 	unsigned int request_count = 0;
@@ -1396,8 +1396,8 @@ done:
  */
 static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
-	const int is_sync = rw_is_sync(bio_op(bio), bio->bi_rw);
-	const int is_flush_fua = bio->bi_rw & (REQ_PREFLUSH | REQ_FUA);
+	const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf);
+	const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
 	struct blk_plug *plug;
 	unsigned int request_count = 0;
 	struct blk_map_ctx data;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c5494e403239..f1aba26f4719 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -821,8 +821,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	 * second time when it eventually gets issued.  Set it when a bio
 	 * is being charged to a tg.
 	 */
-	if (!(bio->bi_rw & REQ_THROTTLED))
-		bio->bi_rw |= REQ_THROTTLED;
+	if (!(bio->bi_opf & REQ_THROTTLED))
+		bio->bi_opf |= REQ_THROTTLED;
 }
 
 /**
@@ -1399,7 +1399,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
 	/* see throtl_charge_bio() */
-	if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw])
+	if ((bio->bi_opf & REQ_THROTTLED) || !tg->has_rules[rw])
 		goto out;
 
 	spin_lock_irq(q->queue_lock);
@@ -1478,7 +1478,7 @@ out:
 	 * being issued.
 	 */
 	if (!throttled)
-		bio->bi_rw &= ~REQ_THROTTLED;
+		bio->bi_opf &= ~REQ_THROTTLED;
 	return throttled;
 }
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index acabba198de9..cc2f6dbd4303 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -918,7 +918,7 @@ static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
  */
 static inline bool cfq_bio_sync(struct bio *bio)
 {
-	return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
+	return bio_data_dir(bio) == READ || (bio->bi_opf & REQ_SYNC);
 }
 
 /*
@@ -2565,7 +2565,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
 static void cfq_bio_merged(struct request_queue *q, struct request *req,
 				struct bio *bio)
 {
-	cfqg_stats_update_io_merged(RQ_CFQG(req), bio_op(bio), bio->bi_rw);
+	cfqg_stats_update_io_merged(RQ_CFQG(req), bio_op(bio), bio->bi_opf);
 }
 
 static void
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 0501ae0c517b..100be556e613 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1663,13 +1663,13 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection,
 			     struct bio *bio)
 {
 	if (connection->agreed_pro_version >= 95)
-		return  (bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
-			(bio->bi_rw & REQ_FUA ? DP_FUA : 0) |
-			(bio->bi_rw & REQ_PREFLUSH ? DP_FLUSH : 0) |
+		return  (bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0) |
+			(bio->bi_opf & REQ_FUA ? DP_FUA : 0) |
+			(bio->bi_opf & REQ_PREFLUSH ? DP_FLUSH : 0) |
 			(bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) |
 			(bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0);
 	else
-		return bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
+		return bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0;
 }
 
 /* Used to send write or TRIM aka REQ_DISCARD requests
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index df45713dfbe8..942384f34e22 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1564,7 +1564,7 @@ static void drbd_issue_peer_wsame(struct drbd_device *device,
  * drbd_submit_peer_request()
  * @device:	DRBD device.
  * @peer_req:	peer request
- * @rw:		flag field, see bio->bi_rw
+ * @rw:		flag field, see bio->bi_opf
  *
  * May spread the pages to multiple bios,
  * depending on bio_add_page restrictions.
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 66b8e4bb74d8..de279fe4e4fd 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -288,7 +288,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
 	 */
 	if (!ok &&
 	    bio_op(req->master_bio) == REQ_OP_READ &&
-	    !(req->master_bio->bi_rw & REQ_RAHEAD) &&
+	    !(req->master_bio->bi_opf & REQ_RAHEAD) &&
 	    !list_empty(&req->tl_requests))
 		req->rq_state |= RQ_POSTPONED;
 
@@ -1137,7 +1137,7 @@ static int drbd_process_write_request(struct drbd_request *req)
 	 * replicating, in which case there is no point. */
 	if (unlikely(req->i.size == 0)) {
 		/* The only size==0 bios we expect are empty flushes. */
-		D_ASSERT(device, req->master_bio->bi_rw & REQ_PREFLUSH);
+		D_ASSERT(device, req->master_bio->bi_opf & REQ_PREFLUSH);
 		if (remote)
 			_req_mod(req, QUEUE_AS_DRBD_BARRIER);
 		return remote;
@@ -1176,7 +1176,7 @@ drbd_submit_req_private_bio(struct drbd_request *req)
 
 	if (bio_op(bio) != REQ_OP_READ)
 		type = DRBD_FAULT_DT_WR;
-	else if (bio->bi_rw & REQ_RAHEAD)
+	else if (bio->bi_opf & REQ_RAHEAD)
 		type = DRBD_FAULT_DT_RA;
 	else
 		type = DRBD_FAULT_DT_RD;
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 35dbb3dca47e..c6755c9a0aea 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -256,7 +256,7 @@ void drbd_request_endio(struct bio *bio)
 				what = DISCARD_COMPLETED_WITH_ERROR;
 			break;
 		case REQ_OP_READ:
-			if (bio->bi_rw & REQ_RAHEAD)
+			if (bio->bi_opf & REQ_RAHEAD)
 				what = READ_AHEAD_COMPLETED_WITH_ERROR;
 			else
 				what = READ_COMPLETED_WITH_ERROR;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 9393bc730acf..90fa4ac149db 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1157,7 +1157,7 @@ static int pkt_start_recovery(struct packet_data *pkt)
 
 	bio_reset(pkt->bio);
 	pkt->bio->bi_bdev = pd->bdev;
-	pkt->bio->bi_rw = REQ_WRITE;
+	bio_set_op_attrs(pkt->bio, REQ_OP_WRITE, 0);
 	pkt->bio->bi_iter.bi_sector = new_sector;
 	pkt->bio->bi_iter.bi_size = pkt->frames * CD_FRAMESIZE;
 	pkt->bio->bi_vcnt = pkt->frames;
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index d0a3e6d4515f..be90e15854ed 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -535,7 +535,7 @@ static blk_qc_t mm_make_request(struct request_queue *q, struct bio *bio)
 	*card->biotail = bio;
 	bio->bi_next = NULL;
 	card->biotail = &bio->bi_next;
-	if (bio->bi_rw & REQ_SYNC || !mm_check_plugged(card))
+	if (bio->bi_opf & REQ_SYNC || !mm_check_plugged(card))
 		activate(card);
 	spin_unlock_irq(&card->lock);
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 69f16f43f8ab..4b177fe11ebb 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -208,7 +208,7 @@ static void bch_data_insert_start(struct closure *cl)
 	 * Journal writes are marked REQ_PREFLUSH; if the original write was a
 	 * flush, it'll wait on the journal write.
 	 */
-	bio->bi_rw &= ~(REQ_PREFLUSH|REQ_FUA);
+	bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA);
 
 	do {
 		unsigned i;
@@ -405,7 +405,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	if (!congested &&
 	    mode == CACHE_MODE_WRITEBACK &&
 	    op_is_write(bio_op(bio)) &&
-	    (bio->bi_rw & REQ_SYNC))
+	    (bio->bi_opf & REQ_SYNC))
 		goto rescale;
 
 	spin_lock(&dc->io_lock);
@@ -668,7 +668,7 @@ static inline struct search *search_alloc(struct bio *bio,
 	s->iop.write_prio	= 0;
 	s->iop.error		= 0;
 	s->iop.flags		= 0;
-	s->iop.flush_journal	= (bio->bi_rw & (REQ_PREFLUSH|REQ_FUA)) != 0;
+	s->iop.flush_journal	= (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA)) != 0;
 	s->iop.wq		= bcache_wq;
 
 	return s;
@@ -796,8 +796,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 		goto out_submit;
 	}
 
-	if (!(bio->bi_rw & REQ_RAHEAD) &&
-	    !(bio->bi_rw & REQ_META) &&
+	if (!(bio->bi_opf & REQ_RAHEAD) &&
+	    !(bio->bi_opf & REQ_META) &&
 	    s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
 		reada = min_t(sector_t, dc->readahead >> 9,
 			      bdev_sectors(bio->bi_bdev) - bio_end_sector(bio));
@@ -920,7 +920,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 		bch_writeback_add(dc);
 		s->iop.bio = bio;
 
-		if (bio->bi_rw & REQ_PREFLUSH) {
+		if (bio->bi_opf & REQ_PREFLUSH) {
 			/* Also need to send a flush to the backing device */
 			struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0,
 							     dc->disk.bio_split);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 88ef6d14cce3..95a4ca6ce6ff 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -347,7 +347,7 @@ static void uuid_io(struct cache_set *c, int op, unsigned long op_flags,
 	for (i = 0; i < KEY_PTRS(k); i++) {
 		struct bio *bio = bch_bbio_alloc(c);
 
-		bio->bi_rw	= REQ_SYNC|REQ_META|op_flags;
+		bio->bi_opf = REQ_SYNC | REQ_META | op_flags;
 		bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
 
 		bio->bi_end_io	= uuid_endio;
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 073a042aed24..301eaf565167 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -57,7 +57,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 	if (would_skip)
 		return false;
 
-	return bio->bi_rw & REQ_SYNC ||
+	return bio->bi_opf & REQ_SYNC ||
 		in_use <= CUTOFF_WRITEBACK;
 }
 
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 718744db62df..59b2c50562e4 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -788,7 +788,7 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 
 	spin_lock_irqsave(&cache->lock, flags);
 	if (cache->need_tick_bio &&
-	    !(bio->bi_rw & (REQ_FUA | REQ_PREFLUSH)) &&
+	    !(bio->bi_opf & (REQ_FUA | REQ_PREFLUSH)) &&
 	    bio_op(bio) != REQ_OP_DISCARD) {
 		pb->tick = true;
 		cache->need_tick_bio = false;
@@ -830,7 +830,7 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 
 static int bio_triggers_commit(struct cache *cache, struct bio *bio)
 {
-	return bio->bi_rw & (REQ_PREFLUSH | REQ_FUA);
+	return bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
 }
 
 /*
@@ -1069,7 +1069,7 @@ static void dec_io_migrations(struct cache *cache)
 static bool discard_or_flush(struct bio *bio)
 {
 	return bio_op(bio) == REQ_OP_DISCARD ||
-	       bio->bi_rw & (REQ_PREFLUSH | REQ_FUA);
+	       bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
 }
 
 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
@@ -1980,7 +1980,7 @@ static void process_deferred_bios(struct cache *cache)
 
 		bio = bio_list_pop(&bios);
 
-		if (bio->bi_rw & REQ_PREFLUSH)
+		if (bio->bi_opf & REQ_PREFLUSH)
 			process_flush_bio(cache, bio);
 		else if (bio_op(bio) == REQ_OP_DISCARD)
 			process_discard_bio(cache, &structs, bio);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 8f2e3e2ffd26..4e9784b4e0ac 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1136,7 +1136,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 	clone->bi_private = io;
 	clone->bi_end_io  = crypt_endio;
 	clone->bi_bdev    = cc->dev->bdev;
-	bio_set_op_attrs(clone, bio_op(io->base_bio), io->base_bio->bi_rw);
+	bio_set_op_attrs(clone, bio_op(io->base_bio), io->base_bio->bi_opf);
 }
 
 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
@@ -1915,7 +1915,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 	 * - for REQ_PREFLUSH device-mapper core ensures that no IO is in-flight
 	 * - for REQ_OP_DISCARD caller must use flush if IO ordering matters
 	 */
-	if (unlikely(bio->bi_rw & REQ_PREFLUSH ||
+	if (unlikely(bio->bi_opf & REQ_PREFLUSH ||
 	    bio_op(bio) == REQ_OP_DISCARD)) {
 		bio->bi_bdev = cc->dev->bdev;
 		if (bio_sectors(bio))
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index 2faf49d8f4d7..bf2b2676cb8a 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -1542,7 +1542,7 @@ static int era_map(struct dm_target *ti, struct bio *bio)
 	/*
 	 * REQ_PREFLUSH bios carry no data, so we're not interested in them.
 	 */
-	if (!(bio->bi_rw & REQ_PREFLUSH) &&
+	if (!(bio->bi_opf & REQ_PREFLUSH) &&
 	    (bio_data_dir(bio) == WRITE) &&
 	    !metadata_current_marked(era->md, block)) {
 		defer_bio(era, bio);
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 19db13e99466..97e446d54a15 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -16,7 +16,7 @@
 #define DM_MSG_PREFIX "flakey"
 
 #define all_corrupt_bio_flags_match(bio, fc)	\
-	(((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags)
+	(((bio)->bi_opf & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags)
 
 /*
  * Flakey: Used for testing only, simulates intermittent,
@@ -266,9 +266,9 @@ static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
 		data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value;
 
 		DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
-			"(rw=%c bi_rw=%u bi_sector=%llu cur_bytes=%u)\n",
+			"(rw=%c bi_opf=%u bi_sector=%llu cur_bytes=%u)\n",
 			bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
-			(bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_rw,
+			(bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf,
 			(unsigned long long)bio->bi_iter.bi_sector, bio_bytes);
 	}
 }
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index daa03e41654a..0bf1a12e35fe 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -505,9 +505,9 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
  * New collapsed (a)synchronous interface.
  *
  * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug
- * the queue with blk_unplug() some time later or set REQ_SYNC in io_req->bi_rw.
- * If you fail to do one of these, the IO will be submitted to the disk after
- * q->unplug_delay, which defaults to 3ms in blk-settings.c.
+ * the queue with blk_unplug() some time later or set REQ_SYNC in
+ * io_req->bi_opf. If you fail to do one of these, the IO will be submitted to
+ * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c.
  */
 int dm_io(struct dm_io_request *io_req, unsigned num_regions,
 	  struct dm_io_region *where, unsigned long *sync_error_bits)
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index b5dbf7a0515e..4ab68033f9d1 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -555,8 +555,8 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio)
 	struct bio_vec bv;
 	size_t alloc_size;
 	int i = 0;
-	bool flush_bio = (bio->bi_rw & REQ_PREFLUSH);
-	bool fua_bio = (bio->bi_rw & REQ_FUA);
+	bool flush_bio = (bio->bi_opf & REQ_PREFLUSH);
+	bool fua_bio = (bio->bi_opf & REQ_FUA);
 	bool discard_bio = (bio_op(bio) == REQ_OP_DISCARD);
 
 	pb->block = NULL;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index d7107d23b897..ac734e5bbe48 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -661,7 +661,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
 
 	bio->bi_error = 0;
 	bio->bi_bdev = pgpath->path.dev->bdev;
-	bio->bi_rw |= REQ_FAILFAST_TRANSPORT;
+	bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
 
 	if (pgpath->pg->ps.type->start_io)
 		pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index dac55b254a09..bdf1606f67bc 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -657,7 +657,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
 	struct mirror *m;
 	struct dm_io_request io_req = {
 		.bi_op = REQ_OP_WRITE,
-		.bi_op_flags = bio->bi_rw & WRITE_FLUSH_FUA,
+		.bi_op_flags = bio->bi_opf & WRITE_FLUSH_FUA,
 		.mem.type = DM_IO_BIO,
 		.mem.ptr.bio = bio,
 		.notify.fn = write_callback,
@@ -704,7 +704,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	bio_list_init(&requeue);
 
 	while ((bio = bio_list_pop(writes))) {
-		if ((bio->bi_rw & REQ_PREFLUSH) ||
+		if ((bio->bi_opf & REQ_PREFLUSH) ||
 		    (bio_op(bio) == REQ_OP_DISCARD)) {
 			bio_list_add(&sync, bio);
 			continue;
@@ -1217,7 +1217,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
 	 * If region is not in-sync queue the bio.
 	 */
 	if (!r || (r == -EWOULDBLOCK)) {
-		if (bio->bi_rw & REQ_RAHEAD)
+		if (bio->bi_opf & REQ_RAHEAD)
 			return -EWOULDBLOCK;
 
 		queue_bio(ms, bio, rw);
@@ -1253,7 +1253,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
 	 * We need to dec pending if this was a write.
 	 */
 	if (rw == WRITE) {
-		if (!(bio->bi_rw & REQ_PREFLUSH) &&
+		if (!(bio->bi_opf & REQ_PREFLUSH) &&
 		    bio_op(bio) != REQ_OP_DISCARD)
 			dm_rh_dec(ms->rh, bio_record->write_region);
 		return error;
@@ -1262,7 +1262,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
 	if (error == -EOPNOTSUPP)
 		goto out;
 
-	if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD))
+	if ((error == -EWOULDBLOCK) && (bio->bi_opf & REQ_RAHEAD))
 		goto out;
 
 	if (unlikely(error)) {
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index b11813431f31..85c32b22a420 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -398,7 +398,7 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
 	region_t region = dm_rh_bio_to_region(rh, bio);
 	int recovering = 0;
 
-	if (bio->bi_rw & REQ_PREFLUSH) {
+	if (bio->bi_opf & REQ_PREFLUSH) {
 		rh->flush_failure = 1;
 		return;
 	}
@@ -526,7 +526,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
 	struct bio *bio;
 
 	for (bio = bios->head; bio; bio = bio->bi_next) {
-		if (bio->bi_rw & REQ_PREFLUSH || bio_op(bio) == REQ_OP_DISCARD)
+		if (bio->bi_opf & REQ_PREFLUSH || bio_op(bio) == REQ_OP_DISCARD)
 			continue;
 		rh_inc(rh, dm_rh_bio_to_region(rh, bio));
 	}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index ce2a910709f7..c65feeada864 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1680,7 +1680,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 
 	init_tracked_chunk(bio);
 
-	if (bio->bi_rw & REQ_PREFLUSH) {
+	if (bio->bi_opf & REQ_PREFLUSH) {
 		bio->bi_bdev = s->cow->bdev;
 		return DM_MAPIO_REMAPPED;
 	}
@@ -1800,7 +1800,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
 
 	init_tracked_chunk(bio);
 
-	if (bio->bi_rw & REQ_PREFLUSH) {
+	if (bio->bi_opf & REQ_PREFLUSH) {
 		if (!dm_bio_get_target_bio_nr(bio))
 			bio->bi_bdev = s->origin->bdev;
 		else
@@ -2286,7 +2286,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio)
 
 	bio->bi_bdev = o->dev->bdev;
 
-	if (unlikely(bio->bi_rw & REQ_PREFLUSH))
+	if (unlikely(bio->bi_opf & REQ_PREFLUSH))
 		return DM_MAPIO_REMAPPED;
 
 	if (bio_data_dir(bio) != WRITE)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 83f1d4667195..28193a57bf47 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -286,7 +286,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
 	uint32_t stripe;
 	unsigned target_bio_nr;
 
-	if (bio->bi_rw & REQ_PREFLUSH) {
+	if (bio->bi_opf & REQ_PREFLUSH) {
 		target_bio_nr = dm_bio_get_target_bio_nr(bio);
 		BUG_ON(target_bio_nr >= sc->stripes);
 		bio->bi_bdev = sc->stripe[target_bio_nr].dev->bdev;
@@ -383,7 +383,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error)
 	if (!error)
 		return 0; /* I/O complete */
 
-	if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD))
+	if ((error == -EWOULDBLOCK) && (bio->bi_opf & REQ_RAHEAD))
 		return error;
 
 	if (error == -EOPNOTSUPP)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 197ea2003400..d1c05c12a9db 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -699,7 +699,7 @@ static void remap_to_origin(struct thin_c *tc, struct bio *bio)
 
 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
 {
-	return (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA)) &&
+	return (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) &&
 		dm_thin_changed_this_transaction(tc->td);
 }
 
@@ -870,7 +870,7 @@ static void __inc_remap_and_issue_cell(void *context,
 	struct bio *bio;
 
 	while ((bio = bio_list_pop(&cell->bios))) {
-		if (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA) ||
+		if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) ||
 		    bio_op(bio) == REQ_OP_DISCARD)
 			bio_list_add(&info->defer_bios, bio);
 		else {
@@ -1717,7 +1717,7 @@ static void __remap_and_issue_shared_cell(void *context,
 
 	while ((bio = bio_list_pop(&cell->bios))) {
 		if ((bio_data_dir(bio) == WRITE) ||
-		    (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA) ||
+		    (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) ||
 		     bio_op(bio) == REQ_OP_DISCARD))
 			bio_list_add(&info->defer_bios, bio);
 		else {
@@ -2635,7 +2635,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_SUBMITTED;
 	}
 
-	if (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA) ||
+	if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) ||
 	    bio_op(bio) == REQ_OP_DISCARD) {
 		thin_defer_bio_with_throttle(tc, bio);
 		return DM_MAPIO_SUBMITTED;
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index 618b8752dcf1..b616f11d8473 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -37,7 +37,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio)
 {
 	switch (bio_op(bio)) {
 	case REQ_OP_READ:
-		if (bio->bi_rw & REQ_RAHEAD) {
+		if (bio->bi_opf & REQ_RAHEAD) {
 			/* readahead of null bytes only wastes buffer cache */
 			return -EIO;
 		}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dfa09e14e847..fa9b1cb4438a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -798,12 +798,12 @@ static void dec_pending(struct dm_io *io, int error)
 		if (io_error == DM_ENDIO_REQUEUE)
 			return;
 
-		if ((bio->bi_rw & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
+		if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
 			/*
 			 * Preflush done for flush with data, reissue
 			 * without REQ_PREFLUSH.
 			 */
-			bio->bi_rw &= ~REQ_PREFLUSH;
+			bio->bi_opf &= ~REQ_PREFLUSH;
 			queue_io(md, bio);
 		} else {
 			/* done with normal IO or empty flush */
@@ -964,7 +964,7 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
 {
 	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 	unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
-	BUG_ON(bio->bi_rw & REQ_PREFLUSH);
+	BUG_ON(bio->bi_opf & REQ_PREFLUSH);
 	BUG_ON(bi_size > *tio->len_ptr);
 	BUG_ON(n_sectors > bi_size);
 	*tio->len_ptr -= bi_size - n_sectors;
@@ -1252,7 +1252,7 @@ static void __split_and_process_bio(struct mapped_device *md,
 
 	start_io_acct(ci.io);
 
-	if (bio->bi_rw & REQ_PREFLUSH) {
+	if (bio->bi_opf & REQ_PREFLUSH) {
 		ci.bio = &ci.md->flush_bio;
 		ci.sector_count = 0;
 		error = __send_empty_flush(&ci);
@@ -1290,7 +1290,7 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
 	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
 		dm_put_live_table(md, srcu_idx);
 
-		if (!(bio->bi_rw & REQ_RAHEAD))
+		if (!(bio->bi_opf & REQ_RAHEAD))
 			queue_io(md, bio);
 		else
 			bio_io_error(bio);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 70ff888d25d0..86f5d435901d 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -221,7 +221,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 	struct bio *split;
 	sector_t start_sector, end_sector, data_offset;
 
-	if (unlikely(bio->bi_rw & REQ_PREFLUSH)) {
+	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2c3ab6f5e6be..d646f6e444f0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -285,7 +285,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 	 */
 	sectors = bio_sectors(bio);
 	/* bio could be mergeable after passing to underlayer */
-	bio->bi_rw &= ~REQ_NOMERGE;
+	bio->bi_opf &= ~REQ_NOMERGE;
 	mddev->pers->make_request(mddev, bio);
 
 	cpu = part_stat_lock();
@@ -414,7 +414,7 @@ static void md_submit_flush_data(struct work_struct *ws)
 		/* an empty barrier - all done */
 		bio_endio(bio);
 	else {
-		bio->bi_rw &= ~REQ_PREFLUSH;
+		bio->bi_opf &= ~REQ_PREFLUSH;
 		mddev->pers->make_request(mddev, bio);
 	}
 
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 4974682842ae..673efbd6fc47 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -91,7 +91,7 @@ static void multipath_end_request(struct bio *bio)
 
 	if (!bio->bi_error)
 		multipath_end_bh_io(mp_bh, 0);
-	else if (!(bio->bi_rw & REQ_RAHEAD)) {
+	else if (!(bio->bi_opf & REQ_RAHEAD)) {
 		/*
 		 * oops, IO error:
 		 */
@@ -112,7 +112,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
 	struct multipath_bh * mp_bh;
 	struct multipath_info *multipath;
 
-	if (unlikely(bio->bi_rw & REQ_PREFLUSH)) {
+	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
@@ -135,7 +135,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
 
 	mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
 	mp_bh->bio.bi_bdev = multipath->rdev->bdev;
-	mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT;
+	mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT;
 	mp_bh->bio.bi_end_io = multipath_end_request;
 	mp_bh->bio.bi_private = mp_bh;
 	generic_make_request(&mp_bh->bio);
@@ -360,7 +360,7 @@ static void multipathd(struct md_thread *thread)
 			bio->bi_iter.bi_sector +=
 				conf->multipaths[mp_bh->path].rdev->data_offset;
 			bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev;
-			bio->bi_rw |= REQ_FAILFAST_TRANSPORT;
+			bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
 			bio->bi_end_io = multipath_end_request;
 			bio->bi_private = mp_bh;
 			generic_make_request(bio);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c3d439083212..258986a2699d 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -458,7 +458,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 	struct md_rdev *tmp_dev;
 	struct bio *split;
 
-	if (unlikely(bio->bi_rw & REQ_PREFLUSH)) {
+	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 46168ef2e279..21dc00eb1989 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1043,8 +1043,8 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
 	unsigned long flags;
 	const int op = bio_op(bio);
 	const int rw = bio_data_dir(bio);
-	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
-	const unsigned long do_flush_fua = (bio->bi_rw &
+	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
+	const unsigned long do_flush_fua = (bio->bi_opf &
 						(REQ_PREFLUSH | REQ_FUA));
 	struct md_rdev *blocked_rdev;
 	struct blk_plug_cb *cb;
@@ -2318,7 +2318,7 @@ read_more:
 		raid_end_bio_io(r1_bio);
 	} else {
 		const unsigned long do_sync
-			= r1_bio->master_bio->bi_rw & REQ_SYNC;
+			= r1_bio->master_bio->bi_opf & REQ_SYNC;
 		if (bio) {
 			r1_bio->bios[r1_bio->read_disk] =
 				mddev->ro ? IO_BLOCKED : NULL;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ed29fc899f06..0e4efcd10795 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1054,8 +1054,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 	int i;
 	const int op = bio_op(bio);
 	const int rw = bio_data_dir(bio);
-	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
-	const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
+	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
+	const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
 	unsigned long flags;
 	struct md_rdev *blocked_rdev;
 	struct blk_plug_cb *cb;
@@ -1440,7 +1440,7 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
 
 	struct bio *split;
 
-	if (unlikely(bio->bi_rw & REQ_PREFLUSH)) {
+	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
@@ -2533,7 +2533,7 @@ read_more:
 		return;
 	}
 
-	do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
+	do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC);
 	slot = r10_bio->read_slot;
 	printk_ratelimited(
 		KERN_ERR
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 5504ce2bac06..51f76ddbe265 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -536,7 +536,7 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
 		bio_endio(bio);
 		return 0;
 	}
-	bio->bi_rw &= ~REQ_PREFLUSH;
+	bio->bi_opf &= ~REQ_PREFLUSH;
 	return -EAGAIN;
 }
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d189e894b921..8912407a4dd0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -806,7 +806,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
 	dd_idx = 0;
 	while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
 		dd_idx++;
-	if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw ||
+	if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf ||
 	    bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite))
 		goto unlock_out;
 
@@ -1003,7 +1003,7 @@ again:
 
 			pr_debug("%s: for %llu schedule op %d on disc %d\n",
 				__func__, (unsigned long long)sh->sector,
-				bi->bi_rw, i);
+				bi->bi_opf, i);
 			atomic_inc(&sh->count);
 			if (sh != head_sh)
 				atomic_inc(&head_sh->count);
@@ -1014,7 +1014,7 @@ again:
 				bi->bi_iter.bi_sector = (sh->sector
 						 + rdev->data_offset);
 			if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
-				bi->bi_rw |= REQ_NOMERGE;
+				bi->bi_opf |= REQ_NOMERGE;
 
 			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
 				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
@@ -1055,7 +1055,7 @@ again:
 			pr_debug("%s: for %llu schedule op %d on "
 				 "replacement disc %d\n",
 				__func__, (unsigned long long)sh->sector,
-				rbi->bi_rw, i);
+				rbi->bi_opf, i);
 			atomic_inc(&sh->count);
 			if (sh != head_sh)
 				atomic_inc(&head_sh->count);
@@ -1088,7 +1088,7 @@ again:
 			if (op_is_write(op))
 				set_bit(STRIPE_DEGRADED, &sh->state);
 			pr_debug("skip op %d on disc %d for sector %llu\n",
-				bi->bi_rw, i, (unsigned long long)sh->sector);
+				bi->bi_opf, i, (unsigned long long)sh->sector);
 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
 			set_bit(STRIPE_HANDLE, &sh->state);
 		}
@@ -1619,9 +1619,9 @@ again:
 
 			while (wbi && wbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
-				if (wbi->bi_rw & REQ_FUA)
+				if (wbi->bi_opf & REQ_FUA)
 					set_bit(R5_WantFUA, &dev->flags);
-				if (wbi->bi_rw & REQ_SYNC)
+				if (wbi->bi_opf & REQ_SYNC)
 					set_bit(R5_SyncIO, &dev->flags);
 				if (bio_op(wbi) == REQ_OP_DISCARD)
 					set_bit(R5_Discard, &dev->flags);
@@ -5154,7 +5154,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	DEFINE_WAIT(w);
 	bool do_prepare;
 
-	if (unlikely(bi->bi_rw & REQ_PREFLUSH)) {
+	if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
 		int ret = r5l_handle_flush_request(conf->log, bi);
 
 		if (ret == 0)
@@ -5237,7 +5237,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 			(unsigned long long)logical_sector);
 
 		sh = raid5_get_active_stripe(conf, new_sector, previous,
-				       (bi->bi_rw & REQ_RAHEAD), 0);
+				       (bi->bi_opf & REQ_RAHEAD), 0);
 		if (sh) {
 			if (unlikely(previous)) {
 				/* expansion might have moved on while waiting for a
@@ -5305,7 +5305,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 			set_bit(STRIPE_HANDLE, &sh->state);
 			clear_bit(STRIPE_DELAYED, &sh->state);
 			if ((!sh->batch_head || sh == sh->batch_head) &&
-			    (bi->bi_rw & REQ_SYNC) &&
+			    (bi->bi_opf & REQ_SYNC) &&
 			    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 				atomic_inc(&conf->preread_active_stripes);
 			release_stripe_plug(mddev, sh);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 20bae50c231d..571a6c7ee2fc 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -128,7 +128,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 	struct pmem_device *pmem = q->queuedata;
 	struct nd_region *nd_region = to_region(pmem);
 
-	if (bio->bi_rw & REQ_FLUSH)
+	if (bio->bi_opf & REQ_FLUSH)
 		nvdimm_flush(nd_region);
 
 	do_acct = nd_iostat_start(bio, &start);
@@ -144,7 +144,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 	if (do_acct)
 		nd_iostat_end(bio, start);
 
-	if (bio->bi_rw & REQ_FUA)
+	if (bio->bi_opf & REQ_FUA)
 		nvdimm_flush(nd_region);
 
 	bio_endio(bio);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 5d5cae05818d..66789471b49d 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2945,7 +2945,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
 			printk(KERN_INFO
 			       "submit_bio(rw=%d,0x%x, bi_vcnt=%u,"
 			       " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
-			       bio_op(bio), bio->bi_rw, bio->bi_vcnt,
+			       bio_op(bio), bio->bi_opf, bio->bi_vcnt,
 			       (unsigned long long)bio->bi_iter.bi_sector,
 			       dev_bytenr, bio->bi_bdev);
 
@@ -2976,18 +2976,18 @@ static void __btrfsic_submit_bio(struct bio *bio)
 		btrfsic_process_written_block(dev_state, dev_bytenr,
 					      mapped_datav, bio->bi_vcnt,
 					      bio, &bio_is_patched,
-					      NULL, bio->bi_rw);
+					      NULL, bio->bi_opf);
 		while (i > 0) {
 			i--;
 			kunmap(bio->bi_io_vec[i].bv_page);
 		}
 		kfree(mapped_datav);
-	} else if (NULL != dev_state && (bio->bi_rw & REQ_PREFLUSH)) {
+	} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
 		if (dev_state->state->print_mask &
 		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
 			printk(KERN_INFO
 			       "submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
-			       bio_op(bio), bio->bi_rw, bio->bi_bdev);
+			       bio_op(bio), bio->bi_opf, bio->bi_bdev);
 		if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
 			if ((dev_state->state->print_mask &
 			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
@@ -3005,7 +3005,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
 			block->never_written = 0;
 			block->iodone_w_error = 0;
 			block->flush_gen = dev_state->last_flush_gen + 1;
-			block->submit_bio_bh_rw = bio->bi_rw;
+			block->submit_bio_bh_rw = bio->bi_opf;
 			block->orig_bio_bh_private = bio->bi_private;
 			block->orig_bio_bh_end_io.bio = bio->bi_end_io;
 			block->next_in_same_bio = NULL;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 87dad552e39a..59febfb8d04a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -870,7 +870,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 
 	atomic_inc(&fs_info->nr_async_submits);
 
-	if (bio->bi_rw & REQ_SYNC)
+	if (bio->bi_opf & REQ_SYNC)
 		btrfs_set_work_high_priority(&async->work);
 
 	btrfs_queue_work(fs_info->workers, &async->work);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b0f421f332ae..2f5975954ccf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8209,7 +8209,7 @@ static void btrfs_end_dio_bio(struct bio *bio)
 	if (err)
 		btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
 			   "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
-			   btrfs_ino(dip->inode), bio_op(bio), bio->bi_rw,
+			   btrfs_ino(dip->inode), bio_op(bio), bio->bi_opf,
 			   (unsigned long long)bio->bi_iter.bi_sector,
 			   bio->bi_iter.bi_size, err);
 
@@ -8373,7 +8373,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
 	if (!bio)
 		return -ENOMEM;
 
-	bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_rw);
+	bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf);
 	bio->bi_private = dip;
 	bio->bi_end_io = btrfs_end_dio_bio;
 	btrfs_io_bio(bio)->logical = file_offset;
@@ -8411,7 +8411,7 @@ next_block:
 						  start_sector, GFP_NOFS);
 			if (!bio)
 				goto out_err;
-			bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_rw);
+			bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf);
 			bio->bi_private = dip;
 			bio->bi_end_io = btrfs_end_dio_bio;
 			btrfs_io_bio(bio)->logical = file_offset;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bb0addce7558..51f125508771 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6012,7 +6012,7 @@ static void btrfs_end_bio(struct bio *bio)
 				else
 					btrfs_dev_stat_inc(dev,
 						BTRFS_DEV_STAT_READ_ERRS);
-				if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
+				if ((bio->bi_opf & WRITE_FLUSH) == WRITE_FLUSH)
 					btrfs_dev_stat_inc(dev,
 						BTRFS_DEV_STAT_FLUSH_ERRS);
 				btrfs_dev_stat_print_on_error(dev);
@@ -6089,7 +6089,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
 	bio->bi_next = NULL;
 
 	spin_lock(&device->io_lock);
-	if (bio->bi_rw & REQ_SYNC)
+	if (bio->bi_opf & REQ_SYNC)
 		pending_bios = &device->pending_sync_bios;
 	else
 		pending_bios = &device->pending_bios;
@@ -6127,7 +6127,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
 		rcu_read_lock();
 		name = rcu_dereference(dev->name);
 		pr_debug("btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu "
-			 "(%s id %llu), size=%u\n", bio_op(bio), bio->bi_rw,
+			 "(%s id %llu), size=%u\n", bio_op(bio), bio->bi_opf,
 			 (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev,
 			 name->str, dev->devid, bio->bi_iter.bi_size);
 		rcu_read_unlock();
diff --git a/include/linux/bio.h b/include/linux/bio.h
index e09a8895fc31..59ffaa68b11b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -95,7 +95,7 @@ static inline bool bio_is_rw(struct bio *bio)
 
 static inline bool bio_mergeable(struct bio *bio)
 {
-	if (bio->bi_rw & REQ_NOMERGE_FLAGS)
+	if (bio->bi_opf & REQ_NOMERGE_FLAGS)
 		return false;
 
 	return true;
@@ -318,7 +318,7 @@ struct bio_integrity_payload {
 
 static inline struct bio_integrity_payload *bio_integrity(struct bio *bio)
 {
-	if (bio->bi_rw & REQ_INTEGRITY)
+	if (bio->bi_opf & REQ_INTEGRITY)
 		return bio->bi_integrity;
 
 	return NULL;
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index f77150a4a96a..10648e300c93 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -714,9 +714,9 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 
 	if (!throtl) {
 		blkg = blkg ?: q->root_blkg;
-		blkg_rwstat_add(&blkg->stat_bytes, bio_op(bio), bio->bi_rw,
+		blkg_rwstat_add(&blkg->stat_bytes, bio_op(bio), bio->bi_opf,
 				bio->bi_iter.bi_size);
-		blkg_rwstat_add(&blkg->stat_ios, bio_op(bio), bio->bi_rw, 1);
+		blkg_rwstat_add(&blkg->stat_ios, bio_op(bio), bio->bi_opf, 1);
 	}
 
 	rcu_read_unlock();
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index f254eb264924..436f43f87da9 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -27,8 +27,9 @@ struct bio {
 	struct bio		*bi_next;	/* request queue link */
 	struct block_device	*bi_bdev;
 	int			bi_error;
-	unsigned int		bi_rw;		/* bottom bits req flags,
-						 * top bits REQ_OP
+	unsigned int		bi_opf;		/* bottom bits req flags,
+						 * top bits REQ_OP. Use
+						 * accessors.
 						 */
 	unsigned short		bi_flags;	/* status, command, etc */
 	unsigned short		bi_ioprio;
@@ -89,13 +90,13 @@ struct bio {
 };
 
 #define BIO_OP_SHIFT	(8 * sizeof(unsigned int) - REQ_OP_BITS)
-#define bio_op(bio)	((bio)->bi_rw >> BIO_OP_SHIFT)
+#define bio_op(bio)	((bio)->bi_opf >> BIO_OP_SHIFT)
 
 #define bio_set_op_attrs(bio, op, op_flags) do {		\
 	WARN_ON(op >= (1 << REQ_OP_BITS));			\
-	(bio)->bi_rw &= ((1 << BIO_OP_SHIFT) - 1);		\
-	(bio)->bi_rw |= ((unsigned int) (op) << BIO_OP_SHIFT);	\
-	(bio)->bi_rw |= op_flags;				\
+	(bio)->bi_opf &= ((1 << BIO_OP_SHIFT) - 1);		\
+	(bio)->bi_opf |= ((unsigned int) (op) << BIO_OP_SHIFT);	\
+	(bio)->bi_opf |= op_flags;				\
 } while (0)
 
 #define BIO_RESET_BYTES		offsetof(struct bio, bi_max_vecs)
@@ -138,7 +139,7 @@ struct bio {
 
 /*
  * Request flags.  For use in the cmd_flags field of struct request, and in
- * bi_rw of struct bio.  Note that some flags are only valid in either one.
+ * bi_opf of struct bio.  Note that some flags are only valid in either one.
  */
 enum rq_flag_bits {
 	/* common flags */
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index 65673d8b81ac..d336b890e31f 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -27,7 +27,7 @@ DECLARE_EVENT_CLASS(bcache_request,
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->orig_sector	= bio->bi_iter.bi_sector - 16;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
 			      bio->bi_iter.bi_size);
 	),
 
@@ -102,7 +102,7 @@ DECLARE_EVENT_CLASS(bcache_bio,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
 			      bio->bi_iter.bi_size);
 	),
 
@@ -138,7 +138,7 @@ TRACE_EVENT(bcache_read,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
 			      bio->bi_iter.bi_size);
 		__entry->cache_hit = hit;
 		__entry->bypass = bypass;
@@ -170,7 +170,7 @@ TRACE_EVENT(bcache_write,
 		__entry->inode		= inode;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
 			      bio->bi_iter.bi_size);
 		__entry->writeback = writeback;
 		__entry->bypass = bypass;
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 5a2a7592068f..8f3a163b8166 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -274,7 +274,7 @@ TRACE_EVENT(block_bio_bounce,
 					  bio->bi_bdev->bd_dev : 0;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
 			      bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
@@ -313,7 +313,7 @@ TRACE_EVENT(block_bio_complete,
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
 		__entry->error		= error;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
 			      bio->bi_iter.bi_size);
 	),
 
@@ -341,7 +341,7 @@ DECLARE_EVENT_CLASS(block_bio_merge,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
 			      bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
@@ -409,7 +409,7 @@ TRACE_EVENT(block_bio_queue,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
 			      bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
@@ -439,7 +439,7 @@ DECLARE_EVENT_CLASS(block_get_rq,
 		__entry->sector		= bio ? bio->bi_iter.bi_sector : 0;
 		__entry->nr_sector	= bio ? bio_sectors(bio) : 0;
 		blk_fill_rwbs(__entry->rwbs, bio ? bio_op(bio) : 0,
-			      bio ? bio->bi_rw : 0, __entry->nr_sector);
+			      bio ? bio->bi_opf : 0, __entry->nr_sector);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
         ),
 
@@ -573,7 +573,7 @@ TRACE_EVENT(block_split,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->new_sector	= new_sector;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
 			      bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
@@ -617,7 +617,7 @@ TRACE_EVENT(block_bio_remap,
 		__entry->nr_sector	= bio_sectors(bio);
 		__entry->old_dev	= dev;
 		__entry->old_sector	= from;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_rw,
+		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
 			      bio->bi_iter.bi_size);
 	),
 
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fb345cd11883..7598e6ca817a 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -776,7 +776,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 		return;
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-			bio_op(bio), bio->bi_rw, what, error, 0, NULL);
+			bio_op(bio), bio->bi_opf, what, error, 0, NULL);
 }
 
 static void blk_add_trace_bio_bounce(void *ignore,
@@ -881,7 +881,7 @@ static void blk_add_trace_split(void *ignore,
 		__be64 rpdu = cpu_to_be64(pdu);
 
 		__blk_add_trace(bt, bio->bi_iter.bi_sector,
-				bio->bi_iter.bi_size, bio_op(bio), bio->bi_rw,
+				bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
 				BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu),
 				&rpdu);
 	}
@@ -915,7 +915,7 @@ static void blk_add_trace_bio_remap(void *ignore,
 	r.sector_from = cpu_to_be64(from);
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-			bio_op(bio), bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
+			bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error,
 			sizeof(r), &r);
 }
 
-- 
cgit v1.3-8-gc7d7


From 574673c231a5fad1560249cc3a598907acb36cf9 Mon Sep 17 00:00:00 2001
From: Andreas Ziegler <andreas.ziegler@fau.de>
Date: Thu, 4 Aug 2016 09:52:09 +0200
Subject: printk: Remove unnecessary #ifdef CONFIG_PRINTK

In commit 874f9c7da9a4 ("printk: create pr_<level> functions"), new
pr_level defines were added to printk.c.

These new defines are guarded by an #ifdef CONFIG_PRINTK - however,
there is already a surrounding #ifdef CONFIG_PRINTK starting a lot
earlier in line 249 which means the newly introduced #ifdef is
unnecessary.

Let's remove it to avoid confusion.

Signed-off-by: Andreas Ziegler <andreas.ziegler@fau.de>
Cc: Joe Perches <joe@perches.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a5ef95ca18c9..a37fc8cf8e84 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1930,7 +1930,6 @@ asmlinkage int printk_emit(int facility, int level,
 }
 EXPORT_SYMBOL(printk_emit);
 
-#ifdef CONFIG_PRINTK
 #define define_pr_level(func, loglevel)				\
 asmlinkage __visible void func(const char *fmt, ...)		\
 {								\
@@ -1949,7 +1948,6 @@ define_pr_level(__pr_err, LOGLEVEL_ERR);
 define_pr_level(__pr_warn, LOGLEVEL_WARNING);
 define_pr_level(__pr_notice, LOGLEVEL_NOTICE);
 define_pr_level(__pr_info, LOGLEVEL_INFO);
-#endif
 
 int vprintk_default(int level, const char *fmt, va_list args)
 {
-- 
cgit v1.3-8-gc7d7


From f3b0946d629c8bfbd3e5f038e30cb9c711a35f10 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Wed, 13 Jul 2016 17:18:33 +0100
Subject: genirq/msi: Make sure PCI MSIs are activated early

Bharat Kumar Gogada reported issues with the generic MSI code, where the
end-point ended up with garbage in its MSI configuration (both for the vector
and the message).

It turns out that the two MSI paths in the kernel are doing slightly different
things:

generic MSI: disable MSI -> allocate MSI -> enable MSI -> setup EP
PCI MSI: disable MSI -> allocate MSI -> setup EP -> enable MSI

And it turns out that end-points are allowed to latch the content of the MSI
configuration registers as soon as MSIs are enabled.  In Bharat's case, the
end-point ends up using whatever was there already, which is not what you
want.

In order to make things converge, we introduce a new MSI domain flag
(MSI_FLAG_ACTIVATE_EARLY) that is unconditionally set for PCI/MSI. When set,
this flag forces the programming of the end-point as soon as the MSIs are
allocated.

A consequence of this is that we have an extra activate in irq_startup, but
that should be without much consequence.

tglx:

 - Several people reported a VMWare regression with PCI/MSI-X passthrough. It
   turns out that the patch also cures that issue.

 - We need to have a look at the MSI disable interrupt path, where we write
   the msg to all zeros without disabling MSI in the PCI device. Is that
   correct?

Fixes: 52f518a3a7c2 "x86/MSI: Use hierarchical irqdomains to manage MSI interrupts"
Reported-and-tested-by: Bharat Kumar Gogada <bharat.kumar.gogada@xilinx.com>
Reported-and-tested-by: Foster Snowhill <forst@forstwoof.ru>
Reported-by: Matthias Prager <linux@matthiasprager.de>
Reported-by: Jason Taylor <jason.taylor@simplivity.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: linux-pci@vger.kernel.org
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1468426713-31431-1-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/pci/msi.c   |  2 ++
 include/linux/msi.h |  2 ++
 kernel/irq/msi.c    | 11 +++++++++++
 3 files changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index a02981efdad5..eafa6138a6b8 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1411,6 +1411,8 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 	if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
 		pci_msi_domain_update_chip_ops(info);
 
+	info->flags |= MSI_FLAG_ACTIVATE_EARLY;
+
 	domain = msi_create_irq_domain(fwnode, info, parent);
 	if (!domain)
 		return NULL;
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 4f0bfe5912b2..e8c81fbd5f9c 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -270,6 +270,8 @@ enum {
 	MSI_FLAG_MULTI_PCI_MSI		= (1 << 2),
 	/* Support PCI MSIX interrupts */
 	MSI_FLAG_PCI_MSIX		= (1 << 3),
+	/* Needs early activate, required for PCI */
+	MSI_FLAG_ACTIVATE_EARLY		= (1 << 4),
 };
 
 int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask,
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 54999350162c..19e9dfbe97fa 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -359,6 +359,17 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 		else
 			dev_dbg(dev, "irq [%d-%d] for MSI\n",
 				virq, virq + desc->nvec_used - 1);
+		/*
+		 * This flag is set by the PCI layer as we need to activate
+		 * the MSI entries before the PCI layer enables MSI in the
+		 * card. Otherwise the card latches a random msi message.
+		 */
+		if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
+			struct irq_data *irq_data;
+
+			irq_data = irq_domain_get_irq_data(domain, desc->irq);
+			irq_domain_activate_irq(irq_data);
+		}
 	}
 
 	return 0;
-- 
cgit v1.3-8-gc7d7


From 46c8f0b077a838eb1f6169bb370aab8ed98f7630 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@mellanox.com>
Date: Mon, 8 Aug 2016 16:29:07 -0400
Subject: timers: Fix get_next_timer_interrupt() computation

The tick_nohz_stop_sched_tick() routine is not properly
canceling the sched timer when nothing is pending, because
get_next_timer_interrupt() is no longer returning KTIME_MAX in
that case.  This causes periodic interrupts when none are needed.

When determining the next interrupt time, we first use
__next_timer_interrupt() to get the first expiring timer in the
timer wheel.  If no timer is found, we return the base clock value
plus NEXT_TIMER_MAX_DELTA to indicate there is no timer in the
timer wheel.

Back in get_next_timer_interrupt(), we set the "expires" value
by converting the timer wheel expiry (in ticks) to a nsec value.
But we don't want to do this if the timer wheel expiry value
indicates no timer; we want to return KTIME_MAX.

Prior to commit 500462a9de65 ("timers: Switch to a non-cascading
wheel") we checked base->active_timers to see if any timers
were active, and if not, we didn't touch the expiry value and so
properly returned KTIME_MAX.  Now we don't have active_timers.

To fix this, we now just check the timer wheel expiry value to
see if it is "now + NEXT_TIMER_MAX_DELTA", and if it is, we don't
try to compute a new value based on it, but instead simply let the
KTIME_MAX value in expires remain.

Fixes: 500462a9de65 "timers: Switch to a non-cascading wheel"
Signed-off-by: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/1470688147-22287-1-git-send-email-cmetcalf@mellanox.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 555670a5143c..32bf6f75a8fe 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1496,6 +1496,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
 	u64 expires = KTIME_MAX;
 	unsigned long nextevt;
+	bool is_max_delta;
 
 	/*
 	 * Pretend that there is no timer pending if the cpu is offline.
@@ -1506,6 +1507,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 
 	spin_lock(&base->lock);
 	nextevt = __next_timer_interrupt(base);
+	is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
 	base->next_expiry = nextevt;
 	/*
 	 * We have a fresh next event. Check whether we can forward the base:
@@ -1519,7 +1521,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 		expires = basem;
 		base->is_idle = false;
 	} else {
-		expires = basem + (nextevt - basej) * TICK_NSEC;
+		if (!is_max_delta)
+			expires = basem + (nextevt - basej) * TICK_NSEC;
 		/*
 		 * If we expect to sleep more than a tick, mark the base idle:
 		 */
-- 
cgit v1.3-8-gc7d7


From a0cba2179ea4c1820fce2ee046b6ed90ecc56196 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 9 Aug 2016 10:48:18 -0700
Subject: Revert "printk: create pr_<level> functions"

This reverts commit 874f9c7da9a4acbc1b9e12ca722579fb50e4d142.

Geert Uytterhoeven reports:
 "This change seems to have an (unintendent?) side-effect.

  Before, pr_*() calls without a trailing newline characters would be
  printed with a newline character appended, both on the console and in
  the output of the dmesg command.

  After this commit, no new line character is appended, and the output
  of the next pr_*() call of the same type may be appended, like in:

    - Truncating RAM at 0x0000000040000000-0x00000000c0000000 to -0x0000000070000000
    - Ignoring RAM at 0x0000000200000000-0x0000000240000000 (!CONFIG_HIGHMEM)
    + Truncating RAM at 0x0000000040000000-0x00000000c0000000 to -0x0000000070000000Ignoring RAM at 0x0000000200000000-0x0000000240000000 (!CONFIG_HIGHMEM)"

Joe Perches says:
 "No, that is not intentional.

  The newline handling code inside vprintk_emit is a bit involved and
  for now I suggest a revert until this has all the same behavior as
  earlier"

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Requested-by: Joe Perches <joe@perches.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h   | 48 +++++++++++++++---------------------------------
 kernel/printk/internal.h | 16 ++++++----------
 kernel/printk/nmi.c      | 13 ++-----------
 kernel/printk/printk.c   | 25 +++----------------------
 4 files changed, 26 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 8dc155dab3ed..696a56be7d3e 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -266,39 +266,21 @@ extern asmlinkage void dump_stack(void) __cold;
  * and other debug macros are compiled out unless either DEBUG is defined
  * or CONFIG_DYNAMIC_DEBUG is set.
  */
-
-#ifdef CONFIG_PRINTK
-
-asmlinkage __printf(1, 2) __cold void __pr_emerg(const char *fmt, ...);
-asmlinkage __printf(1, 2) __cold void __pr_alert(const char *fmt, ...);
-asmlinkage __printf(1, 2) __cold void __pr_crit(const char *fmt, ...);
-asmlinkage __printf(1, 2) __cold void __pr_err(const char *fmt, ...);
-asmlinkage __printf(1, 2) __cold void __pr_warn(const char *fmt, ...);
-asmlinkage __printf(1, 2) __cold void __pr_notice(const char *fmt, ...);
-asmlinkage __printf(1, 2) __cold void __pr_info(const char *fmt, ...);
-
-#define pr_emerg(fmt, ...)	__pr_emerg(pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_alert(fmt, ...)	__pr_alert(pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_crit(fmt, ...)	__pr_crit(pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_err(fmt, ...)	__pr_err(pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_warn(fmt, ...)	__pr_warn(pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_notice(fmt, ...)	__pr_notice(pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_info(fmt, ...)	__pr_info(pr_fmt(fmt), ##__VA_ARGS__)
-
-#else
-
-#define pr_emerg(fmt, ...)	printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_alert(fmt, ...)	printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_crit(fmt, ...)	printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_err(fmt, ...)	printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_warn(fmt, ...)	printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_notice(fmt, ...)	printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_info(fmt, ...)	printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
-
-#endif
-
-#define pr_warning pr_warn
-
+#define pr_emerg(fmt, ...) \
+	printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_alert(fmt, ...) \
+	printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_crit(fmt, ...) \
+	printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_err(fmt, ...) \
+	printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_warning(fmt, ...) \
+	printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_warn pr_warning
+#define pr_notice(fmt, ...) \
+	printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_info(fmt, ...) \
+	printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
 /*
  * Like KERN_CONT, pr_cont() should only be used when continuing
  * a line with no newline ('\n') enclosed. Otherwise it defaults
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 5d4505f30083..7fd2838fa417 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -16,11 +16,9 @@
  */
 #include <linux/percpu.h>
 
-typedef __printf(2, 0) int (*printk_func_t)(int level, const char *fmt,
-					    va_list args);
+typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
 
-__printf(2, 0)
-int vprintk_default(int level, const char *fmt, va_list args);
+int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
 
 #ifdef CONFIG_PRINTK_NMI
 
@@ -33,10 +31,9 @@ extern raw_spinlock_t logbuf_lock;
  * via per-CPU variable.
  */
 DECLARE_PER_CPU(printk_func_t, printk_func);
-__printf(2, 0)
-static inline int vprintk_func(int level, const char *fmt, va_list args)
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
 {
-	return this_cpu_read(printk_func)(level, fmt, args);
+	return this_cpu_read(printk_func)(fmt, args);
 }
 
 extern atomic_t nmi_message_lost;
@@ -47,10 +44,9 @@ static inline int get_nmi_message_lost(void)
 
 #else /* CONFIG_PRINTK_NMI */
 
-__printf(2, 0)
-static inline int vprintk_func(int level, const char *fmt, va_list args)
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
 {
-	return vprintk_default(level, fmt, args);
+	return vprintk_default(fmt, args);
 }
 
 static inline int get_nmi_message_lost(void)
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index bc3eeb1ae6da..b69eb8a2876f 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -58,7 +58,7 @@ static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
  * one writer running. But the buffer might get flushed from another
  * CPU, so we need to be careful.
  */
-static int vprintk_nmi(int level, const char *fmt, va_list args)
+static int vprintk_nmi(const char *fmt, va_list args)
 {
 	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
 	int add = 0;
@@ -79,16 +79,7 @@ again:
 	if (!len)
 		smp_rmb();
 
-	if (level != LOGLEVEL_DEFAULT) {
-		add = snprintf(s->buffer + len, sizeof(s->buffer) - len,
-				KERN_SOH "%c", '0' + level);
-		add += vsnprintf(s->buffer + len + add,
-				 sizeof(s->buffer) - len - add,
-				 fmt, args);
-	} else {
-		add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len,
-				fmt, args);
-	}
+	add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
 
 	/*
 	 * Do it once again if the buffer has been flushed in the meantime.
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a37fc8cf8e84..eea6dbc2d8cf 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1930,26 +1930,7 @@ asmlinkage int printk_emit(int facility, int level,
 }
 EXPORT_SYMBOL(printk_emit);
 
-#define define_pr_level(func, loglevel)				\
-asmlinkage __visible void func(const char *fmt, ...)		\
-{								\
-	va_list args;						\
-								\
-	va_start(args, fmt);					\
-	vprintk_default(loglevel, fmt, args);			\
-	va_end(args);						\
-}								\
-EXPORT_SYMBOL(func)
-
-define_pr_level(__pr_emerg, LOGLEVEL_EMERG);
-define_pr_level(__pr_alert, LOGLEVEL_ALERT);
-define_pr_level(__pr_crit, LOGLEVEL_CRIT);
-define_pr_level(__pr_err, LOGLEVEL_ERR);
-define_pr_level(__pr_warn, LOGLEVEL_WARNING);
-define_pr_level(__pr_notice, LOGLEVEL_NOTICE);
-define_pr_level(__pr_info, LOGLEVEL_INFO);
-
-int vprintk_default(int level, const char *fmt, va_list args)
+int vprintk_default(const char *fmt, va_list args)
 {
 	int r;
 
@@ -1959,7 +1940,7 @@ int vprintk_default(int level, const char *fmt, va_list args)
 		return r;
 	}
 #endif
-	r = vprintk_emit(0, level, NULL, 0, fmt, args);
+	r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
 
 	return r;
 }
@@ -1992,7 +1973,7 @@ asmlinkage __visible int printk(const char *fmt, ...)
 	int r;
 
 	va_start(args, fmt);
-	r = vprintk_func(LOGLEVEL_DEFAULT, fmt, args);
+	r = vprintk_func(fmt, args);
 	va_end(args);
 
 	return r;
-- 
cgit v1.3-8-gc7d7


From 06f4e94898918bcad00cdd4d349313a439d6911e Mon Sep 17 00:00:00 2001
From: Zefan Li <lizefan@huawei.com>
Date: Tue, 9 Aug 2016 11:25:01 +0800
Subject: cpuset: make sure new tasks conform to the current config of the
 cpuset

A new task inherits cpus_allowed and mems_allowed masks from its parent,
but if someone changes cpuset's config by writing to cpuset.cpus/cpuset.mems
before this new task is inserted into the cgroup's task list, the new task
won't be updated accordingly.

Signed-off-by: Zefan Li <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
---
 kernel/cpuset.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c7fd2778ed50..c27e53326bef 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2069,6 +2069,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
 	mutex_unlock(&cpuset_mutex);
 }
 
+/*
+ * Make sure the new task conform to the current state of its parent,
+ * which could have been changed by cpuset just after it inherits the
+ * state from the parent and before it sits on the cgroup's task list.
+ */
+void cpuset_fork(struct task_struct *task)
+{
+	if (task_css_is_root(task, cpuset_cgrp_id))
+		return;
+
+	set_cpus_allowed_ptr(task, &current->cpus_allowed);
+	task->mems_allowed = current->mems_allowed;
+}
+
 struct cgroup_subsys cpuset_cgrp_subsys = {
 	.css_alloc	= cpuset_css_alloc,
 	.css_online	= cpuset_css_online,
@@ -2079,6 +2093,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
 	.attach		= cpuset_attach,
 	.post_attach	= cpuset_post_attach,
 	.bind		= cpuset_bind,
+	.fork		= cpuset_fork,
 	.legacy_cftypes	= files,
 	.early_init	= true,
 };
-- 
cgit v1.3-8-gc7d7


From 0b8f1e2e26bfc6b9abe3f0f3faba2cb0eecb9fb9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 4 Aug 2016 14:37:24 +0200
Subject: perf/core: Fix sideband list-iteration vs. event ordering NULL
 pointer deference crash

Vegard Nossum reported that perf fuzzing generates a NULL
pointer dereference crash:

> Digging a bit deeper into this, it seems the event itself is getting
> created by perf_event_open() and it gets added to the pmu_event_list
> through:
>
> perf_event_open()
>  - perf_event_alloc()
>     - account_event()
>        - account_pmu_sb_event()
>           - attach_sb_event()
>
> so at this point the event is being attached but its ->ctx is still
> NULL. It seems like ->ctx is set just a bit later in
> perf_event_open(), though.
>
> But before that, __schedule() comes along and creates a stack trace
> similar to the one above:
>
> __schedule()
>  - __perf_event_task_sched_out()
>    - perf_iterate_sb()
>      - perf_iterate_sb_cpu()
>         - event_filter_match()
>           - perf_cgroup_match()
>             - __get_cpu_context()
>               - (dereference ctx which is NULL)
>
> So I guess the question is... should the event be attached (= put on
> the list) before ->ctx gets set? Or should the cgroup code check for a
> NULL ->ctx?

The latter seems like the simplest solution. Moving the list-add later
creates a bit of a mess.

Reported-by: Vegard Nossum <vegard.nossum@gmail.com>
Tested-by: Vegard Nossum <vegard.nossum@gmail.com>
Tested-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Carrillo-Cisneros <davidcc@google.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: f2fb6bef9251 ("perf/core: Optimize side-band event delivery")
Link: http://lkml.kernel.org/r/20160804123724.GN6862@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index a19550d80ab1..87d02b8cb87e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1716,8 +1716,8 @@ static inline int pmu_filter_match(struct perf_event *event)
 static inline int
 event_filter_match(struct perf_event *event)
 {
-	return (event->cpu == -1 || event->cpu == smp_processor_id())
-	    && perf_cgroup_match(event) && pmu_filter_match(event);
+	return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
+	       perf_cgroup_match(event) && pmu_filter_match(event);
 }
 
 static void
@@ -1737,8 +1737,8 @@ event_sched_out(struct perf_event *event,
 	 * maintained, otherwise bogus information is return
 	 * via read() for time_enabled, time_running:
 	 */
-	if (event->state == PERF_EVENT_STATE_INACTIVE
-	    && !event_filter_match(event)) {
+	if (event->state == PERF_EVENT_STATE_INACTIVE &&
+	    !event_filter_match(event)) {
 		delta = tstamp - event->tstamp_stopped;
 		event->tstamp_running += delta;
 		event->tstamp_stopped = tstamp;
@@ -2236,10 +2236,15 @@ perf_install_in_context(struct perf_event_context *ctx,
 
 	lockdep_assert_held(&ctx->mutex);
 
-	event->ctx = ctx;
 	if (event->cpu != -1)
 		event->cpu = cpu;
 
+	/*
+	 * Ensures that if we can observe event->ctx, both the event and ctx
+	 * will be 'complete'. See perf_iterate_sb_cpu().
+	 */
+	smp_store_release(&event->ctx, ctx);
+
 	if (!task) {
 		cpu_function_call(cpu, __perf_install_in_context, event);
 		return;
@@ -5969,6 +5974,14 @@ static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
 	struct perf_event *event;
 
 	list_for_each_entry_rcu(event, &pel->list, sb_list) {
+		/*
+		 * Skip events that are not fully formed yet; ensure that
+		 * if we observe event->ctx, both event and ctx will be
+		 * complete enough. See perf_install_in_context().
+		 */
+		if (!smp_load_acquire(&event->ctx))
+			continue;
+
 		if (event->state < PERF_EVENT_STATE_INACTIVE)
 			continue;
 		if (!event_filter_match(event))
-- 
cgit v1.3-8-gc7d7


From db4a835601b73cf8d6cd8986381d966b8e13d2d9 Mon Sep 17 00:00:00 2001
From: David Carrillo-Cisneros <davidcc@google.com>
Date: Tue, 2 Aug 2016 00:48:12 -0700
Subject: perf/core: Set cgroup in CPU contexts for new cgroup events

There's a perf stat bug easy to observer on a machine with only one cgroup:

  $ perf stat -e cycles -I 1000 -C 0 -G /
  #          time             counts unit events
      1.000161699      <not counted>      cycles                    /
      2.000355591      <not counted>      cycles                    /
      3.000565154      <not counted>      cycles                    /
      4.000951350      <not counted>      cycles                    /

We'd expect some output there.

The underlying problem is that there is an optimization in
perf_cgroup_sched_{in,out}() that skips the switch of cgroup events
if the old and new cgroups in a task switch are the same.

This optimization interacts with the current code in two ways
that cause a CPU context's cgroup (cpuctx->cgrp) to be NULL even if a
cgroup event matches the current task. These are:

  1. On creation of the first cgroup event in a CPU: In current code,
  cpuctx->cpu is only set in perf_cgroup_sched_in, but due to the
  aforesaid optimization, perf_cgroup_sched_in will run until the next
  cgroup switches in that CPU. This may happen late or never happen,
  depending on system's number of cgroups, CPU load, etc.

  2. On deletion of the last cgroup event in a cpuctx: In list_del_event,
  cpuctx->cgrp is set NULL. Any new cgroup event will not be sched in
  because cpuctx->cgrp == NULL until a cgroup switch occurs and
  perf_cgroup_sched_in is executed (updating cpuctx->cgrp).

This patch fixes both problems by setting cpuctx->cgrp in list_add_event,
mirroring what list_del_event does when removing a cgroup event from CPU
context, as introduced in:

  commit 68cacd29167b ("perf_events: Fix stale ->cgrp pointer in update_cgrp_time_from_cpuctx()")

With this patch, cpuctx->cgrp is always set/clear when installing/removing
the first/last cgroup event in/from the CPU context. With cpuctx->cgrp
correctly set, event_filter_match works as intended when events are
sched in/out.

After the fix, the output is as expected:

  $ perf stat -e cycles -I 1000 -a -G /
  #         time             counts unit events
     1.004699159          627342882      cycles                    /
     2.007397156          615272690      cycles                    /
     3.010019057          616726074      cycles                    /

Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1470124092-113192-1-git-send-email-davidcc@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  4 ++++
 kernel/events/core.c       | 54 ++++++++++++++++++++++++++++++----------------
 2 files changed, 40 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8ed4326164cc..2b6b43cc0dd5 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -743,7 +743,9 @@ struct perf_event_context {
 	u64				parent_gen;
 	u64				generation;
 	int				pin_count;
+#ifdef CONFIG_CGROUP_PERF
 	int				nr_cgroups;	 /* cgroup evts */
+#endif
 	void				*task_ctx_data; /* pmu specific data */
 	struct rcu_head			rcu_head;
 };
@@ -769,7 +771,9 @@ struct perf_cpu_context {
 	unsigned int			hrtimer_active;
 
 	struct pmu			*unique_pmu;
+#ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp;
+#endif
 };
 
 struct perf_output_handle {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 87d02b8cb87e..1903b8f3a705 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -843,6 +843,32 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 		}
 	}
 }
+
+/*
+ * Update cpuctx->cgrp so that it is set when first cgroup event is added and
+ * cleared when last cgroup event is removed.
+ */
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+			 struct perf_event_context *ctx, bool add)
+{
+	struct perf_cpu_context *cpuctx;
+
+	if (!is_cgroup_event(event))
+		return;
+
+	if (add && ctx->nr_cgroups++)
+		return;
+	else if (!add && --ctx->nr_cgroups)
+		return;
+	/*
+	 * Because cgroup events are always per-cpu events,
+	 * this will always be called from the right CPU.
+	 */
+	cpuctx = __get_cpu_context(ctx);
+	cpuctx->cgrp = add ? event->cgrp : NULL;
+}
+
 #else /* !CONFIG_CGROUP_PERF */
 
 static inline bool
@@ -920,6 +946,13 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 			 struct perf_event_context *ctx)
 {
 }
+
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+			 struct perf_event_context *ctx, bool add)
+{
+}
+
 #endif
 
 /*
@@ -1392,6 +1425,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 static void
 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 {
+
 	lockdep_assert_held(&ctx->lock);
 
 	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
@@ -1412,8 +1446,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 		list_add_tail(&event->group_entry, list);
 	}
 
-	if (is_cgroup_event(event))
-		ctx->nr_cgroups++;
+	list_update_cgroup_event(event, ctx, true);
 
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	ctx->nr_events++;
@@ -1581,8 +1614,6 @@ static void perf_group_attach(struct perf_event *event)
 static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
-	struct perf_cpu_context *cpuctx;
-
 	WARN_ON_ONCE(event->ctx != ctx);
 	lockdep_assert_held(&ctx->lock);
 
@@ -1594,20 +1625,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 
 	event->attach_state &= ~PERF_ATTACH_CONTEXT;
 
-	if (is_cgroup_event(event)) {
-		ctx->nr_cgroups--;
-		/*
-		 * Because cgroup events are always per-cpu events, this will
-		 * always be called from the right CPU.
-		 */
-		cpuctx = __get_cpu_context(ctx);
-		/*
-		 * If there are no more cgroup events then clear cgrp to avoid
-		 * stale pointer in update_cgrp_time_from_cpuctx().
-		 */
-		if (!ctx->nr_cgroups)
-			cpuctx->cgrp = NULL;
-	}
+	list_update_cgroup_event(event, ctx, false);
 
 	ctx->nr_events--;
 	if (event->attr.inherit_stat)
-- 
cgit v1.3-8-gc7d7


From a23eadfae2fd45536a355b785d5a1533e1955c22 Mon Sep 17 00:00:00 2001
From: Tommaso Cucinotta <tommaso.cucinotta@sssup.it>
Date: Tue, 19 Jul 2016 11:44:50 +0200
Subject: sched/deadline: Fix wrap-around in DL heap

Current code in cpudeadline.c has a bug in re-heapifying when adding a
new element at the end of the heap, because a deadline value of 0 is
temporarily set in the new elem, then cpudl_change_key() is called
with the actual elem deadline as param.

However, the function compares the new deadline to set with the one
previously in the elem, which is 0.  So, if current absolute deadlines
grew so much to have negative values as s64, the comparison in
cpudl_change_key() makes the wrong decision.  Instead, as from
dl_time_before(), the kernel should handle correctly abs deadlines
wrap-arounds.

This patch fixes the problem with a minimally invasive change that
forces cpudl_change_key() to heapify up in this case.

Signed-off-by: Tommaso Cucinotta <tommaso.cucinotta@sssup.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Luca Abeni <luca.abeni@unitn.it>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Juri Lelli <juri.lelli@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1468921493-10054-2-git-send-email-tommaso.cucinotta@sssup.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cpudeadline.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5be58820465c..d4184498c9f5 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -168,7 +168,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
 
 	if (old_idx == IDX_INVALID) {
 		cp->size++;
-		cp->elements[cp->size - 1].dl = 0;
+		cp->elements[cp->size - 1].dl = dl;
 		cp->elements[cp->size - 1].cpu = cpu;
 		cp->elements[cpu].idx = cp->size - 1;
 		cpudl_change_key(cp, cp->size - 1, dl);
-- 
cgit v1.3-8-gc7d7


From b8922125e4790fa237a8a4204562ecf457ef54bb Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Sat, 9 Jul 2016 15:54:22 +0800
Subject: sched/fair: Fix typo in sync_throttle()

We should update cfs_rq->throttled_clock_task, not
pcfs_rq->throttle_clock_task.

The effects of this bug was probably occasionally erratic
group scheduling, particularly in cgroups-intense workloads.

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
[ Added changelog. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 55e16d30bd99 ("sched/fair: Rework throttle_count sync")
Link: http://lkml.kernel.org/r/1468050862-18864-1-git-send-email-xlpang@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4088eedea763..039de34f1521 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4269,7 +4269,7 @@ static void sync_throttle(struct task_group *tg, int cpu)
 	pcfs_rq = tg->parent->cfs_rq[cpu];
 
 	cfs_rq->throttle_count = pcfs_rq->throttle_count;
-	pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+	cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
 }
 
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
-- 
cgit v1.3-8-gc7d7


From 6075620b0590eaf22f10ce88833eb20a57f760d6 Mon Sep 17 00:00:00 2001
From: Giovanni Gherdovich <ggherdovich@suse.cz>
Date: Fri, 5 Aug 2016 10:21:56 +0200
Subject: sched/cputime: Mitigate performance regression in
 times()/clock_gettime()

Commit:

  6e998916dfe3 ("sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency")

fixed a problem whereby clock_nanosleep() followed by clock_gettime() could
allow a task to wake early. It addressed the problem by calling the scheduling
classes update_curr() when the cputimer starts.

Said change induced a considerable performance regression on the syscalls
times() and clock_gettimes(CLOCK_PROCESS_CPUTIME_ID). There are some
debuggers and applications that monitor their own performance that
accidentally depend on the performance of these specific calls.

This patch mitigates the performace loss by prefetching data in the CPU
cache, as stalls due to cache misses appear to be where most time is spent
in our benchmarks.

Here are the performance gain of this patch over v4.7-rc7 on a Sandy Bridge
box with 32 logical cores and 2 NUMA nodes. The test is repeated with a
variable number of threads, from 2 to 4*num_cpus; the results are in
seconds and correspond to the average of 10 runs; the percentage gain is
computed with (before-after)/before so a positive value is an improvement
(it's faster). The improvement varies between a few percents for 5-20
threads and more than 10% for 2 or >20 threads.

pound_clock_gettime:

    threads       4.7-rc7     patched 4.7-rc7
    [num]         [secs]      [secs (percent)]
      2           3.48        3.06 ( 11.83%)
      5           3.33        3.25 (  2.40%)
      8           3.37        3.26 (  3.30%)
     12           3.32        3.37 ( -1.60%)
     21           4.01        3.90 (  2.74%)
     30           3.63        3.36 (  7.41%)
     48           3.71        3.11 ( 16.27%)
     79           3.75        3.16 ( 15.74%)
    110           3.81        3.25 ( 14.80%)
    128           3.88        3.31 ( 14.76%)

pound_times:

    threads       4.7-rc7     patched 4.7-rc7
    [num]         [secs]      [secs (percent)]
      2           3.65        3.25 ( 11.03%)
      5           3.45        3.17 (  7.92%)
      8           3.52        3.22 (  8.69%)
     12           3.29        3.36 ( -2.04%)
     21           4.07        3.92 (  3.78%)
     30           3.87        3.40 ( 12.17%)
     48           3.79        3.16 ( 16.61%)
     79           3.88        3.28 ( 15.42%)
    110           3.90        3.38 ( 13.35%)
    128           4.00        3.38 ( 15.45%)

pound_clock_gettime and pound_clock_gettime are two benchmarks included in
the MMTests framework. They launch a given number of threads which
repeatedly call times() or clock_gettimes(). The results above can be
reproduced with cloning MMTests from github.com and running the "poundtime"
workload:

  $ git clone https://github.com/gormanm/mmtests.git
  $ cd mmtests
  $ cp configs/config-global-dhp__workload_poundtime config
  $ ./run-mmtests.sh --run-monitor $(uname -r)

The above will run "poundtime" measuring the kernel currently running on
the machine; Once a new kernel is installed and the machine rebooted,
running again

  $ cd mmtests
  $ ./run-mmtests.sh --run-monitor $(uname -r)

will produce results to compare with. A comparison table will be output
with:

  $ cd mmtests/work/log
  $ ../../compare-kernels.sh

the table will contain a lot of entries; grepping for "Amean" (as in
"arithmetic mean") will give the tables presented above. The source code
for the two benchmarks is reported at the end of this changelog for
clairity.

The cache misses addressed by this patch were found using a combination of
`perf top`, `perf record` and `perf annotate`. The incriminated lines were
found to be

    struct sched_entity *curr = cfs_rq->curr;

and

    delta_exec = now - curr->exec_start;

in the function update_curr() from kernel/sched/fair.c. This patch
prefetches the data from memory just before update_curr is called in the
interested execution path.

A comparison of the total number of cycles before and after the patch
follows; the data is obtained using `perf stat -r 10 -ddd <program>`
running over the same sequence of number of threads used above (a positive
gain is an improvement):

  threads   cycles before                 cycles after                gain

    2      19,699,563,964  +-1.19%      17,358,917,517  +-1.85%      11.88%
    5      47,401,089,566  +-2.96%      45,103,730,829  +-0.97%       4.85%
    8      80,923,501,004  +-3.01%      71,419,385,977  +-0.77%      11.74%
   12     112,326,485,473  +-0.47%     110,371,524,403  +-0.47%       1.74%
   21     193,455,574,299  +-0.72%     180,120,667,904  +-0.36%       6.89%
   30     315,073,519,013  +-1.64%     271,222,225,950  +-1.29%      13.92%
   48     321,969,515,332  +-1.48%     273,353,977,321  +-1.16%      15.10%
   79     337,866,003,422  +-0.97%     289,462,481,538  +-1.05%      14.33%
  110     338,712,691,920  +-0.78%     290,574,233,170  +-0.77%      14.21%
  128     348,384,794,006  +-0.50%     292,691,648,206  +-0.66%      15.99%

A comparison of cache miss vs total cache loads ratios, before and after
the patch (again from the `perf stat -r 10 -ddd <program>` tables):

  threads   L1 misses/total*100     L1 misses/total*100            gain
		         before                   after
      2           7.43  +-4.90%           7.36  +-4.70%           0.94%
      5          13.09  +-4.74%          13.52  +-3.73%          -3.28%
      8          13.79  +-5.61%          12.90  +-3.27%           6.45%
     12          11.57  +-2.44%           8.71  +-1.40%          24.72%
     21          12.39  +-3.92%           9.97  +-1.84%          19.53%
     30          13.91  +-2.53%          11.73  +-2.28%          15.67%
     48          13.71  +-1.59%          12.32  +-1.97%          10.14%
     79          14.44  +-0.66%          13.40  +-1.06%           7.20%
    110          15.86  +-0.50%          14.46  +-0.59%           8.83%
    128          16.51  +-0.32%          15.06  +-0.78%           8.78%

As a final note, the following shows the evolution of performance figures
in the "poundtime" benchmark and pinpoints commit 6e998916dfe3
("sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency") as a
major source of degradation, mostly unaddressed to this day (figures
expressed in seconds).

pound_clock_gettime:

  threads   parent of         6e998916dfe3        4.7-rc7
	    6e998916dfe3            itself
    2        2.23          3.68 ( -64.56%)        3.48 (-55.48%)
    5        2.83          3.78 ( -33.42%)        3.33 (-17.43%)
    8        2.84          4.31 ( -52.12%)        3.37 (-18.76%)
    12       3.09          3.61 ( -16.74%)        3.32 ( -7.17%)
    21       3.14          4.63 ( -47.36%)        4.01 (-27.71%)
    30       3.28          5.75 ( -75.37%)        3.63 (-10.80%)
    48       3.02          6.05 (-100.56%)        3.71 (-22.99%)
    79       2.88          6.30 (-118.90%)        3.75 (-30.26%)
    110      2.95          6.46 (-119.00%)        3.81 (-29.24%)
    128      3.05          6.42 (-110.08%)        3.88 (-27.04%)

pound_times:

  threads   parent of         6e998916dfe3        4.7-rc7
	    6e998916dfe3            itself
    2        2.27          3.73 ( -64.71%)        3.65 (-61.14%)
    5        2.78          3.77 ( -35.56%)        3.45 (-23.98%)
    8        2.79          4.41 ( -57.71%)        3.52 (-26.05%)
    12       3.02          3.56 ( -17.94%)        3.29 ( -9.08%)
    21       3.10          4.61 ( -48.74%)        4.07 (-31.34%)
    30       3.33          5.75 ( -72.53%)        3.87 (-16.01%)
    48       2.96          6.06 (-105.04%)        3.79 (-28.10%)
    79       2.88          6.24 (-116.83%)        3.88 (-34.81%)
    110      2.98          6.37 (-114.08%)        3.90 (-31.12%)
    128      3.10          6.35 (-104.61%)        4.00 (-28.87%)

The source code of the two benchmarks follows. To compile the two:

  NR_THREADS=42
  for FILE in pound_times pound_clock_gettime; do
      gcc -lrt -O2 -lpthread -DNUM_THREADS=$NR_THREADS $FILE.c -o $FILE
  done

==== BEGIN pound_times.c ====

struct tms start;

void *pound (void *threadid)
{
  struct tms end;
  int oldutime = 0;
  int utime;
  int i;
  for (i = 0; i < 5000000 / NUM_THREADS; i++) {
          times(&end);
          utime = ((int)end.tms_utime - (int)start.tms_utime);
          if (oldutime > utime) {
            printf("utime decreased, was %d, now %d!\n", oldutime, utime);
          }
          oldutime = utime;
  }
  pthread_exit(NULL);
}

int main()
{
  pthread_t th[NUM_THREADS];
  long i;
  times(&start);
  for (i = 0; i < NUM_THREADS; i++) {
    pthread_create (&th[i], NULL, pound, (void *)i);
  }
  pthread_exit(NULL);
  return 0;
}
==== END pound_times.c ====

==== BEGIN pound_clock_gettime.c ====

void *pound (void *threadid)
{
	struct timespec ts;
	int rc, i;
	unsigned long prev = 0, this = 0;

	for (i = 0; i < 5000000 / NUM_THREADS; i++) {
		rc = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
		if (rc < 0)
			perror("clock_gettime");
		this = (ts.tv_sec * 1000000000) + ts.tv_nsec;
		if (0 && this < prev)
			printf("%lu ns timewarp at iteration %d\n", prev - this, i);
		prev = this;
	}
	pthread_exit(NULL);
}

int main()
{
	pthread_t th[NUM_THREADS];
	long rc, i;
	pid_t pgid;

	for (i = 0; i < NUM_THREADS; i++) {
		rc = pthread_create(&th[i], NULL, pound, (void *)i);
		if (rc < 0)
			perror("pthread_create");
	}

	pthread_exit(NULL);
	return 0;
}
==== END pound_clock_gettime.c ====

Suggested-by: Mike Galbraith <mgalbraith@suse.de>
Signed-off-by: Giovanni Gherdovich <ggherdovich@suse.cz>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1470385316-15027-2-git-send-email-ggherdovich@suse.cz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5c883fe8e440..2a906f20fba7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,7 @@
 #include <linux/context_tracking.h>
 #include <linux/compiler.h>
 #include <linux/frame.h>
+#include <linux/prefetch.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -2971,6 +2972,23 @@ DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
 
+/*
+ * The function fair_sched_class.update_curr accesses the struct curr
+ * and its field curr->exec_start; when called from task_sched_runtime(),
+ * we observe a high rate of cache misses in practice.
+ * Prefetching this data results in improved performance.
+ */
+static inline void prefetch_curr_exec_start(struct task_struct *p)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct sched_entity *curr = (&p->se)->cfs_rq->curr;
+#else
+	struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
+#endif
+	prefetch(curr);
+	prefetch(&curr->exec_start);
+}
+
 /*
  * Return accounted runtime for the task.
  * In case the task is currently running, return the runtime plus current's
@@ -3005,6 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	 * thread, breaking clock_gettime().
 	 */
 	if (task_current(rq, p) && task_on_rq_queued(p)) {
+		prefetch_curr_exec_start(p);
 		update_rq_clock(rq);
 		p->sched_class->update_curr(rq);
 	}
-- 
cgit v1.3-8-gc7d7


From c0c8c9fa210c9a042060435f17e40ba4a76d6d6f Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 4 Aug 2016 09:42:20 +0800
Subject: sched/deadline: Fix lock pinning warning during CPU hotplug

The following warning can be triggered by hot-unplugging the CPU
on which an active SCHED_DEADLINE task is running on:

  WARNING: CPU: 0 PID: 0 at kernel/locking/lockdep.c:3531 lock_release+0x690/0x6a0
  releasing a pinned lock
  Call Trace:
   dump_stack+0x99/0xd0
   __warn+0xd1/0xf0
   ? dl_task_timer+0x1a1/0x2b0
   warn_slowpath_fmt+0x4f/0x60
   ? sched_clock+0x13/0x20
   lock_release+0x690/0x6a0
   ? enqueue_pushable_dl_task+0x9b/0xa0
   ? enqueue_task_dl+0x1ca/0x480
   _raw_spin_unlock+0x1f/0x40
   dl_task_timer+0x1a1/0x2b0
   ? push_dl_task.part.31+0x190/0x190
  WARNING: CPU: 0 PID: 0 at kernel/locking/lockdep.c:3649 lock_unpin_lock+0x181/0x1a0
  unpinning an unpinned lock
  Call Trace:
   dump_stack+0x99/0xd0
   __warn+0xd1/0xf0
   warn_slowpath_fmt+0x4f/0x60
   lock_unpin_lock+0x181/0x1a0
   dl_task_timer+0x127/0x2b0
   ? push_dl_task.part.31+0x190/0x190

As per the comment before this code, its safe to drop the RQ lock
here, and since we (potentially) change rq, unpin and repin to avoid
the splat.

Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
[ Rewrote changelog. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luca Abeni <luca.abeni@unitn.it>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1470274940-17976-1-git-send-email-wanpeng.li@hotmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fcb7f0217ff4..1ce8867283dc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -658,8 +658,11 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	 *
 	 * XXX figure out if select_task_rq_dl() deals with offline cpus.
 	 */
-	if (unlikely(!rq->online))
+	if (unlikely(!rq->online)) {
+		lockdep_unpin_lock(&rq->lock, rf.cookie);
 		rq = dl_task_offline_migration(rq, p);
+		rf.cookie = lockdep_pin_lock(&rq->lock);
+	}
 
 	/*
 	 * Queueing this task back might have overloaded rq, check if we need
-- 
cgit v1.3-8-gc7d7


From 229ce631574761870a2ac938845fadbd07f35caa Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 14 Jul 2016 16:15:56 +0800
Subject: locking/pvqspinlock: Fix double hash race

When the lock holder vCPU is racing with the queue head:

   CPU 0 (lock holder)    CPU1 (queue head)
   ===================    =================
   spin_lock();           spin_lock();
    pv_kick_node():        pv_wait_head_or_lock():
                            if (!lp) {
                             lp = pv_hash(lock, pn);
                             xchg(&l->locked, _Q_SLOW_VAL);
                            }
                            WRITE_ONCE(pn->state, vcpu_halted);
     cmpxchg(&pn->state,
      vcpu_halted, vcpu_hashed);
     WRITE_ONCE(l->locked, _Q_SLOW_VAL);
     (void)pv_hash(lock, pn);

In this case, lock holder inserts the pv_node of queue head into the
hash table and set _Q_SLOW_VAL unnecessary. This patch avoids it by
restoring/setting vcpu_hashed state after failing adaptive locking
spinning.

Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <Waiman.Long@hpe.com>
Link: http://lkml.kernel.org/r/1468484156-4521-1-git-send-email-wanpeng.li@hotmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock_paravirt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 37649e69056c..8a99abf58080 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -450,7 +450,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 				goto gotlock;
 			}
 		}
-		WRITE_ONCE(pn->state, vcpu_halted);
+		WRITE_ONCE(pn->state, vcpu_hashed);
 		qstat_inc(qstat_pv_wait_head, true);
 		qstat_inc(qstat_pv_wait_again, waitcnt);
 		pv_wait(&l->locked, _Q_SLOW_VAL);
-- 
cgit v1.3-8-gc7d7


From c2ace36b884de9330c4149064ae8d212d2e0d9ee Mon Sep 17 00:00:00 2001
From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Date: Wed, 13 Jul 2016 18:23:34 +0800
Subject: locking/pvqspinlock: Fix a bug in qstat_read()

It's obviously wrong to set stat to NULL. So lets remove it.
Otherwise it is always zero when we check the latency of kick/wake.

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Waiman Long <Waiman.Long@hpe.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1468405414-3700-1-git-send-email-xinhui.pan@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock_stat.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 22e025309845..b9d031516254 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -153,7 +153,6 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf,
 		 */
 		if ((counter == qstat_pv_latency_kick) ||
 		    (counter == qstat_pv_latency_wake)) {
-			stat = 0;
 			if (kicks)
 				stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
 		}
-- 
cgit v1.3-8-gc7d7


From f9bcf1e0e0145323ba2cf72ecad5264ff3883eb1 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 11 Aug 2016 13:36:35 +0800
Subject: sched/cputime: Fix steal time accounting

Commit:

  57430218317 ("sched/cputime: Count actually elapsed irq & softirq time")

... didn't take steal time into consideration with passing the noirqtime
kernel parameter.

As Paolo pointed out before:

| Why not? If idle=poll, for example, any time the guest is suspended (and
| thus cannot poll) does count as stolen time.

This patch fixes it by reducing steal time from idle time accounting when
the noirqtime parameter is true. The average idle time drops from 56.8%
to 54.75% for nohz idle kvm guest(noirqtime, idle=poll, four vCPUs running
on one pCPU).

Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1470893795-3527-1-git-send-email-wanpeng.li@hotmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 1934f658c036..8b9bcc5a58fa 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -508,13 +508,20 @@ void account_process_tick(struct task_struct *p, int user_tick)
  */
 void account_idle_ticks(unsigned long ticks)
 {
-
+	cputime_t cputime, steal;
 	if (sched_clock_irqtime) {
 		irqtime_account_idle_ticks(ticks);
 		return;
 	}
 
-	account_idle_time(jiffies_to_cputime(ticks));
+	cputime = cputime_one_jiffy;
+	steal = steal_account_process_time(cputime);
+
+	if (steal >= cputime)
+		return;
+
+	cputime -= steal;
+	account_idle_time(cputime);
 }
 
 /*
-- 
cgit v1.3-8-gc7d7


From 26f2c75cd2cf10a6120ef02ca9a94db77cc9c8e0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 11 Aug 2016 14:58:24 +0200
Subject: sched/cputime: Fix omitted ticks passed in parameter

Commit:

  f9bcf1e0e014 ("sched/cputime: Fix steal time accounting")

... fixes a leak on steal time accounting but forgets to account
the ticks passed in parameters, assuming there is only one to
take into account.

Let's consider that parameter back.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Wanpeng Li <kernellwp@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: linux-tip-commits@vger.kernel.org
Link: http://lkml.kernel.org/r/20160811125822.GB4214@lerouge
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8b9bcc5a58fa..9858266fb0b3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -509,12 +509,13 @@ void account_process_tick(struct task_struct *p, int user_tick)
 void account_idle_ticks(unsigned long ticks)
 {
 	cputime_t cputime, steal;
+
 	if (sched_clock_irqtime) {
 		irqtime_account_idle_ticks(ticks);
 		return;
 	}
 
-	cputime = cputime_one_jiffy;
+	cputime = jiffies_to_cputime(ticks);
 	steal = steal_account_process_time(cputime);
 
 	if (steal >= cputime)
-- 
cgit v1.3-8-gc7d7


From 62822e2ec4ad091ba31f823f577ef80db52e3c2c Mon Sep 17 00:00:00 2001
From: Thomas Garnier <thgarnie@google.com>
Date: Thu, 11 Aug 2016 14:49:29 -0700
Subject: PM / hibernate: Restore processor state before using per-CPU
 variables

Restore the processor state before calling any other functions to
ensure per-CPU variables can be used with KASLR memory randomization.

Tracing functions use per-CPU variables (GS based on x86) and one was
called just before restoring the processor state fully. It resulted
in a double fault when both the tracing & the exception handler
functions tried to use a per-CPU variable.

Fixes: bb3632c6101b (PM / sleep: trace events for suspend/resume)
Reported-and-tested-by: Borislav Petkov <bp@suse.de>
Reported-by: Jiri Kosina <jikos@kernel.org>
Tested-by: Rafael J. Wysocki <rafael@kernel.org>
Tested-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Thomas Garnier <thgarnie@google.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 0ee1df0a0bd6..61761aa7cc19 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -300,12 +300,12 @@ static int create_image(int platform_mode)
 	save_processor_state();
 	trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
 	error = swsusp_arch_suspend();
+	/* Restore control flow magically appears here */
+	restore_processor_state();
 	trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
 	if (error)
 		printk(KERN_ERR "PM: Error %d creating hibernation image\n",
 			error);
-	/* Restore control flow magically appears here */
-	restore_processor_state();
 	if (!in_suspend)
 		events_check_enabled = false;
 
-- 
cgit v1.3-8-gc7d7


From 747ea55e4f78fd980350c39570a986b8c1c3e4aa Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 12 Aug 2016 22:17:17 +0200
Subject: bpf: fix bpf_skb_in_cgroup helper naming

While hashing out BPF's current_task_under_cgroup helper bits, it came
to discussion that the skb_in_cgroup helper name was suboptimally chosen.

Tejun says:

  So, I think in_cgroup should mean that the object is in that
  particular cgroup while under_cgroup in the subhierarchy of that
  cgroup. Let's rename the other subhierarchy test to under too. I
  think that'd be a lot less confusing going forward.

  [...]

  It's more intuitive and gives us the room to implement the real
  "in" test if ever necessary in the future.

Since this touches uapi bits, we need to change this as long as v4.8
is not yet officially released. Thus, change the helper enum and rename
related bits.

Fixes: 4a482f34afcc ("cgroup: bpf: Add bpf_skb_in_cgroup_proto")
Reference: http://patchwork.ozlabs.org/patch/658500/
Suggested-by: Sargun Dhillon <sargun@sargun.me>
Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h         |  4 ++--
 kernel/bpf/verifier.c            |  4 ++--
 net/core/filter.c                | 10 +++++-----
 samples/bpf/bpf_helpers.h        |  4 ++--
 samples/bpf/test_cgrp2_tc_kern.c |  2 +-
 5 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da218fec6056..9e5fc168c8a3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -339,7 +339,7 @@ enum bpf_func_id {
 	BPF_FUNC_skb_change_type,
 
 	/**
-	 * bpf_skb_in_cgroup(skb, map, index) - Check cgroup2 membership of skb
+	 * bpf_skb_under_cgroup(skb, map, index) - Check cgroup2 membership of skb
 	 * @skb: pointer to skb
 	 * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
 	 * @index: index of the cgroup in the bpf_map
@@ -348,7 +348,7 @@ enum bpf_func_id {
 	 *   == 1 skb succeeded the cgroup2 descendant test
 	 *    < 0 error
 	 */
-	BPF_FUNC_skb_in_cgroup,
+	BPF_FUNC_skb_under_cgroup,
 
 	/**
 	 * bpf_get_hash_recalc(skb)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7094c69ac199..daea765d72e6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1053,7 +1053,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_CGROUP_ARRAY:
-		if (func_id != BPF_FUNC_skb_in_cgroup)
+		if (func_id != BPF_FUNC_skb_under_cgroup)
 			goto error;
 		break;
 	default:
@@ -1075,7 +1075,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
 			goto error;
 		break;
-	case BPF_FUNC_skb_in_cgroup:
+	case BPF_FUNC_skb_under_cgroup:
 		if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
 			goto error;
 		break;
diff --git a/net/core/filter.c b/net/core/filter.c
index b5add4ef0d1d..bd9bf2e5fafa 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2317,7 +2317,7 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
 }
 
 #ifdef CONFIG_SOCK_CGROUP_DATA
-static u64 bpf_skb_in_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *)(long)r1;
 	struct bpf_map *map = (struct bpf_map *)(long)r2;
@@ -2340,8 +2340,8 @@ static u64 bpf_skb_in_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp);
 }
 
-static const struct bpf_func_proto bpf_skb_in_cgroup_proto = {
-	.func		= bpf_skb_in_cgroup,
+static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
+	.func		= bpf_skb_under_cgroup,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
@@ -2421,8 +2421,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
 #ifdef CONFIG_SOCK_CGROUP_DATA
-	case BPF_FUNC_skb_in_cgroup:
-		return &bpf_skb_in_cgroup_proto;
+	case BPF_FUNC_skb_under_cgroup:
+		return &bpf_skb_under_cgroup_proto;
 #endif
 	default:
 		return sk_filter_func_proto(func_id);
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index 217c8d507f2e..7927a090fa0d 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -72,8 +72,8 @@ static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flag
 	(void *) BPF_FUNC_l3_csum_replace;
 static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flags) =
 	(void *) BPF_FUNC_l4_csum_replace;
-static int (*bpf_skb_in_cgroup)(void *ctx, void *map, int index) =
-	(void *) BPF_FUNC_skb_in_cgroup;
+static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) =
+	(void *) BPF_FUNC_skb_under_cgroup;
 
 #if defined(__x86_64__)
 
diff --git a/samples/bpf/test_cgrp2_tc_kern.c b/samples/bpf/test_cgrp2_tc_kern.c
index 2732c37c8d5b..10ff73404e3a 100644
--- a/samples/bpf/test_cgrp2_tc_kern.c
+++ b/samples/bpf/test_cgrp2_tc_kern.c
@@ -57,7 +57,7 @@ int handle_egress(struct __sk_buff *skb)
 		bpf_trace_printk(dont_care_msg, sizeof(dont_care_msg),
 				 eth->h_proto, ip6h->nexthdr);
 		return TC_ACT_OK;
-	} else if (bpf_skb_in_cgroup(skb, &test_cgrp2_array_pin, 0) != 1) {
+	} else if (bpf_skb_under_cgroup(skb, &test_cgrp2_array_pin, 0) != 1) {
 		bpf_trace_printk(pass_msg, sizeof(pass_msg));
 		return TC_ACT_OK;
 	} else {
-- 
cgit v1.3-8-gc7d7


From 924d8696751c4b9e58263bc82efdafcf875596a6 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Tue, 16 Aug 2016 10:46:38 +0100
Subject: PM / hibernate: Fix rtree_next_node() to avoid walking off list ends

rtree_next_node() walks the linked list of leaf nodes to find the next
block of pages in the struct memory_bitmap. If it walks off the end of
the list of nodes, it walks the list of memory zones to find the next
region of memory. If it walks off the end of the list of zones, it
returns false.

This leaves the struct bm_position's node and zone pointers pointing
at their respective struct list_heads in struct mem_zone_bm_rtree.

memory_bm_find_bit() uses struct bm_position's node and zone pointers
to avoid walking lists and trees if the next bit appears in the same
node/zone. It handles these values being stale.

Swap rtree_next_node()s 'step then test' to 'test-next then step',
this means if we reach the end of memory we return false and leave
the node and zone pointers as they were.

This fixes a panic on resume using AMD Seattle with 64K pages:
[    6.868732] Freezing user space processes ... (elapsed 0.000 seconds) done.
[    6.875753] Double checking all user space processes after OOM killer disable... (elapsed 0.000 seconds)
[    6.896453] PM: Using 3 thread(s) for decompression.
[    6.896453] PM: Loading and decompressing image data (5339 pages)...
[    7.318890] PM: Image loading progress:   0%
[    7.323395] Unable to handle kernel paging request at virtual address 00800040
[    7.330611] pgd = ffff000008df0000
[    7.334003] [00800040] *pgd=00000083fffe0003, *pud=00000083fffe0003, *pmd=00000083fffd0003, *pte=0000000000000000
[    7.344266] Internal error: Oops: 96000005 [#1] PREEMPT SMP
[    7.349825] Modules linked in:
[    7.352871] CPU: 2 PID: 1 Comm: swapper/0 Tainted: G        W I     4.8.0-rc1 #4737
[    7.360512] Hardware name: AMD Overdrive/Supercharger/Default string, BIOS ROD1002C 04/08/2016
[    7.369109] task: ffff8003c0220000 task.stack: ffff8003c0280000
[    7.375020] PC is at set_bit+0x18/0x30
[    7.378758] LR is at memory_bm_set_bit+0x24/0x30
[    7.383362] pc : [<ffff00000835bbc8>] lr : [<ffff0000080faf18>] pstate: 60000045
[    7.390743] sp : ffff8003c0283b00
[    7.473551]
[    7.475031] Process swapper/0 (pid: 1, stack limit = 0xffff8003c0280020)
[    7.481718] Stack: (0xffff8003c0283b00 to 0xffff8003c0284000)
[    7.800075] Call trace:
[    7.887097] [<ffff00000835bbc8>] set_bit+0x18/0x30
[    7.891876] [<ffff0000080fb038>] duplicate_memory_bitmap.constprop.38+0x54/0x70
[    7.899172] [<ffff0000080fcc40>] snapshot_write_next+0x22c/0x47c
[    7.905166] [<ffff0000080fe1b4>] load_image_lzo+0x754/0xa88
[    7.910725] [<ffff0000080ff0a8>] swsusp_read+0x144/0x230
[    7.916025] [<ffff0000080fa338>] load_image_and_restore+0x58/0x90
[    7.922105] [<ffff0000080fa660>] software_resume+0x2f0/0x338
[    7.927752] [<ffff000008083350>] do_one_initcall+0x38/0x11c
[    7.933314] [<ffff000008b40cc0>] kernel_init_freeable+0x14c/0x1ec
[    7.939395] [<ffff0000087ce564>] kernel_init+0x10/0xfc
[    7.944520] [<ffff000008082e90>] ret_from_fork+0x10/0x40
[    7.949820] Code: d2800022 8b400c21 f9800031 9ac32043 (c85f7c22)
[    7.955909] ---[ end trace 0024a5986e6ff323 ]---
[    7.960529] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b

Here struct mem_zone_bm_rtree's start_pfn has been returned instead of
struct rtree_node's addr as the node/zone pointers are corrupt after
we walked off the end of the lists during mark_unsafe_pages().

This behaviour was exposed by commit 6dbecfd345a6 ("PM / hibernate:
Simplify mark_unsafe_pages()"), which caused mark_unsafe_pages() to call
duplicate_memory_bitmap(), which uses memory_bm_find_bit() after walking
off the end of the memory bitmap.

Fixes: 3a20cb177961 (PM / Hibernate: Implement position keeping in radix tree)
Signed-off-by: James Morse <james.morse@arm.com>
[ rjw: Subject ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d90df926b59f..2ba18691fe76 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -835,9 +835,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
  */
 static bool rtree_next_node(struct memory_bitmap *bm)
 {
-	bm->cur.node = list_entry(bm->cur.node->list.next,
-				  struct rtree_node, list);
-	if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+	if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) {
+		bm->cur.node = list_entry(bm->cur.node->list.next,
+					  struct rtree_node, list);
 		bm->cur.node_pfn += BM_BITS_PER_BLOCK;
 		bm->cur.node_bit  = 0;
 		touch_softlockup_watchdog();
@@ -845,9 +845,9 @@ static bool rtree_next_node(struct memory_bitmap *bm)
 	}
 
 	/* No more nodes, goto next zone */
-	bm->cur.zone = list_entry(bm->cur.zone->list.next,
+	if (!list_is_last(&bm->cur.zone->list, &bm->zones)) {
+		bm->cur.zone = list_entry(bm->cur.zone->list.next,
 				  struct mem_zone_bm_rtree, list);
-	if (&bm->cur.zone->list != &bm->zones) {
 		bm->cur.node = list_entry(bm->cur.zone->leaves.next,
 					  struct rtree_node, list);
 		bm->cur.node_pfn = 0;
-- 
cgit v1.3-8-gc7d7


From 7afafc8a44bf0ab841b17d450b02aedb3a138985 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 16 Aug 2016 10:59:35 +0300
Subject: block: Fix secure erase

Commit 288dab8a35a0 ("block: add a separate operation type for secure
erase") split REQ_OP_SECURE_ERASE from REQ_OP_DISCARD without considering
all the places REQ_OP_DISCARD was being used to mean either. Fix those.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Fixes: 288dab8a35a0 ("block: add a separate operation type for secure erase")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c              | 21 +++++++++++----------
 block/blk-merge.c        | 33 +++++++++++++++++++--------------
 block/elevator.c         |  2 +-
 drivers/mmc/card/block.c |  1 +
 drivers/mmc/card/queue.c |  3 ++-
 drivers/mmc/card/queue.h |  4 +++-
 include/linux/bio.h      | 10 ++++++++--
 include/linux/blkdev.h   |  6 ++++--
 kernel/trace/blktrace.c  |  2 +-
 9 files changed, 50 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/block/bio.c b/block/bio.c
index f39477538fef..aa7354088008 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -667,18 +667,19 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 	bio->bi_iter.bi_sector	= bio_src->bi_iter.bi_sector;
 	bio->bi_iter.bi_size	= bio_src->bi_iter.bi_size;
 
-	if (bio_op(bio) == REQ_OP_DISCARD)
-		goto integrity_clone;
-
-	if (bio_op(bio) == REQ_OP_WRITE_SAME) {
+	switch (bio_op(bio)) {
+	case REQ_OP_DISCARD:
+	case REQ_OP_SECURE_ERASE:
+		break;
+	case REQ_OP_WRITE_SAME:
 		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
-		goto integrity_clone;
+		break;
+	default:
+		bio_for_each_segment(bv, bio_src, iter)
+			bio->bi_io_vec[bio->bi_vcnt++] = bv;
+		break;
 	}
 
-	bio_for_each_segment(bv, bio_src, iter)
-		bio->bi_io_vec[bio->bi_vcnt++] = bv;
-
-integrity_clone:
 	if (bio_integrity(bio_src)) {
 		int ret;
 
@@ -1788,7 +1789,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
 	 * Discards need a mutable bio_vec to accommodate the payload
 	 * required by the DSM TRIM and UNMAP commands.
 	 */
-	if (bio_op(bio) == REQ_OP_DISCARD)
+	if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
 		split = bio_clone_bioset(bio, gfp, bs);
 	else
 		split = bio_clone_fast(bio, gfp, bs);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3eec75a9e91d..72627e3cf91e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -172,12 +172,18 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
 	struct bio *split, *res;
 	unsigned nsegs;
 
-	if (bio_op(*bio) == REQ_OP_DISCARD)
+	switch (bio_op(*bio)) {
+	case REQ_OP_DISCARD:
+	case REQ_OP_SECURE_ERASE:
 		split = blk_bio_discard_split(q, *bio, bs, &nsegs);
-	else if (bio_op(*bio) == REQ_OP_WRITE_SAME)
+		break;
+	case REQ_OP_WRITE_SAME:
 		split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
-	else
+		break;
+	default:
 		split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs);
+		break;
+	}
 
 	/* physical segments can be figured out during splitting */
 	res = split ? split : *bio;
@@ -213,7 +219,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 	 * This should probably be returning 0, but blk_add_request_payload()
 	 * (Christoph!!!!)
 	 */
-	if (bio_op(bio) == REQ_OP_DISCARD)
+	if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
 		return 1;
 
 	if (bio_op(bio) == REQ_OP_WRITE_SAME)
@@ -385,7 +391,9 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
 	nsegs = 0;
 	cluster = blk_queue_cluster(q);
 
-	if (bio_op(bio) == REQ_OP_DISCARD) {
+	switch (bio_op(bio)) {
+	case REQ_OP_DISCARD:
+	case REQ_OP_SECURE_ERASE:
 		/*
 		 * This is a hack - drivers should be neither modifying the
 		 * biovec, nor relying on bi_vcnt - but because of
@@ -393,19 +401,16 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
 		 * a payload we need to set up here (thank you Christoph) and
 		 * bi_vcnt is really the only way of telling if we need to.
 		 */
-
-		if (bio->bi_vcnt)
-			goto single_segment;
-
-		return 0;
-	}
-
-	if (bio_op(bio) == REQ_OP_WRITE_SAME) {
-single_segment:
+		if (!bio->bi_vcnt)
+			return 0;
+		/* Fall through */
+	case REQ_OP_WRITE_SAME:
 		*sg = sglist;
 		bvec = bio_iovec(bio);
 		sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
 		return 1;
+	default:
+		break;
 	}
 
 	for_each_bio(bio)
diff --git a/block/elevator.c b/block/elevator.c
index 7096c22041e7..f7d973a56fd7 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -366,7 +366,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 	list_for_each_prev(entry, &q->queue_head) {
 		struct request *pos = list_entry_rq(entry);
 
-		if ((req_op(rq) == REQ_OP_DISCARD) != (req_op(pos) == REQ_OP_DISCARD))
+		if (req_op(rq) != req_op(pos))
 			break;
 		if (rq_data_dir(rq) != rq_data_dir(pos))
 			break;
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index 48a5dd740f3b..82503e6f04b3 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -1726,6 +1726,7 @@ static u8 mmc_blk_prep_packed_list(struct mmc_queue *mq, struct request *req)
 			break;
 
 		if (req_op(next) == REQ_OP_DISCARD ||
+		    req_op(next) == REQ_OP_SECURE_ERASE ||
 		    req_op(next) == REQ_OP_FLUSH)
 			break;
 
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index bf14642a576a..29578e98603d 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -33,7 +33,8 @@ static int mmc_prep_request(struct request_queue *q, struct request *req)
 	/*
 	 * We only like normal block requests and discards.
 	 */
-	if (req->cmd_type != REQ_TYPE_FS && req_op(req) != REQ_OP_DISCARD) {
+	if (req->cmd_type != REQ_TYPE_FS && req_op(req) != REQ_OP_DISCARD &&
+	    req_op(req) != REQ_OP_SECURE_ERASE) {
 		blk_dump_rq_flags(req, "MMC bad request");
 		return BLKPREP_KILL;
 	}
diff --git a/drivers/mmc/card/queue.h b/drivers/mmc/card/queue.h
index d62531124d54..fee5e1271465 100644
--- a/drivers/mmc/card/queue.h
+++ b/drivers/mmc/card/queue.h
@@ -4,7 +4,9 @@
 static inline bool mmc_req_is_special(struct request *req)
 {
 	return req &&
-		(req_op(req) == REQ_OP_FLUSH || req_op(req) == REQ_OP_DISCARD);
+		(req_op(req) == REQ_OP_FLUSH ||
+		 req_op(req) == REQ_OP_DISCARD ||
+		 req_op(req) == REQ_OP_SECURE_ERASE);
 }
 
 struct request;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 59ffaa68b11b..23ddf4b46a9b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -71,7 +71,8 @@ static inline bool bio_has_data(struct bio *bio)
 {
 	if (bio &&
 	    bio->bi_iter.bi_size &&
-	    bio_op(bio) != REQ_OP_DISCARD)
+	    bio_op(bio) != REQ_OP_DISCARD &&
+	    bio_op(bio) != REQ_OP_SECURE_ERASE)
 		return true;
 
 	return false;
@@ -79,7 +80,9 @@ static inline bool bio_has_data(struct bio *bio)
 
 static inline bool bio_no_advance_iter(struct bio *bio)
 {
-	return bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_WRITE_SAME;
+	return bio_op(bio) == REQ_OP_DISCARD ||
+	       bio_op(bio) == REQ_OP_SECURE_ERASE ||
+	       bio_op(bio) == REQ_OP_WRITE_SAME;
 }
 
 static inline bool bio_is_rw(struct bio *bio)
@@ -199,6 +202,9 @@ static inline unsigned bio_segments(struct bio *bio)
 	if (bio_op(bio) == REQ_OP_DISCARD)
 		return 1;
 
+	if (bio_op(bio) == REQ_OP_SECURE_ERASE)
+		return 1;
+
 	if (bio_op(bio) == REQ_OP_WRITE_SAME)
 		return 1;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2c210b6a7bcf..e79055c8b577 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -882,7 +882,7 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
 						     int op)
 {
-	if (unlikely(op == REQ_OP_DISCARD))
+	if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
 		return min(q->limits.max_discard_sectors, UINT_MAX >> 9);
 
 	if (unlikely(op == REQ_OP_WRITE_SAME))
@@ -913,7 +913,9 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
 	if (unlikely(rq->cmd_type != REQ_TYPE_FS))
 		return q->limits.max_hw_sectors;
 
-	if (!q->limits.chunk_sectors || (req_op(rq) == REQ_OP_DISCARD))
+	if (!q->limits.chunk_sectors ||
+	    req_op(rq) == REQ_OP_DISCARD ||
+	    req_op(rq) == REQ_OP_SECURE_ERASE)
 		return blk_queue_get_max_sectors(q, req_op(rq));
 
 	return min(blk_max_size_offset(q, offset),
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7598e6ca817a..dbafc5df03f3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -223,7 +223,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	what |= MASK_TC_BIT(op_flags, META);
 	what |= MASK_TC_BIT(op_flags, PREFLUSH);
 	what |= MASK_TC_BIT(op_flags, FUA);
-	if (op == REQ_OP_DISCARD)
+	if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
 		what |= BLK_TC_ACT(BLK_TC_DISCARD);
 	if (op == REQ_OP_FLUSH)
 		what |= BLK_TC_ACT(BLK_TC_FLUSH);
-- 
cgit v1.3-8-gc7d7


From 1e12c4a9393b75a744aada2c8115434572698bc3 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 11 Aug 2016 14:19:42 +0100
Subject: genirq: Correctly configure the trigger on chained interrupts

Commit 1e2a7d78499e ("irqdomain: Don't set type when mapping an IRQ")
moved the trigger configuration call from the irqdomain mapping to
the interrupt being actually requested.

This patch failed to handle the case where we configure a chained
interrupt, which doesn't get requested through the usual path.

In order to solve this, let's call __irq_set_trigger just before
starting the cascade interrupt. Special care must be taken to
make the flow handler stick, as the .irq_set_type method could
have reset it (it doesn't know we're dealing with a chained
interrupt).

Based on an initial patch by Jon Hunter.

Fixes: 1e2a7d78499e ("irqdomain: Don't set type when mapping an IRQ")
Reported-by: John Stultz <john.stultz@linaro.org>
Reported-by: Linus Walleij <linus.walleij@linaro.org>
Tested-by: John Stultz <john.stultz@linaro.org>
Acked-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 kernel/irq/chip.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b4c1bc7c9ca2..637389088b3f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -820,6 +820,17 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
 	desc->name = name;
 
 	if (handle != handle_bad_irq && is_chained) {
+		/*
+		 * We're about to start this interrupt immediately,
+		 * hence the need to set the trigger configuration.
+		 * But the .set_type callback may have overridden the
+		 * flow handler, ignoring that we're dealing with a
+		 * chained interrupt. Reset it immediately because we
+		 * do know better.
+		 */
+		__irq_set_trigger(desc, irqd_get_trigger_type(&desc->irq_data));
+		desc->handle_irq = handle;
+
 		irq_settings_set_noprobe(desc);
 		irq_settings_set_norequest(desc);
 		irq_settings_set_nothread(desc);
-- 
cgit v1.3-8-gc7d7


From 568ac888215c7fb2fabe8ea739b00ec3c1f5d440 Mon Sep 17 00:00:00 2001
From: Balbir Singh <bsingharora@gmail.com>
Date: Wed, 10 Aug 2016 15:43:06 -0400
Subject: cgroup: reduce read locked section of cgroup_threadgroup_rwsem during
 fork

cgroup_threadgroup_rwsem is acquired in read mode during process exit
and fork.  It is also grabbed in write mode during
__cgroups_proc_write().  I've recently run into a scenario with lots
of memory pressure and OOM and I am beginning to see

systemd

 __switch_to+0x1f8/0x350
 __schedule+0x30c/0x990
 schedule+0x48/0xc0
 percpu_down_write+0x114/0x170
 __cgroup_procs_write.isra.12+0xb8/0x3c0
 cgroup_file_write+0x74/0x1a0
 kernfs_fop_write+0x188/0x200
 __vfs_write+0x6c/0xe0
 vfs_write+0xc0/0x230
 SyS_write+0x6c/0x110
 system_call+0x38/0xb4

This thread is waiting on the reader of cgroup_threadgroup_rwsem to
exit.  The reader itself is under memory pressure and has gone into
reclaim after fork. There are times the reader also ends up waiting on
oom_lock as well.

 __switch_to+0x1f8/0x350
 __schedule+0x30c/0x990
 schedule+0x48/0xc0
 jbd2_log_wait_commit+0xd4/0x180
 ext4_evict_inode+0x88/0x5c0
 evict+0xf8/0x2a0
 dispose_list+0x50/0x80
 prune_icache_sb+0x6c/0x90
 super_cache_scan+0x190/0x210
 shrink_slab.part.15+0x22c/0x4c0
 shrink_zone+0x288/0x3c0
 do_try_to_free_pages+0x1dc/0x590
 try_to_free_pages+0xdc/0x260
 __alloc_pages_nodemask+0x72c/0xc90
 alloc_pages_current+0xb4/0x1a0
 page_table_alloc+0xc0/0x170
 __pte_alloc+0x58/0x1f0
 copy_page_range+0x4ec/0x950
 copy_process.isra.5+0x15a0/0x1870
 _do_fork+0xa8/0x4b0
 ppc_clone+0x8/0xc

In the meanwhile, all processes exiting/forking are blocked almost
stalling the system.

This patch moves the threadgroup_change_begin from before
cgroup_fork() to just before cgroup_canfork().  There is no nee to
worry about threadgroup changes till the task is actually added to the
threadgroup.  This avoids having to call reclaim with
cgroup_threadgroup_rwsem held.

tj: Subject and description edits.

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Acked-by: Zefan Li <lizefan@huawei.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: stable@vger.kernel.org # v4.2+
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 52e725d4a866..aaf782327bf3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1404,7 +1404,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->real_start_time = ktime_get_boot_ns();
 	p->io_context = NULL;
 	p->audit_context = NULL;
-	threadgroup_change_begin(current);
 	cgroup_fork(p);
 #ifdef CONFIG_NUMA
 	p->mempolicy = mpol_dup(p->mempolicy);
@@ -1556,6 +1555,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	INIT_LIST_HEAD(&p->thread_group);
 	p->task_works = NULL;
 
+	threadgroup_change_begin(current);
 	/*
 	 * Ensure that the cgroup subsystem policies allow the new process to be
 	 * forked. It should be noted the the new process's css_set can be changed
@@ -1656,6 +1656,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 bad_fork_cancel_cgroup:
 	cgroup_cancel_fork(p);
 bad_fork_free_pid:
+	threadgroup_change_end(current);
 	if (pid != &init_struct_pid)
 		free_pid(pid);
 bad_fork_cleanup_thread:
@@ -1688,7 +1689,6 @@ bad_fork_cleanup_policy:
 	mpol_put(p->mempolicy);
 bad_fork_cleanup_threadgroup_lock:
 #endif
-	threadgroup_change_end(current);
 	delayacct_tsk_free(p);
 bad_fork_cleanup_count:
 	atomic_dec(&p->cred->user->processes);
-- 
cgit v1.3-8-gc7d7


From 6c4687cc17a788a6dd8de3e27dbeabb7cbd3e066 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 17 Aug 2016 17:36:29 +0200
Subject: uprobes: Fix the memcg accounting

__replace_page() wronlgy calls mem_cgroup_cancel_charge() in "success" path,
it should only do this if page_check_address() fails.

This means that every enable/disable leads to unbalanced mem_cgroup_uncharge()
from put_page(old_page), it is trivial to underflow the page_counter->count
and trigger OOM.

Reported-and-tested-by: Brenden Blanco <bblanco@plumgrid.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: stable@vger.kernel.org # 3.17+
Fixes: 00501b531c47 ("mm: memcontrol: rewrite charge API")
Link: http://lkml.kernel.org/r/20160817153629.GB29724@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b7a525ab2083..8c50276b60d1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -172,8 +172,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	err = -EAGAIN;
 	ptep = page_check_address(page, mm, addr, &ptl, 0);
-	if (!ptep)
+	if (!ptep) {
+		mem_cgroup_cancel_charge(kpage, memcg, false);
 		goto unlock;
+	}
 
 	get_page(kpage);
 	page_add_new_anon_rmap(kpage, vma, addr, false);
@@ -200,7 +202,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 
 	err = 0;
  unlock:
-	mem_cgroup_cancel_charge(kpage, memcg, false);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	unlock_page(page);
 	return err;
-- 
cgit v1.3-8-gc7d7


From cca2094605efe6ccf43ff2876dd5bccc799202d8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 16 Aug 2016 13:33:26 +0200
Subject: perf/core: Fix event_function_local()

Vincent reported triggering the WARN_ON_ONCE() in event_function_local().

While thinking through cases I noticed that by using event_function()
directly, we miss the inactive case usually handled by
event_function_call().

Therefore construct a blend of event_function_call() and
event_function() that handles the cases relevant to
event_function_local().

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org # 4.5+
Fixes: fae3fde65138 ("perf: Collapse and fix event_function_call() users")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 60 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 48 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1903b8f3a705..6e454bfd514f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -242,18 +242,6 @@ unlock:
 	return ret;
 }
 
-static void event_function_local(struct perf_event *event, event_f func, void *data)
-{
-	struct event_function_struct efs = {
-		.event = event,
-		.func = func,
-		.data = data,
-	};
-
-	int ret = event_function(&efs);
-	WARN_ON_ONCE(ret);
-}
-
 static void event_function_call(struct perf_event *event, event_f func, void *data)
 {
 	struct perf_event_context *ctx = event->ctx;
@@ -303,6 +291,54 @@ again:
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
+/*
+ * Similar to event_function_call() + event_function(), but hard assumes IRQs
+ * are already disabled and we're on the right CPU.
+ */
+static void event_function_local(struct perf_event *event, event_f func, void *data)
+{
+	struct perf_event_context *ctx = event->ctx;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+	struct task_struct *task = READ_ONCE(ctx->task);
+	struct perf_event_context *task_ctx = NULL;
+
+	WARN_ON_ONCE(!irqs_disabled());
+
+	if (task) {
+		if (task == TASK_TOMBSTONE)
+			return;
+
+		task_ctx = ctx;
+	}
+
+	perf_ctx_lock(cpuctx, task_ctx);
+
+	task = ctx->task;
+	if (task == TASK_TOMBSTONE)
+		goto unlock;
+
+	if (task) {
+		/*
+		 * We must be either inactive or active and the right task,
+		 * otherwise we're screwed, since we cannot IPI to somewhere
+		 * else.
+		 */
+		if (ctx->is_active) {
+			if (WARN_ON_ONCE(task != current))
+				goto unlock;
+
+			if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
+				goto unlock;
+		}
+	} else {
+		WARN_ON_ONCE(&cpuctx->ctx != ctx);
+	}
+
+	func(event, cpuctx, ctx, data);
+unlock:
+	perf_ctx_unlock(cpuctx, task_ctx);
+}
+
 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
 		       PERF_FLAG_FD_OUTPUT  |\
 		       PERF_FLAG_PID_CGROUP |\
-- 
cgit v1.3-8-gc7d7


From 4059ffd09d694f704e18a4baf97fc0016c32e9ad Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Mon, 18 Jul 2016 10:43:05 -0600
Subject: perf/core: Fix file name handling for start/stop filters

Binary file names have to be supplied for both range and start/stop
filters but the current code only processes the filename if an
address range filter is specified.  This code adds processing of
the filename for start/stop filters.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1468860187-318-2-git-send-email-mathieu.poirier@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6e454bfd514f..52780ef941d3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7972,8 +7972,10 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 					goto fail;
 			}
 
-			if (token == IF_SRC_FILE) {
-				filename = match_strdup(&args[2]);
+			if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
+				int fpos = filter->range ? 2 : 1;
+
+				filename = match_strdup(&args[fpos]);
 				if (!filename) {
 					ret = -ENOMEM;
 					goto fail;
-- 
cgit v1.3-8-gc7d7


From 12b40a2393719a37ff86a0b43bece6d28a75cbfc Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Mon, 18 Jul 2016 10:43:06 -0600
Subject: perf/core: Update filters only on executable mmap

Function perf_event_mmap() is called by the MM subsystem each time
part of a binary is loaded in memory.  There can be several mapping
for a binary, many times unrelated to the code section.

Each time a section of a binary is mapped address filters are
updated, event when the map doesn't pertain to the code section.
The end result is that filters are configured based on the last map
event that was received rather than the last mapping of the code
segment.

For example if we have an executable 'main' that calls library
'libcstest.so.1.0', and that we want to collect traces on code
that is in that library.  The perf cmd line for this scenario
would be:

  perf record -e cs_etm// --filter 'filter 0x72c/0x40@/opt/lib/libcstest.so.1.0' --per-thread ./main

Resulting in binaries being mapped this way:

  root@linaro-nano:~# cat /proc/1950/maps
  00400000-00401000 r-xp 00000000 08:02 33169     /home/linaro/main
  00410000-00411000 r--p 00000000 08:02 33169     /home/linaro/main
  00411000-00412000 rw-p 00001000 08:02 33169     /home/linaro/main
  7fa2464000-7fa2474000 rw-p 00000000 00:00 0
  7fa2474000-7fa25a4000 r-xp 00000000 08:02 543   /lib/aarch64-linux-gnu/libc-2.21.so
  7fa25a4000-7fa25b3000 ---p 00130000 08:02 543   /lib/aarch64-linux-gnu/libc-2.21.so
  7fa25b3000-7fa25b7000 r--p 0012f000 08:02 543   /lib/aarch64-linux-gnu/libc-2.21.so
  7fa25b7000-7fa25b9000 rw-p 00133000 08:02 543   /lib/aarch64-linux-gnu/libc-2.21.so
  7fa25b9000-7fa25bd000 rw-p 00000000 00:00 0
  7fa25bd000-7fa25be000 r-xp 00000000 08:02 38308 /opt/lib/libcstest.so.1.0
  7fa25be000-7fa25cd000 ---p 00001000 08:02 38308 /opt/lib/libcstest.so.1.0
  7fa25cd000-7fa25ce000 r--p 00000000 08:02 38308 /opt/lib/libcstest.so.1.0
  7fa25ce000-7fa25cf000 rw-p 00001000 08:02 38308 /opt/lib/libcstest.so.1.0
  7fa25cf000-7fa25eb000 r-xp 00000000 08:02 574   /lib/aarch64-linux-gnu/ld-2.21.so
  7fa25ef000-7fa25f2000 rw-p 00000000 00:00 0
  7fa25f7000-7fa25f9000 rw-p 00000000 00:00 0
  7fa25f9000-7fa25fa000 r--p 00000000 00:00 0     [vvar]
  7fa25fa000-7fa25fb000 r-xp 00000000 00:00 0     [vdso]
  7fa25fb000-7fa25fc000 r--p 0001c000 08:02 574   /lib/aarch64-linux-gnu/ld-2.21.so
  7fa25fc000-7fa25fe000 rw-p 0001d000 08:02 574   /lib/aarch64-linux-gnu/ld-2.21.so
  7ff2ea8000-7ff2ec9000 rw-p 00000000 00:00 0     [stack]
  root@linaro-nano:~#

Before 'main()' can execute 'libcstest.so.1.0' has to be loaded in
memory.  Once that has been done perf_event_mmap() has been called
4 times, with the last map starting at address 0x7fa25ce000 and
the address filter configured to start filtering when the
IP has passed over address 0x0x7fa25ce72c (0x7fa25ce000 + 0x72c).

But that is wrong since the code segment for library 'libcstest.so.1.0'
as been mapped at 0x7fa25bd000, resulting in traces not being
collected.

This patch corrects the situation by requesting that address
filters be updated only if the mapped event is for a code
segment.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1468860187-318-3-git-send-email-mathieu.poirier@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 52780ef941d3..9a030a96bc1f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6689,6 +6689,13 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
 	struct perf_event_context *ctx;
 	int ctxn;
 
+	/*
+	 * Data tracing isn't supported yet and as such there is no need
+	 * to keep track of anything that isn't related to executable code:
+	 */
+	if (!(vma->vm_flags & VM_EXEC))
+		return;
+
 	rcu_read_lock();
 	for_each_task_context_nr(ctxn) {
 		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
-- 
cgit v1.3-8-gc7d7


From 99f5bc9bfa9094e7c264a8e09f9507b391a3d1d1 Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Mon, 18 Jul 2016 10:43:07 -0600
Subject: perf/core: Enable mapping of the stop filters

At this time the perf_addr_filter_needs_mmap() function will _not_
return true on a user space 'stop' filter.  But stop filters need
exactly the same kind of mapping that range and start filters get.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1468860187-318-4-git-send-email-mathieu.poirier@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9a030a96bc1f..a5fc5c8cdfb0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6619,15 +6619,6 @@ got_name:
 	kfree(buf);
 }
 
-/*
- * Whether this @filter depends on a dynamic object which is not loaded
- * yet or its load addresses are not known.
- */
-static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
-{
-	return filter->filter && filter->inode;
-}
-
 /*
  * Check whether inode and address range match filter criteria.
  */
@@ -7848,7 +7839,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
 	list_for_each_entry(filter, &ifh->list, entry) {
 		event->addr_filters_offs[count] = 0;
 
-		if (perf_addr_filter_needs_mmap(filter))
+		/*
+		 * Adjust base offset if the filter is associated to a binary
+		 * that needs to be mapped:
+		 */
+		if (filter->inode)
 			event->addr_filters_offs[count] =
 				perf_addr_filter_apply(filter, mm);
 
-- 
cgit v1.3-8-gc7d7


From 71e7bc2bab77e64882c031c2af943c3256c1adb0 Mon Sep 17 00:00:00 2001
From: David Carrillo-Cisneros <davidcc@google.com>
Date: Wed, 17 Aug 2016 13:55:04 -0700
Subject: perf/core: Check return value of the perf_event_read() IPI

The call to smp_call_function_single in perf_event_read() may fail if
an invalid or not online CPU index is passed. Warn user if such bug is
present and return error.

Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1471467307-61171-2-git-send-email-davidcc@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index a5fc5c8cdfb0..5650f5317e0c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3549,9 +3549,10 @@ static int perf_event_read(struct perf_event *event, bool group)
 			.group = group,
 			.ret = 0,
 		};
-		smp_call_function_single(event->oncpu,
-					 __perf_event_read, &data, 1);
-		ret = data.ret;
+		ret = smp_call_function_single(event->oncpu, __perf_event_read, &data, 1);
+		/* The event must have been read from an online CPU: */
+		WARN_ON_ONCE(ret);
+		ret = ret ? : data.ret;
 	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
 		struct perf_event_context *ctx = event->ctx;
 		unsigned long flags;
-- 
cgit v1.3-8-gc7d7


From 173be9a14f7b2e901cf77c18b1aafd4d672e9d9e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 15 Aug 2016 18:38:42 +0200
Subject: sched/cputime: Fix NO_HZ_FULL getrusage() monotonicity regression

Mike reports:

 Roughly 10% of the time, ltp testcase getrusage04 fails:
 getrusage04    0  TINFO  :  Expected timers granularity is 4000 us
 getrusage04    0  TINFO  :  Using 1 as multiply factor for max [us]time increment (1000+4000us)!
 getrusage04    0  TINFO  :  utime:           0us; stime:         179us
 getrusage04    0  TINFO  :  utime:        3751us; stime:           0us
 getrusage04    1  TFAIL  :  getrusage04.c:133: stime increased > 5000us:

And tracked it down to the case where the task simply doesn't get
_any_ [us]time ticks.

Update the code to assume all rtime is utime when we lack information,
thus ensuring a task that elides the tick gets time accounted.

Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Fredrik Markstrom <fredrik.markstrom@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: stable@vger.kernel.org # 4.3+
Fixes: 9d7fb0427648 ("sched/cputime: Guarantee stime + utime == rtime")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9858266fb0b3..2ee83b200504 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -614,19 +614,25 @@ static void cputime_adjust(struct task_cputime *curr,
 	stime = curr->stime;
 	utime = curr->utime;
 
-	if (utime == 0) {
-		stime = rtime;
+	/*
+	 * If either stime or both stime and utime are 0, assume all runtime is
+	 * userspace. Once a task gets some ticks, the monotonicy code at
+	 * 'update' will ensure things converge to the observed ratio.
+	 */
+	if (stime == 0) {
+		utime = rtime;
 		goto update;
 	}
 
-	if (stime == 0) {
-		utime = rtime;
+	if (utime == 0) {
+		stime = rtime;
 		goto update;
 	}
 
 	stime = scale_stime((__force u64)stime, (__force u64)rtime,
 			    (__force u64)(stime + utime));
 
+update:
 	/*
 	 * Make sure stime doesn't go backwards; this preserves monotonicity
 	 * for utime because rtime is monotonic.
@@ -649,7 +655,6 @@ static void cputime_adjust(struct task_cputime *curr,
 		stime = rtime - utime;
 	}
 
-update:
 	prev->stime = stime;
 	prev->utime = utime;
 out:
-- 
cgit v1.3-8-gc7d7


From 03cbc732639ddcad15218c4b2046d255851ff1e3 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Wed, 17 Aug 2016 10:05:46 +0800
Subject: sched/cputime: Resync steal time when guest & host lose sync

Commit:

  57430218317e ("sched/cputime: Count actually elapsed irq & softirq time")

... fixed a bug but also triggered a regression:

On an i5 laptop, 4 pCPUs, 4vCPUs for one full dynticks guest, there are four
CPU hog processes(for loop) running in the guest, I hot-unplug the pCPUs
on host one by one until there is only one left, then observe CPU utilization
via 'top' in the guest, it shows:

  100% st for cpu0(housekeeping)
   75% st for other CPUs (nohz full mode)

However, w/o this commit it shows the correct 75% for all four CPUs.

When a guest is interrupted for a longer amount of time, missed clock ticks
are not redelivered later. Because of that, we should not limit the amount
of steal time accounted to the amount of time that the calling functions
think have passed.

However, the interval returned by account_other_time() is NOT rounded down
to the nearest jiffy, while the base interval in get_vtime_delta() it is
subtracted from is, so the max cputime limit is required to avoid underflow.

This patch fixes the regression by limiting the account_other_time() from
get_vtime_delta() to avoid underflow, and lets the other three call sites
(in account_other_time() and steal_account_process_time()) account however
much steal time the host told us elapsed.

Suggested-by: Rik van Riel <riel@redhat.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm@vger.kernel.org
Link: http://lkml.kernel.org/r/1471399546-4069-1-git-send-email-wanpeng.li@hotmail.com
[ Improved the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 2ee83b200504..a846cf89eb96 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -263,6 +263,11 @@ void account_idle_time(cputime_t cputime)
 		cpustat[CPUTIME_IDLE] += (__force u64) cputime;
 }
 
+/*
+ * When a guest is interrupted for a longer amount of time, missed clock
+ * ticks are not redelivered later. Due to that, this function may on
+ * occasion account more time than the calling functions think elapsed.
+ */
 static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
 {
 #ifdef CONFIG_PARAVIRT
@@ -371,7 +376,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 	 * idle, or potentially user or system time. Due to rounding,
 	 * other time can exceed ticks occasionally.
 	 */
-	other = account_other_time(cputime);
+	other = account_other_time(ULONG_MAX);
 	if (other >= cputime)
 		return;
 	cputime -= other;
@@ -486,7 +491,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
 	}
 
 	cputime = cputime_one_jiffy;
-	steal = steal_account_process_time(cputime);
+	steal = steal_account_process_time(ULONG_MAX);
 
 	if (steal >= cputime)
 		return;
@@ -516,7 +521,7 @@ void account_idle_ticks(unsigned long ticks)
 	}
 
 	cputime = jiffies_to_cputime(ticks);
-	steal = steal_account_process_time(cputime);
+	steal = steal_account_process_time(ULONG_MAX);
 
 	if (steal >= cputime)
 		return;
@@ -699,6 +704,13 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
 	unsigned long now = READ_ONCE(jiffies);
 	cputime_t delta, other;
 
+	/*
+	 * Unlike tick based timing, vtime based timing never has lost
+	 * ticks, and no need for steal time accounting to make up for
+	 * lost ticks. Vtime accounts a rounded version of actual
+	 * elapsed time. Limit account_other_time to prevent rounding
+	 * errors from causing elapsed vtime to go negative.
+	 */
 	delta = jiffies_to_cputime(now - tsk->vtime_snap);
 	other = account_other_time(delta);
 	WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
-- 
cgit v1.3-8-gc7d7


From 4396f46c8c628329bd35ee4b84140b8b001a11eb Mon Sep 17 00:00:00 2001
From: Shawn Lin <shawn.lin@rock-chips.com>
Date: Mon, 22 Aug 2016 16:21:52 +0800
Subject: genirq: Fix potential memleak when failing to get irq pm

Obviously we should free action here if irq_chip_pm_get failed.

Fixes: be45beb2df69: "genirq: Add runtime power management support for IRQ chips"
Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
Cc: Jon Hunter <jonathanh@nvidia.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Link: http://lkml.kernel.org/r/1471854112-13006-1-git-send-email-shawn.lin@rock-chips.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 73a2b786b5e9..9530fcd27704 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1681,8 +1681,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 	action->dev_id = dev_id;
 
 	retval = irq_chip_pm_get(&desc->irq_data);
-	if (retval < 0)
+	if (retval < 0) {
+		kfree(action);
 		return retval;
+	}
 
 	chip_bus_lock(desc);
 	retval = __setup_irq(irq, desc, action);
@@ -1985,8 +1987,10 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
 	action->percpu_dev_id = dev_id;
 
 	retval = irq_chip_pm_get(&desc->irq_data);
-	if (retval < 0)
+	if (retval < 0) {
+		kfree(action);
 		return retval;
+	}
 
 	chip_bus_lock(desc);
 	retval = __setup_irq(irq, desc, action);
-- 
cgit v1.3-8-gc7d7


From 3ee0ce2a54dff07d09440723594df89bc1a12e79 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 11 Aug 2016 07:06:45 -0700
Subject: genirq/affinity: Use get/put_online_cpus around cpumask operations

Without locking out CPU mask operations we might end up with an inconsistent
view of the cpumask in the function.

Fixes: 5e385a6ef31f: "genirq: Add a helper to spread an affinity mask for MSI/MSI-X vectors"
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/1470924405-25728-1-git-send-email-hch@lst.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/affinity.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f68959341c0f..32f6cfcff212 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -39,6 +39,7 @@ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
 		return NULL;
 	}
 
+	get_online_cpus();
 	if (max_vecs >= num_online_cpus()) {
 		cpumask_copy(affinity_mask, cpu_online_mask);
 		*nr_vecs = num_online_cpus();
@@ -56,6 +57,7 @@ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
 		}
 		*nr_vecs = vecs;
 	}
+	put_online_cpus();
 
 	return affinity_mask;
 }
-- 
cgit v1.3-8-gc7d7


From 27727df240c7cc84f2ba6047c6f18d5addfd25ef Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 23 Aug 2016 16:08:21 -0700
Subject: timekeeping: Avoid taking lock in NMI path with
 CONFIG_DEBUG_TIMEKEEPING

When I added some extra sanity checking in timekeeping_get_ns() under
CONFIG_DEBUG_TIMEKEEPING, I missed that the NMI safe __ktime_get_fast_ns()
method was using timekeeping_get_ns().

Thus the locking added to the debug checks broke the NMI-safety of
__ktime_get_fast_ns().

This patch open-codes the timekeeping_get_ns() logic for
__ktime_get_fast_ns(), so can avoid any deadlocks in NMI.

Fixes: 4ca22c2648f9 "timekeeping: Add warnings when overflows or underflows are observed"
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: stable <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/1471993702-29148-2-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3b65746c7f15..e07fb093f819 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -401,7 +401,10 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
 	do {
 		seq = raw_read_seqcount_latch(&tkf->seq);
 		tkr = tkf->base + (seq & 0x01);
-		now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+		now = ktime_to_ns(tkr->base);
+
+		now += clocksource_delta(tkr->read(tkr->clock),
+					 tkr->cycle_last, tkr->mask);
 	} while (read_seqcount_retry(&tkf->seq, seq));
 
 	return now;
-- 
cgit v1.3-8-gc7d7


From a4f8f6667f099036c88f231dcad4cf233652c824 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 23 Aug 2016 16:08:22 -0700
Subject: timekeeping: Cap array access in timekeeping_debug

It was reported that hibernation could fail on the 2nd attempt, where the
system hangs at hibernate() -> syscore_resume() -> i8237A_resume() ->
claim_dma_lock(), because the lock has already been taken.

However there is actually no other process would like to grab this lock on
that problematic platform.

Further investigation showed that the problem is triggered by setting
/sys/power/pm_trace to 1 before the 1st hibernation.

Since once pm_trace is enabled, the rtc becomes unmeaningful after suspend,
and meanwhile some BIOSes would like to adjust the 'invalid' RTC (e.g, smaller
than 1970) to the release date of that motherboard during POST stage, thus
after resumed, it may seem that the system had a significant long sleep time
which is a completely meaningless value.

Then in timekeeping_resume -> tk_debug_account_sleep_time, if the bit31 of the
sleep time happened to be set to 1, fls() returns 32 and we add 1 to
sleep_time_bin[32], which causes an out of bounds array access and therefor
memory being overwritten.

As depicted by System.map:
0xffffffff81c9d080 b sleep_time_bin
0xffffffff81c9d100 B dma_spin_lock
the dma_spin_lock.val is set to 1, which caused this problem.

This patch adds a sanity check in tk_debug_account_sleep_time()
to ensure we don't index past the sleep_time_bin array.

[jstultz: Problem diagnosed and original patch by Chen Yu, I've solved the
 issue slightly differently, but borrowed his excelent explanation of the
 issue here.]

Fixes: 5c83545f24ab "power: Add option to log time spent in suspend"
Reported-by: Janek Kozicki <cosurgi@gmail.com>
Reported-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: linux-pm@vger.kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Xunlei Pang <xpang@redhat.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: stable <stable@vger.kernel.org>
Cc: Zhang Rui <rui.zhang@intel.com>
Link: http://lkml.kernel.org/r/1471993702-29148-3-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping_debug.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index f6bd65236712..107310a6f36f 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -23,7 +23,9 @@
 
 #include "timekeeping_internal.h"
 
-static unsigned int sleep_time_bin[32] = {0};
+#define NUM_BINS 32
+
+static unsigned int sleep_time_bin[NUM_BINS] = {0};
 
 static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
 {
@@ -69,6 +71,9 @@ late_initcall(tk_debug_sleep_time_init);
 
 void tk_debug_account_sleep_time(struct timespec64 *t)
 {
-	sleep_time_bin[fls(t->tv_sec)]++;
+	/* Cap bin index so we don't overflow the array */
+	int bin = min(fls(t->tv_sec), NUM_BINS-1);
+
+	sleep_time_bin[bin]++;
 }
 
-- 
cgit v1.3-8-gc7d7


From 8b6a3fe8fab97716990a3abde1a01fb5a34552a3 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 24 Aug 2016 10:07:14 +0100
Subject: perf/core: Use this_cpu_ptr() when stopping AUX events

When tearing down an AUX buf for an event via perf_mmap_close(),
__perf_event_output_stop() is called on the event's CPU to ensure that
trace generation is halted before the process of unmapping and
freeing the buffer pages begins.

The callback is performed via cpu_function_call(), which ensures that it
runs with interrupts disabled and is therefore not preemptible.
Unfortunately, the current code grabs the per-cpu context pointer using
get_cpu_ptr(), which unnecessarily disables preemption and doesn't pair
the call with put_cpu_ptr(), leading to a preempt_count() imbalance and
a BUG when freeing the AUX buffer later on:

  WARNING: CPU: 1 PID: 2249 at kernel/events/ring_buffer.c:539 __rb_free_aux+0x10c/0x120
  Modules linked in:
  [...]
  Call Trace:
   [<ffffffff813379dd>] dump_stack+0x4f/0x72
   [<ffffffff81059ff6>] __warn+0xc6/0xe0
   [<ffffffff8105a0c8>] warn_slowpath_null+0x18/0x20
   [<ffffffff8112761c>] __rb_free_aux+0x10c/0x120
   [<ffffffff81128163>] rb_free_aux+0x13/0x20
   [<ffffffff8112515e>] perf_mmap_close+0x29e/0x2f0
   [<ffffffff8111da30>] ? perf_iterate_ctx+0xe0/0xe0
   [<ffffffff8115f685>] remove_vma+0x25/0x60
   [<ffffffff81161796>] exit_mmap+0x106/0x140
   [<ffffffff8105725c>] mmput+0x1c/0xd0
   [<ffffffff8105cac3>] do_exit+0x253/0xbf0
   [<ffffffff8105e32e>] do_group_exit+0x3e/0xb0
   [<ffffffff81068d49>] get_signal+0x249/0x640
   [<ffffffff8101c273>] do_signal+0x23/0x640
   [<ffffffff81905f42>] ? _raw_write_unlock_irq+0x12/0x30
   [<ffffffff81905f69>] ? _raw_spin_unlock_irq+0x9/0x10
   [<ffffffff81901896>] ? __schedule+0x2c6/0x710
   [<ffffffff810022a4>] exit_to_usermode_loop+0x74/0x90
   [<ffffffff81002a56>] prepare_exit_to_usermode+0x26/0x30
   [<ffffffff81906d1b>] retint_user+0x8/0x10

This patch uses this_cpu_ptr() instead of get_cpu_ptr(), since preemption is
already disabled by the caller.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Reviewed-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Fixes: 95ff4ca26c49 ("perf/core: Free AUX pages in unmap path")
Link: http://lkml.kernel.org/r/20160824091905.GA16944@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5650f5317e0c..3cfabdf7b942 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6166,7 +6166,7 @@ static int __perf_pmu_output_stop(void *info)
 {
 	struct perf_event *event = info;
 	struct pmu *pmu = event->pmu;
-	struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 	struct remote_output ro = {
 		.rb	= event->rb,
 	};
-- 
cgit v1.3-8-gc7d7


From e7d316a02f683864a12389f8808570e37fb90aa3 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Thu, 25 Aug 2016 15:16:51 -0700
Subject: sysctl: handle error writing UINT_MAX to u32 fields

We have scripts which write to certain fields on 3.18 kernels but this
seems to be failing on 4.4 kernels.  An entry which we write to here is
xfrm_aevent_rseqth which is u32.

  echo 4294967295  > /proc/sys/net/core/xfrm_aevent_rseqth

Commit 230633d109e3 ("kernel/sysctl.c: detect overflows when converting
to int") prevented writing to sysctl entries when integer overflow
occurs.  However, this does not apply to unsigned integers.

Heinrich suggested that we introduce a new option to handle 64 bit
limits and set min as 0 and max as UINT_MAX.  This might not work as it
leads to issues similar to __do_proc_doulongvec_minmax.  Alternatively,
we would need to change the datatype of the entry to 64 bit.

  static int __do_proc_doulongvec_minmax(void *data, struct ctl_table
  {
      i = (unsigned long *) data;   //This cast is causing to read beyond the size of data (u32)
      vleft = table->maxlen / sizeof(unsigned long); //vleft is 0 because maxlen is sizeof(u32) which is lesser than sizeof(unsigned long) on x86_64.

Introduce a new proc handler proc_douintvec.  Individual proc entries
will need to be updated to use the new handler.

[akpm@linux-foundation.org: coding-style fixes]
Fixes: 230633d109e3 ("kernel/sysctl.c:detect overflows when converting to int")
Link: http://lkml.kernel.org/r/1471479806-5252-1-git-send-email-subashab@codeaurora.org
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Cc: Heinrich Schuchardt <xypron.glpk@gmx.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sysctl.h |  2 ++
 kernel/sysctl.c        | 45 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 697e160c78d0..a4f7203a9017 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -42,6 +42,8 @@ extern int proc_dostring(struct ctl_table *, int,
 			 void __user *, size_t *, loff_t *);
 extern int proc_dointvec(struct ctl_table *, int,
 			 void __user *, size_t *, loff_t *);
+extern int proc_douintvec(struct ctl_table *, int,
+			 void __user *, size_t *, loff_t *);
 extern int proc_dointvec_minmax(struct ctl_table *, int,
 				void __user *, size_t *, loff_t *);
 extern int proc_dointvec_jiffies(struct ctl_table *, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b43d0b27c1fe..a13bbdaab47d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2140,6 +2140,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
 	return 0;
 }
 
+static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
+				 int *valp,
+				 int write, void *data)
+{
+	if (write) {
+		if (*negp)
+			return -EINVAL;
+		*valp = *lvalp;
+	} else {
+		unsigned int val = *valp;
+		*lvalp = (unsigned long)val;
+	}
+	return 0;
+}
+
 static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
 
 static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
@@ -2259,8 +2274,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
 int proc_dointvec(struct ctl_table *table, int write,
 		     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_dointvec(table,write,buffer,lenp,ppos,
-		    	    NULL,NULL);
+	return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
+}
+
+/**
+ * proc_douintvec - read a vector of unsigned integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec(struct ctl_table *table, int write,
+		     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return do_proc_dointvec(table, write, buffer, lenp, ppos,
+				do_proc_douintvec_conv, NULL);
 }
 
 /*
@@ -2858,6 +2892,12 @@ int proc_dointvec(struct ctl_table *table, int write,
 	return -ENOSYS;
 }
 
+int proc_douintvec(struct ctl_table *table, int write,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
 int proc_dointvec_minmax(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -2903,6 +2943,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
  * exception granted :-)
  */
 EXPORT_SYMBOL(proc_dointvec);
+EXPORT_SYMBOL(proc_douintvec);
 EXPORT_SYMBOL(proc_dointvec_jiffies);
 EXPORT_SYMBOL(proc_dointvec_minmax);
 EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
-- 
cgit v1.3-8-gc7d7


From ae6c33ba6e37eea3012fe2640b22400ef3f2d0f3 Mon Sep 17 00:00:00 2001
From: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Date: Thu, 25 Aug 2016 15:17:00 -0700
Subject: printk: fix parsing of "brl=" option

Commit bbeddf52adc1 ("printk: move braille console support into separate
braille.[ch] files") moved the parsing of braille-related options into
_braille_console_setup(), changing the type of variable str from char*
to char**.  In this commit, memcmp(str, "brl,", 4) was correctly updated
to memcmp(*str, "brl,", 4) but not memcmp(str, "brl=", 4).

Update the code to make "brl=" option work again and replace memcmp()
with strncmp() to make the compiler able to detect such an issue.

Fixes: bbeddf52adc1 ("printk: move braille console support into separate braille.[ch] files")
Link: http://lkml.kernel.org/r/20160823165700.28952-1-nicolas.iooss_linux@m4x.org
Signed-off-by: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/braille.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index 276762f3a460..d5760c42f042 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -9,10 +9,10 @@
 
 char *_braille_console_setup(char **str, char **brl_options)
 {
-	if (!memcmp(*str, "brl,", 4)) {
+	if (!strncmp(*str, "brl,", 4)) {
 		*brl_options = "";
 		*str += 4;
-	} else if (!memcmp(str, "brl=", 4)) {
+	} else if (!strncmp(*str, "brl=", 4)) {
 		*brl_options = *str + 4;
 		*str = strchr(*brl_options, ',');
 		if (!*str)
-- 
cgit v1.3-8-gc7d7


From 485a252a5559b45d7df04c819ec91177c62c270b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 10 Aug 2016 16:28:09 -0700
Subject: seccomp: Fix tracer exit notifications during fatal signals

This fixes a ptrace vs fatal pending signals bug as manifested in
seccomp now that seccomp was reordered to happen after ptrace. The
short version is that seccomp should not attempt to call do_exit()
while fatal signals are pending under a tracer. The existing code was
trying to be as defensively paranoid as possible, but it now ends up
confusing ptrace. Instead, the syscall can just be skipped (which solves
the original concern that the do_exit() was addressing) and normal signal
handling, tracer notification, and process death can happen.

Paraphrasing from the original bug report:

If a tracee task is in a PTRACE_EVENT_SECCOMP trap, or has been resumed
after such a trap but not yet been scheduled, and another task in the
thread-group calls exit_group(), then the tracee task exits without the
ptracer receiving a PTRACE_EVENT_EXIT notification. Test case here:
https://gist.github.com/khuey/3c43ac247c72cef8c956ca73281c9be7

The bug happens because when __seccomp_filter() detects
fatal_signal_pending(), it calls do_exit() without dequeuing the fatal
signal. When do_exit() sends the PTRACE_EVENT_EXIT notification and
that task is descheduled, __schedule() notices that there is a fatal
signal pending and changes its state from TASK_TRACED to TASK_RUNNING.
That prevents the ptracer's waitpid() from returning the ptrace event.
A more detailed analysis is here:
https://github.com/mozilla/rr/issues/1762#issuecomment-237396255.

Reported-by: Robert O'Callahan <robert@ocallahan.org>
Reported-by: Kyle Huey <khuey@kylehuey.com>
Tested-by: Kyle Huey <khuey@kylehuey.com>
Fixes: 93e35efb8de4 ("x86/ptrace: run seccomp after ptrace")
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: James Morris <james.l.morris@oracle.com>
---
 kernel/seccomp.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ef6c6c3f9d8a..0db7c8a2afe2 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -605,12 +605,16 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 		ptrace_event(PTRACE_EVENT_SECCOMP, data);
 		/*
 		 * The delivery of a fatal signal during event
-		 * notification may silently skip tracer notification.
-		 * Terminating the task now avoids executing a system
-		 * call that may not be intended.
+		 * notification may silently skip tracer notification,
+		 * which could leave us with a potentially unmodified
+		 * syscall that the tracer would have liked to have
+		 * changed. Since the process is about to die, we just
+		 * force the syscall to be skipped and let the signal
+		 * kill the process and correctly handle any tracer exit
+		 * notifications.
 		 */
 		if (fatal_signal_pending(current))
-			do_exit(SIGSYS);
+			goto skip;
 		/* Check if the tracer forced the syscall to be skipped. */
 		this_syscall = syscall_get_nr(current, task_pt_regs(current));
 		if (this_syscall < 0)
-- 
cgit v1.3-8-gc7d7


From cd81a9170e69e018bbaba547c1fd85a585f5697a Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mguzik@redhat.com>
Date: Tue, 23 Aug 2016 16:20:38 +0200
Subject: mm: introduce get_task_exe_file

For more convenient access if one has a pointer to the task.

As a minor nit take advantage of the fact that only task lock + rcu are
needed to safely grab ->exe_file. This saves mm refcount dance.

Use the helper in proc_exe_link.

Signed-off-by: Mateusz Guzik <mguzik@redhat.com>
Acked-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Richard Guy Briggs <rgb@redhat.com>
Cc: <stable@vger.kernel.org> # 4.3.x
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 fs/proc/base.c     |  7 +------
 include/linux/mm.h |  1 +
 kernel/fork.c      | 23 +++++++++++++++++++++++
 3 files changed, 25 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0d163a84082d..da8b1943ba04 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1552,18 +1552,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
 static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 {
 	struct task_struct *task;
-	struct mm_struct *mm;
 	struct file *exe_file;
 
 	task = get_proc_task(d_inode(dentry));
 	if (!task)
 		return -ENOENT;
-	mm = get_task_mm(task);
+	exe_file = get_task_exe_file(task);
 	put_task_struct(task);
-	if (!mm)
-		return -ENOENT;
-	exe_file = get_mm_exe_file(mm);
-	mmput(mm);
 	if (exe_file) {
 		*exe_path = exe_file->f_path;
 		path_get(&exe_file->f_path);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8f468e0d2534..004c73a988b7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1987,6 +1987,7 @@ extern void mm_drop_all_locks(struct mm_struct *mm);
 
 extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
 extern struct file *get_mm_exe_file(struct mm_struct *mm);
+extern struct file *get_task_exe_file(struct task_struct *task);
 
 extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages);
 extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);
diff --git a/kernel/fork.c b/kernel/fork.c
index d277e83ed3e0..42451aeb245f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -773,6 +773,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
 }
 EXPORT_SYMBOL(get_mm_exe_file);
 
+/**
+ * get_task_exe_file - acquire a reference to the task's executable file
+ *
+ * Returns %NULL if task's mm (if any) has no associated executable file or
+ * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
+ * User must release file via fput().
+ */
+struct file *get_task_exe_file(struct task_struct *task)
+{
+	struct file *exe_file = NULL;
+	struct mm_struct *mm;
+
+	task_lock(task);
+	mm = task->mm;
+	if (mm) {
+		if (!(task->flags & PF_KTHREAD))
+			exe_file = get_mm_exe_file(mm);
+	}
+	task_unlock(task);
+	return exe_file;
+}
+EXPORT_SYMBOL(get_task_exe_file);
+
 /**
  * get_task_mm - acquire a reference to the task's mm
  *
-- 
cgit v1.3-8-gc7d7


From 5efc244346f9f338765da3d592f7947b0afdc4b5 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mguzik@redhat.com>
Date: Tue, 23 Aug 2016 16:20:39 +0200
Subject: audit: fix exe_file access in audit_exe_compare

Prior to the change the function would blindly deference mm, exe_file
and exe_file->f_inode, each of which could have been NULL or freed.

Use get_task_exe_file to safely obtain stable exe_file.

Signed-off-by: Mateusz Guzik <mguzik@redhat.com>
Acked-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Richard Guy Briggs <rgb@redhat.com>
Cc: <stable@vger.kernel.org> # 4.3.x
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_watch.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 3cf1c5978d39..4846691957da 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -19,6 +19,7 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/file.h>
 #include <linux/kernel.h>
 #include <linux/audit.h>
 #include <linux/kthread.h>
@@ -544,10 +545,11 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
 	unsigned long ino;
 	dev_t dev;
 
-	rcu_read_lock();
-	exe_file = rcu_dereference(tsk->mm->exe_file);
+	exe_file = get_task_exe_file(tsk);
+	if (!exe_file)
+		return 0;
 	ino = exe_file->f_inode->i_ino;
 	dev = exe_file->f_inode->i_sb->s_dev;
-	rcu_read_unlock();
+	fput(exe_file);
 	return audit_mark_compare(mark, ino, dev);
 }
-- 
cgit v1.3-8-gc7d7


From 070c43eea5043e950daa423707ae3c77e2f48edb Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Date: Thu, 1 Sep 2016 16:14:44 -0700
Subject: kexec: fix double-free when failing to relocate the purgatory

If kexec_apply_relocations fails, kexec_load_purgatory frees pi->sechdrs
and pi->purgatory_buf.  This is redundant, because in case of error
kimage_file_prepare_segments calls kimage_file_post_load_cleanup, which
will also free those buffers.

This causes two warnings like the following, one for pi->sechdrs and the
other for pi->purgatory_buf:

  kexec-bzImage64: Loading purgatory failed
  ------------[ cut here ]------------
  WARNING: CPU: 1 PID: 2119 at mm/vmalloc.c:1490 __vunmap+0xc1/0xd0
  Trying to vfree() nonexistent vm area (ffffc90000e91000)
  Modules linked in:
  CPU: 1 PID: 2119 Comm: kexec Not tainted 4.8.0-rc3+ #5
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
  Call Trace:
    dump_stack+0x4d/0x65
    __warn+0xcb/0xf0
    warn_slowpath_fmt+0x4f/0x60
    ? find_vmap_area+0x19/0x70
    ? kimage_file_post_load_cleanup+0x47/0xb0
    __vunmap+0xc1/0xd0
    vfree+0x2e/0x70
    kimage_file_post_load_cleanup+0x5e/0xb0
    SyS_kexec_file_load+0x448/0x680
    ? putname+0x54/0x60
    ? do_sys_open+0x190/0x1f0
    entry_SYSCALL_64_fastpath+0x13/0x8f
  ---[ end trace 158bb74f5950ca2b ]---

Fix by setting pi->sechdrs an pi->purgatory_buf to NULL, since vfree
won't try to free a NULL pointer.

Link: http://lkml.kernel.org/r/1472083546-23683-1-git-send-email-bauerman@linux.vnet.ibm.com
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_file.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 503bc2d348e5..037c321c5618 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -887,7 +887,10 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
 	return 0;
 out:
 	vfree(pi->sechdrs);
+	pi->sechdrs = NULL;
+
 	vfree(pi->purgatory_buf);
+	pi->purgatory_buf = NULL;
 	return ret;
 }
 
-- 
cgit v1.3-8-gc7d7


From 236dec051078a8691950f56949612b4b74107e48 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 1 Sep 2016 16:14:47 -0700
Subject: kconfig: tinyconfig: provide whole choice blocks to avoid warnings

Using "make tinyconfig" produces a couple of annoying warnings that show
up for build test machines all the time:

    .config:966:warning: override: NOHIGHMEM changes choice state
    .config:965:warning: override: SLOB changes choice state
    .config:963:warning: override: KERNEL_XZ changes choice state
    .config:962:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state
    .config:933:warning: override: SLOB changes choice state
    .config:930:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state
    .config:870:warning: override: SLOB changes choice state
    .config:868:warning: override: KERNEL_XZ changes choice state
    .config:867:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state

I've made a previous attempt at fixing them and we discussed a number of
alternatives.

I tried changing the Makefile to use "merge_config.sh -n
$(fragment-list)" but couldn't get that to work properly.

This is yet another approach, based on the observation that we do want
to see a warning for conflicting 'choice' options, and that we can
simply make them non-conflicting by listing all other options as
disabled.  This is a trivial patch that we can apply independent of
plans for other changes.

Link: http://lkml.kernel.org/r/20160829214952.1334674-2-arnd@arndb.de
Link: https://storage.kernelci.org/mainline/v4.7-rc6/x86-tinyconfig/build.log
https://patchwork.kernel.org/patch/9212749/
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Reviewed-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/configs/tiny.config | 2 ++
 kernel/configs/tiny.config   | 8 ++++++++
 2 files changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config
index 4e2ecfa23c15..4b429df40d7a 100644
--- a/arch/x86/configs/tiny.config
+++ b/arch/x86/configs/tiny.config
@@ -1 +1,3 @@
 CONFIG_NOHIGHMEM=y
+# CONFIG_HIGHMEM4G is not set
+# CONFIG_HIGHMEM64G is not set
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index c2de56ab0fce..7fa0c4ae6394 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -1,4 +1,12 @@
+# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+# CONFIG_KERNEL_GZIP is not set
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
 CONFIG_KERNEL_XZ=y
+# CONFIG_KERNEL_LZO is not set
+# CONFIG_KERNEL_LZ4 is not set
 CONFIG_OPTIMIZE_INLINING=y
+# CONFIG_SLAB is not set
+# CONFIG_SLUB is not set
 CONFIG_SLOB=y
-- 
cgit v1.3-8-gc7d7


From 19feeff18bbfde659baa58c2346f15a24d7c405e Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Date: Thu, 1 Sep 2016 16:15:04 -0700
Subject: printk/nmi: avoid direct printk()-s from __printk_nmi_flush()

__printk_nmi_flush() can be called from nmi_panic(), therefore it has to
test whether it's executed in NMI context and thus must route the
messages through deferred printk() or via direct printk().

This is to avoid potential deadlocks, as described in commit
cf9b1106c81c ("printk/nmi: flush NMI messages on the system panic").

However there remain two places where __printk_nmi_flush() does
unconditional direct printk() calls:

 - pr_err("printk_nmi_flush: internal error ...")
 - pr_cont("\n")

Factor out print_nmi_seq_line() parts into a new printk_nmi_flush_line()
function, which takes care of in_nmi(), and use it in
__printk_nmi_flush() for printing and error-reporting.

Link: http://lkml.kernel.org/r/20160830161354.581-1-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/nmi.c | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index b69eb8a2876f..16bab471c7e2 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -99,26 +99,32 @@ again:
 	return add;
 }
 
-/*
- * printk one line from the temporary buffer from @start index until
- * and including the @end index.
- */
-static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end)
+static void printk_nmi_flush_line(const char *text, int len)
 {
-	const char *buf = s->buffer + start;
-
 	/*
 	 * The buffers are flushed in NMI only on panic.  The messages must
 	 * go only into the ring buffer at this stage.  Consoles will get
 	 * explicitly called later when a crashdump is not generated.
 	 */
 	if (in_nmi())
-		printk_deferred("%.*s", (end - start) + 1, buf);
+		printk_deferred("%.*s", len, text);
 	else
-		printk("%.*s", (end - start) + 1, buf);
+		printk("%.*s", len, text);
 
 }
 
+/*
+ * printk one line from the temporary buffer from @start index until
+ * and including the @end index.
+ */
+static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s,
+					int start, int end)
+{
+	const char *buf = s->buffer + start;
+
+	printk_nmi_flush_line(buf, (end - start) + 1);
+}
+
 /*
  * Flush data from the associated per_CPU buffer. The function
  * can be called either via IRQ work or independently.
@@ -150,9 +156,11 @@ more:
 	 * the buffer an unexpected way. If we printed something then
 	 * @len must only increase.
 	 */
-	if (i && i >= len)
-		pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n",
-		       i, len);
+	if (i && i >= len) {
+		const char *msg = "printk_nmi_flush: internal error\n";
+
+		printk_nmi_flush_line(msg, strlen(msg));
+	}
 
 	if (!len)
 		goto out; /* Someone else has already flushed the buffer. */
@@ -166,14 +174,14 @@ more:
 	/* Print line by line. */
 	for (; i < size; i++) {
 		if (s->buffer[i] == '\n') {
-			print_nmi_seq_line(s, last_i, i);
+			printk_nmi_flush_seq_line(s, last_i, i);
 			last_i = i + 1;
 		}
 	}
 	/* Check if there was a partial line. */
 	if (last_i < size) {
-		print_nmi_seq_line(s, last_i, size - 1);
-		pr_cont("\n");
+		printk_nmi_flush_seq_line(s, last_i, size - 1);
+		printk_nmi_flush_line("\n", strlen("\n"));
 	}
 
 	/*
-- 
cgit v1.3-8-gc7d7


From c11600e4fed67ae4cd6a8096936afd445410e8ed Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 1 Sep 2016 16:15:07 -0700
Subject: mm, mempolicy: task->mempolicy must be NULL before dropping final
 reference

KASAN allocates memory from the page allocator as part of
kmem_cache_free(), and that can reference current->mempolicy through any
number of allocation functions.  It needs to be NULL'd out before the
final reference is dropped to prevent a use-after-free bug:

	BUG: KASAN: use-after-free in alloc_pages_current+0x363/0x370 at addr ffff88010b48102c
	CPU: 0 PID: 15425 Comm: trinity-c2 Not tainted 4.8.0-rc2+ #140
	...
	Call Trace:
		dump_stack
		kasan_object_err
		kasan_report_error
		__asan_report_load2_noabort
		alloc_pages_current	<-- use after free
		depot_save_stack
		save_stack
		kasan_slab_free
		kmem_cache_free
		__mpol_put		<-- free
		do_exit

This patch sets current->mempolicy to NULL before dropping the final
reference.

Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1608301442180.63329@chino.kir.corp.google.com
Fixes: cd11016e5f52 ("mm, kasan: stackdepot implementation. Enable stackdepot for SLAB")
Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: <stable@vger.kernel.org>	[4.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h |  4 ++++
 kernel/exit.c             |  7 +------
 mm/mempolicy.c            | 17 +++++++++++++++++
 3 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 4429d255c8ab..5e5b2969d931 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -195,6 +195,7 @@ static inline bool vma_migratable(struct vm_area_struct *vma)
 }
 
 extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
+extern void mpol_put_task_policy(struct task_struct *);
 
 #else
 
@@ -297,5 +298,8 @@ static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
 	return -1; /* no node preference */
 }
 
+static inline void mpol_put_task_policy(struct task_struct *task)
+{
+}
 #endif /* CONFIG_NUMA */
 #endif
diff --git a/kernel/exit.c b/kernel/exit.c
index 2f974ae042a6..091a78be3b09 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -848,12 +848,7 @@ void do_exit(long code)
 	TASKS_RCU(preempt_enable());
 	exit_notify(tsk, group_dead);
 	proc_exit_connector(tsk);
-#ifdef CONFIG_NUMA
-	task_lock(tsk);
-	mpol_put(tsk->mempolicy);
-	tsk->mempolicy = NULL;
-	task_unlock(tsk);
-#endif
+	mpol_put_task_policy(tsk);
 #ifdef CONFIG_FUTEX
 	if (unlikely(current->pi_state_cache))
 		kfree(current->pi_state_cache);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d8c4e38fb5f4..2da72a5b6ecc 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2336,6 +2336,23 @@ out:
 	return ret;
 }
 
+/*
+ * Drop the (possibly final) reference to task->mempolicy.  It needs to be
+ * dropped after task->mempolicy is set to NULL so that any allocation done as
+ * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
+ * policy.
+ */
+void mpol_put_task_policy(struct task_struct *task)
+{
+	struct mempolicy *pol;
+
+	task_lock(task);
+	pol = task->mempolicy;
+	task->mempolicy = NULL;
+	task_unlock(task);
+	mpol_put(pol);
+}
+
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
-- 
cgit v1.3-8-gc7d7


From 735f2770a770156100f534646158cb58cb8b2939 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 1 Sep 2016 16:15:13 -0700
Subject: kernel/fork: fix CLONE_CHILD_CLEARTID regression in nscd

Commit fec1d0115240 ("[PATCH] Disable CLONE_CHILD_CLEARTID for abnormal
exit") has caused a subtle regression in nscd which uses
CLONE_CHILD_CLEARTID to clear the nscd_certainly_running flag in the
shared databases, so that the clients are notified when nscd is
restarted.  Now, when nscd uses a non-persistent database, clients that
have it mapped keep thinking the database is being updated by nscd, when
in fact nscd has created a new (anonymous) one (for non-persistent
databases it uses an unlinked file as backend).

The original proposal for the CLONE_CHILD_CLEARTID change claimed
(https://lkml.org/lkml/2006/10/25/233):

: The NPTL library uses the CLONE_CHILD_CLEARTID flag on clone() syscalls
: on behalf of pthread_create() library calls.  This feature is used to
: request that the kernel clear the thread-id in user space (at an address
: provided in the syscall) when the thread disassociates itself from the
: address space, which is done in mm_release().
:
: Unfortunately, when a multi-threaded process incurs a core dump (such as
: from a SIGSEGV), the core-dumping thread sends SIGKILL signals to all of
: the other threads, which then proceed to clear their user-space tids
: before synchronizing in exit_mm() with the start of core dumping.  This
: misrepresents the state of process's address space at the time of the
: SIGSEGV and makes it more difficult for someone to debug NPTL and glibc
: problems (misleading him/her to conclude that the threads had gone away
: before the fault).
:
: The fix below is to simply avoid the CLONE_CHILD_CLEARTID action if a
: core dump has been initiated.

The resulting patch from Roland (https://lkml.org/lkml/2006/10/26/269)
seems to have a larger scope than the original patch asked for.  It
seems that limitting the scope of the check to core dumping should work
for SIGSEGV issue describe above.

[Changelog partly based on Andreas' description]
Fixes: fec1d0115240 ("[PATCH] Disable CLONE_CHILD_CLEARTID for abnormal exit")
Link: http://lkml.kernel.org/r/1471968749-26173-1-git-send-email-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Tested-by: William Preston <wpreston@suse.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: Andreas Schwab <schwab@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index aaf782327bf3..93bdba13d7d9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -913,14 +913,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 	deactivate_mm(tsk, mm);
 
 	/*
-	 * If we're exiting normally, clear a user-space tid field if
-	 * requested.  We leave this alone when dying by signal, to leave
-	 * the value intact in a core dump, and to save the unnecessary
-	 * trouble, say, a killed vfork parent shouldn't touch this mm.
-	 * Userland only wants this done for a sys_exit.
+	 * Signal userspace if we're not exiting with a core dump
+	 * because we want to leave the value intact for debugging
+	 * purposes.
 	 */
 	if (tsk->clear_child_tid) {
-		if (!(tsk->flags & PF_SIGNALED) &&
+		if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
 		    atomic_read(&mm->mm_users) > 1) {
 			/*
 			 * We don't check the error code - if userspace has
-- 
cgit v1.3-8-gc7d7


From 08d072599234c959b0b82b63fa252c129225a899 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Fri, 2 Sep 2016 14:38:23 +0800
Subject: tick/nohz: Fix softlockup on scheduler stalls in kvm guest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

tick_nohz_start_idle() is prevented to be called if the idle tick can't
be stopped since commit 1f3b0f8243cb934 ("tick/nohz: Optimize nohz idle
enter"). As a result, after suspend/resume the host machine, full dynticks
kvm guest will softlockup:

 NMI watchdog: BUG: soft lockup - CPU#0 stuck for 26s! [swapper/0:0]
 Call Trace:
  default_idle+0x31/0x1a0
  arch_cpu_idle+0xf/0x20
  default_idle_call+0x2a/0x50
  cpu_startup_entry+0x39b/0x4d0
  rest_init+0x138/0x140
  ? rest_init+0x5/0x140
  start_kernel+0x4c1/0x4ce
  ? set_init_arg+0x55/0x55
  ? early_idt_handler_array+0x120/0x120
  x86_64_start_reservations+0x24/0x26
  x86_64_start_kernel+0x142/0x14f

In addition, cat /proc/stat | grep cpu in guest or host:

cpu  398 16 5049 15754 5490 0 1 46 0 0
cpu0 206 5 450 0 0 0 1 14 0 0
cpu1 81 0 3937 3149 1514 0 0 9 0 0
cpu2 45 6 332 6052 2243 0 0 11 0 0
cpu3 65 2 328 6552 1732 0 0 11 0 0

The idle and iowait states are weird 0 for cpu0(housekeeping).

The bug is present in both guest and host kernels, and they both have
cpu0's idle and iowait states issue, however, host kernel's suspend/resume
path etc will touch watchdog to avoid the softlockup.

- The watchdog will not be touched in tick_nohz_stop_idle path (need be
  touched since the scheduler stall is expected) if idle_active flags are
  not detected.
- The idle and iowait states will not be accounted when exit idle loop
  (resched or interrupt) if idle start time and idle_active flags are
  not set.

This patch fixes it by reverting commit 1f3b0f8243cb934 since can't stop
idle tick doesn't mean can't be idle.

Fixes: 1f3b0f8243cb934 ("tick/nohz: Optimize nohz idle enter")
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Sanjeev Yadav<sanjeev.yadav@spreadtrum.com>
Cc: Gaurav Jindal<gaurav.jindal@spreadtrum.com>
Cc: stable@vger.kernel.org
Cc: kvm@vger.kernel.org
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Link: http://lkml.kernel.org/r/1472798303-4154-1-git-send-email-wanpeng.li@hotmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 204fdc86863d..2ec7c00228f3 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -908,10 +908,11 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
 	ktime_t now, expires;
 	int cpu = smp_processor_id();
 
+	now = tick_nohz_start_idle(ts);
+
 	if (can_stop_idle_tick(cpu, ts)) {
 		int was_stopped = ts->tick_stopped;
 
-		now = tick_nohz_start_idle(ts);
 		ts->idle_calls++;
 
 		expires = tick_nohz_stop_sched_tick(ts, now, cpu);
-- 
cgit v1.3-8-gc7d7


From 58763148758057ffc447bf990321d3ea86d199a0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 30 Aug 2016 10:15:03 +0200
Subject: perf/core: Remove WARN from perf_event_read()

This effectively reverts commit:

  71e7bc2bab77 ("perf/core: Check return value of the perf_event_read() IPI")

... and puts in a comment explaining why we ignore the return value.

Reported-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David Carrillo-Cisneros <davidcc@google.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 71e7bc2bab77 ("perf/core: Check return value of the perf_event_read() IPI")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3cfabdf7b942..07ac8596a728 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3549,10 +3549,18 @@ static int perf_event_read(struct perf_event *event, bool group)
 			.group = group,
 			.ret = 0,
 		};
-		ret = smp_call_function_single(event->oncpu, __perf_event_read, &data, 1);
-		/* The event must have been read from an online CPU: */
-		WARN_ON_ONCE(ret);
-		ret = ret ? : data.ret;
+		/*
+		 * Purposely ignore the smp_call_function_single() return
+		 * value.
+		 *
+		 * If event->oncpu isn't a valid CPU it means the event got
+		 * scheduled out and that will have updated the event count.
+		 *
+		 * Therefore, either way, we'll have an up-to-date event count
+		 * after this.
+		 */
+		(void)smp_call_function_single(event->oncpu, __perf_event_read, &data, 1);
+		ret = data.ret;
 	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
 		struct perf_event_context *ctx = event->ctx;
 		unsigned long flags;
-- 
cgit v1.3-8-gc7d7


From 135e8c9250dd5c8c9aae5984fde6f230d0cbfeaf Mon Sep 17 00:00:00 2001
From: Balbir Singh <bsingharora@gmail.com>
Date: Mon, 5 Sep 2016 13:16:40 +1000
Subject: sched/core: Fix a race between try_to_wake_up() and a woken up task

The origin of the issue I've seen is related to
a missing memory barrier between check for task->state and
the check for task->on_rq.

The task being woken up is already awake from a schedule()
and is doing the following:

	do {
		schedule()
		set_current_state(TASK_(UN)INTERRUPTIBLE);
	} while (!cond);

The waker, actually gets stuck doing the following in
try_to_wake_up():

	while (p->on_cpu)
		cpu_relax();

Analysis:

The instance I've seen involves the following race:

 CPU1					CPU2

 while () {
   if (cond)
     break;
   do {
     schedule();
     set_current_state(TASK_UN..)
   } while (!cond);
					wakeup_routine()
					  spin_lock_irqsave(wait_lock)
   raw_spin_lock_irqsave(wait_lock)	  wake_up_process()
 }					  try_to_wake_up()
 set_current_state(TASK_RUNNING);	  ..
 list_del(&waiter.list);

CPU2 wakes up CPU1, but before it can get the wait_lock and set
current state to TASK_RUNNING the following occurs:

 CPU3
 wakeup_routine()
 raw_spin_lock_irqsave(wait_lock)
 if (!list_empty)
   wake_up_process()
   try_to_wake_up()
   raw_spin_lock_irqsave(p->pi_lock)
   ..
   if (p->on_rq && ttwu_wakeup())
   ..
   while (p->on_cpu)
     cpu_relax()
   ..

CPU3 tries to wake up the task on CPU1 again since it finds
it on the wait_queue, CPU1 is spinning on wait_lock, but immediately
after CPU2, CPU3 got it.

CPU3 checks the state of p on CPU1, it is TASK_UNINTERRUPTIBLE and
the task is spinning on the wait_lock. Interestingly since p->on_rq
is checked under pi_lock, I've noticed that try_to_wake_up() finds
p->on_rq to be 0. This was the most confusing bit of the analysis,
but p->on_rq is changed under runqueue lock, rq_lock, the p->on_rq
check is not reliable without this fix IMHO. The race is visible
(based on the analysis) only when ttwu_queue() does a remote wakeup
via ttwu_queue_remote. In which case the p->on_rq change is not
done uder the pi_lock.

The result is that after a while the entire system locks up on
the raw_spin_irqlock_save(wait_lock) and the holder spins infintely

Reproduction of the issue:

The issue can be reproduced after a long run on my system with 80
threads and having to tweak available memory to very low and running
memory stress-ng mmapfork test. It usually takes a long time to
reproduce. I am trying to work on a test case that can reproduce
the issue faster, but thats work in progress. I am still testing the
changes on my still in a loop and the tests seem OK thus far.

Big thanks to Benjamin and Nick for helping debug this as well.
Ben helped catch the missing barrier, Nick caught every missing
bit in my theory.

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
[ Updated comment to clarify matching barriers. Many
  architectures do not have a full barrier in switch_to()
  so that cannot be relied upon. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <nicholas.piggin@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/e02cce7b-d9ca-1ad0-7a61-ea97c7582b37@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2a906f20fba7..44817c640e99 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2016,6 +2016,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	success = 1; /* we're going to change ->state */
 	cpu = task_cpu(p);
 
+	/*
+	 * Ensure we load p->on_rq _after_ p->state, otherwise it would
+	 * be possible to, falsely, observe p->on_rq == 0 and get stuck
+	 * in smp_cond_load_acquire() below.
+	 *
+	 * sched_ttwu_pending()                 try_to_wake_up()
+	 *   [S] p->on_rq = 1;                  [L] P->state
+	 *       UNLOCK rq->lock  -----.
+	 *                              \
+	 *				 +---   RMB
+	 * schedule()                   /
+	 *       LOCK rq->lock    -----'
+	 *       UNLOCK rq->lock
+	 *
+	 * [task p]
+	 *   [S] p->state = UNINTERRUPTIBLE     [L] p->on_rq
+	 *
+	 * Pairs with the UNLOCK+LOCK on rq->lock from the
+	 * last wakeup of our task and the schedule that got our task
+	 * current.
+	 */
+	smp_rmb();
 	if (p->on_rq && ttwu_remote(p, wake_flags))
 		goto stat;
 
-- 
cgit v1.3-8-gc7d7


From c86d06ba2818c5126078cb0cf4e0175ec381045b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Sep 2016 08:38:13 -0400
Subject: PM / QoS: avoid calling cancel_delayed_work_sync() during early boot

of_clk_init() ends up calling into pm_qos_update_request() very early
during boot where irq is expected to stay disabled.
pm_qos_update_request() uses cancel_delayed_work_sync() which
correctly assumes that irq is enabled on invocation and
unconditionally disables and re-enables it.

Gate cancel_delayed_work_sync() invocation with kevented_up() to avoid
enabling irq unexpectedly during early boot.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Qiao Zhou <qiaozhou@asrmicro.com>
Link: http://lkml.kernel.org/r/d2501c4c-8e7b-bea3-1b01-000b36b5dfe9@asrmicro.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/qos.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 97b0df71303e..168ff442ebde 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -482,7 +482,16 @@ void pm_qos_update_request(struct pm_qos_request *req,
 		return;
 	}
 
-	cancel_delayed_work_sync(&req->work);
+	/*
+	 * This function may be called very early during boot, for example,
+	 * from of_clk_init(), where irq needs to stay disabled.
+	 * cancel_delayed_work_sync() assumes that irq is enabled on
+	 * invocation and re-enables it on return.  Avoid calling it until
+	 * workqueue is initialized.
+	 */
+	if (keventd_up())
+		cancel_delayed_work_sync(&req->work);
+
 	__pm_qos_update_request(req, new_value);
 }
 EXPORT_SYMBOL_GPL(pm_qos_update_request);
-- 
cgit v1.3-8-gc7d7


From 9049771f7d5490a302589976984810064c83ab40 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 7 Sep 2016 08:51:21 -0700
Subject: mm: fix cache mode of dax pmd mappings

track_pfn_insert() in vmf_insert_pfn_pmd() is marking dax mappings as
uncacheable rendering them impractical for application usage.  DAX-pte
mappings are cached and the goal of establishing DAX-pmd mappings is to
attain more performance, not dramatically less (3 orders of magnitude).

track_pfn_insert() relies on a previous call to reserve_memtype() to
establish the expected page_cache_mode for the range.  While memremap()
arranges for reserve_memtype() to be called, devm_memremap_pages() does
not.  So, teach track_pfn_insert() and untrack_pfn() how to handle
tracking without a vma, and arrange for devm_memremap_pages() to
establish the write-back-cache reservation in the memtype tree.

Cc: <stable@vger.kernel.org>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Toshi Kani <toshi.kani@hpe.com>
Reported-by: Kai Zhang <kai.ka.zhang@oracle.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/mm/pat.c | 17 ++++++++++-------
 kernel/memremap.c |  9 +++++++++
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index ecb1b69c1651..170cc4ff057b 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -927,9 +927,10 @@ int track_pfn_copy(struct vm_area_struct *vma)
 }
 
 /*
- * prot is passed in as a parameter for the new mapping. If the vma has a
- * linear pfn mapping for the entire range reserve the entire vma range with
- * single reserve_pfn_range call.
+ * prot is passed in as a parameter for the new mapping. If the vma has
+ * a linear pfn mapping for the entire range, or no vma is provided,
+ * reserve the entire pfn + size range with single reserve_pfn_range
+ * call.
  */
 int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
 		    unsigned long pfn, unsigned long addr, unsigned long size)
@@ -938,11 +939,12 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
 	enum page_cache_mode pcm;
 
 	/* reserve the whole chunk starting from paddr */
-	if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) {
+	if (!vma || (addr == vma->vm_start
+				&& size == (vma->vm_end - vma->vm_start))) {
 		int ret;
 
 		ret = reserve_pfn_range(paddr, size, prot, 0);
-		if (!ret)
+		if (ret == 0 && vma)
 			vma->vm_flags |= VM_PAT;
 		return ret;
 	}
@@ -997,7 +999,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 	resource_size_t paddr;
 	unsigned long prot;
 
-	if (!(vma->vm_flags & VM_PAT))
+	if (vma && !(vma->vm_flags & VM_PAT))
 		return;
 
 	/* free the chunk starting from pfn or the whole chunk */
@@ -1011,7 +1013,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 		size = vma->vm_end - vma->vm_start;
 	}
 	free_pfn_range(paddr, size);
-	vma->vm_flags &= ~VM_PAT;
+	if (vma)
+		vma->vm_flags &= ~VM_PAT;
 }
 
 /*
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 251d16b4cb41..b501e390bb34 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -247,6 +247,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_size = ALIGN(resource_size(res), SECTION_SIZE);
 	arch_remove_memory(align_start, align_size);
+	untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
 	pgmap_radix_release(res);
 	dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
 			"%s: failed to free all reserved pages\n", __func__);
@@ -282,6 +283,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 		struct percpu_ref *ref, struct vmem_altmap *altmap)
 {
 	resource_size_t key, align_start, align_size, align_end;
+	pgprot_t pgprot = PAGE_KERNEL;
 	struct dev_pagemap *pgmap;
 	struct page_map *page_map;
 	int error, nid, is_ram;
@@ -351,6 +353,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 	if (nid < 0)
 		nid = numa_mem_id();
 
+	error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0,
+			align_size);
+	if (error)
+		goto err_pfn_remap;
+
 	error = arch_add_memory(nid, align_start, align_size, true);
 	if (error)
 		goto err_add_memory;
@@ -371,6 +378,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 	return __va(res->start);
 
  err_add_memory:
+	untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
+ err_pfn_remap:
  err_radix:
 	pgmap_radix_release(res);
 	devres_free(page_map);
-- 
cgit v1.3-8-gc7d7


From 767ae08678c2c796bcd7f582ee457aee20a28a1e Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Tue, 6 Sep 2016 16:23:49 +0300
Subject: perf/core: Fix a race between mmap_close() and set_output() of AUX
 events

In the mmap_close() path we need to stop all the AUX events that are
writing data to the AUX area that we are unmapping, before we can
safely free the pages. To determine if an event needs to be stopped,
we're comparing its ->rb against the one that's getting unmapped.
However, a SET_OUTPUT ioctl may turn up inside an AUX transaction
and swizzle event::rb to some other ring buffer, but the transaction
will keep writing data to the old ring buffer until the event gets
scheduled out. At this point, mmap_close() will skip over such an
event and will proceed to free the AUX area, while it's still being
used by this event, which will set off a warning in the mmap_close()
path and cause a memory corruption.

To avoid this, always stop an AUX event before its ->rb is updated;
this will release the (potentially) last reference on the AUX area
of the buffer. If the event gets restarted, its new ring buffer will
be used. If another SET_OUTPUT comes and switches it back to the
old ring buffer that's getting unmapped, it's also fine: this
ring buffer's aux_mmap_count will be zero and AUX transactions won't
start any more.

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160906132353.19887-2-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 07ac8596a728..a54f2c2cdb20 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2496,11 +2496,11 @@ static int __perf_event_stop(void *info)
 	return 0;
 }
 
-static int perf_event_restart(struct perf_event *event)
+static int perf_event_stop(struct perf_event *event, int restart)
 {
 	struct stop_event_data sd = {
 		.event		= event,
-		.restart	= 1,
+		.restart	= restart,
 	};
 	int ret = 0;
 
@@ -4845,6 +4845,19 @@ static void ring_buffer_attach(struct perf_event *event,
 		spin_unlock_irqrestore(&rb->event_lock, flags);
 	}
 
+	/*
+	 * Avoid racing with perf_mmap_close(AUX): stop the event
+	 * before swizzling the event::rb pointer; if it's getting
+	 * unmapped, its aux_mmap_count will be 0 and it won't
+	 * restart. See the comment in __perf_pmu_output_stop().
+	 *
+	 * Data will inevitably be lost when set_output is done in
+	 * mid-air, but then again, whoever does it like this is
+	 * not in for the data anyway.
+	 */
+	if (has_aux(event))
+		perf_event_stop(event, 0);
+
 	rcu_assign_pointer(event->rb, rb);
 
 	if (old_rb) {
@@ -6120,7 +6133,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
 	raw_spin_unlock_irqrestore(&ifh->lock, flags);
 
 	if (restart)
-		perf_event_restart(event);
+		perf_event_stop(event, 1);
 }
 
 void perf_event_exec(void)
@@ -6164,7 +6177,13 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
 
 	/*
 	 * In case of inheritance, it will be the parent that links to the
-	 * ring-buffer, but it will be the child that's actually using it:
+	 * ring-buffer, but it will be the child that's actually using it.
+	 *
+	 * We are using event::rb to determine if the event should be stopped,
+	 * however this may race with ring_buffer_attach() (through set_output),
+	 * which will make us skip the event that actually needs to be stopped.
+	 * So ring_buffer_attach() has to stop an aux event before re-assigning
+	 * its rb pointer.
 	 */
 	if (rcu_dereference(parent->rb) == rb)
 		ro->err = __perf_event_stop(&sd);
@@ -6678,7 +6697,7 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
 	raw_spin_unlock_irqrestore(&ifh->lock, flags);
 
 	if (restart)
-		perf_event_restart(event);
+		perf_event_stop(event, 1);
 }
 
 /*
@@ -7867,7 +7886,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
 	mmput(mm);
 
 restart:
-	perf_event_restart(event);
+	perf_event_stop(event, 1);
 }
 
 /*
-- 
cgit v1.3-8-gc7d7


From b79ccadd6bb10e72cf784a298ca6dc1398eb9a24 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Tue, 6 Sep 2016 16:23:50 +0300
Subject: perf/core: Fix aux_mmap_count vs aux_refcount order

The order of accesses to ring buffer's aux_mmap_count and aux_refcount
has to be preserved across the users, namely perf_mmap_close() and
perf_aux_output_begin(), otherwise the inversion can result in the latter
holding the last reference to the aux buffer and subsequently free'ing
it in atomic context, triggering a warning.

> ------------[ cut here ]------------
> WARNING: CPU: 0 PID: 257 at kernel/events/ring_buffer.c:541 __rb_free_aux+0x11a/0x130
> CPU: 0 PID: 257 Comm: stopbug Not tainted 4.8.0-rc1+ #2596
> Call Trace:
>  [<ffffffff810f3e0b>] __warn+0xcb/0xf0
>  [<ffffffff810f3f3d>] warn_slowpath_null+0x1d/0x20
>  [<ffffffff8121182a>] __rb_free_aux+0x11a/0x130
>  [<ffffffff812127a8>] rb_free_aux+0x18/0x20
>  [<ffffffff81212913>] perf_aux_output_begin+0x163/0x1e0
>  [<ffffffff8100c33a>] bts_event_start+0x3a/0xd0
>  [<ffffffff8100c42d>] bts_event_add+0x5d/0x80
>  [<ffffffff81203646>] event_sched_in.isra.104+0xf6/0x2f0
>  [<ffffffff8120652e>] group_sched_in+0x6e/0x190
>  [<ffffffff8120694e>] ctx_sched_in+0x2fe/0x5f0
>  [<ffffffff81206ca0>] perf_event_sched_in+0x60/0x80
>  [<ffffffff81206d1b>] ctx_resched+0x5b/0x90
>  [<ffffffff81207281>] __perf_event_enable+0x1e1/0x240
>  [<ffffffff81200639>] event_function+0xa9/0x180
>  [<ffffffff81202000>] ? perf_cgroup_attach+0x70/0x70
>  [<ffffffff8120203f>] remote_function+0x3f/0x50
>  [<ffffffff811971f3>] flush_smp_call_function_queue+0x83/0x150
>  [<ffffffff81197bd3>] generic_smp_call_function_single_interrupt+0x13/0x60
>  [<ffffffff810a6477>] smp_call_function_single_interrupt+0x27/0x40
>  [<ffffffff81a26ea9>] call_function_single_interrupt+0x89/0x90
>  [<ffffffff81120056>] finish_task_switch+0xa6/0x210
>  [<ffffffff81120017>] ? finish_task_switch+0x67/0x210
>  [<ffffffff81a1e83d>] __schedule+0x3dd/0xb50
>  [<ffffffff81a1efe5>] schedule+0x35/0x80
>  [<ffffffff81128031>] sys_sched_yield+0x61/0x70
>  [<ffffffff81a25be5>] entry_SYSCALL_64_fastpath+0x18/0xa8
> ---[ end trace 6235f556f5ea83a9 ]---

This patch puts the checks in perf_aux_output_begin() in the same order
as that of perf_mmap_close().

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160906132353.19887-3-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/ring_buffer.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ae9b90dc9a5a..257fa460b846 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -330,15 +330,22 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	if (!rb)
 		return NULL;
 
-	if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
+	if (!rb_has_aux(rb))
 		goto err;
 
 	/*
-	 * If rb::aux_mmap_count is zero (and rb_has_aux() above went through),
-	 * the aux buffer is in perf_mmap_close(), about to get freed.
+	 * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(),
+	 * about to get freed, so we leave immediately.
+	 *
+	 * Checking rb::aux_mmap_count and rb::refcount has to be done in
+	 * the same order, see perf_mmap_close. Otherwise we end up freeing
+	 * aux pages in this path, which is a bug, because in_atomic().
 	 */
 	if (!atomic_read(&rb->aux_mmap_count))
-		goto err_put;
+		goto err;
+
+	if (!atomic_inc_not_zero(&rb->aux_refcount))
+		goto err;
 
 	/*
 	 * Nesting is not supported for AUX area, make sure nested
-- 
cgit v1.3-8-gc7d7


From 28b89b9e6f7b6c8fef7b3af39828722bca20cfee Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Sun, 11 Sep 2016 21:14:58 -0700
Subject: cpuset: handle race between CPU hotplug and cpuset_hotplug_work

A discrepancy between cpu_online_mask and cpuset's effective_cpus
mask is inevitable during hotplug since cpuset defers updating of
effective_cpus mask using a workqueue, during which time nothing
prevents the system from more hotplug operations.  For that reason
guarantee_online_cpus() walks up the cpuset hierarchy until it finds
an intersection under the assumption that top cpuset's effective_cpus
mask intersects with cpu_online_mask even with such a race occurring.

However a sequence of CPU hotplugs can open a time window, during which
none of the effective CPUs in the top cpuset intersect with
cpu_online_mask.

For example when there are 4 possible CPUs 0-3 and only CPU0 is online:

  ========================  ===========================
   cpu_online_mask           top_cpuset.effective_cpus
  ========================  ===========================
   echo 1 > cpu2/online.
   CPU hotplug notifier woke up hotplug work but not yet scheduled.
      [0,2]                     [0]

   echo 0 > cpu0/online.
   The workqueue is still runnable.
      [2]                       [0]
  ========================  ===========================

  Now there is no intersection between cpu_online_mask and
  top_cpuset.effective_cpus.  Thus invoking sys_sched_setaffinity() at
  this moment can cause following:

   Unable to handle kernel NULL pointer dereference at virtual address 000000d0
   ------------[ cut here ]------------
   Kernel BUG at ffffffc0001389b0 [verbose debug info unavailable]
   Internal error: Oops - BUG: 96000005 [#1] PREEMPT SMP
   Modules linked in:
   CPU: 2 PID: 1420 Comm: taskset Tainted: G        W       4.4.8+ #98
   task: ffffffc06a5c4880 ti: ffffffc06e124000 task.ti: ffffffc06e124000
   PC is at guarantee_online_cpus+0x2c/0x58
   LR is at cpuset_cpus_allowed+0x4c/0x6c
   <snip>
   Process taskset (pid: 1420, stack limit = 0xffffffc06e124020)
   Call trace:
   [<ffffffc0001389b0>] guarantee_online_cpus+0x2c/0x58
   [<ffffffc00013b208>] cpuset_cpus_allowed+0x4c/0x6c
   [<ffffffc0000d61f0>] sched_setaffinity+0xc0/0x1ac
   [<ffffffc0000d6374>] SyS_sched_setaffinity+0x98/0xac
   [<ffffffc000085cb0>] el0_svc_naked+0x24/0x28

The top cpuset's effective_cpus are guaranteed to be identical to
cpu_online_mask eventually.  Hence fall back to cpu_online_mask when
there is no intersection between top cpuset's effective_cpus and
cpu_online_mask.

Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: cgroups@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: <stable@vger.kernel.org> # 3.17+
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cpuset.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c27e53326bef..c08585ad996b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -325,8 +325,7 @@ static struct file_system_type cpuset_fs_type = {
 /*
  * Return in pmask the portion of a cpusets's cpus_allowed that
  * are online.  If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus.  The top
- * cpuset always has some cpus online.
+ * until we find one that does have some online cpus.
  *
  * One way or another, we guarantee to return some non-empty subset
  * of cpu_online_mask.
@@ -335,8 +334,20 @@ static struct file_system_type cpuset_fs_type = {
  */
 static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
 {
-	while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
+	while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
 		cs = parent_cs(cs);
+		if (unlikely(!cs)) {
+			/*
+			 * The top cpuset doesn't have any online cpu as a
+			 * consequence of a race between cpuset_hotplug_work
+			 * and cpu hotplug notifier.  But we know the top
+			 * cpuset's effective_cpus is on its way to to be
+			 * identical to cpu_online_mask.
+			 */
+			cpumask_copy(pmask, cpu_online_mask);
+			return;
+		}
+	}
 	cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
 }
 
-- 
cgit v1.3-8-gc7d7


From 8a15b81741879fa89601b03c0e50b0d780d65bc0 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Fri, 16 Sep 2016 13:02:37 +0000
Subject: cpuset: fix non static symbol warning

Fixes the following sparse warning:

kernel/cpuset.c:2088:6: warning:
 symbol 'cpuset_fork' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c08585ad996b..2b4c20ab5bbe 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2085,7 +2085,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
  * which could have been changed by cpuset just after it inherits the
  * state from the parent and before it sits on the cgroup's task list.
  */
-void cpuset_fork(struct task_struct *task)
+static void cpuset_fork(struct task_struct *task)
 {
 	if (task_css_is_root(task, cpuset_cgrp_id))
 		return;
-- 
cgit v1.3-8-gc7d7


From 1984e075915cbae65336a99b1879865080d8e55e Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 19 Sep 2016 09:49:27 +0100
Subject: genirq: Skip chained interrupt trigger setup if type is IRQ_TYPE_NONE

There is no point in trying to configure the trigger of a chained
interrupt if no trigger information has been configured. At best
this is ignored, and at the worse this confuses the underlying
irqchip (which is likely not to handle such a thing), and
unnecessarily alarms the user.

Only apply the configuration if type is not IRQ_TYPE_NONE.

Fixes: 1e12c4a9393b ("genirq: Correctly configure the trigger on chained interrupts")
Reported-and-tested-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Link: https://lkml.kernel.org/r/CAMuHMdVW1eTn20=EtYcJ8hkVwohaSuH_yQXrY2MGBEvZ8fpFOg@mail.gmail.com
Link: http://lkml.kernel.org/r/1474274967-15984-1-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 637389088b3f..26ba5654d9d5 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -820,6 +820,8 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
 	desc->name = name;
 
 	if (handle != handle_bad_irq && is_chained) {
+		unsigned int type = irqd_get_trigger_type(&desc->irq_data);
+
 		/*
 		 * We're about to start this interrupt immediately,
 		 * hence the need to set the trigger configuration.
@@ -828,8 +830,10 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
 		 * chained interrupt. Reset it immediately because we
 		 * do know better.
 		 */
-		__irq_set_trigger(desc, irqd_get_trigger_type(&desc->irq_data));
-		desc->handle_irq = handle;
+		if (type != IRQ_TYPE_NONE) {
+			__irq_set_trigger(desc, type);
+			desc->handle_irq = handle;
+		}
 
 		irq_settings_set_noprobe(desc);
 		irq_settings_set_norequest(desc);
-- 
cgit v1.3-8-gc7d7


From d979a39d7242e0601bf9b60e89628fb8ac577179 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <jweiner@fb.com>
Date: Mon, 19 Sep 2016 14:44:38 -0700
Subject: cgroup: duplicate cgroup reference when cloning sockets

When a socket is cloned, the associated sock_cgroup_data is duplicated
but not its reference on the cgroup.  As a result, the cgroup reference
count will underflow when both sockets are destroyed later on.

Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
Link: http://lkml.kernel.org/r/20160914194846.11153-2-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: <stable@vger.kernel.org>	[4.5+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 6 ++++++
 net/core/sock.c | 5 ++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d1c51b7f5221..5e8dab5bf9ad 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6270,6 +6270,12 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
 	if (cgroup_sk_alloc_disabled)
 		return;
 
+	/* Socket clone path */
+	if (skcd->val) {
+		cgroup_get(sock_cgroup_ptr(skcd));
+		return;
+	}
+
 	rcu_read_lock();
 
 	while (true) {
diff --git a/net/core/sock.c b/net/core/sock.c
index 25dab8b60223..fd7b41edf1ce 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1362,7 +1362,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 		if (!try_module_get(prot->owner))
 			goto out_free_sec;
 		sk_tx_queue_clear(sk);
-		cgroup_sk_alloc(&sk->sk_cgrp_data);
 	}
 
 	return sk;
@@ -1422,6 +1421,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		sock_net_set(sk, net);
 		atomic_set(&sk->sk_wmem_alloc, 1);
 
+		cgroup_sk_alloc(&sk->sk_cgrp_data);
 		sock_update_classid(&sk->sk_cgrp_data);
 		sock_update_netprioidx(&sk->sk_cgrp_data);
 	}
@@ -1566,6 +1566,9 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_priority = 0;
 		newsk->sk_incoming_cpu = raw_smp_processor_id();
 		atomic64_set(&newsk->sk_cookie, 0);
+
+		cgroup_sk_alloc(&newsk->sk_cgrp_data);
+
 		/*
 		 * Before updating sk_refcnt, we must commit prior changes to memory
 		 * (Documentation/RCU/rculist_nulls.txt for details)
-- 
cgit v1.3-8-gc7d7


From 3bf6215a1b30db7df6083c708caab3fe1a8e8abe Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Tue, 20 Sep 2016 18:48:11 +0300
Subject: perf/core: Limit matching exclusive events to one PMU

An "exclusive" PMU is the one that can only have one event scheduled in
at any given time. There may be more than one of such PMUs in a system,
though, like Intel PT and BTS. It should be allowed to have one event
for either of those inside the same context (there may be other constraints
that may prevent this, but those would be hardware-specific). However,
the exclusivity code is written so that only one event from any of the
"exclusive" PMUs is allowed in a context.

Fix this by making the exclusive event filter explicitly match two events'
PMUs.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160920154811.3255-3-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index a54f2c2cdb20..fc9bb2225291 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3929,7 +3929,7 @@ static void exclusive_event_destroy(struct perf_event *event)
 
 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
 {
-	if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
+	if ((e1->pmu == e2->pmu) &&
 	    (e1->cpu == e2->cpu ||
 	     e1->cpu == -1 ||
 	     e2->cpu == -1))
-- 
cgit v1.3-8-gc7d7


From 9157056da8f8c4a6305f15619e269f164b63a6de Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 23 Sep 2016 16:55:49 -0400
Subject: cgroup: fix invalid controller enable rejections with cgroup
 namespace

On the v2 hierarchy, "cgroup.subtree_control" rejects controller
enables if the cgroup has processes in it.  The enforcement of this
logic assumes that the cgroup wouldn't have any css_sets associated
with it if there are no tasks in the cgroup, which is no longer true
since a79a908fd2b0 ("cgroup: introduce cgroup namespaces").

When a cgroup namespace is created, it pins the css_set of the
creating task to use it as the root css_set of the namespace.  This
extra reference stays as long as the namespace is around and makes
"cgroup.subtree_control" think that the namespace root cgroup is not
empty even when it is and thus reject controller enables.

Fix it by making cgroup_subtree_control() walk and test emptiness of
each css_set instead of testing whether the list_head is empty.

While at it, update the comment of cgroup_task_count() to indicate
that the returned value may be higher than the number of tasks, which
has always been true due to temporary references and doesn't break
anything.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Evgeny Vereshchagin <evvers@ya.ru>
Cc: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Cc: Aditya Kali <adityakali@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: stable@vger.kernel.org # v4.6+
Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces")
Link: https://github.com/systemd/systemd/pull/3589#issuecomment-249089541
---
 kernel/cgroup.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d1c51b7f5221..0d4ee1ea5c31 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3446,9 +3446,28 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 	 * Except for the root, subtree_control must be zero for a cgroup
 	 * with tasks so that child cgroups don't compete against tasks.
 	 */
-	if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
-		ret = -EBUSY;
-		goto out_unlock;
+	if (enable && cgroup_parent(cgrp)) {
+		struct cgrp_cset_link *link;
+
+		/*
+		 * Because namespaces pin csets too, @cgrp->cset_links
+		 * might not be empty even when @cgrp is empty.  Walk and
+		 * verify each cset.
+		 */
+		spin_lock_irq(&css_set_lock);
+
+		ret = 0;
+		list_for_each_entry(link, &cgrp->cset_links, cset_link) {
+			if (css_set_populated(link->cset)) {
+				ret = -EBUSY;
+				break;
+			}
+		}
+
+		spin_unlock_irq(&css_set_lock);
+
+		if (ret)
+			goto out_unlock;
 	}
 
 	/* save and update control masks and prepare csses */
@@ -3899,7 +3918,9 @@ void cgroup_file_notify(struct cgroup_file *cfile)
  * cgroup_task_count - count the number of tasks in a cgroup.
  * @cgrp: the cgroup in question
  *
- * Return the number of tasks in the cgroup.
+ * Return the number of tasks in the cgroup.  The returned number can be
+ * higher than the actual number of tasks due to css_set references from
+ * namespace roots and temporary usages.
  */
 static int cgroup_task_count(const struct cgroup *cgrp)
 {
-- 
cgit v1.3-8-gc7d7


From 1245800c0f96eb6ebb368593e251d66c01e61022 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 23 Sep 2016 22:57:13 -0400
Subject: tracing: Move mutex to protect against resetting of seq data

The iter->seq can be reset outside the protection of the mutex. So can
reading of user data. Move the mutex up to the beginning of the function.

Fixes: d7350c3f45694 ("tracing/core: make the read callbacks reentrants")
Cc: stable@vger.kernel.org # 2.6.30+
Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a4bd6b68a0b..8fb4847b0450 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4890,19 +4890,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	struct trace_iterator *iter = filp->private_data;
 	ssize_t sret;
 
-	/* return any leftover data */
-	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
-	if (sret != -EBUSY)
-		return sret;
-
-	trace_seq_init(&iter->seq);
-
 	/*
 	 * Avoid more than one consumer on a single file descriptor
 	 * This is just a matter of traces coherency, the ring buffer itself
 	 * is protected.
 	 */
 	mutex_lock(&iter->mutex);
+
+	/* return any leftover data */
+	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+	if (sret != -EBUSY)
+		goto out;
+
+	trace_seq_init(&iter->seq);
+
 	if (iter->trace->read) {
 		sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
 		if (sret)
-- 
cgit v1.3-8-gc7d7


From 1ae2293dd6d2f5c823cf97e60b70d03631cd622f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 17 Sep 2016 18:31:46 -0400
Subject: fix memory leaks in tracing_buffers_splice_read()

Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/trace/trace.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8fb4847b0450..77eeab2776ef 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5930,9 +5930,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		return -EBUSY;
 #endif
 
-	if (splice_grow_spd(pipe, &spd))
-		return -ENOMEM;
-
 	if (*ppos & (PAGE_SIZE - 1))
 		return -EINVAL;
 
@@ -5942,6 +5939,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		len &= PAGE_MASK;
 	}
 
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
  again:
 	trace_access_lock(iter->cpu_file);
 	entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
@@ -5999,19 +5999,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 	/* did we read anything? */
 	if (!spd.nr_pages) {
 		if (ret)
-			return ret;
+			goto out;
 
+		ret = -EAGAIN;
 		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
-			return -EAGAIN;
+			goto out;
 
 		ret = wait_on_pipe(iter, true);
 		if (ret)
-			return ret;
+			goto out;
 
 		goto again;
 	}
 
 	ret = splice_to_pipe(pipe, &spd);
+out:
 	splice_shrink_spd(&spd);
 
 	return ret;
-- 
cgit v1.3-8-gc7d7