From d8eabc37310a92df40d07c5a8afc53cebf996716 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Feb 2019 12:36:50 +0100
Subject: x86/msr-index: Cleanup bit defines

Greg pointed out that speculation related bit defines are using (1 << N)
format instead of BIT(N). Aside of that (1 << N) is wrong as it should use
1UL at least.

Clean it up.

[ Josh Poimboeuf: Fix tools build ]

Reported-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 arch/x86/include/asm/msr-index.h | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 8e40c2446fd1..e4074556c37b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -2,6 +2,8 @@
 #ifndef _ASM_X86_MSR_INDEX_H
 #define _ASM_X86_MSR_INDEX_H
 
+#include <linux/bits.h>
+
 /*
  * CPU model specific register (MSR) numbers.
  *
@@ -40,14 +42,14 @@
 /* Intel MSRs. Some also available on other CPUs */
 
 #define MSR_IA32_SPEC_CTRL		0x00000048 /* Speculation Control */
-#define SPEC_CTRL_IBRS			(1 << 0)   /* Indirect Branch Restricted Speculation */
+#define SPEC_CTRL_IBRS			BIT(0)	   /* Indirect Branch Restricted Speculation */
 #define SPEC_CTRL_STIBP_SHIFT		1	   /* Single Thread Indirect Branch Predictor (STIBP) bit */
-#define SPEC_CTRL_STIBP			(1 << SPEC_CTRL_STIBP_SHIFT)	/* STIBP mask */
+#define SPEC_CTRL_STIBP			BIT(SPEC_CTRL_STIBP_SHIFT)	/* STIBP mask */
 #define SPEC_CTRL_SSBD_SHIFT		2	   /* Speculative Store Bypass Disable bit */
-#define SPEC_CTRL_SSBD			(1 << SPEC_CTRL_SSBD_SHIFT)	/* Speculative Store Bypass Disable */
+#define SPEC_CTRL_SSBD			BIT(SPEC_CTRL_SSBD_SHIFT)	/* Speculative Store Bypass Disable */
 
 #define MSR_IA32_PRED_CMD		0x00000049 /* Prediction Command */
-#define PRED_CMD_IBPB			(1 << 0)   /* Indirect Branch Prediction Barrier */
+#define PRED_CMD_IBPB			BIT(0)	   /* Indirect Branch Prediction Barrier */
 
 #define MSR_PPIN_CTL			0x0000004e
 #define MSR_PPIN			0x0000004f
@@ -69,20 +71,20 @@
 #define MSR_MTRRcap			0x000000fe
 
 #define MSR_IA32_ARCH_CAPABILITIES	0x0000010a
-#define ARCH_CAP_RDCL_NO		(1 << 0)   /* Not susceptible to Meltdown */
-#define ARCH_CAP_IBRS_ALL		(1 << 1)   /* Enhanced IBRS support */
-#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH	(1 << 3)   /* Skip L1D flush on vmentry */
-#define ARCH_CAP_SSB_NO			(1 << 4)   /*
-						    * Not susceptible to Speculative Store Bypass
-						    * attack, so no Speculative Store Bypass
-						    * control required.
-						    */
+#define ARCH_CAP_RDCL_NO		BIT(0)	/* Not susceptible to Meltdown */
+#define ARCH_CAP_IBRS_ALL		BIT(1)	/* Enhanced IBRS support */
+#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH	BIT(3)	/* Skip L1D flush on vmentry */
+#define ARCH_CAP_SSB_NO			BIT(4)	/*
+						 * Not susceptible to Speculative Store Bypass
+						 * attack, so no Speculative Store Bypass
+						 * control required.
+						 */
 
 #define MSR_IA32_FLUSH_CMD		0x0000010b
-#define L1D_FLUSH			(1 << 0)   /*
-						    * Writeback and invalidate the
-						    * L1 data cache.
-						    */
+#define L1D_FLUSH			BIT(0)	/*
+						 * Writeback and invalidate the
+						 * L1 data cache.
+						 */
 
 #define MSR_IA32_BBL_CR_CTL		0x00000119
 #define MSR_IA32_BBL_CR_CTL3		0x0000011e
-- 
cgit v1.2.3-59-g8ed1b


From 36ad35131adacc29b328b9c8b6277a8bf0d6fd5d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 27 Feb 2019 10:10:23 +0100
Subject: x86/speculation: Consolidate CPU whitelists

The CPU vulnerability whitelists have some overlap and there are more
whitelists coming along.

Use the driver_data field in the x86_cpu_id struct to denote the
whitelisted vulnerabilities and combine all whitelists into one.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 arch/x86/kernel/cpu/common.c | 110 +++++++++++++++++++++++--------------------
 1 file changed, 60 insertions(+), 50 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cb28e98a0659..26ec15034f86 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -948,61 +948,72 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
 #endif
 }
 
-static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
-	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_SALTWELL,	X86_FEATURE_ANY },
-	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_SALTWELL_TABLET,	X86_FEATURE_ANY },
-	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_BONNELL_MID,	X86_FEATURE_ANY },
-	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_SALTWELL_MID,	X86_FEATURE_ANY },
-	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_BONNELL,	X86_FEATURE_ANY },
-	{ X86_VENDOR_CENTAUR,	5 },
-	{ X86_VENDOR_INTEL,	5 },
-	{ X86_VENDOR_NSC,	5 },
-	{ X86_VENDOR_ANY,	4 },
+#define NO_SPECULATION	BIT(0)
+#define NO_MELTDOWN	BIT(1)
+#define NO_SSB		BIT(2)
+#define NO_L1TF		BIT(3)
+
+#define VULNWL(_vendor, _family, _model, _whitelist)	\
+	{ X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
+
+#define VULNWL_INTEL(model, whitelist)		\
+	VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist)
+
+#define VULNWL_AMD(family, whitelist)		\
+	VULNWL(AMD, family, X86_MODEL_ANY, whitelist)
+
+#define VULNWL_HYGON(family, whitelist)		\
+	VULNWL(HYGON, family, X86_MODEL_ANY, whitelist)
+
+static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
+	VULNWL(ANY,	4, X86_MODEL_ANY,	NO_SPECULATION),
+	VULNWL(CENTAUR,	5, X86_MODEL_ANY,	NO_SPECULATION),
+	VULNWL(INTEL,	5, X86_MODEL_ANY,	NO_SPECULATION),
+	VULNWL(NSC,	5, X86_MODEL_ANY,	NO_SPECULATION),
+
+	VULNWL_INTEL(ATOM_SALTWELL,		NO_SPECULATION),
+	VULNWL_INTEL(ATOM_SALTWELL_TABLET,	NO_SPECULATION),
+	VULNWL_INTEL(ATOM_SALTWELL_MID,		NO_SPECULATION),
+	VULNWL_INTEL(ATOM_BONNELL,		NO_SPECULATION),
+	VULNWL_INTEL(ATOM_BONNELL_MID,		NO_SPECULATION),
+
+	VULNWL_INTEL(ATOM_SILVERMONT,		NO_SSB | NO_L1TF),
+	VULNWL_INTEL(ATOM_SILVERMONT_X,		NO_SSB | NO_L1TF),
+	VULNWL_INTEL(ATOM_SILVERMONT_MID,	NO_SSB | NO_L1TF),
+	VULNWL_INTEL(ATOM_AIRMONT,		NO_SSB | NO_L1TF),
+	VULNWL_INTEL(XEON_PHI_KNL,		NO_SSB | NO_L1TF),
+	VULNWL_INTEL(XEON_PHI_KNM,		NO_SSB | NO_L1TF),
+
+	VULNWL_INTEL(CORE_YONAH,		NO_SSB),
+
+	VULNWL_INTEL(ATOM_AIRMONT_MID,		NO_L1TF),
+	VULNWL_INTEL(ATOM_GOLDMONT,		NO_L1TF),
+	VULNWL_INTEL(ATOM_GOLDMONT_X,		NO_L1TF),
+	VULNWL_INTEL(ATOM_GOLDMONT_PLUS,	NO_L1TF),
+
+	VULNWL_AMD(0x0f,		NO_MELTDOWN | NO_SSB | NO_L1TF),
+	VULNWL_AMD(0x10,		NO_MELTDOWN | NO_SSB | NO_L1TF),
+	VULNWL_AMD(0x11,		NO_MELTDOWN | NO_SSB | NO_L1TF),
+	VULNWL_AMD(0x12,		NO_MELTDOWN | NO_SSB | NO_L1TF),
+
+	/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
+	VULNWL_AMD(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF),
+	VULNWL_HYGON(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF),
 	{}
 };
 
-static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
-	{ X86_VENDOR_AMD },
-	{ X86_VENDOR_HYGON },
-	{}
-};
-
-/* Only list CPUs which speculate but are non susceptible to SSB */
-static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_AIRMONT		},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT_X	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT_MID	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_CORE_YONAH		},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNL		},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNM		},
-	{ X86_VENDOR_AMD,	0x12,					},
-	{ X86_VENDOR_AMD,	0x11,					},
-	{ X86_VENDOR_AMD,	0x10,					},
-	{ X86_VENDOR_AMD,	0xf,					},
-	{}
-};
+static bool __init cpu_matches(unsigned long which)
+{
+	const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist);
 
-static const __initconst struct x86_cpu_id cpu_no_l1tf[] = {
-	/* in addition to cpu_no_speculation */
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT_X	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_AIRMONT		},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT_MID	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_AIRMONT_MID	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_GOLDMONT	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_GOLDMONT_X	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_GOLDMONT_PLUS	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNL		},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNM		},
-	{}
-};
+	return m && !!(m->driver_data & which);
+}
 
 static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 {
 	u64 ia32_cap = 0;
 
-	if (x86_match_cpu(cpu_no_speculation))
+	if (cpu_matches(NO_SPECULATION))
 		return;
 
 	setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
@@ -1011,15 +1022,14 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 	if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
 		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
 
-	if (!x86_match_cpu(cpu_no_spec_store_bypass) &&
-	   !(ia32_cap & ARCH_CAP_SSB_NO) &&
+	if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
 	   !cpu_has(c, X86_FEATURE_AMD_SSB_NO))
 		setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
 
 	if (ia32_cap & ARCH_CAP_IBRS_ALL)
 		setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
 
-	if (x86_match_cpu(cpu_no_meltdown))
+	if (cpu_matches(NO_MELTDOWN))
 		return;
 
 	/* Rogue Data Cache Load? No! */
@@ -1028,7 +1038,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 
 	setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
 
-	if (x86_match_cpu(cpu_no_l1tf))
+	if (cpu_matches(NO_L1TF))
 		return;
 
 	setup_force_cpu_bug(X86_BUG_L1TF);
-- 
cgit v1.2.3-59-g8ed1b


From ed5194c2732c8084af9fd159c146ea92bf137128 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 18 Jan 2019 16:50:16 -0800
Subject: x86/speculation/mds: Add basic bug infrastructure for MDS

Microarchitectural Data Sampling (MDS), is a class of side channel attacks
on internal buffers in Intel CPUs. The variants are:

 - Microarchitectural Store Buffer Data Sampling (MSBDS) (CVE-2018-12126)
 - Microarchitectural Fill Buffer Data Sampling (MFBDS) (CVE-2018-12130)
 - Microarchitectural Load Port Data Sampling (MLPDS) (CVE-2018-12127)

MSBDS leaks Store Buffer Entries which can be speculatively forwarded to a
dependent load (store-to-load forwarding) as an optimization. The forward
can also happen to a faulting or assisting load operation for a different
memory address, which can be exploited under certain conditions. Store
buffers are partitioned between Hyper-Threads so cross thread forwarding is
not possible. But if a thread enters or exits a sleep state the store
buffer is repartitioned which can expose data from one thread to the other.

MFBDS leaks Fill Buffer Entries. Fill buffers are used internally to manage
L1 miss situations and to hold data which is returned or sent in response
to a memory or I/O operation. Fill buffers can forward data to a load
operation and also write data to the cache. When the fill buffer is
deallocated it can retain the stale data of the preceding operations which
can then be forwarded to a faulting or assisting load operation, which can
be exploited under certain conditions. Fill buffers are shared between
Hyper-Threads so cross thread leakage is possible.

MLDPS leaks Load Port Data. Load ports are used to perform load operations
from memory or I/O. The received data is then forwarded to the register
file or a subsequent operation. In some implementations the Load Port can
contain stale data from a previous operation which can be forwarded to
faulting or assisting loads under certain conditions, which again can be
exploited eventually. Load ports are shared between Hyper-Threads so cross
thread leakage is possible.

All variants have the same mitigation for single CPU thread case (SMT off),
so the kernel can treat them as one MDS issue.

Add the basic infrastructure to detect if the current CPU is affected by
MDS.

[ tglx: Rewrote changelog ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 arch/x86/include/asm/cpufeatures.h |  2 ++
 arch/x86/include/asm/msr-index.h   |  5 +++++
 arch/x86/kernel/cpu/common.c       | 25 ++++++++++++++++---------
 3 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 6d6122524711..ae3f987b24f1 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -344,6 +344,7 @@
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
 #define X86_FEATURE_AVX512_4VNNIW	(18*32+ 2) /* AVX-512 Neural Network Instructions */
 #define X86_FEATURE_AVX512_4FMAPS	(18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_MD_CLEAR		(18*32+10) /* VERW clears CPU buffers */
 #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
@@ -381,5 +382,6 @@
 #define X86_BUG_SPECTRE_V2		X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
 #define X86_BUG_SPEC_STORE_BYPASS	X86_BUG(17) /* CPU is affected by speculative store bypass attack */
 #define X86_BUG_L1TF			X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
+#define X86_BUG_MDS			X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */
 
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e4074556c37b..e2d30636c03f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -79,6 +79,11 @@
 						 * attack, so no Speculative Store Bypass
 						 * control required.
 						 */
+#define ARCH_CAP_MDS_NO			BIT(5)   /*
+						  * Not susceptible to
+						  * Microarchitectural Data
+						  * Sampling (MDS) vulnerabilities.
+						  */
 
 #define MSR_IA32_FLUSH_CMD		0x0000010b
 #define L1D_FLUSH			BIT(0)	/*
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 26ec15034f86..e34817bca504 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -952,6 +952,7 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
 #define NO_MELTDOWN	BIT(1)
 #define NO_SSB		BIT(2)
 #define NO_L1TF		BIT(3)
+#define NO_MDS		BIT(4)
 
 #define VULNWL(_vendor, _family, _model, _whitelist)	\
 	{ X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
@@ -971,6 +972,7 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
 	VULNWL(INTEL,	5, X86_MODEL_ANY,	NO_SPECULATION),
 	VULNWL(NSC,	5, X86_MODEL_ANY,	NO_SPECULATION),
 
+	/* Intel Family 6 */
 	VULNWL_INTEL(ATOM_SALTWELL,		NO_SPECULATION),
 	VULNWL_INTEL(ATOM_SALTWELL_TABLET,	NO_SPECULATION),
 	VULNWL_INTEL(ATOM_SALTWELL_MID,		NO_SPECULATION),
@@ -987,18 +989,20 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
 	VULNWL_INTEL(CORE_YONAH,		NO_SSB),
 
 	VULNWL_INTEL(ATOM_AIRMONT_MID,		NO_L1TF),
-	VULNWL_INTEL(ATOM_GOLDMONT,		NO_L1TF),
-	VULNWL_INTEL(ATOM_GOLDMONT_X,		NO_L1TF),
-	VULNWL_INTEL(ATOM_GOLDMONT_PLUS,	NO_L1TF),
 
-	VULNWL_AMD(0x0f,		NO_MELTDOWN | NO_SSB | NO_L1TF),
-	VULNWL_AMD(0x10,		NO_MELTDOWN | NO_SSB | NO_L1TF),
-	VULNWL_AMD(0x11,		NO_MELTDOWN | NO_SSB | NO_L1TF),
-	VULNWL_AMD(0x12,		NO_MELTDOWN | NO_SSB | NO_L1TF),
+	VULNWL_INTEL(ATOM_GOLDMONT,		NO_MDS | NO_L1TF),
+	VULNWL_INTEL(ATOM_GOLDMONT_X,		NO_MDS | NO_L1TF),
+	VULNWL_INTEL(ATOM_GOLDMONT_PLUS,	NO_MDS | NO_L1TF),
+
+	/* AMD Family 0xf - 0x12 */
+	VULNWL_AMD(0x0f,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
+	VULNWL_AMD(0x10,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
+	VULNWL_AMD(0x11,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
+	VULNWL_AMD(0x12,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
 
 	/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
-	VULNWL_AMD(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF),
-	VULNWL_HYGON(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF),
+	VULNWL_AMD(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF | NO_MDS),
+	VULNWL_HYGON(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF | NO_MDS),
 	{}
 };
 
@@ -1029,6 +1033,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 	if (ia32_cap & ARCH_CAP_IBRS_ALL)
 		setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
 
+	if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO))
+		setup_force_cpu_bug(X86_BUG_MDS);
+
 	if (cpu_matches(NO_MELTDOWN))
 		return;
 
-- 
cgit v1.2.3-59-g8ed1b


From e261f209c3666e842fd645a1e31f001c3a26def9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 Mar 2019 20:21:08 +0100
Subject: x86/speculation/mds: Add BUG_MSBDS_ONLY

This bug bit is set on CPUs which are only affected by Microarchitectural
Store Buffer Data Sampling (MSBDS) and not by any other MDS variant.

This is important because the Store Buffers are partitioned between
Hyper-Threads so cross thread forwarding is not possible. But if a thread
enters or exits a sleep state the store buffer is repartitioned which can
expose data from one thread to the other. This transition can be mitigated.

That means that for CPUs which are only affected by MSBDS SMT can be
enabled, if the CPU is not affected by other SMT sensitive vulnerabilities,
e.g. L1TF. The XEON PHI variants fall into that category. Also the
Silvermont/Airmont ATOMs, but for them it's not really relevant as they do
not support SMT, but mark them for completeness sake.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/kernel/cpu/common.c       | 20 ++++++++++++--------
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index ae3f987b24f1..bdcea163850a 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -383,5 +383,6 @@
 #define X86_BUG_SPEC_STORE_BYPASS	X86_BUG(17) /* CPU is affected by speculative store bypass attack */
 #define X86_BUG_L1TF			X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
 #define X86_BUG_MDS			X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */
+#define X86_BUG_MSBDS_ONLY		X86_BUG(20) /* CPU is only affected by the  MSDBS variant of BUG_MDS */
 
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e34817bca504..132a63dc5a76 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -953,6 +953,7 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
 #define NO_SSB		BIT(2)
 #define NO_L1TF		BIT(3)
 #define NO_MDS		BIT(4)
+#define MSBDS_ONLY	BIT(5)
 
 #define VULNWL(_vendor, _family, _model, _whitelist)	\
 	{ X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
@@ -979,16 +980,16 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
 	VULNWL_INTEL(ATOM_BONNELL,		NO_SPECULATION),
 	VULNWL_INTEL(ATOM_BONNELL_MID,		NO_SPECULATION),
 
-	VULNWL_INTEL(ATOM_SILVERMONT,		NO_SSB | NO_L1TF),
-	VULNWL_INTEL(ATOM_SILVERMONT_X,		NO_SSB | NO_L1TF),
-	VULNWL_INTEL(ATOM_SILVERMONT_MID,	NO_SSB | NO_L1TF),
-	VULNWL_INTEL(ATOM_AIRMONT,		NO_SSB | NO_L1TF),
-	VULNWL_INTEL(XEON_PHI_KNL,		NO_SSB | NO_L1TF),
-	VULNWL_INTEL(XEON_PHI_KNM,		NO_SSB | NO_L1TF),
+	VULNWL_INTEL(ATOM_SILVERMONT,		NO_SSB | NO_L1TF | MSBDS_ONLY),
+	VULNWL_INTEL(ATOM_SILVERMONT_X,		NO_SSB | NO_L1TF | MSBDS_ONLY),
+	VULNWL_INTEL(ATOM_SILVERMONT_MID,	NO_SSB | NO_L1TF | MSBDS_ONLY),
+	VULNWL_INTEL(ATOM_AIRMONT,		NO_SSB | NO_L1TF | MSBDS_ONLY),
+	VULNWL_INTEL(XEON_PHI_KNL,		NO_SSB | NO_L1TF | MSBDS_ONLY),
+	VULNWL_INTEL(XEON_PHI_KNM,		NO_SSB | NO_L1TF | MSBDS_ONLY),
 
 	VULNWL_INTEL(CORE_YONAH,		NO_SSB),
 
-	VULNWL_INTEL(ATOM_AIRMONT_MID,		NO_L1TF),
+	VULNWL_INTEL(ATOM_AIRMONT_MID,		NO_L1TF | MSBDS_ONLY),
 
 	VULNWL_INTEL(ATOM_GOLDMONT,		NO_MDS | NO_L1TF),
 	VULNWL_INTEL(ATOM_GOLDMONT_X,		NO_MDS | NO_L1TF),
@@ -1033,8 +1034,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 	if (ia32_cap & ARCH_CAP_IBRS_ALL)
 		setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
 
-	if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO))
+	if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) {
 		setup_force_cpu_bug(X86_BUG_MDS);
+		if (cpu_matches(MSBDS_ONLY))
+			setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
+	}
 
 	if (cpu_matches(NO_MELTDOWN))
 		return;
-- 
cgit v1.2.3-59-g8ed1b


From 6c4dbbd14730c43f4ed808a9c42ca41625925c22 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 18 Jan 2019 16:50:23 -0800
Subject: x86/kvm: Expose X86_FEATURE_MD_CLEAR to guests

X86_FEATURE_MD_CLEAR is a new CPUID bit which is set when microcode
provides the mechanism to invoke a flush of various exploitable CPU buffers
by invoking the VERW instruction.

Hand it through to guests so they can adjust their mitigations.

This also requires corresponding qemu changes, which are available
separately.

[ tglx: Massaged changelog ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 arch/x86/kvm/cpuid.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c07958b59f50..39501e7afdb4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -410,7 +410,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	/* cpuid 7.0.edx*/
 	const u32 kvm_cpuid_7_0_edx_x86_features =
 		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
-		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP);
+		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
+		F(MD_CLEAR);
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
-- 
cgit v1.2.3-59-g8ed1b


From 6a9e529272517755904b7afa639f6db59ddb793e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Feb 2019 23:13:06 +0100
Subject: x86/speculation/mds: Add mds_clear_cpu_buffers()

The Microarchitectural Data Sampling (MDS) vulernabilities are mitigated by
clearing the affected CPU buffers. The mechanism for clearing the buffers
uses the unused and obsolete VERW instruction in combination with a
microcode update which triggers a CPU buffer clear when VERW is executed.

Provide a inline function with the assembly magic. The argument of the VERW
instruction must be a memory operand as documented:

  "MD_CLEAR enumerates that the memory-operand variant of VERW (for
   example, VERW m16) has been extended to also overwrite buffers affected
   by MDS. This buffer overwriting functionality is not guaranteed for the
   register operand variant of VERW."

Documentation also recommends to use a writable data segment selector:

  "The buffer overwriting occurs regardless of the result of the VERW
   permission check, as well as when the selector is null or causes a
   descriptor load segment violation. However, for lowest latency we
   recommend using a selector that indicates a valid writable data
   segment."

Add x86 specific documentation about MDS and the internal workings of the
mitigation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 Documentation/index.rst              |  1 +
 Documentation/x86/conf.py            | 10 ++++
 Documentation/x86/index.rst          |  8 +++
 Documentation/x86/mds.rst            | 99 ++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/nospec-branch.h | 25 +++++++++
 5 files changed, 143 insertions(+)
 create mode 100644 Documentation/x86/conf.py
 create mode 100644 Documentation/x86/index.rst
 create mode 100644 Documentation/x86/mds.rst

(limited to 'arch')

diff --git a/Documentation/index.rst b/Documentation/index.rst
index c858c2e66e36..63864826dcd6 100644
--- a/Documentation/index.rst
+++ b/Documentation/index.rst
@@ -101,6 +101,7 @@ implementation.
    :maxdepth: 2
 
    sh/index
+   x86/index
 
 Filesystem Documentation
 ------------------------
diff --git a/Documentation/x86/conf.py b/Documentation/x86/conf.py
new file mode 100644
index 000000000000..33c5c3142e20
--- /dev/null
+++ b/Documentation/x86/conf.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8; mode: python -*-
+
+project = "X86 architecture specific documentation"
+
+tags.add("subproject")
+
+latex_documents = [
+    ('index', 'x86.tex', project,
+     'The kernel development community', 'manual'),
+]
diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
new file mode 100644
index 000000000000..ef389dcf1b1d
--- /dev/null
+++ b/Documentation/x86/index.rst
@@ -0,0 +1,8 @@
+==========================
+x86 architecture specifics
+==========================
+
+.. toctree::
+   :maxdepth: 1
+
+   mds
diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst
new file mode 100644
index 000000000000..1096738d50f2
--- /dev/null
+++ b/Documentation/x86/mds.rst
@@ -0,0 +1,99 @@
+Microarchitectural Data Sampling (MDS) mitigation
+=================================================
+
+.. _mds:
+
+Overview
+--------
+
+Microarchitectural Data Sampling (MDS) is a family of side channel attacks
+on internal buffers in Intel CPUs. The variants are:
+
+ - Microarchitectural Store Buffer Data Sampling (MSBDS) (CVE-2018-12126)
+ - Microarchitectural Fill Buffer Data Sampling (MFBDS) (CVE-2018-12130)
+ - Microarchitectural Load Port Data Sampling (MLPDS) (CVE-2018-12127)
+
+MSBDS leaks Store Buffer Entries which can be speculatively forwarded to a
+dependent load (store-to-load forwarding) as an optimization. The forward
+can also happen to a faulting or assisting load operation for a different
+memory address, which can be exploited under certain conditions. Store
+buffers are partitioned between Hyper-Threads so cross thread forwarding is
+not possible. But if a thread enters or exits a sleep state the store
+buffer is repartitioned which can expose data from one thread to the other.
+
+MFBDS leaks Fill Buffer Entries. Fill buffers are used internally to manage
+L1 miss situations and to hold data which is returned or sent in response
+to a memory or I/O operation. Fill buffers can forward data to a load
+operation and also write data to the cache. When the fill buffer is
+deallocated it can retain the stale data of the preceding operations which
+can then be forwarded to a faulting or assisting load operation, which can
+be exploited under certain conditions. Fill buffers are shared between
+Hyper-Threads so cross thread leakage is possible.
+
+MLPDS leaks Load Port Data. Load ports are used to perform load operations
+from memory or I/O. The received data is then forwarded to the register
+file or a subsequent operation. In some implementations the Load Port can
+contain stale data from a previous operation which can be forwarded to
+faulting or assisting loads under certain conditions, which again can be
+exploited eventually. Load ports are shared between Hyper-Threads so cross
+thread leakage is possible.
+
+
+Exposure assumptions
+--------------------
+
+It is assumed that attack code resides in user space or in a guest with one
+exception. The rationale behind this assumption is that the code construct
+needed for exploiting MDS requires:
+
+ - to control the load to trigger a fault or assist
+
+ - to have a disclosure gadget which exposes the speculatively accessed
+   data for consumption through a side channel.
+
+ - to control the pointer through which the disclosure gadget exposes the
+   data
+
+The existence of such a construct in the kernel cannot be excluded with
+100% certainty, but the complexity involved makes it extremly unlikely.
+
+There is one exception, which is untrusted BPF. The functionality of
+untrusted BPF is limited, but it needs to be thoroughly investigated
+whether it can be used to create such a construct.
+
+
+Mitigation strategy
+-------------------
+
+All variants have the same mitigation strategy at least for the single CPU
+thread case (SMT off): Force the CPU to clear the affected buffers.
+
+This is achieved by using the otherwise unused and obsolete VERW
+instruction in combination with a microcode update. The microcode clears
+the affected CPU buffers when the VERW instruction is executed.
+
+For virtualization there are two ways to achieve CPU buffer
+clearing. Either the modified VERW instruction or via the L1D Flush
+command. The latter is issued when L1TF mitigation is enabled so the extra
+VERW can be avoided. If the CPU is not affected by L1TF then VERW needs to
+be issued.
+
+If the VERW instruction with the supplied segment selector argument is
+executed on a CPU without the microcode update there is no side effect
+other than a small number of pointlessly wasted CPU cycles.
+
+This does not protect against cross Hyper-Thread attacks except for MSBDS
+which is only exploitable cross Hyper-thread when one of the Hyper-Threads
+enters a C-state.
+
+The kernel provides a function to invoke the buffer clearing:
+
+    mds_clear_cpu_buffers()
+
+The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state
+(idle) transitions.
+
+According to current knowledge additional mitigations inside the kernel
+itself are not required because the necessary gadgets to expose the leaked
+data cannot be controlled in a way which allows exploitation from malicious
+user space or VM guests.
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index dad12b767ba0..67cb9b2082b1 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -318,6 +318,31 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
 DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
 DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
 
+#include <asm/segment.h>
+
+/**
+ * mds_clear_cpu_buffers - Mitigation for MDS vulnerability
+ *
+ * This uses the otherwise unused and obsolete VERW instruction in
+ * combination with microcode which triggers a CPU buffer flush when the
+ * instruction is executed.
+ */
+static inline void mds_clear_cpu_buffers(void)
+{
+	static const u16 ds = __KERNEL_DS;
+
+	/*
+	 * Has to be the memory-operand variant because only that
+	 * guarantees the CPU buffer flush functionality according to
+	 * documentation. The register-operand variant does not.
+	 * Works with any segment selector, but a valid writable
+	 * data segment is the fastest variant.
+	 *
+	 * "cc" clobber is required because VERW modifies ZF.
+	 */
+	asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc");
+}
+
 #endif /* __ASSEMBLY__ */
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 04dcbdb8057827b043b3c71aa397c4c63e67d086 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Feb 2019 23:42:51 +0100
Subject: x86/speculation/mds: Clear CPU buffers on exit to user

Add a static key which controls the invocation of the CPU buffer clear
mechanism on exit to user space and add the call into
prepare_exit_to_usermode() and do_nmi() right before actually returning.

Add documentation which kernel to user space transition this covers and
explain why some corner cases are not mitigated.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 Documentation/x86/mds.rst            | 52 ++++++++++++++++++++++++++++++++++++
 arch/x86/entry/common.c              |  3 +++
 arch/x86/include/asm/nospec-branch.h | 13 +++++++++
 arch/x86/kernel/cpu/bugs.c           |  3 +++
 arch/x86/kernel/nmi.c                |  4 +++
 arch/x86/kernel/traps.c              |  8 ++++++
 6 files changed, 83 insertions(+)

(limited to 'arch')

diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst
index 1096738d50f2..54d935bf283b 100644
--- a/Documentation/x86/mds.rst
+++ b/Documentation/x86/mds.rst
@@ -97,3 +97,55 @@ According to current knowledge additional mitigations inside the kernel
 itself are not required because the necessary gadgets to expose the leaked
 data cannot be controlled in a way which allows exploitation from malicious
 user space or VM guests.
+
+Mitigation points
+-----------------
+
+1. Return to user space
+^^^^^^^^^^^^^^^^^^^^^^^
+
+   When transitioning from kernel to user space the CPU buffers are flushed
+   on affected CPUs when the mitigation is not disabled on the kernel
+   command line. The migitation is enabled through the static key
+   mds_user_clear.
+
+   The mitigation is invoked in prepare_exit_to_usermode() which covers
+   most of the kernel to user space transitions. There are a few exceptions
+   which are not invoking prepare_exit_to_usermode() on return to user
+   space. These exceptions use the paranoid exit code.
+
+   - Non Maskable Interrupt (NMI):
+
+     Access to sensible data like keys, credentials in the NMI context is
+     mostly theoretical: The CPU can do prefetching or execute a
+     misspeculated code path and thereby fetching data which might end up
+     leaking through a buffer.
+
+     But for mounting other attacks the kernel stack address of the task is
+     already valuable information. So in full mitigation mode, the NMI is
+     mitigated on the return from do_nmi() to provide almost complete
+     coverage.
+
+   - Double fault (#DF):
+
+     A double fault is usually fatal, but the ESPFIX workaround, which can
+     be triggered from user space through modify_ldt(2) is a recoverable
+     double fault. #DF uses the paranoid exit path, so explicit mitigation
+     in the double fault handler is required.
+
+   - Machine Check Exception (#MC):
+
+     Another corner case is a #MC which hits between the CPU buffer clear
+     invocation and the actual return to user. As this still is in kernel
+     space it takes the paranoid exit path which does not clear the CPU
+     buffers. So the #MC handler repopulates the buffers to some
+     extent. Machine checks are not reliably controllable and the window is
+     extremly small so mitigation would just tick a checkbox that this
+     theoretical corner case is covered. To keep the amount of special
+     cases small, ignore #MC.
+
+   - Debug Exception (#DB):
+
+     This takes the paranoid exit path only when the INT1 breakpoint is in
+     kernel space. #DB on a user space address takes the regular exit path,
+     so no extra mitigation required.
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 7bc105f47d21..19f650d729f5 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -31,6 +31,7 @@
 #include <asm/vdso.h>
 #include <linux/uaccess.h>
 #include <asm/cpufeature.h>
+#include <asm/nospec-branch.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
@@ -212,6 +213,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 #endif
 
 	user_enter_irqoff();
+
+	mds_user_clear_cpu_buffers();
 }
 
 #define SYSCALL_EXIT_WORK_FLAGS				\
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 67cb9b2082b1..65b747286d96 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -318,6 +318,8 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
 DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
 DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
 
+DECLARE_STATIC_KEY_FALSE(mds_user_clear);
+
 #include <asm/segment.h>
 
 /**
@@ -343,6 +345,17 @@ static inline void mds_clear_cpu_buffers(void)
 	asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc");
 }
 
+/**
+ * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability
+ *
+ * Clear CPU buffers if the corresponding static key is enabled
+ */
+static inline void mds_user_clear_cpu_buffers(void)
+{
+	if (static_branch_likely(&mds_user_clear))
+		mds_clear_cpu_buffers();
+}
+
 #endif /* __ASSEMBLY__ */
 
 /*
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 01874d54f4fd..dbb45014de1b 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -63,6 +63,9 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
 /* Control unconditional IBPB in switch_mm() */
 DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
 
+/* Control MDS CPU buffer clear before returning to user space */
+DEFINE_STATIC_KEY_FALSE(mds_user_clear);
+
 void __init check_bugs(void)
 {
 	identify_boot_cpu();
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 18bc9b51ac9b..086cf1d1d71d 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -34,6 +34,7 @@
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
 #include <asm/cache.h>
+#include <asm/nospec-branch.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/nmi.h>
@@ -533,6 +534,9 @@ nmi_restart:
 		write_cr2(this_cpu_read(nmi_cr2));
 	if (this_cpu_dec_return(nmi_state))
 		goto nmi_restart;
+
+	if (user_mode(regs))
+		mds_user_clear_cpu_buffers();
 }
 NOKPROBE_SYMBOL(do_nmi);
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9b7c4ca8f0a7..85fe1870f873 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -58,6 +58,7 @@
 #include <asm/alternative.h>
 #include <asm/fpu/xstate.h>
 #include <asm/trace/mpx.h>
+#include <asm/nospec-branch.h>
 #include <asm/mpx.h>
 #include <asm/vm86.h>
 #include <asm/umip.h>
@@ -366,6 +367,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 		regs->ip = (unsigned long)general_protection;
 		regs->sp = (unsigned long)&gpregs->orig_ax;
 
+		/*
+		 * This situation can be triggered by userspace via
+		 * modify_ldt(2) and the return does not take the regular
+		 * user space exit, so a CPU buffer clear is required when
+		 * MDS mitigation is enabled.
+		 */
+		mds_user_clear_cpu_buffers();
 		return;
 	}
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 650b68a0622f933444a6d66936abb3103029413b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 27 Feb 2019 12:48:14 +0100
Subject: x86/kvm/vmx: Add MDS protection when L1D Flush is not active

CPUs which are affected by L1TF and MDS mitigate MDS with the L1D Flush on
VMENTER when updated microcode is installed.

If a CPU is not affected by L1TF or if the L1D Flush is not in use, then
MDS mitigation needs to be invoked explicitly.

For these cases, follow the host mitigation state and invoke the MDS
mitigation before VMENTER.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 arch/x86/kernel/cpu/bugs.c | 1 +
 arch/x86/kvm/vmx/vmx.c     | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index dbb45014de1b..29ed8e8dfee2 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -65,6 +65,7 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
 
 /* Control MDS CPU buffer clear before returning to user space */
 DEFINE_STATIC_KEY_FALSE(mds_user_clear);
+EXPORT_SYMBOL_GPL(mds_user_clear);
 
 void __init check_bugs(void)
 {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 30a6bcd735ec..544bd24a9c1e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6369,8 +6369,11 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
 	evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
 		(unsigned long)&current_evmcs->host_rsp : 0;
 
+	/* L1D Flush includes CPU buffer clear to mitigate MDS */
 	if (static_branch_unlikely(&vmx_l1d_should_flush))
 		vmx_l1d_flush(vcpu);
+	else if (static_branch_unlikely(&mds_user_clear))
+		mds_clear_cpu_buffers();
 
 	asm(
 		/* Store host registers */
-- 
cgit v1.2.3-59-g8ed1b


From 07f07f55a29cb705e221eda7894dd67ab81ef343 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Feb 2019 23:04:01 +0100
Subject: x86/speculation/mds: Conditionally clear CPU buffers on idle entry

Add a static key which controls the invocation of the CPU buffer clear
mechanism on idle entry. This is independent of other MDS mitigations
because the idle entry invocation to mitigate the potential leakage due to
store buffer repartitioning is only necessary on SMT systems.

Add the actual invocations to the different halt/mwait variants which
covers all usage sites. mwaitx is not patched as it's not available on
Intel CPUs.

The buffer clear is only invoked before entering the C-State to prevent
that stale data from the idling CPU is spilled to the Hyper-Thread sibling
after the Store buffer got repartitioned and all entries are available to
the non idle sibling.

When coming out of idle the store buffer is partitioned again so each
sibling has half of it available. Now CPU which returned from idle could be
speculatively exposed to contents of the sibling, but the buffers are
flushed either on exit to user space or on VMENTER.

When later on conditional buffer clearing is implemented on top of this,
then there is no action required either because before returning to user
space the context switch will set the condition flag which causes a flush
on the return to user path.

Note, that the buffer clearing on idle is only sensible on CPUs which are
solely affected by MSBDS and not any other variant of MDS because the other
MDS variants cannot be mitigated when SMT is enabled, so the buffer
clearing on idle would be a window dressing exercise.

This intentionally does not handle the case in the acpi/processor_idle
driver which uses the legacy IO port interface for C-State transitions for
two reasons:

 - The acpi/processor_idle driver was replaced by the intel_idle driver
   almost a decade ago. Anything Nehalem upwards supports it and defaults
   to that new driver.

 - The legacy IO port interface is likely to be used on older and therefore
   unaffected CPUs or on systems which do not receive microcode updates
   anymore, so there is no point in adding that.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 Documentation/x86/mds.rst            | 42 ++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/irqflags.h      |  4 ++++
 arch/x86/include/asm/mwait.h         |  7 ++++++
 arch/x86/include/asm/nospec-branch.h | 12 +++++++++++
 arch/x86/kernel/cpu/bugs.c           |  3 +++
 5 files changed, 68 insertions(+)

(limited to 'arch')

diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst
index 54d935bf283b..87ce8ac9f36e 100644
--- a/Documentation/x86/mds.rst
+++ b/Documentation/x86/mds.rst
@@ -149,3 +149,45 @@ Mitigation points
      This takes the paranoid exit path only when the INT1 breakpoint is in
      kernel space. #DB on a user space address takes the regular exit path,
      so no extra mitigation required.
+
+
+2. C-State transition
+^^^^^^^^^^^^^^^^^^^^^
+
+   When a CPU goes idle and enters a C-State the CPU buffers need to be
+   cleared on affected CPUs when SMT is active. This addresses the
+   repartitioning of the store buffer when one of the Hyper-Threads enters
+   a C-State.
+
+   When SMT is inactive, i.e. either the CPU does not support it or all
+   sibling threads are offline CPU buffer clearing is not required.
+
+   The idle clearing is enabled on CPUs which are only affected by MSBDS
+   and not by any other MDS variant. The other MDS variants cannot be
+   protected against cross Hyper-Thread attacks because the Fill Buffer and
+   the Load Ports are shared. So on CPUs affected by other variants, the
+   idle clearing would be a window dressing exercise and is therefore not
+   activated.
+
+   The invocation is controlled by the static key mds_idle_clear which is
+   switched depending on the chosen mitigation mode and the SMT state of
+   the system.
+
+   The buffer clear is only invoked before entering the C-State to prevent
+   that stale data from the idling CPU from spilling to the Hyper-Thread
+   sibling after the store buffer got repartitioned and all entries are
+   available to the non idle sibling.
+
+   When coming out of idle the store buffer is partitioned again so each
+   sibling has half of it available. The back from idle CPU could be then
+   speculatively exposed to contents of the sibling. The buffers are
+   flushed either on exit to user space or on VMENTER so malicious code
+   in user space or the guest cannot speculatively access them.
+
+   The mitigation is hooked into all variants of halt()/mwait(), but does
+   not cover the legacy ACPI IO-Port mechanism because the ACPI idle driver
+   has been superseded by the intel_idle driver around 2010 and is
+   preferred on all affected CPUs which are expected to gain the MD_CLEAR
+   functionality in microcode. Aside of that the IO-Port mechanism is a
+   legacy interface which is only used on older systems which are either
+   not affected or do not receive microcode updates anymore.
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 058e40fed167..8a0e56e1dcc9 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -6,6 +6,8 @@
 
 #ifndef __ASSEMBLY__
 
+#include <asm/nospec-branch.h>
+
 /* Provide __cpuidle; we can't safely include <linux/cpu.h> */
 #define __cpuidle __attribute__((__section__(".cpuidle.text")))
 
@@ -54,11 +56,13 @@ static inline void native_irq_enable(void)
 
 static inline __cpuidle void native_safe_halt(void)
 {
+	mds_idle_clear_cpu_buffers();
 	asm volatile("sti; hlt": : :"memory");
 }
 
 static inline __cpuidle void native_halt(void)
 {
+	mds_idle_clear_cpu_buffers();
 	asm volatile("hlt": : :"memory");
 }
 
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 39a2fb29378a..eb0f80ce8524 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -6,6 +6,7 @@
 #include <linux/sched/idle.h>
 
 #include <asm/cpufeature.h>
+#include <asm/nospec-branch.h>
 
 #define MWAIT_SUBSTATE_MASK		0xf
 #define MWAIT_CSTATE_MASK		0xf
@@ -40,6 +41,8 @@ static inline void __monitorx(const void *eax, unsigned long ecx,
 
 static inline void __mwait(unsigned long eax, unsigned long ecx)
 {
+	mds_idle_clear_cpu_buffers();
+
 	/* "mwait %eax, %ecx;" */
 	asm volatile(".byte 0x0f, 0x01, 0xc9;"
 		     :: "a" (eax), "c" (ecx));
@@ -74,6 +77,8 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
 static inline void __mwaitx(unsigned long eax, unsigned long ebx,
 			    unsigned long ecx)
 {
+	/* No MDS buffer clear as this is AMD/HYGON only */
+
 	/* "mwaitx %eax, %ebx, %ecx;" */
 	asm volatile(".byte 0x0f, 0x01, 0xfb;"
 		     :: "a" (eax), "b" (ebx), "c" (ecx));
@@ -81,6 +86,8 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx,
 
 static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
 {
+	mds_idle_clear_cpu_buffers();
+
 	trace_hardirqs_on();
 	/* "mwait %eax, %ecx;" */
 	asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 65b747286d96..4e970390110f 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -319,6 +319,7 @@ DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
 DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
 
 DECLARE_STATIC_KEY_FALSE(mds_user_clear);
+DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
 
 #include <asm/segment.h>
 
@@ -356,6 +357,17 @@ static inline void mds_user_clear_cpu_buffers(void)
 		mds_clear_cpu_buffers();
 }
 
+/**
+ * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability
+ *
+ * Clear CPU buffers if the corresponding static key is enabled
+ */
+static inline void mds_idle_clear_cpu_buffers(void)
+{
+	if (static_branch_likely(&mds_idle_clear))
+		mds_clear_cpu_buffers();
+}
+
 #endif /* __ASSEMBLY__ */
 
 /*
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 29ed8e8dfee2..916995167301 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -66,6 +66,9 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
 /* Control MDS CPU buffer clear before returning to user space */
 DEFINE_STATIC_KEY_FALSE(mds_user_clear);
 EXPORT_SYMBOL_GPL(mds_user_clear);
+/* Control MDS CPU buffer clear before idling (halt, mwait) */
+DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
+EXPORT_SYMBOL_GPL(mds_idle_clear);
 
 void __init check_bugs(void)
 {
-- 
cgit v1.2.3-59-g8ed1b


From bc1241700acd82ec69fde98c5763ce51086269f8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Feb 2019 22:04:08 +0100
Subject: x86/speculation/mds: Add mitigation control for MDS

Now that the mitigations are in place, add a command line parameter to
control the mitigation, a mitigation selector function and a SMT update
mechanism.

This is the minimal straight forward initial implementation which just
provides an always on/off mode. The command line parameter is:

  mds=[full|off]

This is consistent with the existing mitigations for other speculative
hardware vulnerabilities.

The idle invocation is dynamically updated according to the SMT state of
the system similar to the dynamic update of the STIBP mitigation. The idle
mitigation is limited to CPUs which are only affected by MSBDS and not any
other variant, because the other variants cannot be mitigated on SMT
enabled systems.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 22 ++++++++
 arch/x86/include/asm/processor.h                |  5 ++
 arch/x86/kernel/cpu/bugs.c                      | 70 +++++++++++++++++++++++++
 3 files changed, 97 insertions(+)

(limited to 'arch')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 858b6c0b9a15..dddb024eb523 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2356,6 +2356,28 @@
 			Format: <first>,<last>
 			Specifies range of consoles to be captured by the MDA.
 
+	mds=		[X86,INTEL]
+			Control mitigation for the Micro-architectural Data
+			Sampling (MDS) vulnerability.
+
+			Certain CPUs are vulnerable to an exploit against CPU
+			internal buffers which can forward information to a
+			disclosure gadget under certain conditions.
+
+			In vulnerable processors, the speculatively
+			forwarded data can be used in a cache side channel
+			attack, to access data to which the attacker does
+			not have direct access.
+
+			This parameter controls the MDS mitigation. The
+			options are:
+
+			full    - Enable MDS mitigation on vulnerable CPUs
+			off     - Unconditionally disable MDS mitigation
+
+			Not specifying this option is equivalent to
+			mds=full.
+
 	mem=nn[KMG]	[KNL,BOOT] Force usage of a specific amount of memory
 			Amount of memory to be used when the kernel is not able
 			to see the whole system memory or for test.
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 33051436c864..1f0295783325 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -992,4 +992,9 @@ enum l1tf_mitigations {
 
 extern enum l1tf_mitigations l1tf_mitigation;
 
+enum mds_mitigations {
+	MDS_MITIGATION_OFF,
+	MDS_MITIGATION_FULL,
+};
+
 #endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 916995167301..c7b29d200d27 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -37,6 +37,7 @@
 static void __init spectre_v2_select_mitigation(void);
 static void __init ssb_select_mitigation(void);
 static void __init l1tf_select_mitigation(void);
+static void __init mds_select_mitigation(void);
 
 /* The base value of the SPEC_CTRL MSR that always has to be preserved. */
 u64 x86_spec_ctrl_base;
@@ -108,6 +109,8 @@ void __init check_bugs(void)
 
 	l1tf_select_mitigation();
 
+	mds_select_mitigation();
+
 #ifdef CONFIG_X86_32
 	/*
 	 * Check whether we are able to run this kernel safely on SMP.
@@ -213,6 +216,50 @@ static void x86_amd_ssb_disable(void)
 		wrmsrl(MSR_AMD64_LS_CFG, msrval);
 }
 
+#undef pr_fmt
+#define pr_fmt(fmt)	"MDS: " fmt
+
+/* Default mitigation for L1TF-affected CPUs */
+static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL;
+
+static const char * const mds_strings[] = {
+	[MDS_MITIGATION_OFF]	= "Vulnerable",
+	[MDS_MITIGATION_FULL]	= "Mitigation: Clear CPU buffers"
+};
+
+static void __init mds_select_mitigation(void)
+{
+	if (!boot_cpu_has_bug(X86_BUG_MDS)) {
+		mds_mitigation = MDS_MITIGATION_OFF;
+		return;
+	}
+
+	if (mds_mitigation == MDS_MITIGATION_FULL) {
+		if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
+			static_branch_enable(&mds_user_clear);
+		else
+			mds_mitigation = MDS_MITIGATION_OFF;
+	}
+	pr_info("%s\n", mds_strings[mds_mitigation]);
+}
+
+static int __init mds_cmdline(char *str)
+{
+	if (!boot_cpu_has_bug(X86_BUG_MDS))
+		return 0;
+
+	if (!str)
+		return -EINVAL;
+
+	if (!strcmp(str, "off"))
+		mds_mitigation = MDS_MITIGATION_OFF;
+	else if (!strcmp(str, "full"))
+		mds_mitigation = MDS_MITIGATION_FULL;
+
+	return 0;
+}
+early_param("mds", mds_cmdline);
+
 #undef pr_fmt
 #define pr_fmt(fmt)     "Spectre V2 : " fmt
 
@@ -617,6 +664,26 @@ static void update_indir_branch_cond(void)
 		static_branch_disable(&switch_to_cond_stibp);
 }
 
+/* Update the static key controlling the MDS CPU buffer clear in idle */
+static void update_mds_branch_idle(void)
+{
+	/*
+	 * Enable the idle clearing if SMT is active on CPUs which are
+	 * affected only by MSBDS and not any other MDS variant.
+	 *
+	 * The other variants cannot be mitigated when SMT is enabled, so
+	 * clearing the buffers on idle just to prevent the Store Buffer
+	 * repartitioning leak would be a window dressing exercise.
+	 */
+	if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY))
+		return;
+
+	if (sched_smt_active())
+		static_branch_enable(&mds_idle_clear);
+	else
+		static_branch_disable(&mds_idle_clear);
+}
+
 void arch_smt_update(void)
 {
 	/* Enhanced IBRS implies STIBP. No update required. */
@@ -638,6 +705,9 @@ void arch_smt_update(void)
 		break;
 	}
 
+	if (mds_mitigation == MDS_MITIGATION_FULL)
+		update_mds_branch_idle();
+
 	mutex_unlock(&spec_ctrl_mutex);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 8a4b06d391b0a42a373808979b5028f5c84d9c6a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Feb 2019 22:51:43 +0100
Subject: x86/speculation/mds: Add sysfs reporting for MDS

Add the sysfs reporting file for MDS. It exposes the vulnerability and
mitigation state similar to the existing files for the other speculative
hardware vulnerabilities.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 Documentation/ABI/testing/sysfs-devices-system-cpu |  1 +
 arch/x86/kernel/cpu/bugs.c                         | 25 ++++++++++++++++++++++
 drivers/base/cpu.c                                 |  8 +++++++
 include/linux/cpu.h                                |  2 ++
 4 files changed, 36 insertions(+)

(limited to 'arch')

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 9605dbd4b5b5..2db5c3407fd6 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -484,6 +484,7 @@ What:		/sys/devices/system/cpu/vulnerabilities
 		/sys/devices/system/cpu/vulnerabilities/spectre_v2
 		/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
 		/sys/devices/system/cpu/vulnerabilities/l1tf
+		/sys/devices/system/cpu/vulnerabilities/mds
 Date:		January 2018
 Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
 Description:	Information about CPU vulnerabilities
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c7b29d200d27..7ab16a6ed064 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1172,6 +1172,22 @@ static ssize_t l1tf_show_state(char *buf)
 }
 #endif
 
+static ssize_t mds_show_state(char *buf)
+{
+	if (!hypervisor_is_type(X86_HYPER_NATIVE)) {
+		return sprintf(buf, "%s; SMT Host state unknown\n",
+			       mds_strings[mds_mitigation]);
+	}
+
+	if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) {
+		return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+			       sched_smt_active() ? "mitigated" : "disabled");
+	}
+
+	return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+		       sched_smt_active() ? "vulnerable" : "disabled");
+}
+
 static char *stibp_state(void)
 {
 	if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
@@ -1238,6 +1254,10 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
 		if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
 			return l1tf_show_state(buf);
 		break;
+
+	case X86_BUG_MDS:
+		return mds_show_state(buf);
+
 	default:
 		break;
 	}
@@ -1269,4 +1289,9 @@ ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *b
 {
 	return cpu_show_common(dev, attr, buf, X86_BUG_L1TF);
 }
+
+ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return cpu_show_common(dev, attr, buf, X86_BUG_MDS);
+}
 #endif
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index eb9443d5bae1..2fd6ca1021c2 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -546,11 +546,18 @@ ssize_t __weak cpu_show_l1tf(struct device *dev,
 	return sprintf(buf, "Not affected\n");
 }
 
+ssize_t __weak cpu_show_mds(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "Not affected\n");
+}
+
 static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
 static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
 static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
 static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);
 static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL);
+static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL);
 
 static struct attribute *cpu_root_vulnerabilities_attrs[] = {
 	&dev_attr_meltdown.attr,
@@ -558,6 +565,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
 	&dev_attr_spectre_v2.attr,
 	&dev_attr_spec_store_bypass.attr,
 	&dev_attr_l1tf.attr,
+	&dev_attr_mds.attr,
 	NULL
 };
 
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 5041357d0297..3c87ad888ed3 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -57,6 +57,8 @@ extern ssize_t cpu_show_spec_store_bypass(struct device *dev,
 					  struct device_attribute *attr, char *buf);
 extern ssize_t cpu_show_l1tf(struct device *dev,
 			     struct device_attribute *attr, char *buf);
+extern ssize_t cpu_show_mds(struct device *dev,
+			    struct device_attribute *attr, char *buf);
 
 extern __printf(4, 5)
 struct device *cpu_device_create(struct device *parent, void *drvdata,
-- 
cgit v1.2.3-59-g8ed1b


From 22dd8365088b6403630b82423cf906491859b65e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 20 Feb 2019 09:40:40 +0100
Subject: x86/speculation/mds: Add mitigation mode VMWERV

In virtualized environments it can happen that the host has the microcode
update which utilizes the VERW instruction to clear CPU buffers, but the
hypervisor is not yet updated to expose the X86_FEATURE_MD_CLEAR CPUID bit
to guests.

Introduce an internal mitigation mode VMWERV which enables the invocation
of the CPU buffer clearing even if X86_FEATURE_MD_CLEAR is not set. If the
system has no updated microcode this results in a pointless execution of
the VERW instruction wasting a few CPU cycles. If the microcode is updated,
but not exposed to a guest then the CPU buffers will be cleared.

That said: Virtual Machines Will Eventually Receive Vaccine

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 Documentation/x86/mds.rst        | 27 +++++++++++++++++++++++++++
 arch/x86/include/asm/processor.h |  1 +
 arch/x86/kernel/cpu/bugs.c       | 18 ++++++++++++------
 3 files changed, 40 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst
index 87ce8ac9f36e..3d6f943f1afb 100644
--- a/Documentation/x86/mds.rst
+++ b/Documentation/x86/mds.rst
@@ -93,11 +93,38 @@ The kernel provides a function to invoke the buffer clearing:
 The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state
 (idle) transitions.
 
+As a special quirk to address virtualization scenarios where the host has
+the microcode updated, but the hypervisor does not (yet) expose the
+MD_CLEAR CPUID bit to guests, the kernel issues the VERW instruction in the
+hope that it might actually clear the buffers. The state is reflected
+accordingly.
+
 According to current knowledge additional mitigations inside the kernel
 itself are not required because the necessary gadgets to expose the leaked
 data cannot be controlled in a way which allows exploitation from malicious
 user space or VM guests.
 
+Kernel internal mitigation modes
+--------------------------------
+
+ ======= ============================================================
+ off      Mitigation is disabled. Either the CPU is not affected or
+          mds=off is supplied on the kernel command line
+
+ full     Mitigation is eanbled. CPU is affected and MD_CLEAR is
+          advertised in CPUID.
+
+ vmwerv	  Mitigation is enabled. CPU is affected and MD_CLEAR is not
+	  advertised in CPUID. That is mainly for virtualization
+	  scenarios where the host has the updated microcode but the
+	  hypervisor does not expose MD_CLEAR in CPUID. It's a best
+	  effort approach without guarantee.
+ ======= ============================================================
+
+If the CPU is affected and mds=off is not supplied on the kernel command
+line then the kernel selects the appropriate mitigation mode depending on
+the availability of the MD_CLEAR CPUID bit.
+
 Mitigation points
 -----------------
 
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 1f0295783325..aca1ef8cc79f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -995,6 +995,7 @@ extern enum l1tf_mitigations l1tf_mitigation;
 enum mds_mitigations {
 	MDS_MITIGATION_OFF,
 	MDS_MITIGATION_FULL,
+	MDS_MITIGATION_VMWERV,
 };
 
 #endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 7ab16a6ed064..95cda38c8785 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -224,7 +224,8 @@ static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL
 
 static const char * const mds_strings[] = {
 	[MDS_MITIGATION_OFF]	= "Vulnerable",
-	[MDS_MITIGATION_FULL]	= "Mitigation: Clear CPU buffers"
+	[MDS_MITIGATION_FULL]	= "Mitigation: Clear CPU buffers",
+	[MDS_MITIGATION_VMWERV]	= "Vulnerable: Clear CPU buffers attempted, no microcode",
 };
 
 static void __init mds_select_mitigation(void)
@@ -235,10 +236,9 @@ static void __init mds_select_mitigation(void)
 	}
 
 	if (mds_mitigation == MDS_MITIGATION_FULL) {
-		if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
-			static_branch_enable(&mds_user_clear);
-		else
-			mds_mitigation = MDS_MITIGATION_OFF;
+		if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
+			mds_mitigation = MDS_MITIGATION_VMWERV;
+		static_branch_enable(&mds_user_clear);
 	}
 	pr_info("%s\n", mds_strings[mds_mitigation]);
 }
@@ -705,8 +705,14 @@ void arch_smt_update(void)
 		break;
 	}
 
-	if (mds_mitigation == MDS_MITIGATION_FULL)
+	switch (mds_mitigation) {
+	case MDS_MITIGATION_FULL:
+	case MDS_MITIGATION_VMWERV:
 		update_mds_branch_idle();
+		break;
+	case MDS_MITIGATION_OFF:
+		break;
+	}
 
 	mutex_unlock(&spec_ctrl_mutex);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 65fd4cb65b2dad97feb8330b6690445910b56d6a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 19 Feb 2019 11:10:49 +0100
Subject: Documentation: Move L1TF to separate directory

Move L!TF to a separate directory so the MDS stuff can be added at the
side. Otherwise the all hardware vulnerabilites have their own top level
entry. Should have done that right away.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jon Masters <jcm@redhat.com>
---
 Documentation/ABI/testing/sysfs-devices-system-cpu |   2 +-
 Documentation/admin-guide/hw-vuln/index.rst        |  12 +
 Documentation/admin-guide/hw-vuln/l1tf.rst         | 614 +++++++++++++++++++++
 Documentation/admin-guide/index.rst                |   6 +-
 Documentation/admin-guide/kernel-parameters.txt    |   2 +-
 Documentation/admin-guide/l1tf.rst                 | 614 ---------------------
 arch/x86/kernel/cpu/bugs.c                         |   2 +-
 arch/x86/kvm/vmx/vmx.c                             |   4 +-
 8 files changed, 633 insertions(+), 623 deletions(-)
 create mode 100644 Documentation/admin-guide/hw-vuln/index.rst
 create mode 100644 Documentation/admin-guide/hw-vuln/l1tf.rst
 delete mode 100644 Documentation/admin-guide/l1tf.rst

(limited to 'arch')

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 2db5c3407fd6..744c6d764b0c 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -498,7 +498,7 @@ Description:	Information about CPU vulnerabilities
 		"Mitigation: $M"  CPU is affected and mitigation $M is in effect
 
 		Details about the l1tf file can be found in
-		Documentation/admin-guide/l1tf.rst
+		Documentation/admin-guide/hw-vuln/l1tf.rst
 
 What:		/sys/devices/system/cpu/smt
 		/sys/devices/system/cpu/smt/active
diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst
new file mode 100644
index 000000000000..8ce2009f1981
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/index.rst
@@ -0,0 +1,12 @@
+========================
+Hardware vulnerabilities
+========================
+
+This section describes CPU vulnerabilities and provides an overview of the
+possible mitigations along with guidance for selecting mitigations if they
+are configurable at compile, boot or run time.
+
+.. toctree::
+   :maxdepth: 1
+
+   l1tf
diff --git a/Documentation/admin-guide/hw-vuln/l1tf.rst b/Documentation/admin-guide/hw-vuln/l1tf.rst
new file mode 100644
index 000000000000..9af977384168
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/l1tf.rst
@@ -0,0 +1,614 @@
+L1TF - L1 Terminal Fault
+========================
+
+L1 Terminal Fault is a hardware vulnerability which allows unprivileged
+speculative access to data which is available in the Level 1 Data Cache
+when the page table entry controlling the virtual address, which is used
+for the access, has the Present bit cleared or other reserved bits set.
+
+Affected processors
+-------------------
+
+This vulnerability affects a wide range of Intel processors. The
+vulnerability is not present on:
+
+   - Processors from AMD, Centaur and other non Intel vendors
+
+   - Older processor models, where the CPU family is < 6
+
+   - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
+     Penwell, Pineview, Silvermont, Airmont, Merrifield)
+
+   - The Intel XEON PHI family
+
+   - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
+     IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
+     by the Meltdown vulnerability either. These CPUs should become
+     available by end of 2018.
+
+Whether a processor is affected or not can be read out from the L1TF
+vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
+
+Related CVEs
+------------
+
+The following CVE entries are related to the L1TF vulnerability:
+
+   =============  =================  ==============================
+   CVE-2018-3615  L1 Terminal Fault  SGX related aspects
+   CVE-2018-3620  L1 Terminal Fault  OS, SMM related aspects
+   CVE-2018-3646  L1 Terminal Fault  Virtualization related aspects
+   =============  =================  ==============================
+
+Problem
+-------
+
+If an instruction accesses a virtual address for which the relevant page
+table entry (PTE) has the Present bit cleared or other reserved bits set,
+then speculative execution ignores the invalid PTE and loads the referenced
+data if it is present in the Level 1 Data Cache, as if the page referenced
+by the address bits in the PTE was still present and accessible.
+
+While this is a purely speculative mechanism and the instruction will raise
+a page fault when it is retired eventually, the pure act of loading the
+data and making it available to other speculative instructions opens up the
+opportunity for side channel attacks to unprivileged malicious code,
+similar to the Meltdown attack.
+
+While Meltdown breaks the user space to kernel space protection, L1TF
+allows to attack any physical memory address in the system and the attack
+works across all protection domains. It allows an attack of SGX and also
+works from inside virtual machines because the speculation bypasses the
+extended page table (EPT) protection mechanism.
+
+
+Attack scenarios
+----------------
+
+1. Malicious user space
+^^^^^^^^^^^^^^^^^^^^^^^
+
+   Operating Systems store arbitrary information in the address bits of a
+   PTE which is marked non present. This allows a malicious user space
+   application to attack the physical memory to which these PTEs resolve.
+   In some cases user-space can maliciously influence the information
+   encoded in the address bits of the PTE, thus making attacks more
+   deterministic and more practical.
+
+   The Linux kernel contains a mitigation for this attack vector, PTE
+   inversion, which is permanently enabled and has no performance
+   impact. The kernel ensures that the address bits of PTEs, which are not
+   marked present, never point to cacheable physical memory space.
+
+   A system with an up to date kernel is protected against attacks from
+   malicious user space applications.
+
+2. Malicious guest in a virtual machine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   The fact that L1TF breaks all domain protections allows malicious guest
+   OSes, which can control the PTEs directly, and malicious guest user
+   space applications, which run on an unprotected guest kernel lacking the
+   PTE inversion mitigation for L1TF, to attack physical host memory.
+
+   A special aspect of L1TF in the context of virtualization is symmetric
+   multi threading (SMT). The Intel implementation of SMT is called
+   HyperThreading. The fact that Hyperthreads on the affected processors
+   share the L1 Data Cache (L1D) is important for this. As the flaw allows
+   only to attack data which is present in L1D, a malicious guest running
+   on one Hyperthread can attack the data which is brought into the L1D by
+   the context which runs on the sibling Hyperthread of the same physical
+   core. This context can be host OS, host user space or a different guest.
+
+   If the processor does not support Extended Page Tables, the attack is
+   only possible, when the hypervisor does not sanitize the content of the
+   effective (shadow) page tables.
+
+   While solutions exist to mitigate these attack vectors fully, these
+   mitigations are not enabled by default in the Linux kernel because they
+   can affect performance significantly. The kernel provides several
+   mechanisms which can be utilized to address the problem depending on the
+   deployment scenario. The mitigations, their protection scope and impact
+   are described in the next sections.
+
+   The default mitigations and the rationale for choosing them are explained
+   at the end of this document. See :ref:`default_mitigations`.
+
+.. _l1tf_sys_info:
+
+L1TF system information
+-----------------------
+
+The Linux kernel provides a sysfs interface to enumerate the current L1TF
+status of the system: whether the system is vulnerable, and which
+mitigations are active. The relevant sysfs file is:
+
+/sys/devices/system/cpu/vulnerabilities/l1tf
+
+The possible values in this file are:
+
+  ===========================   ===============================
+  'Not affected'		The processor is not vulnerable
+  'Mitigation: PTE Inversion'	The host protection is active
+  ===========================   ===============================
+
+If KVM/VMX is enabled and the processor is vulnerable then the following
+information is appended to the 'Mitigation: PTE Inversion' part:
+
+  - SMT status:
+
+    =====================  ================
+    'VMX: SMT vulnerable'  SMT is enabled
+    'VMX: SMT disabled'    SMT is disabled
+    =====================  ================
+
+  - L1D Flush mode:
+
+    ================================  ====================================
+    'L1D vulnerable'		      L1D flushing is disabled
+
+    'L1D conditional cache flushes'   L1D flush is conditionally enabled
+
+    'L1D cache flushes'		      L1D flush is unconditionally enabled
+    ================================  ====================================
+
+The resulting grade of protection is discussed in the following sections.
+
+
+Host mitigation mechanism
+-------------------------
+
+The kernel is unconditionally protected against L1TF attacks from malicious
+user space running on the host.
+
+
+Guest mitigation mechanisms
+---------------------------
+
+.. _l1d_flush:
+
+1. L1D flush on VMENTER
+^^^^^^^^^^^^^^^^^^^^^^^
+
+   To make sure that a guest cannot attack data which is present in the L1D
+   the hypervisor flushes the L1D before entering the guest.
+
+   Flushing the L1D evicts not only the data which should not be accessed
+   by a potentially malicious guest, it also flushes the guest
+   data. Flushing the L1D has a performance impact as the processor has to
+   bring the flushed guest data back into the L1D. Depending on the
+   frequency of VMEXIT/VMENTER and the type of computations in the guest
+   performance degradation in the range of 1% to 50% has been observed. For
+   scenarios where guest VMEXIT/VMENTER are rare the performance impact is
+   minimal. Virtio and mechanisms like posted interrupts are designed to
+   confine the VMEXITs to a bare minimum, but specific configurations and
+   application scenarios might still suffer from a high VMEXIT rate.
+
+   The kernel provides two L1D flush modes:
+    - conditional ('cond')
+    - unconditional ('always')
+
+   The conditional mode avoids L1D flushing after VMEXITs which execute
+   only audited code paths before the corresponding VMENTER. These code
+   paths have been verified that they cannot expose secrets or other
+   interesting data to an attacker, but they can leak information about the
+   address space layout of the hypervisor.
+
+   Unconditional mode flushes L1D on all VMENTER invocations and provides
+   maximum protection. It has a higher overhead than the conditional
+   mode. The overhead cannot be quantified correctly as it depends on the
+   workload scenario and the resulting number of VMEXITs.
+
+   The general recommendation is to enable L1D flush on VMENTER. The kernel
+   defaults to conditional mode on affected processors.
+
+   **Note**, that L1D flush does not prevent the SMT problem because the
+   sibling thread will also bring back its data into the L1D which makes it
+   attackable again.
+
+   L1D flush can be controlled by the administrator via the kernel command
+   line and sysfs control files. See :ref:`mitigation_control_command_line`
+   and :ref:`mitigation_control_kvm`.
+
+.. _guest_confinement:
+
+2. Guest VCPU confinement to dedicated physical cores
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   To address the SMT problem, it is possible to make a guest or a group of
+   guests affine to one or more physical cores. The proper mechanism for
+   that is to utilize exclusive cpusets to ensure that no other guest or
+   host tasks can run on these cores.
+
+   If only a single guest or related guests run on sibling SMT threads on
+   the same physical core then they can only attack their own memory and
+   restricted parts of the host memory.
+
+   Host memory is attackable, when one of the sibling SMT threads runs in
+   host OS (hypervisor) context and the other in guest context. The amount
+   of valuable information from the host OS context depends on the context
+   which the host OS executes, i.e. interrupts, soft interrupts and kernel
+   threads. The amount of valuable data from these contexts cannot be
+   declared as non-interesting for an attacker without deep inspection of
+   the code.
+
+   **Note**, that assigning guests to a fixed set of physical cores affects
+   the ability of the scheduler to do load balancing and might have
+   negative effects on CPU utilization depending on the hosting
+   scenario. Disabling SMT might be a viable alternative for particular
+   scenarios.
+
+   For further information about confining guests to a single or to a group
+   of cores consult the cpusets documentation:
+
+   https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
+
+.. _interrupt_isolation:
+
+3. Interrupt affinity
+^^^^^^^^^^^^^^^^^^^^^
+
+   Interrupts can be made affine to logical CPUs. This is not universally
+   true because there are types of interrupts which are truly per CPU
+   interrupts, e.g. the local timer interrupt. Aside of that multi queue
+   devices affine their interrupts to single CPUs or groups of CPUs per
+   queue without allowing the administrator to control the affinities.
+
+   Moving the interrupts, which can be affinity controlled, away from CPUs
+   which run untrusted guests, reduces the attack vector space.
+
+   Whether the interrupts with are affine to CPUs, which run untrusted
+   guests, provide interesting data for an attacker depends on the system
+   configuration and the scenarios which run on the system. While for some
+   of the interrupts it can be assumed that they won't expose interesting
+   information beyond exposing hints about the host OS memory layout, there
+   is no way to make general assumptions.
+
+   Interrupt affinity can be controlled by the administrator via the
+   /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
+   available at:
+
+   https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
+
+.. _smt_control:
+
+4. SMT control
+^^^^^^^^^^^^^^
+
+   To prevent the SMT issues of L1TF it might be necessary to disable SMT
+   completely. Disabling SMT can have a significant performance impact, but
+   the impact depends on the hosting scenario and the type of workloads.
+   The impact of disabling SMT needs also to be weighted against the impact
+   of other mitigation solutions like confining guests to dedicated cores.
+
+   The kernel provides a sysfs interface to retrieve the status of SMT and
+   to control it. It also provides a kernel command line interface to
+   control SMT.
+
+   The kernel command line interface consists of the following options:
+
+     =========== ==========================================================
+     nosmt	 Affects the bring up of the secondary CPUs during boot. The
+		 kernel tries to bring all present CPUs online during the
+		 boot process. "nosmt" makes sure that from each physical
+		 core only one - the so called primary (hyper) thread is
+		 activated. Due to a design flaw of Intel processors related
+		 to Machine Check Exceptions the non primary siblings have
+		 to be brought up at least partially and are then shut down
+		 again.  "nosmt" can be undone via the sysfs interface.
+
+     nosmt=force Has the same effect as "nosmt" but it does not allow to
+		 undo the SMT disable via the sysfs interface.
+     =========== ==========================================================
+
+   The sysfs interface provides two files:
+
+   - /sys/devices/system/cpu/smt/control
+   - /sys/devices/system/cpu/smt/active
+
+   /sys/devices/system/cpu/smt/control:
+
+     This file allows to read out the SMT control state and provides the
+     ability to disable or (re)enable SMT. The possible states are:
+
+	==============  ===================================================
+	on		SMT is supported by the CPU and enabled. All
+			logical CPUs can be onlined and offlined without
+			restrictions.
+
+	off		SMT is supported by the CPU and disabled. Only
+			the so called primary SMT threads can be onlined
+			and offlined without restrictions. An attempt to
+			online a non-primary sibling is rejected
+
+	forceoff	Same as 'off' but the state cannot be controlled.
+			Attempts to write to the control file are rejected.
+
+	notsupported	The processor does not support SMT. It's therefore
+			not affected by the SMT implications of L1TF.
+			Attempts to write to the control file are rejected.
+	==============  ===================================================
+
+     The possible states which can be written into this file to control SMT
+     state are:
+
+     - on
+     - off
+     - forceoff
+
+   /sys/devices/system/cpu/smt/active:
+
+     This file reports whether SMT is enabled and active, i.e. if on any
+     physical core two or more sibling threads are online.
+
+   SMT control is also possible at boot time via the l1tf kernel command
+   line parameter in combination with L1D flush control. See
+   :ref:`mitigation_control_command_line`.
+
+5. Disabling EPT
+^^^^^^^^^^^^^^^^
+
+  Disabling EPT for virtual machines provides full mitigation for L1TF even
+  with SMT enabled, because the effective page tables for guests are
+  managed and sanitized by the hypervisor. Though disabling EPT has a
+  significant performance impact especially when the Meltdown mitigation
+  KPTI is enabled.
+
+  EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
+
+There is ongoing research and development for new mitigation mechanisms to
+address the performance impact of disabling SMT or EPT.
+
+.. _mitigation_control_command_line:
+
+Mitigation control on the kernel command line
+---------------------------------------------
+
+The kernel command line allows to control the L1TF mitigations at boot
+time with the option "l1tf=". The valid arguments for this option are:
+
+  ============  =============================================================
+  full		Provides all available mitigations for the L1TF
+		vulnerability. Disables SMT and enables all mitigations in
+		the hypervisors, i.e. unconditional L1D flushing
+
+		SMT control and L1D flush control via the sysfs interface
+		is still possible after boot.  Hypervisors will issue a
+		warning when the first VM is started in a potentially
+		insecure configuration, i.e. SMT enabled or L1D flush
+		disabled.
+
+  full,force	Same as 'full', but disables SMT and L1D flush runtime
+		control. Implies the 'nosmt=force' command line option.
+		(i.e. sysfs control of SMT is disabled.)
+
+  flush		Leaves SMT enabled and enables the default hypervisor
+		mitigation, i.e. conditional L1D flushing
+
+		SMT control and L1D flush control via the sysfs interface
+		is still possible after boot.  Hypervisors will issue a
+		warning when the first VM is started in a potentially
+		insecure configuration, i.e. SMT enabled or L1D flush
+		disabled.
+
+  flush,nosmt	Disables SMT and enables the default hypervisor mitigation,
+		i.e. conditional L1D flushing.
+
+		SMT control and L1D flush control via the sysfs interface
+		is still possible after boot.  Hypervisors will issue a
+		warning when the first VM is started in a potentially
+		insecure configuration, i.e. SMT enabled or L1D flush
+		disabled.
+
+  flush,nowarn	Same as 'flush', but hypervisors will not warn when a VM is
+		started in a potentially insecure configuration.
+
+  off		Disables hypervisor mitigations and doesn't emit any
+		warnings.
+		It also drops the swap size and available RAM limit restrictions
+		on both hypervisor and bare metal.
+
+  ============  =============================================================
+
+The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
+
+
+.. _mitigation_control_kvm:
+
+Mitigation control for KVM - module parameter
+-------------------------------------------------------------
+
+The KVM hypervisor mitigation mechanism, flushing the L1D cache when
+entering a guest, can be controlled with a module parameter.
+
+The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
+following arguments:
+
+  ============  ==============================================================
+  always	L1D cache flush on every VMENTER.
+
+  cond		Flush L1D on VMENTER only when the code between VMEXIT and
+		VMENTER can leak host memory which is considered
+		interesting for an attacker. This still can leak host memory
+		which allows e.g. to determine the hosts address space layout.
+
+  never		Disables the mitigation
+  ============  ==============================================================
+
+The parameter can be provided on the kernel command line, as a module
+parameter when loading the modules and at runtime modified via the sysfs
+file:
+
+/sys/module/kvm_intel/parameters/vmentry_l1d_flush
+
+The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
+line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
+module parameter is ignored and writes to the sysfs file are rejected.
+
+
+Mitigation selection guide
+--------------------------
+
+1. No virtualization in use
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   The system is protected by the kernel unconditionally and no further
+   action is required.
+
+2. Virtualization with trusted guests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   If the guest comes from a trusted source and the guest OS kernel is
+   guaranteed to have the L1TF mitigations in place the system is fully
+   protected against L1TF and no further action is required.
+
+   To avoid the overhead of the default L1D flushing on VMENTER the
+   administrator can disable the flushing via the kernel command line and
+   sysfs control files. See :ref:`mitigation_control_command_line` and
+   :ref:`mitigation_control_kvm`.
+
+
+3. Virtualization with untrusted guests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+3.1. SMT not supported or disabled
+""""""""""""""""""""""""""""""""""
+
+  If SMT is not supported by the processor or disabled in the BIOS or by
+  the kernel, it's only required to enforce L1D flushing on VMENTER.
+
+  Conditional L1D flushing is the default behaviour and can be tuned. See
+  :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
+
+3.2. EPT not supported or disabled
+""""""""""""""""""""""""""""""""""
+
+  If EPT is not supported by the processor or disabled in the hypervisor,
+  the system is fully protected. SMT can stay enabled and L1D flushing on
+  VMENTER is not required.
+
+  EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
+
+3.3. SMT and EPT supported and active
+"""""""""""""""""""""""""""""""""""""
+
+  If SMT and EPT are supported and active then various degrees of
+  mitigations can be employed:
+
+  - L1D flushing on VMENTER:
+
+    L1D flushing on VMENTER is the minimal protection requirement, but it
+    is only potent in combination with other mitigation methods.
+
+    Conditional L1D flushing is the default behaviour and can be tuned. See
+    :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
+
+  - Guest confinement:
+
+    Confinement of guests to a single or a group of physical cores which
+    are not running any other processes, can reduce the attack surface
+    significantly, but interrupts, soft interrupts and kernel threads can
+    still expose valuable data to a potential attacker. See
+    :ref:`guest_confinement`.
+
+  - Interrupt isolation:
+
+    Isolating the guest CPUs from interrupts can reduce the attack surface
+    further, but still allows a malicious guest to explore a limited amount
+    of host physical memory. This can at least be used to gain knowledge
+    about the host address space layout. The interrupts which have a fixed
+    affinity to the CPUs which run the untrusted guests can depending on
+    the scenario still trigger soft interrupts and schedule kernel threads
+    which might expose valuable information. See
+    :ref:`interrupt_isolation`.
+
+The above three mitigation methods combined can provide protection to a
+certain degree, but the risk of the remaining attack surface has to be
+carefully analyzed. For full protection the following methods are
+available:
+
+  - Disabling SMT:
+
+    Disabling SMT and enforcing the L1D flushing provides the maximum
+    amount of protection. This mitigation is not depending on any of the
+    above mitigation methods.
+
+    SMT control and L1D flushing can be tuned by the command line
+    parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
+    time with the matching sysfs control files. See :ref:`smt_control`,
+    :ref:`mitigation_control_command_line` and
+    :ref:`mitigation_control_kvm`.
+
+  - Disabling EPT:
+
+    Disabling EPT provides the maximum amount of protection as well. It is
+    not depending on any of the above mitigation methods. SMT can stay
+    enabled and L1D flushing is not required, but the performance impact is
+    significant.
+
+    EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
+    parameter.
+
+3.4. Nested virtual machines
+""""""""""""""""""""""""""""
+
+When nested virtualization is in use, three operating systems are involved:
+the bare metal hypervisor, the nested hypervisor and the nested virtual
+machine.  VMENTER operations from the nested hypervisor into the nested
+guest will always be processed by the bare metal hypervisor. If KVM is the
+bare metal hypervisor it will:
+
+ - Flush the L1D cache on every switch from the nested hypervisor to the
+   nested virtual machine, so that the nested hypervisor's secrets are not
+   exposed to the nested virtual machine;
+
+ - Flush the L1D cache on every switch from the nested virtual machine to
+   the nested hypervisor; this is a complex operation, and flushing the L1D
+   cache avoids that the bare metal hypervisor's secrets are exposed to the
+   nested virtual machine;
+
+ - Instruct the nested hypervisor to not perform any L1D cache flush. This
+   is an optimization to avoid double L1D flushing.
+
+
+.. _default_mitigations:
+
+Default mitigations
+-------------------
+
+  The kernel default mitigations for vulnerable processors are:
+
+  - PTE inversion to protect against malicious user space. This is done
+    unconditionally and cannot be controlled. The swap storage is limited
+    to ~16TB.
+
+  - L1D conditional flushing on VMENTER when EPT is enabled for
+    a guest.
+
+  The kernel does not by default enforce the disabling of SMT, which leaves
+  SMT systems vulnerable when running untrusted guests with EPT enabled.
+
+  The rationale for this choice is:
+
+  - Force disabling SMT can break existing setups, especially with
+    unattended updates.
+
+  - If regular users run untrusted guests on their machine, then L1TF is
+    just an add on to other malware which might be embedded in an untrusted
+    guest, e.g. spam-bots or attacks on the local network.
+
+    There is no technical way to prevent a user from running untrusted code
+    on their machines blindly.
+
+  - It's technically extremely unlikely and from today's knowledge even
+    impossible that L1TF can be exploited via the most popular attack
+    mechanisms like JavaScript because these mechanisms have no way to
+    control PTEs. If this would be possible and not other mitigation would
+    be possible, then the default might be different.
+
+  - The administrators of cloud and hosting setups have to carefully
+    analyze the risk for their scenarios and make the appropriate
+    mitigation choices, which might even vary across their deployed
+    machines and also result in other changes of their overall setup.
+    There is no way for the kernel to provide a sensible default for this
+    kind of scenarios.
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index 0a491676685e..42247516962a 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -17,14 +17,12 @@ etc.
    kernel-parameters
    devices
 
-This section describes CPU vulnerabilities and provides an overview of the
-possible mitigations along with guidance for selecting mitigations if they
-are configurable at compile, boot or run time.
+This section describes CPU vulnerabilities and their mitigations.
 
 .. toctree::
    :maxdepth: 1
 
-   l1tf
+   hw-vuln/index
 
 Here is a set of documents aimed at users who are trying to track down
 problems and bugs in particular.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index dddb024eb523..9afcb240a673 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2114,7 +2114,7 @@
 
 			Default is 'flush'.
 
-			For details see: Documentation/admin-guide/l1tf.rst
+			For details see: Documentation/admin-guide/hw-vuln/l1tf.rst
 
 	l2cr=		[PPC]
 
diff --git a/Documentation/admin-guide/l1tf.rst b/Documentation/admin-guide/l1tf.rst
deleted file mode 100644
index 9af977384168..000000000000
--- a/Documentation/admin-guide/l1tf.rst
+++ /dev/null
@@ -1,614 +0,0 @@
-L1TF - L1 Terminal Fault
-========================
-
-L1 Terminal Fault is a hardware vulnerability which allows unprivileged
-speculative access to data which is available in the Level 1 Data Cache
-when the page table entry controlling the virtual address, which is used
-for the access, has the Present bit cleared or other reserved bits set.
-
-Affected processors
--------------------
-
-This vulnerability affects a wide range of Intel processors. The
-vulnerability is not present on:
-
-   - Processors from AMD, Centaur and other non Intel vendors
-
-   - Older processor models, where the CPU family is < 6
-
-   - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
-     Penwell, Pineview, Silvermont, Airmont, Merrifield)
-
-   - The Intel XEON PHI family
-
-   - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
-     IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
-     by the Meltdown vulnerability either. These CPUs should become
-     available by end of 2018.
-
-Whether a processor is affected or not can be read out from the L1TF
-vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
-
-Related CVEs
-------------
-
-The following CVE entries are related to the L1TF vulnerability:
-
-   =============  =================  ==============================
-   CVE-2018-3615  L1 Terminal Fault  SGX related aspects
-   CVE-2018-3620  L1 Terminal Fault  OS, SMM related aspects
-   CVE-2018-3646  L1 Terminal Fault  Virtualization related aspects
-   =============  =================  ==============================
-
-Problem
--------
-
-If an instruction accesses a virtual address for which the relevant page
-table entry (PTE) has the Present bit cleared or other reserved bits set,
-then speculative execution ignores the invalid PTE and loads the referenced
-data if it is present in the Level 1 Data Cache, as if the page referenced
-by the address bits in the PTE was still present and accessible.
-
-While this is a purely speculative mechanism and the instruction will raise
-a page fault when it is retired eventually, the pure act of loading the
-data and making it available to other speculative instructions opens up the
-opportunity for side channel attacks to unprivileged malicious code,
-similar to the Meltdown attack.
-
-While Meltdown breaks the user space to kernel space protection, L1TF
-allows to attack any physical memory address in the system and the attack
-works across all protection domains. It allows an attack of SGX and also
-works from inside virtual machines because the speculation bypasses the
-extended page table (EPT) protection mechanism.
-
-
-Attack scenarios
-----------------
-
-1. Malicious user space
-^^^^^^^^^^^^^^^^^^^^^^^
-
-   Operating Systems store arbitrary information in the address bits of a
-   PTE which is marked non present. This allows a malicious user space
-   application to attack the physical memory to which these PTEs resolve.
-   In some cases user-space can maliciously influence the information
-   encoded in the address bits of the PTE, thus making attacks more
-   deterministic and more practical.
-
-   The Linux kernel contains a mitigation for this attack vector, PTE
-   inversion, which is permanently enabled and has no performance
-   impact. The kernel ensures that the address bits of PTEs, which are not
-   marked present, never point to cacheable physical memory space.
-
-   A system with an up to date kernel is protected against attacks from
-   malicious user space applications.
-
-2. Malicious guest in a virtual machine
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-   The fact that L1TF breaks all domain protections allows malicious guest
-   OSes, which can control the PTEs directly, and malicious guest user
-   space applications, which run on an unprotected guest kernel lacking the
-   PTE inversion mitigation for L1TF, to attack physical host memory.
-
-   A special aspect of L1TF in the context of virtualization is symmetric
-   multi threading (SMT). The Intel implementation of SMT is called
-   HyperThreading. The fact that Hyperthreads on the affected processors
-   share the L1 Data Cache (L1D) is important for this. As the flaw allows
-   only to attack data which is present in L1D, a malicious guest running
-   on one Hyperthread can attack the data which is brought into the L1D by
-   the context which runs on the sibling Hyperthread of the same physical
-   core. This context can be host OS, host user space or a different guest.
-
-   If the processor does not support Extended Page Tables, the attack is
-   only possible, when the hypervisor does not sanitize the content of the
-   effective (shadow) page tables.
-
-   While solutions exist to mitigate these attack vectors fully, these
-   mitigations are not enabled by default in the Linux kernel because they
-   can affect performance significantly. The kernel provides several
-   mechanisms which can be utilized to address the problem depending on the
-   deployment scenario. The mitigations, their protection scope and impact
-   are described in the next sections.
-
-   The default mitigations and the rationale for choosing them are explained
-   at the end of this document. See :ref:`default_mitigations`.
-
-.. _l1tf_sys_info:
-
-L1TF system information
------------------------
-
-The Linux kernel provides a sysfs interface to enumerate the current L1TF
-status of the system: whether the system is vulnerable, and which
-mitigations are active. The relevant sysfs file is:
-
-/sys/devices/system/cpu/vulnerabilities/l1tf
-
-The possible values in this file are:
-
-  ===========================   ===============================
-  'Not affected'		The processor is not vulnerable
-  'Mitigation: PTE Inversion'	The host protection is active
-  ===========================   ===============================
-
-If KVM/VMX is enabled and the processor is vulnerable then the following
-information is appended to the 'Mitigation: PTE Inversion' part:
-
-  - SMT status:
-
-    =====================  ================
-    'VMX: SMT vulnerable'  SMT is enabled
-    'VMX: SMT disabled'    SMT is disabled
-    =====================  ================
-
-  - L1D Flush mode:
-
-    ================================  ====================================
-    'L1D vulnerable'		      L1D flushing is disabled
-
-    'L1D conditional cache flushes'   L1D flush is conditionally enabled
-
-    'L1D cache flushes'		      L1D flush is unconditionally enabled
-    ================================  ====================================
-
-The resulting grade of protection is discussed in the following sections.
-
-
-Host mitigation mechanism
--------------------------
-
-The kernel is unconditionally protected against L1TF attacks from malicious
-user space running on the host.
-
-
-Guest mitigation mechanisms
----------------------------
-
-.. _l1d_flush:
-
-1. L1D flush on VMENTER
-^^^^^^^^^^^^^^^^^^^^^^^
-
-   To make sure that a guest cannot attack data which is present in the L1D
-   the hypervisor flushes the L1D before entering the guest.
-
-   Flushing the L1D evicts not only the data which should not be accessed
-   by a potentially malicious guest, it also flushes the guest
-   data. Flushing the L1D has a performance impact as the processor has to
-   bring the flushed guest data back into the L1D. Depending on the
-   frequency of VMEXIT/VMENTER and the type of computations in the guest
-   performance degradation in the range of 1% to 50% has been observed. For
-   scenarios where guest VMEXIT/VMENTER are rare the performance impact is
-   minimal. Virtio and mechanisms like posted interrupts are designed to
-   confine the VMEXITs to a bare minimum, but specific configurations and
-   application scenarios might still suffer from a high VMEXIT rate.
-
-   The kernel provides two L1D flush modes:
-    - conditional ('cond')
-    - unconditional ('always')
-
-   The conditional mode avoids L1D flushing after VMEXITs which execute
-   only audited code paths before the corresponding VMENTER. These code
-   paths have been verified that they cannot expose secrets or other
-   interesting data to an attacker, but they can leak information about the
-   address space layout of the hypervisor.
-
-   Unconditional mode flushes L1D on all VMENTER invocations and provides
-   maximum protection. It has a higher overhead than the conditional
-   mode. The overhead cannot be quantified correctly as it depends on the
-   workload scenario and the resulting number of VMEXITs.
-
-   The general recommendation is to enable L1D flush on VMENTER. The kernel
-   defaults to conditional mode on affected processors.
-
-   **Note**, that L1D flush does not prevent the SMT problem because the
-   sibling thread will also bring back its data into the L1D which makes it
-   attackable again.
-
-   L1D flush can be controlled by the administrator via the kernel command
-   line and sysfs control files. See :ref:`mitigation_control_command_line`
-   and :ref:`mitigation_control_kvm`.
-
-.. _guest_confinement:
-
-2. Guest VCPU confinement to dedicated physical cores
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-   To address the SMT problem, it is possible to make a guest or a group of
-   guests affine to one or more physical cores. The proper mechanism for
-   that is to utilize exclusive cpusets to ensure that no other guest or
-   host tasks can run on these cores.
-
-   If only a single guest or related guests run on sibling SMT threads on
-   the same physical core then they can only attack their own memory and
-   restricted parts of the host memory.
-
-   Host memory is attackable, when one of the sibling SMT threads runs in
-   host OS (hypervisor) context and the other in guest context. The amount
-   of valuable information from the host OS context depends on the context
-   which the host OS executes, i.e. interrupts, soft interrupts and kernel
-   threads. The amount of valuable data from these contexts cannot be
-   declared as non-interesting for an attacker without deep inspection of
-   the code.
-
-   **Note**, that assigning guests to a fixed set of physical cores affects
-   the ability of the scheduler to do load balancing and might have
-   negative effects on CPU utilization depending on the hosting
-   scenario. Disabling SMT might be a viable alternative for particular
-   scenarios.
-
-   For further information about confining guests to a single or to a group
-   of cores consult the cpusets documentation:
-
-   https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
-
-.. _interrupt_isolation:
-
-3. Interrupt affinity
-^^^^^^^^^^^^^^^^^^^^^
-
-   Interrupts can be made affine to logical CPUs. This is not universally
-   true because there are types of interrupts which are truly per CPU
-   interrupts, e.g. the local timer interrupt. Aside of that multi queue
-   devices affine their interrupts to single CPUs or groups of CPUs per
-   queue without allowing the administrator to control the affinities.
-
-   Moving the interrupts, which can be affinity controlled, away from CPUs
-   which run untrusted guests, reduces the attack vector space.
-
-   Whether the interrupts with are affine to CPUs, which run untrusted
-   guests, provide interesting data for an attacker depends on the system
-   configuration and the scenarios which run on the system. While for some
-   of the interrupts it can be assumed that they won't expose interesting
-   information beyond exposing hints about the host OS memory layout, there
-   is no way to make general assumptions.
-
-   Interrupt affinity can be controlled by the administrator via the
-   /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
-   available at:
-
-   https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
-
-.. _smt_control:
-
-4. SMT control
-^^^^^^^^^^^^^^
-
-   To prevent the SMT issues of L1TF it might be necessary to disable SMT
-   completely. Disabling SMT can have a significant performance impact, but
-   the impact depends on the hosting scenario and the type of workloads.
-   The impact of disabling SMT needs also to be weighted against the impact
-   of other mitigation solutions like confining guests to dedicated cores.
-
-   The kernel provides a sysfs interface to retrieve the status of SMT and
-   to control it. It also provides a kernel command line interface to
-   control SMT.
-
-   The kernel command line interface consists of the following options:
-
-     =========== ==========================================================
-     nosmt	 Affects the bring up of the secondary CPUs during boot. The
-		 kernel tries to bring all present CPUs online during the
-		 boot process. "nosmt" makes sure that from each physical
-		 core only one - the so called primary (hyper) thread is
-		 activated. Due to a design flaw of Intel processors related
-		 to Machine Check Exceptions the non primary siblings have
-		 to be brought up at least partially and are then shut down
-		 again.  "nosmt" can be undone via the sysfs interface.
-
-     nosmt=force Has the same effect as "nosmt" but it does not allow to
-		 undo the SMT disable via the sysfs interface.
-     =========== ==========================================================
-
-   The sysfs interface provides two files:
-
-   - /sys/devices/system/cpu/smt/control
-   - /sys/devices/system/cpu/smt/active
-
-   /sys/devices/system/cpu/smt/control:
-
-     This file allows to read out the SMT control state and provides the
-     ability to disable or (re)enable SMT. The possible states are:
-
-	==============  ===================================================
-	on		SMT is supported by the CPU and enabled. All
-			logical CPUs can be onlined and offlined without
-			restrictions.
-
-	off		SMT is supported by the CPU and disabled. Only
-			the so called primary SMT threads can be onlined
-			and offlined without restrictions. An attempt to
-			online a non-primary sibling is rejected
-
-	forceoff	Same as 'off' but the state cannot be controlled.
-			Attempts to write to the control file are rejected.
-
-	notsupported	The processor does not support SMT. It's therefore
-			not affected by the SMT implications of L1TF.
-			Attempts to write to the control file are rejected.
-	==============  ===================================================
-
-     The possible states which can be written into this file to control SMT
-     state are:
-
-     - on
-     - off
-     - forceoff
-
-   /sys/devices/system/cpu/smt/active:
-
-     This file reports whether SMT is enabled and active, i.e. if on any
-     physical core two or more sibling threads are online.
-
-   SMT control is also possible at boot time via the l1tf kernel command
-   line parameter in combination with L1D flush control. See
-   :ref:`mitigation_control_command_line`.
-
-5. Disabling EPT
-^^^^^^^^^^^^^^^^
-
-  Disabling EPT for virtual machines provides full mitigation for L1TF even
-  with SMT enabled, because the effective page tables for guests are
-  managed and sanitized by the hypervisor. Though disabling EPT has a
-  significant performance impact especially when the Meltdown mitigation
-  KPTI is enabled.
-
-  EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
-
-There is ongoing research and development for new mitigation mechanisms to
-address the performance impact of disabling SMT or EPT.
-
-.. _mitigation_control_command_line:
-
-Mitigation control on the kernel command line
----------------------------------------------
-
-The kernel command line allows to control the L1TF mitigations at boot
-time with the option "l1tf=". The valid arguments for this option are:
-
-  ============  =============================================================
-  full		Provides all available mitigations for the L1TF
-		vulnerability. Disables SMT and enables all mitigations in
-		the hypervisors, i.e. unconditional L1D flushing
-
-		SMT control and L1D flush control via the sysfs interface
-		is still possible after boot.  Hypervisors will issue a
-		warning when the first VM is started in a potentially
-		insecure configuration, i.e. SMT enabled or L1D flush
-		disabled.
-
-  full,force	Same as 'full', but disables SMT and L1D flush runtime
-		control. Implies the 'nosmt=force' command line option.
-		(i.e. sysfs control of SMT is disabled.)
-
-  flush		Leaves SMT enabled and enables the default hypervisor
-		mitigation, i.e. conditional L1D flushing
-
-		SMT control and L1D flush control via the sysfs interface
-		is still possible after boot.  Hypervisors will issue a
-		warning when the first VM is started in a potentially
-		insecure configuration, i.e. SMT enabled or L1D flush
-		disabled.
-
-  flush,nosmt	Disables SMT and enables the default hypervisor mitigation,
-		i.e. conditional L1D flushing.
-
-		SMT control and L1D flush control via the sysfs interface
-		is still possible after boot.  Hypervisors will issue a
-		warning when the first VM is started in a potentially
-		insecure configuration, i.e. SMT enabled or L1D flush
-		disabled.
-
-  flush,nowarn	Same as 'flush', but hypervisors will not warn when a VM is
-		started in a potentially insecure configuration.
-
-  off		Disables hypervisor mitigations and doesn't emit any
-		warnings.
-		It also drops the swap size and available RAM limit restrictions
-		on both hypervisor and bare metal.
-
-  ============  =============================================================
-
-The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
-
-
-.. _mitigation_control_kvm:
-
-Mitigation control for KVM - module parameter
--------------------------------------------------------------
-
-The KVM hypervisor mitigation mechanism, flushing the L1D cache when
-entering a guest, can be controlled with a module parameter.
-
-The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
-following arguments:
-
-  ============  ==============================================================
-  always	L1D cache flush on every VMENTER.
-
-  cond		Flush L1D on VMENTER only when the code between VMEXIT and
-		VMENTER can leak host memory which is considered
-		interesting for an attacker. This still can leak host memory
-		which allows e.g. to determine the hosts address space layout.
-
-  never		Disables the mitigation
-  ============  ==============================================================
-
-The parameter can be provided on the kernel command line, as a module
-parameter when loading the modules and at runtime modified via the sysfs
-file:
-
-/sys/module/kvm_intel/parameters/vmentry_l1d_flush
-
-The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
-line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
-module parameter is ignored and writes to the sysfs file are rejected.
-
-
-Mitigation selection guide
---------------------------
-
-1. No virtualization in use
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-   The system is protected by the kernel unconditionally and no further
-   action is required.
-
-2. Virtualization with trusted guests
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-   If the guest comes from a trusted source and the guest OS kernel is
-   guaranteed to have the L1TF mitigations in place the system is fully
-   protected against L1TF and no further action is required.
-
-   To avoid the overhead of the default L1D flushing on VMENTER the
-   administrator can disable the flushing via the kernel command line and
-   sysfs control files. See :ref:`mitigation_control_command_line` and
-   :ref:`mitigation_control_kvm`.
-
-
-3. Virtualization with untrusted guests
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-3.1. SMT not supported or disabled
-""""""""""""""""""""""""""""""""""
-
-  If SMT is not supported by the processor or disabled in the BIOS or by
-  the kernel, it's only required to enforce L1D flushing on VMENTER.
-
-  Conditional L1D flushing is the default behaviour and can be tuned. See
-  :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
-
-3.2. EPT not supported or disabled
-""""""""""""""""""""""""""""""""""
-
-  If EPT is not supported by the processor or disabled in the hypervisor,
-  the system is fully protected. SMT can stay enabled and L1D flushing on
-  VMENTER is not required.
-
-  EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
-
-3.3. SMT and EPT supported and active
-"""""""""""""""""""""""""""""""""""""
-
-  If SMT and EPT are supported and active then various degrees of
-  mitigations can be employed:
-
-  - L1D flushing on VMENTER:
-
-    L1D flushing on VMENTER is the minimal protection requirement, but it
-    is only potent in combination with other mitigation methods.
-
-    Conditional L1D flushing is the default behaviour and can be tuned. See
-    :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
-
-  - Guest confinement:
-
-    Confinement of guests to a single or a group of physical cores which
-    are not running any other processes, can reduce the attack surface
-    significantly, but interrupts, soft interrupts and kernel threads can
-    still expose valuable data to a potential attacker. See
-    :ref:`guest_confinement`.
-
-  - Interrupt isolation:
-
-    Isolating the guest CPUs from interrupts can reduce the attack surface
-    further, but still allows a malicious guest to explore a limited amount
-    of host physical memory. This can at least be used to gain knowledge
-    about the host address space layout. The interrupts which have a fixed
-    affinity to the CPUs which run the untrusted guests can depending on
-    the scenario still trigger soft interrupts and schedule kernel threads
-    which might expose valuable information. See
-    :ref:`interrupt_isolation`.
-
-The above three mitigation methods combined can provide protection to a
-certain degree, but the risk of the remaining attack surface has to be
-carefully analyzed. For full protection the following methods are
-available:
-
-  - Disabling SMT:
-
-    Disabling SMT and enforcing the L1D flushing provides the maximum
-    amount of protection. This mitigation is not depending on any of the
-    above mitigation methods.
-
-    SMT control and L1D flushing can be tuned by the command line
-    parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
-    time with the matching sysfs control files. See :ref:`smt_control`,
-    :ref:`mitigation_control_command_line` and
-    :ref:`mitigation_control_kvm`.
-
-  - Disabling EPT:
-
-    Disabling EPT provides the maximum amount of protection as well. It is
-    not depending on any of the above mitigation methods. SMT can stay
-    enabled and L1D flushing is not required, but the performance impact is
-    significant.
-
-    EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
-    parameter.
-
-3.4. Nested virtual machines
-""""""""""""""""""""""""""""
-
-When nested virtualization is in use, three operating systems are involved:
-the bare metal hypervisor, the nested hypervisor and the nested virtual
-machine.  VMENTER operations from the nested hypervisor into the nested
-guest will always be processed by the bare metal hypervisor. If KVM is the
-bare metal hypervisor it will:
-
- - Flush the L1D cache on every switch from the nested hypervisor to the
-   nested virtual machine, so that the nested hypervisor's secrets are not
-   exposed to the nested virtual machine;
-
- - Flush the L1D cache on every switch from the nested virtual machine to
-   the nested hypervisor; this is a complex operation, and flushing the L1D
-   cache avoids that the bare metal hypervisor's secrets are exposed to the
-   nested virtual machine;
-
- - Instruct the nested hypervisor to not perform any L1D cache flush. This
-   is an optimization to avoid double L1D flushing.
-
-
-.. _default_mitigations:
-
-Default mitigations
--------------------
-
-  The kernel default mitigations for vulnerable processors are:
-
-  - PTE inversion to protect against malicious user space. This is done
-    unconditionally and cannot be controlled. The swap storage is limited
-    to ~16TB.
-
-  - L1D conditional flushing on VMENTER when EPT is enabled for
-    a guest.
-
-  The kernel does not by default enforce the disabling of SMT, which leaves
-  SMT systems vulnerable when running untrusted guests with EPT enabled.
-
-  The rationale for this choice is:
-
-  - Force disabling SMT can break existing setups, especially with
-    unattended updates.
-
-  - If regular users run untrusted guests on their machine, then L1TF is
-    just an add on to other malware which might be embedded in an untrusted
-    guest, e.g. spam-bots or attacks on the local network.
-
-    There is no technical way to prevent a user from running untrusted code
-    on their machines blindly.
-
-  - It's technically extremely unlikely and from today's knowledge even
-    impossible that L1TF can be exploited via the most popular attack
-    mechanisms like JavaScript because these mechanisms have no way to
-    control PTEs. If this would be possible and not other mitigation would
-    be possible, then the default might be different.
-
-  - The administrators of cloud and hosting setups have to carefully
-    analyze the risk for their scenarios and make the appropriate
-    mitigation choices, which might even vary across their deployed
-    machines and also result in other changes of their overall setup.
-    There is no way for the kernel to provide a sensible default for this
-    kind of scenarios.
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 95cda38c8785..373ae1dcd301 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1107,7 +1107,7 @@ static void __init l1tf_select_mitigation(void)
 		pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n",
 				half_pa);
 		pr_info("However, doing so will make a part of your RAM unusable.\n");
-		pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html might help you decide.\n");
+		pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n");
 		return;
 	}
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 544bd24a9c1e..b0597507bde7 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6801,8 +6801,8 @@ free_partial_vcpu:
 	return ERR_PTR(err);
 }
 
-#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
-#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
+#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
 
 static int vmx_vm_init(struct kvm *kvm)
 {
-- 
cgit v1.2.3-59-g8ed1b


From b8f3b15a7ba0a91de86547de8bbd5a633db03ab1 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Fri, 15 Mar 2019 16:31:33 +0000
Subject: MIPS: entry: Remove unneeded need_resched() loop

Since the enabling and disabling of IRQs within preempt_schedule_irq()
is contained in a need_resched() loop, we don't need the outer arch
code loop.

Note that commit a18815abcdfd ("Use preempt_schedule_irq.") initially
removed the existing loop, but missed the final branch to restore_all.
Commit cdaed73afb61 ("Fix preemption bug.") missed that and reintroduced
the loop.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: linux-mips@vger.kernel.org
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: linux-kernel@vger.kernel.org
---
 arch/mips/kernel/entry.S | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/entry.S b/arch/mips/kernel/entry.S
index d7de8adcfcc8..5469d43b6966 100644
--- a/arch/mips/kernel/entry.S
+++ b/arch/mips/kernel/entry.S
@@ -58,15 +58,14 @@ resume_kernel:
 	local_irq_disable
 	lw	t0, TI_PRE_COUNT($28)
 	bnez	t0, restore_all
-need_resched:
 	LONG_L	t0, TI_FLAGS($28)
 	andi	t1, t0, _TIF_NEED_RESCHED
 	beqz	t1, restore_all
 	LONG_L	t0, PT_STATUS(sp)		# Interrupts off?
 	andi	t0, 1
 	beqz	t0, restore_all
-	jal	preempt_schedule_irq
-	b	need_resched
+	PTR_LA	ra, restore_all
+	j	preempt_schedule_irq
 #endif
 
 FEXPORT(ret_from_kernel_thread)
-- 
cgit v1.2.3-59-g8ed1b


From 0d1d17b9ff8e7d69b2169337b263822355404a86 Mon Sep 17 00:00:00 2001
From: Hassan Naveed <hnaveed@wavecomp.com>
Date: Tue, 12 Mar 2019 22:47:56 +0000
Subject: MIPS: uasm: Add div, mul and sel instructions for mipsr6

Add the following instructions for use by eBPF on mipsr6:
insn_ddivu_r6, insn_divu_r6, insn_dmodu, insn_dmulu, insn_modu,
insn_mulu, insn_seleqz, insn_selnez

Signed-off-by: Hassan Naveed <hnaveed@wavecomp.com>
Reviewed-by: Paul Burton <paul.burton@mips.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: kafai@fb.com
Cc: songliubraving@fb.com
Cc: yhs@fb.com
Cc: netdev@vger.kernel.org
Cc: bpf@vger.kernel.org
Cc: linux-mips@vger.kernel.org
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: open list:MIPS <linux-mips@linux-mips.org>
Cc: open list <linux-kernel@vger.kernel.org>
---
 arch/mips/include/asm/uasm.h      |  8 ++++++++
 arch/mips/include/uapi/asm/inst.h |  6 +++---
 arch/mips/mm/uasm-mips.c          | 14 ++++++++++++++
 arch/mips/mm/uasm.c               | 39 ++++++++++++++++++++++++---------------
 4 files changed, 49 insertions(+), 18 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h
index b1990dd75f27..f7effca791a5 100644
--- a/arch/mips/include/asm/uasm.h
+++ b/arch/mips/include/asm/uasm.h
@@ -86,14 +86,18 @@ Ip_u2u1(_ctcmsa);
 Ip_u2u1s3(_daddiu);
 Ip_u3u1u2(_daddu);
 Ip_u1u2(_ddivu);
+Ip_u3u1u2(_ddivu_r6);
 Ip_u1(_di);
 Ip_u2u1msbu3(_dins);
 Ip_u2u1msbu3(_dinsm);
 Ip_u2u1msbu3(_dinsu);
 Ip_u1u2(_divu);
+Ip_u3u1u2(_divu_r6);
 Ip_u1u2u3(_dmfc0);
+Ip_u3u1u2(_dmodu);
 Ip_u1u2u3(_dmtc0);
 Ip_u1u2(_dmultu);
+Ip_u3u1u2(_dmulu);
 Ip_u2u1u3(_drotr);
 Ip_u2u1u3(_drotr32);
 Ip_u2u1(_dsbh);
@@ -131,6 +135,7 @@ Ip_u1u2u3(_mfc0);
 Ip_u1u2u3(_mfhc0);
 Ip_u1(_mfhi);
 Ip_u1(_mflo);
+Ip_u3u1u2(_modu);
 Ip_u3u1u2(_movn);
 Ip_u3u1u2(_movz);
 Ip_u1u2u3(_mtc0);
@@ -139,6 +144,7 @@ Ip_u1(_mthi);
 Ip_u1(_mtlo);
 Ip_u3u1u2(_mul);
 Ip_u1u2(_multu);
+Ip_u3u1u2(_mulu);
 Ip_u3u1u2(_nor);
 Ip_u3u1u2(_or);
 Ip_u2u1u3(_ori);
@@ -149,6 +155,8 @@ Ip_u2s3u1(_sb);
 Ip_u2s3u1(_sc);
 Ip_u2s3u1(_scd);
 Ip_u2s3u1(_sd);
+Ip_u3u1u2(_seleqz);
+Ip_u3u1u2(_selnez);
 Ip_u2s3u1(_sh);
 Ip_u2u1u3(_sll);
 Ip_u3u2u1(_sllv);
diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index 40fbb5dd66df..eaa3a80affdf 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -55,9 +55,9 @@ enum spec_op {
 	spec3_unused_op, spec4_unused_op, slt_op, sltu_op,
 	dadd_op, daddu_op, dsub_op, dsubu_op,
 	tge_op, tgeu_op, tlt_op, tltu_op,
-	teq_op, spec5_unused_op, tne_op, spec6_unused_op,
-	dsll_op, spec7_unused_op, dsrl_op, dsra_op,
-	dsll32_op, spec8_unused_op, dsrl32_op, dsra32_op
+	teq_op, seleqz_op, tne_op, selnez_op,
+	dsll_op, spec5_unused_op, dsrl_op, dsra_op,
+	dsll32_op, spec6_unused_op, dsrl32_op, dsra32_op
 };
 
 /*
diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c
index 6abe40fc413d..7154a1d99aad 100644
--- a/arch/mips/mm/uasm-mips.c
+++ b/arch/mips/mm/uasm-mips.c
@@ -76,14 +76,22 @@ static const struct insn insn_table[insn_invalid] = {
 	[insn_daddiu]	= {M(daddiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM},
 	[insn_daddu]	= {M(spec_op, 0, 0, 0, 0, daddu_op), RS | RT | RD},
 	[insn_ddivu]	= {M(spec_op, 0, 0, 0, 0, ddivu_op), RS | RT},
+	[insn_ddivu_r6]	= {M(spec_op, 0, 0, 0, ddivu_ddivu6_op, ddivu_op),
+				RS | RT | RD},
 	[insn_di]	= {M(cop0_op, mfmc0_op, 0, 12, 0, 0), RT},
 	[insn_dins]	= {M(spec3_op, 0, 0, 0, 0, dins_op), RS | RT | RD | RE},
 	[insn_dinsm]	= {M(spec3_op, 0, 0, 0, 0, dinsm_op), RS | RT | RD | RE},
 	[insn_dinsu]	= {M(spec3_op, 0, 0, 0, 0, dinsu_op), RS | RT | RD | RE},
 	[insn_divu]	= {M(spec_op, 0, 0, 0, 0, divu_op), RS | RT},
+	[insn_divu_r6]	= {M(spec_op, 0, 0, 0, divu_divu6_op, divu_op),
+				RS | RT | RD},
 	[insn_dmfc0]	= {M(cop0_op, dmfc_op, 0, 0, 0, 0), RT | RD | SET},
+	[insn_dmodu]	= {M(spec_op, 0, 0, 0, ddivu_dmodu_op, ddivu_op),
+				RS | RT | RD},
 	[insn_dmtc0]	= {M(cop0_op, dmtc_op, 0, 0, 0, 0), RT | RD | SET},
 	[insn_dmultu]	= {M(spec_op, 0, 0, 0, 0, dmultu_op), RS | RT},
+	[insn_dmulu]	= {M(spec_op, 0, 0, 0, dmult_dmul_op, dmultu_op),
+				RS | RT | RD},
 	[insn_drotr]	= {M(spec_op, 1, 0, 0, 0, dsrl_op), RT | RD | RE},
 	[insn_drotr32]	= {M(spec_op, 1, 0, 0, 0, dsrl32_op), RT | RD | RE},
 	[insn_dsbh]	= {M(spec3_op, 0, 0, 0, dsbh_op, dbshfl_op), RT | RD},
@@ -132,12 +140,16 @@ static const struct insn insn_table[insn_invalid] = {
 	[insn_mfhc0]	= {M(cop0_op, mfhc0_op, 0, 0, 0, 0),  RT | RD | SET},
 	[insn_mfhi]	= {M(spec_op, 0, 0, 0, 0, mfhi_op), RD},
 	[insn_mflo]	= {M(spec_op, 0, 0, 0, 0, mflo_op), RD},
+	[insn_modu]	= {M(spec_op, 0, 0, 0, divu_modu_op, divu_op),
+				RS | RT | RD},
 	[insn_movn]	= {M(spec_op, 0, 0, 0, 0, movn_op), RS | RT | RD},
 	[insn_movz]	= {M(spec_op, 0, 0, 0, 0, movz_op), RS | RT | RD},
 	[insn_mtc0]	= {M(cop0_op, mtc_op, 0, 0, 0, 0),  RT | RD | SET},
 	[insn_mthc0]	= {M(cop0_op, mthc0_op, 0, 0, 0, 0),  RT | RD | SET},
 	[insn_mthi]	= {M(spec_op, 0, 0, 0, 0, mthi_op), RS},
 	[insn_mtlo]	= {M(spec_op, 0, 0, 0, 0, mtlo_op), RS},
+	[insn_mulu]	= {M(spec_op, 0, 0, 0, multu_mulu_op, multu_op),
+				RS | RT | RD},
 #ifndef CONFIG_CPU_MIPSR6
 	[insn_mul]	= {M(spec2_op, 0, 0, 0, 0, mul_op), RS | RT | RD},
 #else
@@ -163,6 +175,8 @@ static const struct insn insn_table[insn_invalid] = {
 	[insn_scd]	= {M6(spec3_op, 0, 0, 0, scd6_op),  RS | RT | SIMM9},
 #endif
 	[insn_sd]	= {M(sd_op, 0, 0, 0, 0, 0),  RS | RT | SIMM},
+	[insn_seleqz]	= {M(spec_op, 0, 0, 0, 0, seleqz_op), RS | RT | RD},
+	[insn_selnez]	= {M(spec_op, 0, 0, 0, 0, selnez_op), RS | RT | RD},
 	[insn_sh]	= {M(sh_op, 0, 0, 0, 0, 0),  RS | RT | SIMM},
 	[insn_sll]	= {M(spec_op, 0, 0, 0, 0, sll_op),  RT | RD | RE},
 	[insn_sllv]	= {M(spec_op, 0, 0, 0, 0, sllv_op),  RS | RT | RD},
diff --git a/arch/mips/mm/uasm.c b/arch/mips/mm/uasm.c
index 45b6264ff308..c56f129c9a4b 100644
--- a/arch/mips/mm/uasm.c
+++ b/arch/mips/mm/uasm.c
@@ -50,21 +50,22 @@ enum opcode {
 	insn_beq, insn_beql, insn_bgez, insn_bgezl, insn_bgtz, insn_blez,
 	insn_bltz, insn_bltzl, insn_bne, insn_break, insn_cache, insn_cfc1,
 	insn_cfcmsa, insn_ctc1, insn_ctcmsa, insn_daddiu, insn_daddu, insn_ddivu,
-	insn_di, insn_dins, insn_dinsm, insn_dinsu, insn_divu, insn_dmfc0,
-	insn_dmtc0, insn_dmultu, insn_drotr, insn_drotr32, insn_dsbh, insn_dshd,
-	insn_dsll, insn_dsll32, insn_dsllv, insn_dsra, insn_dsra32, insn_dsrav,
-	insn_dsrl, insn_dsrl32, insn_dsrlv, insn_dsubu, insn_eret, insn_ext,
-	insn_ins, insn_j, insn_jal, insn_jalr, insn_jr, insn_lb, insn_lbu,
-	insn_ld, insn_lddir, insn_ldpte, insn_ldx, insn_lh, insn_lhu,
-	insn_ll, insn_lld, insn_lui, insn_lw, insn_lwu, insn_lwx, insn_mfc0,
-	insn_mfhc0, insn_mfhi, insn_mflo, insn_movn, insn_movz, insn_mtc0,
-	insn_mthc0, insn_mthi, insn_mtlo, insn_mul, insn_multu, insn_nor,
-	insn_or, insn_ori, insn_pref, insn_rfe, insn_rotr, insn_sb,
-	insn_sc, insn_scd, insn_sd, insn_sh, insn_sll, insn_sllv,
-	insn_slt, insn_slti, insn_sltiu, insn_sltu, insn_sra, insn_srav,
-	insn_srl, insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall,
-	insn_tlbp, insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh,
-	insn_xor, insn_xori, insn_yield,
+	insn_ddivu_r6, insn_di, insn_dins, insn_dinsm, insn_dinsu, insn_divu,
+	insn_divu_r6, insn_dmfc0, insn_dmodu, insn_dmtc0, insn_dmultu,
+	insn_dmulu, insn_drotr, insn_drotr32, insn_dsbh, insn_dshd, insn_dsll,
+	insn_dsll32, insn_dsllv, insn_dsra, insn_dsra32, insn_dsrav, insn_dsrl,
+	insn_dsrl32, insn_dsrlv, insn_dsubu, insn_eret, insn_ext, insn_ins,
+	insn_j, insn_jal, insn_jalr, insn_jr, insn_lb, insn_lbu, insn_ld,
+	insn_lddir, insn_ldpte, insn_ldx, insn_lh, insn_lhu, insn_ll, insn_lld,
+	insn_lui, insn_lw, insn_lwu, insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi,
+	insn_mflo, insn_modu, insn_movn, insn_movz, insn_mtc0, insn_mthc0,
+	insn_mthi, insn_mtlo, insn_mul, insn_multu, insn_mulu, insn_nor,
+	insn_or, insn_ori, insn_pref, insn_rfe, insn_rotr, insn_sb, insn_sc,
+	insn_scd, insn_seleqz, insn_selnez, insn_sd, insn_sh, insn_sll,
+	insn_sllv, insn_slt, insn_slti, insn_sltiu, insn_sltu, insn_sra,
+	insn_srav, insn_srl, insn_srlv, insn_subu, insn_sw, insn_sync,
+	insn_syscall, insn_tlbp, insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait,
+	insn_wsbh, insn_xor, insn_xori, insn_yield,
 	insn_invalid /* insn_invalid must be last */
 };
 
@@ -287,13 +288,17 @@ I_u2u1(_cfcmsa)
 I_u1u2(_ctc1)
 I_u2u1(_ctcmsa)
 I_u1u2(_ddivu)
+I_u3u1u2(_ddivu_r6)
 I_u1u2u3(_dmfc0)
+I_u3u1u2(_dmodu)
 I_u1u2u3(_dmtc0)
 I_u1u2(_dmultu)
+I_u3u1u2(_dmulu)
 I_u2u1s3(_daddiu)
 I_u3u1u2(_daddu)
 I_u1(_di);
 I_u1u2(_divu)
+I_u3u1u2(_divu_r6)
 I_u2u1(_dsbh);
 I_u2u1(_dshd);
 I_u2u1u3(_dsll)
@@ -327,6 +332,7 @@ I_u2s3u1(_lw)
 I_u2s3u1(_lwu)
 I_u1u2u3(_mfc0)
 I_u1u2u3(_mfhc0)
+I_u3u1u2(_modu)
 I_u3u1u2(_movn)
 I_u3u1u2(_movz)
 I_u1(_mfhi)
@@ -337,6 +343,7 @@ I_u1(_mthi)
 I_u1(_mtlo)
 I_u3u1u2(_mul)
 I_u1u2(_multu)
+I_u3u1u2(_mulu)
 I_u3u1u2(_nor)
 I_u3u1u2(_or)
 I_u2u1u3(_ori)
@@ -345,6 +352,8 @@ I_u2s3u1(_sb)
 I_u2s3u1(_sc)
 I_u2s3u1(_scd)
 I_u2s3u1(_sd)
+I_u3u1u2(_seleqz)
+I_u3u1u2(_selnez)
 I_u2s3u1(_sh)
 I_u2u1u3(_sll)
 I_u3u2u1(_sllv)
-- 
cgit v1.2.3-59-g8ed1b


From 6c2c8a18886897226ee43e77c0149a8e1874274a Mon Sep 17 00:00:00 2001
From: Hassan Naveed <hnaveed@wavecomp.com>
Date: Tue, 12 Mar 2019 22:48:06 +0000
Subject: MIPS: eBPF: Provide eBPF support for MIPS64R6

Currently eBPF support is available on MIPS64R2 only. Use MIPS64R6
variants of instructions like multiply, divide, movn, movz so eBPF
can run on the newer ISA. Also, we only need to check ISA revision
before JIT'ing code, because we know the CPU is running a 64-bit
kernel because eBPF JIT is only included in kernels with CONFIG_64BIT=y
due to Kconfig dependencies.

Signed-off-by: Hassan Naveed <hnaveed@wavecomp.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: kafai@fb.com
Cc: songliubraving@fb.com
Cc: yhs@fb.com
Cc: netdev@vger.kernel.org
Cc: bpf@vger.kernel.org
Cc: linux-mips@vger.kernel.org
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: open list:MIPS <linux-mips@linux-mips.org>
Cc: open list <linux-kernel@vger.kernel.org>
---
 arch/mips/net/ebpf_jit.c | 78 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 69 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index 0effd3cba9a7..26eef9ad3896 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -22,6 +22,7 @@
 #include <asm/byteorder.h>
 #include <asm/cacheflush.h>
 #include <asm/cpu-features.h>
+#include <asm/isa-rev.h>
 #include <asm/uasm.h>
 
 /* Registers used by JIT */
@@ -677,8 +678,12 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 		if (insn->imm == 1) /* Mult by 1 is a nop */
 			break;
 		gen_imm_to_reg(insn, MIPS_R_AT, ctx);
-		emit_instr(ctx, dmultu, MIPS_R_AT, dst);
-		emit_instr(ctx, mflo, dst);
+		if (MIPS_ISA_REV >= 6) {
+			emit_instr(ctx, dmulu, dst, dst, MIPS_R_AT);
+		} else {
+			emit_instr(ctx, dmultu, MIPS_R_AT, dst);
+			emit_instr(ctx, mflo, dst);
+		}
 		break;
 	case BPF_ALU64 | BPF_NEG | BPF_K: /* ALU64_IMM */
 		dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
@@ -700,8 +705,12 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 		if (insn->imm == 1) /* Mult by 1 is a nop */
 			break;
 		gen_imm_to_reg(insn, MIPS_R_AT, ctx);
-		emit_instr(ctx, multu, dst, MIPS_R_AT);
-		emit_instr(ctx, mflo, dst);
+		if (MIPS_ISA_REV >= 6) {
+			emit_instr(ctx, mulu, dst, dst, MIPS_R_AT);
+		} else {
+			emit_instr(ctx, multu, dst, MIPS_R_AT);
+			emit_instr(ctx, mflo, dst);
+		}
 		break;
 	case BPF_ALU | BPF_NEG | BPF_K: /* ALU_IMM */
 		dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
@@ -732,6 +741,13 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 			break;
 		}
 		gen_imm_to_reg(insn, MIPS_R_AT, ctx);
+		if (MIPS_ISA_REV >= 6) {
+			if (bpf_op == BPF_DIV)
+				emit_instr(ctx, divu_r6, dst, dst, MIPS_R_AT);
+			else
+				emit_instr(ctx, modu, dst, dst, MIPS_R_AT);
+			break;
+		}
 		emit_instr(ctx, divu, dst, MIPS_R_AT);
 		if (bpf_op == BPF_DIV)
 			emit_instr(ctx, mflo, dst);
@@ -754,6 +770,13 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 			break;
 		}
 		gen_imm_to_reg(insn, MIPS_R_AT, ctx);
+		if (MIPS_ISA_REV >= 6) {
+			if (bpf_op == BPF_DIV)
+				emit_instr(ctx, ddivu_r6, dst, dst, MIPS_R_AT);
+			else
+				emit_instr(ctx, modu, dst, dst, MIPS_R_AT);
+			break;
+		}
 		emit_instr(ctx, ddivu, dst, MIPS_R_AT);
 		if (bpf_op == BPF_DIV)
 			emit_instr(ctx, mflo, dst);
@@ -819,11 +842,23 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 			emit_instr(ctx, and, dst, dst, src);
 			break;
 		case BPF_MUL:
-			emit_instr(ctx, dmultu, dst, src);
-			emit_instr(ctx, mflo, dst);
+			if (MIPS_ISA_REV >= 6) {
+				emit_instr(ctx, dmulu, dst, dst, src);
+			} else {
+				emit_instr(ctx, dmultu, dst, src);
+				emit_instr(ctx, mflo, dst);
+			}
 			break;
 		case BPF_DIV:
 		case BPF_MOD:
+			if (MIPS_ISA_REV >= 6) {
+				if (bpf_op == BPF_DIV)
+					emit_instr(ctx, ddivu_r6,
+							dst, dst, src);
+				else
+					emit_instr(ctx, modu, dst, dst, src);
+				break;
+			}
 			emit_instr(ctx, ddivu, dst, src);
 			if (bpf_op == BPF_DIV)
 				emit_instr(ctx, mflo, dst);
@@ -903,6 +938,13 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 			break;
 		case BPF_DIV:
 		case BPF_MOD:
+			if (MIPS_ISA_REV >= 6) {
+				if (bpf_op == BPF_DIV)
+					emit_instr(ctx, divu_r6, dst, dst, src);
+				else
+					emit_instr(ctx, modu, dst, dst, src);
+				break;
+			}
 			emit_instr(ctx, divu, dst, src);
 			if (bpf_op == BPF_DIV)
 				emit_instr(ctx, mflo, dst);
@@ -1006,8 +1048,15 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 			emit_instr(ctx, dsubu, MIPS_R_T8, dst, src);
 			emit_instr(ctx, sltu, MIPS_R_AT, dst, src);
 			/* SP known to be non-zero, movz becomes boolean not */
-			emit_instr(ctx, movz, MIPS_R_T9, MIPS_R_SP, MIPS_R_T8);
-			emit_instr(ctx, movn, MIPS_R_T9, MIPS_R_ZERO, MIPS_R_T8);
+			if (MIPS_ISA_REV >= 6) {
+				emit_instr(ctx, seleqz, MIPS_R_T9,
+						MIPS_R_SP, MIPS_R_T8);
+			} else {
+				emit_instr(ctx, movz, MIPS_R_T9,
+						MIPS_R_SP, MIPS_R_T8);
+				emit_instr(ctx, movn, MIPS_R_T9,
+						MIPS_R_ZERO, MIPS_R_T8);
+			}
 			emit_instr(ctx, or, MIPS_R_AT, MIPS_R_T9, MIPS_R_AT);
 			cmp_eq = bpf_op == BPF_JGT;
 			dst = MIPS_R_AT;
@@ -1366,6 +1415,17 @@ jeq_common:
 		if (src < 0)
 			return src;
 		if (BPF_MODE(insn->code) == BPF_XADD) {
+			/*
+			 * If mem_off does not fit within the 9 bit ll/sc
+			 * instruction immediate field, use a temp reg.
+			 */
+			if (MIPS_ISA_REV >= 6 &&
+			    (mem_off >= BIT(8) || mem_off < -BIT(8))) {
+				emit_instr(ctx, daddiu, MIPS_R_T6,
+						dst, mem_off);
+				mem_off = 0;
+				dst = MIPS_R_T6;
+			}
 			switch (BPF_SIZE(insn->code)) {
 			case BPF_W:
 				if (get_reg_val_type(ctx, this_idx, insn->src_reg) == REG_32BIT) {
@@ -1720,7 +1780,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	unsigned int image_size;
 	u8 *image_ptr;
 
-	if (!prog->jit_requested || !cpu_has_mips64r2)
+	if (!prog->jit_requested || MIPS_ISA_REV < 2)
 		return prog;
 
 	tmp = bpf_jit_blind_constants(prog);
-- 
cgit v1.2.3-59-g8ed1b


From 716850ab104db154549b01e63d5c71076434b064 Mon Sep 17 00:00:00 2001
From: Hassan Naveed <hnaveed@wavecomp.com>
Date: Tue, 12 Mar 2019 22:48:12 +0000
Subject: MIPS: eBPF: Initial eBPF support for MIPS32 architecture.

Currently MIPS32 supports a JIT for classic BPF only, not extended BPF.
This patch adds JIT support for extended BPF on MIPS32, so code is
actually JIT'ed instead of being only interpreted. Instructions with
64-bit operands are not supported at this point.
We can delete classic BPF because the kernel will translate classic BPF
programs into extended BPF and JIT them, eliminating the need for
classic BPF.

Signed-off-by: Hassan Naveed <hnaveed@wavecomp.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: kafai@fb.com
Cc: songliubraving@fb.com
Cc: yhs@fb.com
Cc: netdev@vger.kernel.org
Cc: bpf@vger.kernel.org
Cc: linux-mips@vger.kernel.org
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: open list:MIPS <linux-mips@linux-mips.org>
Cc: open list <linux-kernel@vger.kernel.org>
---
 arch/mips/Kconfig           |    3 +-
 arch/mips/net/Makefile      |    1 -
 arch/mips/net/bpf_jit.c     | 1270 -------------------------------------------
 arch/mips/net/bpf_jit_asm.S |  285 ----------
 arch/mips/net/ebpf_jit.c    |  113 ++--
 5 files changed, 70 insertions(+), 1602 deletions(-)
 delete mode 100644 arch/mips/net/bpf_jit.c
 delete mode 100644 arch/mips/net/bpf_jit_asm.S

(limited to 'arch')

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 4a5f5b0ee9a9..cbf16db3366c 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -44,8 +44,7 @@ config MIPS
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES && 64BIT
-	select HAVE_CBPF_JIT if (!64BIT && !CPU_MICROMIPS)
-	select HAVE_EBPF_JIT if (64BIT && !CPU_MICROMIPS)
+	select HAVE_EBPF_JIT if (!CPU_MICROMIPS)
 	select HAVE_CONTEXT_TRACKING
 	select HAVE_COPY_THREAD_TLS
 	select HAVE_C_RECORDMCOUNT
diff --git a/arch/mips/net/Makefile b/arch/mips/net/Makefile
index 47d678416715..72a78462f872 100644
--- a/arch/mips/net/Makefile
+++ b/arch/mips/net/Makefile
@@ -1,4 +1,3 @@
 # MIPS networking code
 
-obj-$(CONFIG_MIPS_CBPF_JIT) += bpf_jit.o bpf_jit_asm.o
 obj-$(CONFIG_MIPS_EBPF_JIT) += ebpf_jit.o
diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c
deleted file mode 100644
index 3a0e34f4e615..000000000000
--- a/arch/mips/net/bpf_jit.c
+++ /dev/null
@@ -1,1270 +0,0 @@
-/*
- * Just-In-Time compiler for BPF filters on MIPS
- *
- * Copyright (c) 2014 Imagination Technologies Ltd.
- * Author: Markos Chandras <markos.chandras@imgtec.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2 of the License.
- */
-
-#include <linux/bitops.h>
-#include <linux/compiler.h>
-#include <linux/errno.h>
-#include <linux/filter.h>
-#include <linux/if_vlan.h>
-#include <linux/moduleloader.h>
-#include <linux/netdevice.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-#include <asm/asm.h>
-#include <asm/bitops.h>
-#include <asm/cacheflush.h>
-#include <asm/cpu-features.h>
-#include <asm/uasm.h>
-
-#include "bpf_jit.h"
-
-/* ABI
- * r_skb_hl	SKB header length
- * r_data	SKB data pointer
- * r_off	Offset
- * r_A		BPF register A
- * r_X		BPF register X
- * r_skb	*skb
- * r_M		*scratch memory
- * r_skb_len	SKB length
- *
- * On entry (*bpf_func)(*skb, *filter)
- * a0 = MIPS_R_A0 = skb;
- * a1 = MIPS_R_A1 = filter;
- *
- * Stack
- * ...
- * M[15]
- * M[14]
- * M[13]
- * ...
- * M[0] <-- r_M
- * saved reg k-1
- * saved reg k-2
- * ...
- * saved reg 0 <-- r_sp
- * <no argument area>
- *
- *                     Packet layout
- *
- * <--------------------- len ------------------------>
- * <--skb-len(r_skb_hl)-->< ----- skb->data_len ------>
- * ----------------------------------------------------
- * |                  skb->data                       |
- * ----------------------------------------------------
- */
-
-#define ptr typeof(unsigned long)
-
-#define SCRATCH_OFF(k)		(4 * (k))
-
-/* JIT flags */
-#define SEEN_CALL		(1 << BPF_MEMWORDS)
-#define SEEN_SREG_SFT		(BPF_MEMWORDS + 1)
-#define SEEN_SREG_BASE		(1 << SEEN_SREG_SFT)
-#define SEEN_SREG(x)		(SEEN_SREG_BASE << (x))
-#define SEEN_OFF		SEEN_SREG(2)
-#define SEEN_A			SEEN_SREG(3)
-#define SEEN_X			SEEN_SREG(4)
-#define SEEN_SKB		SEEN_SREG(5)
-#define SEEN_MEM		SEEN_SREG(6)
-/* SEEN_SK_DATA also implies skb_hl an skb_len */
-#define SEEN_SKB_DATA		(SEEN_SREG(7) | SEEN_SREG(1) | SEEN_SREG(0))
-
-/* Arguments used by JIT */
-#define ARGS_USED_BY_JIT	2 /* only applicable to 64-bit */
-
-#define SBIT(x)			(1 << (x)) /* Signed version of BIT() */
-
-/**
- * struct jit_ctx - JIT context
- * @skf:		The sk_filter
- * @prologue_bytes:	Number of bytes for prologue
- * @idx:		Instruction index
- * @flags:		JIT flags
- * @offsets:		Instruction offsets
- * @target:		Memory location for the compiled filter
- */
-struct jit_ctx {
-	const struct bpf_prog *skf;
-	unsigned int prologue_bytes;
-	u32 idx;
-	u32 flags;
-	u32 *offsets;
-	u32 *target;
-};
-
-
-static inline int optimize_div(u32 *k)
-{
-	/* power of 2 divides can be implemented with right shift */
-	if (!(*k & (*k-1))) {
-		*k = ilog2(*k);
-		return 1;
-	}
-
-	return 0;
-}
-
-static inline void emit_jit_reg_move(ptr dst, ptr src, struct jit_ctx *ctx);
-
-/* Simply emit the instruction if the JIT memory space has been allocated */
-#define emit_instr(ctx, func, ...)			\
-do {							\
-	if ((ctx)->target != NULL) {			\
-		u32 *p = &(ctx)->target[ctx->idx];	\
-		uasm_i_##func(&p, ##__VA_ARGS__);	\
-	}						\
-	(ctx)->idx++;					\
-} while (0)
-
-/*
- * Similar to emit_instr but it must be used when we need to emit
- * 32-bit or 64-bit instructions
- */
-#define emit_long_instr(ctx, func, ...)			\
-do {							\
-	if ((ctx)->target != NULL) {			\
-		u32 *p = &(ctx)->target[ctx->idx];	\
-		UASM_i_##func(&p, ##__VA_ARGS__);	\
-	}						\
-	(ctx)->idx++;					\
-} while (0)
-
-/* Determine if immediate is within the 16-bit signed range */
-static inline bool is_range16(s32 imm)
-{
-	return !(imm >= SBIT(15) || imm < -SBIT(15));
-}
-
-static inline void emit_addu(unsigned int dst, unsigned int src1,
-			     unsigned int src2, struct jit_ctx *ctx)
-{
-	emit_instr(ctx, addu, dst, src1, src2);
-}
-
-static inline void emit_nop(struct jit_ctx *ctx)
-{
-	emit_instr(ctx, nop);
-}
-
-/* Load a u32 immediate to a register */
-static inline void emit_load_imm(unsigned int dst, u32 imm, struct jit_ctx *ctx)
-{
-	if (ctx->target != NULL) {
-		/* addiu can only handle s16 */
-		if (!is_range16(imm)) {
-			u32 *p = &ctx->target[ctx->idx];
-			uasm_i_lui(&p, r_tmp_imm, (s32)imm >> 16);
-			p = &ctx->target[ctx->idx + 1];
-			uasm_i_ori(&p, dst, r_tmp_imm, imm & 0xffff);
-		} else {
-			u32 *p = &ctx->target[ctx->idx];
-			uasm_i_addiu(&p, dst, r_zero, imm);
-		}
-	}
-	ctx->idx++;
-
-	if (!is_range16(imm))
-		ctx->idx++;
-}
-
-static inline void emit_or(unsigned int dst, unsigned int src1,
-			   unsigned int src2, struct jit_ctx *ctx)
-{
-	emit_instr(ctx, or, dst, src1, src2);
-}
-
-static inline void emit_ori(unsigned int dst, unsigned src, u32 imm,
-			    struct jit_ctx *ctx)
-{
-	if (imm >= BIT(16)) {
-		emit_load_imm(r_tmp, imm, ctx);
-		emit_or(dst, src, r_tmp, ctx);
-	} else {
-		emit_instr(ctx, ori, dst, src, imm);
-	}
-}
-
-static inline void emit_daddiu(unsigned int dst, unsigned int src,
-			       int imm, struct jit_ctx *ctx)
-{
-	/*
-	 * Only used for stack, so the imm is relatively small
-	 * and it fits in 15-bits
-	 */
-	emit_instr(ctx, daddiu, dst, src, imm);
-}
-
-static inline void emit_addiu(unsigned int dst, unsigned int src,
-			      u32 imm, struct jit_ctx *ctx)
-{
-	if (!is_range16(imm)) {
-		emit_load_imm(r_tmp, imm, ctx);
-		emit_addu(dst, r_tmp, src, ctx);
-	} else {
-		emit_instr(ctx, addiu, dst, src, imm);
-	}
-}
-
-static inline void emit_and(unsigned int dst, unsigned int src1,
-			    unsigned int src2, struct jit_ctx *ctx)
-{
-	emit_instr(ctx, and, dst, src1, src2);
-}
-
-static inline void emit_andi(unsigned int dst, unsigned int src,
-			     u32 imm, struct jit_ctx *ctx)
-{
-	/* If imm does not fit in u16 then load it to register */
-	if (imm >= BIT(16)) {
-		emit_load_imm(r_tmp, imm, ctx);
-		emit_and(dst, src, r_tmp, ctx);
-	} else {
-		emit_instr(ctx, andi, dst, src, imm);
-	}
-}
-
-static inline void emit_xor(unsigned int dst, unsigned int src1,
-			    unsigned int src2, struct jit_ctx *ctx)
-{
-	emit_instr(ctx, xor, dst, src1, src2);
-}
-
-static inline void emit_xori(ptr dst, ptr src, u32 imm, struct jit_ctx *ctx)
-{
-	/* If imm does not fit in u16 then load it to register */
-	if (imm >= BIT(16)) {
-		emit_load_imm(r_tmp, imm, ctx);
-		emit_xor(dst, src, r_tmp, ctx);
-	} else {
-		emit_instr(ctx, xori, dst, src, imm);
-	}
-}
-
-static inline void emit_stack_offset(int offset, struct jit_ctx *ctx)
-{
-	emit_long_instr(ctx, ADDIU, r_sp, r_sp, offset);
-}
-
-static inline void emit_subu(unsigned int dst, unsigned int src1,
-			     unsigned int src2, struct jit_ctx *ctx)
-{
-	emit_instr(ctx, subu, dst, src1, src2);
-}
-
-static inline void emit_neg(unsigned int reg, struct jit_ctx *ctx)
-{
-	emit_subu(reg, r_zero, reg, ctx);
-}
-
-static inline void emit_sllv(unsigned int dst, unsigned int src,
-			     unsigned int sa, struct jit_ctx *ctx)
-{
-	emit_instr(ctx, sllv, dst, src, sa);
-}
-
-static inline void emit_sll(unsigned int dst, unsigned int src,
-			    unsigned int sa, struct jit_ctx *ctx)
-{
-	/* sa is 5-bits long */
-	if (sa >= BIT(5))
-		/* Shifting >= 32 results in zero */
-		emit_jit_reg_move(dst, r_zero, ctx);
-	else
-		emit_instr(ctx, sll, dst, src, sa);
-}
-
-static inline void emit_srlv(unsigned int dst, unsigned int src,
-			     unsigned int sa, struct jit_ctx *ctx)
-{
-	emit_instr(ctx, srlv, dst, src, sa);
-}
-
-static inline void emit_srl(unsigned int dst, unsigned int src,
-			    unsigned int sa, struct jit_ctx *ctx)
-{
-	/* sa is 5-bits long */
-	if (sa >= BIT(5))
-		/* Shifting >= 32 results in zero */
-		emit_jit_reg_move(dst, r_zero, ctx);
-	else
-		emit_instr(ctx, srl, dst, src, sa);
-}
-
-static inline void emit_slt(unsigned int dst, unsigned int src1,
-			    unsigned int src2, struct jit_ctx *ctx)
-{
-	emit_instr(ctx, slt, dst, src1, src2);
-}
-
-static inline void emit_sltu(unsigned int dst, unsigned int src1,
-			     unsigned int src2, struct jit_ctx *ctx)
-{
-	emit_instr(ctx, sltu, dst, src1, src2);
-}
-
-static inline void emit_sltiu(unsigned dst, unsigned int src,
-			      unsigned int imm, struct jit_ctx *ctx)
-{
-	/* 16 bit immediate */
-	if (!is_range16((s32)imm)) {
-		emit_load_imm(r_tmp, imm, ctx);
-		emit_sltu(dst, src, r_tmp, ctx);
-	} else {
-		emit_instr(ctx, sltiu, dst, src, imm);
-	}
-
-}
-
-/* Store register on the stack */
-static inline void emit_store_stack_reg(ptr reg, ptr base,
-					unsigned int offset,
-					struct jit_ctx *ctx)
-{
-	emit_long_instr(ctx, SW, reg, offset, base);
-}
-
-static inline void emit_store(ptr reg, ptr base, unsigned int offset,
-			      struct jit_ctx *ctx)
-{
-	emit_instr(ctx, sw, reg, offset, base);
-}
-
-static inline void emit_load_stack_reg(ptr reg, ptr base,
-				       unsigned int offset,
-				       struct jit_ctx *ctx)
-{
-	emit_long_instr(ctx, LW, reg, offset, base);
-}
-
-static inline void emit_load(unsigned int reg, unsigned int base,
-			     unsigned int offset, struct jit_ctx *ctx)
-{
-	emit_instr(ctx, lw, reg, offset, base);
-}
-
-static inline void emit_load_byte(unsigned int reg, unsigned int base,
-				  unsigned int offset, struct jit_ctx *ctx)
-{
-	emit_instr(ctx, lb, reg, offset, base);
-}
-
-static inline void emit_half_load(unsigned int reg, unsigned int base,
-				  unsigned int offset, struct jit_ctx *ctx)
-{
-	emit_instr(ctx, lh, reg, offset, base);
-}
-
-static inline void emit_half_load_unsigned(unsigned int reg, unsigned int base,
-					   unsigned int offset, struct jit_ctx *ctx)
-{
-	emit_instr(ctx, lhu, reg, offset, base);
-}
-
-static inline void emit_mul(unsigned int dst, unsigned int src1,
-			    unsigned int src2, struct jit_ctx *ctx)
-{
-	emit_instr(ctx, mul, dst, src1, src2);
-}
-
-static inline void emit_div(unsigned int dst, unsigned int src,
-			    struct jit_ctx *ctx)
-{
-	if (ctx->target != NULL) {
-		u32 *p = &ctx->target[ctx->idx];
-		uasm_i_divu(&p, dst, src);
-		p = &ctx->target[ctx->idx + 1];
-		uasm_i_mflo(&p, dst);
-	}
-	ctx->idx += 2; /* 2 insts */
-}
-
-static inline void emit_mod(unsigned int dst, unsigned int src,
-			    struct jit_ctx *ctx)
-{
-	if (ctx->target != NULL) {
-		u32 *p = &ctx->target[ctx->idx];
-		uasm_i_divu(&p, dst, src);
-		p = &ctx->target[ctx->idx + 1];
-		uasm_i_mfhi(&p, dst);
-	}
-	ctx->idx += 2; /* 2 insts */
-}
-
-static inline void emit_dsll(unsigned int dst, unsigned int src,
-			     unsigned int sa, struct jit_ctx *ctx)
-{
-	emit_instr(ctx, dsll, dst, src, sa);
-}
-
-static inline void emit_dsrl32(unsigned int dst, unsigned int src,
-			       unsigned int sa, struct jit_ctx *ctx)
-{
-	emit_instr(ctx, dsrl32, dst, src, sa);
-}
-
-static inline void emit_wsbh(unsigned int dst, unsigned int src,
-			     struct jit_ctx *ctx)
-{
-	emit_instr(ctx, wsbh, dst, src);
-}
-
-/* load pointer to register */
-static inline void emit_load_ptr(unsigned int dst, unsigned int src,
-				     int imm, struct jit_ctx *ctx)
-{
-	/* src contains the base addr of the 32/64-pointer */
-	emit_long_instr(ctx, LW, dst, imm, src);
-}
-
-/* load a function pointer to register */
-static inline void emit_load_func(unsigned int reg, ptr imm,
-				  struct jit_ctx *ctx)
-{
-	if (IS_ENABLED(CONFIG_64BIT)) {
-		/* At this point imm is always 64-bit */
-		emit_load_imm(r_tmp, (u64)imm >> 32, ctx);
-		emit_dsll(r_tmp_imm, r_tmp, 16, ctx); /* left shift by 16 */
-		emit_ori(r_tmp, r_tmp_imm, (imm >> 16) & 0xffff, ctx);
-		emit_dsll(r_tmp_imm, r_tmp, 16, ctx); /* left shift by 16 */
-		emit_ori(reg, r_tmp_imm, imm & 0xffff, ctx);
-	} else {
-		emit_load_imm(reg, imm, ctx);
-	}
-}
-
-/* Move to real MIPS register */
-static inline void emit_reg_move(ptr dst, ptr src, struct jit_ctx *ctx)
-{
-	emit_long_instr(ctx, ADDU, dst, src, r_zero);
-}
-
-/* Move to JIT (32-bit) register */
-static inline void emit_jit_reg_move(ptr dst, ptr src, struct jit_ctx *ctx)
-{
-	emit_addu(dst, src, r_zero, ctx);
-}
-
-/* Compute the immediate value for PC-relative branches. */
-static inline u32 b_imm(unsigned int tgt, struct jit_ctx *ctx)
-{
-	if (ctx->target == NULL)
-		return 0;
-
-	/*
-	 * We want a pc-relative branch. We only do forward branches
-	 * so tgt is always after pc. tgt is the instruction offset
-	 * we want to jump to.
-
-	 * Branch on MIPS:
-	 * I: target_offset <- sign_extend(offset)
-	 * I+1: PC += target_offset (delay slot)
-	 *
-	 * ctx->idx currently points to the branch instruction
-	 * but the offset is added to the delay slot so we need
-	 * to subtract 4.
-	 */
-	return ctx->offsets[tgt] -
-		(ctx->idx * 4 - ctx->prologue_bytes) - 4;
-}
-
-static inline void emit_bcond(int cond, unsigned int reg1, unsigned int reg2,
-			     unsigned int imm, struct jit_ctx *ctx)
-{
-	if (ctx->target != NULL) {
-		u32 *p = &ctx->target[ctx->idx];
-
-		switch (cond) {
-		case MIPS_COND_EQ:
-			uasm_i_beq(&p, reg1, reg2, imm);
-			break;
-		case MIPS_COND_NE:
-			uasm_i_bne(&p, reg1, reg2, imm);
-			break;
-		case MIPS_COND_ALL:
-			uasm_i_b(&p, imm);
-			break;
-		default:
-			pr_warn("%s: Unhandled branch conditional: %d\n",
-				__func__, cond);
-		}
-	}
-	ctx->idx++;
-}
-
-static inline void emit_b(unsigned int imm, struct jit_ctx *ctx)
-{
-	emit_bcond(MIPS_COND_ALL, r_zero, r_zero, imm, ctx);
-}
-
-static inline void emit_jalr(unsigned int link, unsigned int reg,
-			     struct jit_ctx *ctx)
-{
-	emit_instr(ctx, jalr, link, reg);
-}
-
-static inline void emit_jr(unsigned int reg, struct jit_ctx *ctx)
-{
-	emit_instr(ctx, jr, reg);
-}
-
-static inline u16 align_sp(unsigned int num)
-{
-	/* Double word alignment for 32-bit, quadword for 64-bit */
-	unsigned int align = IS_ENABLED(CONFIG_64BIT) ? 16 : 8;
-	num = (num + (align - 1)) & -align;
-	return num;
-}
-
-static void save_bpf_jit_regs(struct jit_ctx *ctx, unsigned offset)
-{
-	int i = 0, real_off = 0;
-	u32 sflags, tmp_flags;
-
-	/* Adjust the stack pointer */
-	if (offset)
-		emit_stack_offset(-align_sp(offset), ctx);
-
-	tmp_flags = sflags = ctx->flags >> SEEN_SREG_SFT;
-	/* sflags is essentially a bitmap */
-	while (tmp_flags) {
-		if ((sflags >> i) & 0x1) {
-			emit_store_stack_reg(MIPS_R_S0 + i, r_sp, real_off,
-					     ctx);
-			real_off += SZREG;
-		}
-		i++;
-		tmp_flags >>= 1;
-	}
-
-	/* save return address */
-	if (ctx->flags & SEEN_CALL) {
-		emit_store_stack_reg(r_ra, r_sp, real_off, ctx);
-		real_off += SZREG;
-	}
-
-	/* Setup r_M leaving the alignment gap if necessary */
-	if (ctx->flags & SEEN_MEM) {
-		if (real_off % (SZREG * 2))
-			real_off += SZREG;
-		emit_long_instr(ctx, ADDIU, r_M, r_sp, real_off);
-	}
-}
-
-static void restore_bpf_jit_regs(struct jit_ctx *ctx,
-				 unsigned int offset)
-{
-	int i, real_off = 0;
-	u32 sflags, tmp_flags;
-
-	tmp_flags = sflags = ctx->flags >> SEEN_SREG_SFT;
-	/* sflags is a bitmap */
-	i = 0;
-	while (tmp_flags) {
-		if ((sflags >> i) & 0x1) {
-			emit_load_stack_reg(MIPS_R_S0 + i, r_sp, real_off,
-					    ctx);
-			real_off += SZREG;
-		}
-		i++;
-		tmp_flags >>= 1;
-	}
-
-	/* restore return address */
-	if (ctx->flags & SEEN_CALL)
-		emit_load_stack_reg(r_ra, r_sp, real_off, ctx);
-
-	/* Restore the sp and discard the scrach memory */
-	if (offset)
-		emit_stack_offset(align_sp(offset), ctx);
-}
-
-static unsigned int get_stack_depth(struct jit_ctx *ctx)
-{
-	int sp_off = 0;
-
-
-	/* How may s* regs do we need to preserved? */
-	sp_off += hweight32(ctx->flags >> SEEN_SREG_SFT) * SZREG;
-
-	if (ctx->flags & SEEN_MEM)
-		sp_off += 4 * BPF_MEMWORDS; /* BPF_MEMWORDS are 32-bit */
-
-	if (ctx->flags & SEEN_CALL)
-		sp_off += SZREG; /* Space for our ra register */
-
-	return sp_off;
-}
-
-static void build_prologue(struct jit_ctx *ctx)
-{
-	int sp_off;
-
-	/* Calculate the total offset for the stack pointer */
-	sp_off = get_stack_depth(ctx);
-	save_bpf_jit_regs(ctx, sp_off);
-
-	if (ctx->flags & SEEN_SKB)
-		emit_reg_move(r_skb, MIPS_R_A0, ctx);
-
-	if (ctx->flags & SEEN_SKB_DATA) {
-		/* Load packet length */
-		emit_load(r_skb_len, r_skb, offsetof(struct sk_buff, len),
-			  ctx);
-		emit_load(r_tmp, r_skb, offsetof(struct sk_buff, data_len),
-			  ctx);
-		/* Load the data pointer */
-		emit_load_ptr(r_skb_data, r_skb,
-			      offsetof(struct sk_buff, data), ctx);
-		/* Load the header length */
-		emit_subu(r_skb_hl, r_skb_len, r_tmp, ctx);
-	}
-
-	if (ctx->flags & SEEN_X)
-		emit_jit_reg_move(r_X, r_zero, ctx);
-
-	/*
-	 * Do not leak kernel data to userspace, we only need to clear
-	 * r_A if it is ever used.  In fact if it is never used, we
-	 * will not save/restore it, so clearing it in this case would
-	 * corrupt the state of the caller.
-	 */
-	if (bpf_needs_clear_a(&ctx->skf->insns[0]) &&
-	    (ctx->flags & SEEN_A))
-		emit_jit_reg_move(r_A, r_zero, ctx);
-}
-
-static void build_epilogue(struct jit_ctx *ctx)
-{
-	unsigned int sp_off;
-
-	/* Calculate the total offset for the stack pointer */
-
-	sp_off = get_stack_depth(ctx);
-	restore_bpf_jit_regs(ctx, sp_off);
-
-	/* Return */
-	emit_jr(r_ra, ctx);
-	emit_nop(ctx);
-}
-
-#define CHOOSE_LOAD_FUNC(K, func) \
-	((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative : func) : \
-	 func##_positive)
-
-static int build_body(struct jit_ctx *ctx)
-{
-	const struct bpf_prog *prog = ctx->skf;
-	const struct sock_filter *inst;
-	unsigned int i, off, condt;
-	u32 k, b_off __maybe_unused;
-	u8 (*sk_load_func)(unsigned long *skb, int offset);
-
-	for (i = 0; i < prog->len; i++) {
-		u16 code;
-
-		inst = &(prog->insns[i]);
-		pr_debug("%s: code->0x%02x, jt->0x%x, jf->0x%x, k->0x%x\n",
-			 __func__, inst->code, inst->jt, inst->jf, inst->k);
-		k = inst->k;
-		code = bpf_anc_helper(inst);
-
-		if (ctx->target == NULL)
-			ctx->offsets[i] = ctx->idx * 4;
-
-		switch (code) {
-		case BPF_LD | BPF_IMM:
-			/* A <- k ==> li r_A, k */
-			ctx->flags |= SEEN_A;
-			emit_load_imm(r_A, k, ctx);
-			break;
-		case BPF_LD | BPF_W | BPF_LEN:
-			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
-			/* A <- len ==> lw r_A, offset(skb) */
-			ctx->flags |= SEEN_SKB | SEEN_A;
-			off = offsetof(struct sk_buff, len);
-			emit_load(r_A, r_skb, off, ctx);
-			break;
-		case BPF_LD | BPF_MEM:
-			/* A <- M[k] ==> lw r_A, offset(M) */
-			ctx->flags |= SEEN_MEM | SEEN_A;
-			emit_load(r_A, r_M, SCRATCH_OFF(k), ctx);
-			break;
-		case BPF_LD | BPF_W | BPF_ABS:
-			/* A <- P[k:4] */
-			sk_load_func = CHOOSE_LOAD_FUNC(k, sk_load_word);
-			goto load;
-		case BPF_LD | BPF_H | BPF_ABS:
-			/* A <- P[k:2] */
-			sk_load_func = CHOOSE_LOAD_FUNC(k, sk_load_half);
-			goto load;
-		case BPF_LD | BPF_B | BPF_ABS:
-			/* A <- P[k:1] */
-			sk_load_func = CHOOSE_LOAD_FUNC(k, sk_load_byte);
-load:
-			emit_load_imm(r_off, k, ctx);
-load_common:
-			ctx->flags |= SEEN_CALL | SEEN_OFF |
-				SEEN_SKB | SEEN_A | SEEN_SKB_DATA;
-
-			emit_load_func(r_s0, (ptr)sk_load_func, ctx);
-			emit_reg_move(MIPS_R_A0, r_skb, ctx);
-			emit_jalr(MIPS_R_RA, r_s0, ctx);
-			/* Load second argument to delay slot */
-			emit_reg_move(MIPS_R_A1, r_off, ctx);
-			/* Check the error value */
-			emit_bcond(MIPS_COND_EQ, r_ret, 0, b_imm(i + 1, ctx),
-				   ctx);
-			/* Load return register on DS for failures */
-			emit_reg_move(r_ret, r_zero, ctx);
-			/* Return with error */
-			emit_b(b_imm(prog->len, ctx), ctx);
-			emit_nop(ctx);
-			break;
-		case BPF_LD | BPF_W | BPF_IND:
-			/* A <- P[X + k:4] */
-			sk_load_func = sk_load_word;
-			goto load_ind;
-		case BPF_LD | BPF_H | BPF_IND:
-			/* A <- P[X + k:2] */
-			sk_load_func = sk_load_half;
-			goto load_ind;
-		case BPF_LD | BPF_B | BPF_IND:
-			/* A <- P[X + k:1] */
-			sk_load_func = sk_load_byte;
-load_ind:
-			ctx->flags |= SEEN_OFF | SEEN_X;
-			emit_addiu(r_off, r_X, k, ctx);
-			goto load_common;
-		case BPF_LDX | BPF_IMM:
-			/* X <- k */
-			ctx->flags |= SEEN_X;
-			emit_load_imm(r_X, k, ctx);
-			break;
-		case BPF_LDX | BPF_MEM:
-			/* X <- M[k] */
-			ctx->flags |= SEEN_X | SEEN_MEM;
-			emit_load(r_X, r_M, SCRATCH_OFF(k), ctx);
-			break;
-		case BPF_LDX | BPF_W | BPF_LEN:
-			/* X <- len */
-			ctx->flags |= SEEN_X | SEEN_SKB;
-			off = offsetof(struct sk_buff, len);
-			emit_load(r_X, r_skb, off, ctx);
-			break;
-		case BPF_LDX | BPF_B | BPF_MSH:
-			/* X <- 4 * (P[k:1] & 0xf) */
-			ctx->flags |= SEEN_X | SEEN_CALL | SEEN_SKB;
-			/* Load offset to a1 */
-			emit_load_func(r_s0, (ptr)sk_load_byte, ctx);
-			/*
-			 * This may emit two instructions so it may not fit
-			 * in the delay slot. So use a0 in the delay slot.
-			 */
-			emit_load_imm(MIPS_R_A1, k, ctx);
-			emit_jalr(MIPS_R_RA, r_s0, ctx);
-			emit_reg_move(MIPS_R_A0, r_skb, ctx); /* delay slot */
-			/* Check the error value */
-			emit_bcond(MIPS_COND_NE, r_ret, 0,
-				   b_imm(prog->len, ctx), ctx);
-			emit_reg_move(r_ret, r_zero, ctx);
-			/* We are good */
-			/* X <- P[1:K] & 0xf */
-			emit_andi(r_X, r_A, 0xf, ctx);
-			/* X << 2 */
-			emit_b(b_imm(i + 1, ctx), ctx);
-			emit_sll(r_X, r_X, 2, ctx); /* delay slot */
-			break;
-		case BPF_ST:
-			/* M[k] <- A */
-			ctx->flags |= SEEN_MEM | SEEN_A;
-			emit_store(r_A, r_M, SCRATCH_OFF(k), ctx);
-			break;
-		case BPF_STX:
-			/* M[k] <- X */
-			ctx->flags |= SEEN_MEM | SEEN_X;
-			emit_store(r_X, r_M, SCRATCH_OFF(k), ctx);
-			break;
-		case BPF_ALU | BPF_ADD | BPF_K:
-			/* A += K */
-			ctx->flags |= SEEN_A;
-			emit_addiu(r_A, r_A, k, ctx);
-			break;
-		case BPF_ALU | BPF_ADD | BPF_X:
-			/* A += X */
-			ctx->flags |= SEEN_A | SEEN_X;
-			emit_addu(r_A, r_A, r_X, ctx);
-			break;
-		case BPF_ALU | BPF_SUB | BPF_K:
-			/* A -= K */
-			ctx->flags |= SEEN_A;
-			emit_addiu(r_A, r_A, -k, ctx);
-			break;
-		case BPF_ALU | BPF_SUB | BPF_X:
-			/* A -= X */
-			ctx->flags |= SEEN_A | SEEN_X;
-			emit_subu(r_A, r_A, r_X, ctx);
-			break;
-		case BPF_ALU | BPF_MUL | BPF_K:
-			/* A *= K */
-			/* Load K to scratch register before MUL */
-			ctx->flags |= SEEN_A;
-			emit_load_imm(r_s0, k, ctx);
-			emit_mul(r_A, r_A, r_s0, ctx);
-			break;
-		case BPF_ALU | BPF_MUL | BPF_X:
-			/* A *= X */
-			ctx->flags |= SEEN_A | SEEN_X;
-			emit_mul(r_A, r_A, r_X, ctx);
-			break;
-		case BPF_ALU | BPF_DIV | BPF_K:
-			/* A /= k */
-			if (k == 1)
-				break;
-			if (optimize_div(&k)) {
-				ctx->flags |= SEEN_A;
-				emit_srl(r_A, r_A, k, ctx);
-				break;
-			}
-			ctx->flags |= SEEN_A;
-			emit_load_imm(r_s0, k, ctx);
-			emit_div(r_A, r_s0, ctx);
-			break;
-		case BPF_ALU | BPF_MOD | BPF_K:
-			/* A %= k */
-			if (k == 1) {
-				ctx->flags |= SEEN_A;
-				emit_jit_reg_move(r_A, r_zero, ctx);
-			} else {
-				ctx->flags |= SEEN_A;
-				emit_load_imm(r_s0, k, ctx);
-				emit_mod(r_A, r_s0, ctx);
-			}
-			break;
-		case BPF_ALU | BPF_DIV | BPF_X:
-			/* A /= X */
-			ctx->flags |= SEEN_X | SEEN_A;
-			/* Check if r_X is zero */
-			emit_bcond(MIPS_COND_EQ, r_X, r_zero,
-				   b_imm(prog->len, ctx), ctx);
-			emit_load_imm(r_ret, 0, ctx); /* delay slot */
-			emit_div(r_A, r_X, ctx);
-			break;
-		case BPF_ALU | BPF_MOD | BPF_X:
-			/* A %= X */
-			ctx->flags |= SEEN_X | SEEN_A;
-			/* Check if r_X is zero */
-			emit_bcond(MIPS_COND_EQ, r_X, r_zero,
-				   b_imm(prog->len, ctx), ctx);
-			emit_load_imm(r_ret, 0, ctx); /* delay slot */
-			emit_mod(r_A, r_X, ctx);
-			break;
-		case BPF_ALU | BPF_OR | BPF_K:
-			/* A |= K */
-			ctx->flags |= SEEN_A;
-			emit_ori(r_A, r_A, k, ctx);
-			break;
-		case BPF_ALU | BPF_OR | BPF_X:
-			/* A |= X */
-			ctx->flags |= SEEN_A;
-			emit_ori(r_A, r_A, r_X, ctx);
-			break;
-		case BPF_ALU | BPF_XOR | BPF_K:
-			/* A ^= k */
-			ctx->flags |= SEEN_A;
-			emit_xori(r_A, r_A, k, ctx);
-			break;
-		case BPF_ANC | SKF_AD_ALU_XOR_X:
-		case BPF_ALU | BPF_XOR | BPF_X:
-			/* A ^= X */
-			ctx->flags |= SEEN_A;
-			emit_xor(r_A, r_A, r_X, ctx);
-			break;
-		case BPF_ALU | BPF_AND | BPF_K:
-			/* A &= K */
-			ctx->flags |= SEEN_A;
-			emit_andi(r_A, r_A, k, ctx);
-			break;
-		case BPF_ALU | BPF_AND | BPF_X:
-			/* A &= X */
-			ctx->flags |= SEEN_A | SEEN_X;
-			emit_and(r_A, r_A, r_X, ctx);
-			break;
-		case BPF_ALU | BPF_LSH | BPF_K:
-			/* A <<= K */
-			ctx->flags |= SEEN_A;
-			emit_sll(r_A, r_A, k, ctx);
-			break;
-		case BPF_ALU | BPF_LSH | BPF_X:
-			/* A <<= X */
-			ctx->flags |= SEEN_A | SEEN_X;
-			emit_sllv(r_A, r_A, r_X, ctx);
-			break;
-		case BPF_ALU | BPF_RSH | BPF_K:
-			/* A >>= K */
-			ctx->flags |= SEEN_A;
-			emit_srl(r_A, r_A, k, ctx);
-			break;
-		case BPF_ALU | BPF_RSH | BPF_X:
-			ctx->flags |= SEEN_A | SEEN_X;
-			emit_srlv(r_A, r_A, r_X, ctx);
-			break;
-		case BPF_ALU | BPF_NEG:
-			/* A = -A */
-			ctx->flags |= SEEN_A;
-			emit_neg(r_A, ctx);
-			break;
-		case BPF_JMP | BPF_JA:
-			/* pc += K */
-			emit_b(b_imm(i + k + 1, ctx), ctx);
-			emit_nop(ctx);
-			break;
-		case BPF_JMP | BPF_JEQ | BPF_K:
-			/* pc += ( A == K ) ? pc->jt : pc->jf */
-			condt = MIPS_COND_EQ | MIPS_COND_K;
-			goto jmp_cmp;
-		case BPF_JMP | BPF_JEQ | BPF_X:
-			ctx->flags |= SEEN_X;
-			/* pc += ( A == X ) ? pc->jt : pc->jf */
-			condt = MIPS_COND_EQ | MIPS_COND_X;
-			goto jmp_cmp;
-		case BPF_JMP | BPF_JGE | BPF_K:
-			/* pc += ( A >= K ) ? pc->jt : pc->jf */
-			condt = MIPS_COND_GE | MIPS_COND_K;
-			goto jmp_cmp;
-		case BPF_JMP | BPF_JGE | BPF_X:
-			ctx->flags |= SEEN_X;
-			/* pc += ( A >= X ) ? pc->jt : pc->jf */
-			condt = MIPS_COND_GE | MIPS_COND_X;
-			goto jmp_cmp;
-		case BPF_JMP | BPF_JGT | BPF_K:
-			/* pc += ( A > K ) ? pc->jt : pc->jf */
-			condt = MIPS_COND_GT | MIPS_COND_K;
-			goto jmp_cmp;
-		case BPF_JMP | BPF_JGT | BPF_X:
-			ctx->flags |= SEEN_X;
-			/* pc += ( A > X ) ? pc->jt : pc->jf */
-			condt = MIPS_COND_GT | MIPS_COND_X;
-jmp_cmp:
-			/* Greater or Equal */
-			if ((condt & MIPS_COND_GE) ||
-			    (condt & MIPS_COND_GT)) {
-				if (condt & MIPS_COND_K) { /* K */
-					ctx->flags |= SEEN_A;
-					emit_sltiu(r_s0, r_A, k, ctx);
-				} else { /* X */
-					ctx->flags |= SEEN_A |
-						SEEN_X;
-					emit_sltu(r_s0, r_A, r_X, ctx);
-				}
-				/* A < (K|X) ? r_scrach = 1 */
-				b_off = b_imm(i + inst->jf + 1, ctx);
-				emit_bcond(MIPS_COND_NE, r_s0, r_zero, b_off,
-					   ctx);
-				emit_nop(ctx);
-				/* A > (K|X) ? scratch = 0 */
-				if (condt & MIPS_COND_GT) {
-					/* Checking for equality */
-					ctx->flags |= SEEN_A | SEEN_X;
-					if (condt & MIPS_COND_K)
-						emit_load_imm(r_s0, k, ctx);
-					else
-						emit_jit_reg_move(r_s0, r_X,
-								  ctx);
-					b_off = b_imm(i + inst->jf + 1, ctx);
-					emit_bcond(MIPS_COND_EQ, r_A, r_s0,
-						   b_off, ctx);
-					emit_nop(ctx);
-					/* Finally, A > K|X */
-					b_off = b_imm(i + inst->jt + 1, ctx);
-					emit_b(b_off, ctx);
-					emit_nop(ctx);
-				} else {
-					/* A >= (K|X) so jump */
-					b_off = b_imm(i + inst->jt + 1, ctx);
-					emit_b(b_off, ctx);
-					emit_nop(ctx);
-				}
-			} else {
-				/* A == K|X */
-				if (condt & MIPS_COND_K) { /* K */
-					ctx->flags |= SEEN_A;
-					emit_load_imm(r_s0, k, ctx);
-					/* jump true */
-					b_off = b_imm(i + inst->jt + 1, ctx);
-					emit_bcond(MIPS_COND_EQ, r_A, r_s0,
-						   b_off, ctx);
-					emit_nop(ctx);
-					/* jump false */
-					b_off = b_imm(i + inst->jf + 1,
-						      ctx);
-					emit_bcond(MIPS_COND_NE, r_A, r_s0,
-						   b_off, ctx);
-					emit_nop(ctx);
-				} else { /* X */
-					/* jump true */
-					ctx->flags |= SEEN_A | SEEN_X;
-					b_off = b_imm(i + inst->jt + 1,
-						      ctx);
-					emit_bcond(MIPS_COND_EQ, r_A, r_X,
-						   b_off, ctx);
-					emit_nop(ctx);
-					/* jump false */
-					b_off = b_imm(i + inst->jf + 1, ctx);
-					emit_bcond(MIPS_COND_NE, r_A, r_X,
-						   b_off, ctx);
-					emit_nop(ctx);
-				}
-			}
-			break;
-		case BPF_JMP | BPF_JSET | BPF_K:
-			ctx->flags |= SEEN_A;
-			/* pc += (A & K) ? pc -> jt : pc -> jf */
-			emit_load_imm(r_s1, k, ctx);
-			emit_and(r_s0, r_A, r_s1, ctx);
-			/* jump true */
-			b_off = b_imm(i + inst->jt + 1, ctx);
-			emit_bcond(MIPS_COND_NE, r_s0, r_zero, b_off, ctx);
-			emit_nop(ctx);
-			/* jump false */
-			b_off = b_imm(i + inst->jf + 1, ctx);
-			emit_b(b_off, ctx);
-			emit_nop(ctx);
-			break;
-		case BPF_JMP | BPF_JSET | BPF_X:
-			ctx->flags |= SEEN_X | SEEN_A;
-			/* pc += (A & X) ? pc -> jt : pc -> jf */
-			emit_and(r_s0, r_A, r_X, ctx);
-			/* jump true */
-			b_off = b_imm(i + inst->jt + 1, ctx);
-			emit_bcond(MIPS_COND_NE, r_s0, r_zero, b_off, ctx);
-			emit_nop(ctx);
-			/* jump false */
-			b_off = b_imm(i + inst->jf + 1, ctx);
-			emit_b(b_off, ctx);
-			emit_nop(ctx);
-			break;
-		case BPF_RET | BPF_A:
-			ctx->flags |= SEEN_A;
-			if (i != prog->len - 1)
-				/*
-				 * If this is not the last instruction
-				 * then jump to the epilogue
-				 */
-				emit_b(b_imm(prog->len, ctx), ctx);
-			emit_reg_move(r_ret, r_A, ctx); /* delay slot */
-			break;
-		case BPF_RET | BPF_K:
-			/*
-			 * It can emit two instructions so it does not fit on
-			 * the delay slot.
-			 */
-			emit_load_imm(r_ret, k, ctx);
-			if (i != prog->len - 1) {
-				/*
-				 * If this is not the last instruction
-				 * then jump to the epilogue
-				 */
-				emit_b(b_imm(prog->len, ctx), ctx);
-				emit_nop(ctx);
-			}
-			break;
-		case BPF_MISC | BPF_TAX:
-			/* X = A */
-			ctx->flags |= SEEN_X | SEEN_A;
-			emit_jit_reg_move(r_X, r_A, ctx);
-			break;
-		case BPF_MISC | BPF_TXA:
-			/* A = X */
-			ctx->flags |= SEEN_A | SEEN_X;
-			emit_jit_reg_move(r_A, r_X, ctx);
-			break;
-		/* AUX */
-		case BPF_ANC | SKF_AD_PROTOCOL:
-			/* A = ntohs(skb->protocol */
-			ctx->flags |= SEEN_SKB | SEEN_OFF | SEEN_A;
-			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff,
-						  protocol) != 2);
-			off = offsetof(struct sk_buff, protocol);
-			emit_half_load(r_A, r_skb, off, ctx);
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-			/* This needs little endian fixup */
-			if (cpu_has_wsbh) {
-				/* R2 and later have the wsbh instruction */
-				emit_wsbh(r_A, r_A, ctx);
-			} else {
-				/* Get first byte */
-				emit_andi(r_tmp_imm, r_A, 0xff, ctx);
-				/* Shift it */
-				emit_sll(r_tmp, r_tmp_imm, 8, ctx);
-				/* Get second byte */
-				emit_srl(r_tmp_imm, r_A, 8, ctx);
-				emit_andi(r_tmp_imm, r_tmp_imm, 0xff, ctx);
-				/* Put everyting together in r_A */
-				emit_or(r_A, r_tmp, r_tmp_imm, ctx);
-			}
-#endif
-			break;
-		case BPF_ANC | SKF_AD_CPU:
-			ctx->flags |= SEEN_A | SEEN_OFF;
-			/* A = current_thread_info()->cpu */
-			BUILD_BUG_ON(FIELD_SIZEOF(struct thread_info,
-						  cpu) != 4);
-			off = offsetof(struct thread_info, cpu);
-			/* $28/gp points to the thread_info struct */
-			emit_load(r_A, 28, off, ctx);
-			break;
-		case BPF_ANC | SKF_AD_IFINDEX:
-			/* A = skb->dev->ifindex */
-		case BPF_ANC | SKF_AD_HATYPE:
-			/* A = skb->dev->type */
-			ctx->flags |= SEEN_SKB | SEEN_A;
-			off = offsetof(struct sk_buff, dev);
-			/* Load *dev pointer */
-			emit_load_ptr(r_s0, r_skb, off, ctx);
-			/* error (0) in the delay slot */
-			emit_bcond(MIPS_COND_EQ, r_s0, r_zero,
-				   b_imm(prog->len, ctx), ctx);
-			emit_reg_move(r_ret, r_zero, ctx);
-			if (code == (BPF_ANC | SKF_AD_IFINDEX)) {
-				BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
-				off = offsetof(struct net_device, ifindex);
-				emit_load(r_A, r_s0, off, ctx);
-			} else { /* (code == (BPF_ANC | SKF_AD_HATYPE) */
-				BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
-				off = offsetof(struct net_device, type);
-				emit_half_load_unsigned(r_A, r_s0, off, ctx);
-			}
-			break;
-		case BPF_ANC | SKF_AD_MARK:
-			ctx->flags |= SEEN_SKB | SEEN_A;
-			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
-			off = offsetof(struct sk_buff, mark);
-			emit_load(r_A, r_skb, off, ctx);
-			break;
-		case BPF_ANC | SKF_AD_RXHASH:
-			ctx->flags |= SEEN_SKB | SEEN_A;
-			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
-			off = offsetof(struct sk_buff, hash);
-			emit_load(r_A, r_skb, off, ctx);
-			break;
-		case BPF_ANC | SKF_AD_VLAN_TAG:
-			ctx->flags |= SEEN_SKB | SEEN_A;
-			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff,
-						  vlan_tci) != 2);
-			off = offsetof(struct sk_buff, vlan_tci);
-			emit_half_load_unsigned(r_A, r_skb, off, ctx);
-			break;
-		case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT:
-			ctx->flags |= SEEN_SKB | SEEN_A;
-			emit_load_byte(r_A, r_skb, PKT_VLAN_PRESENT_OFFSET(), ctx);
-			if (PKT_VLAN_PRESENT_BIT)
-				emit_srl(r_A, r_A, PKT_VLAN_PRESENT_BIT, ctx);
-			if (PKT_VLAN_PRESENT_BIT < 7)
-				emit_andi(r_A, r_A, 1, ctx);
-			break;
-		case BPF_ANC | SKF_AD_PKTTYPE:
-			ctx->flags |= SEEN_SKB;
-
-			emit_load_byte(r_tmp, r_skb, PKT_TYPE_OFFSET(), ctx);
-			/* Keep only the last 3 bits */
-			emit_andi(r_A, r_tmp, PKT_TYPE_MAX, ctx);
-#ifdef __BIG_ENDIAN_BITFIELD
-			/* Get the actual packet type to the lower 3 bits */
-			emit_srl(r_A, r_A, 5, ctx);
-#endif
-			break;
-		case BPF_ANC | SKF_AD_QUEUE:
-			ctx->flags |= SEEN_SKB | SEEN_A;
-			BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff,
-						  queue_mapping) != 2);
-			BUILD_BUG_ON(offsetof(struct sk_buff,
-					      queue_mapping) > 0xff);
-			off = offsetof(struct sk_buff, queue_mapping);
-			emit_half_load_unsigned(r_A, r_skb, off, ctx);
-			break;
-		default:
-			pr_debug("%s: Unhandled opcode: 0x%02x\n", __FILE__,
-				 inst->code);
-			return -1;
-		}
-	}
-
-	/* compute offsets only during the first pass */
-	if (ctx->target == NULL)
-		ctx->offsets[i] = ctx->idx * 4;
-
-	return 0;
-}
-
-void bpf_jit_compile(struct bpf_prog *fp)
-{
-	struct jit_ctx ctx;
-	unsigned int alloc_size, tmp_idx;
-
-	if (!bpf_jit_enable)
-		return;
-
-	memset(&ctx, 0, sizeof(ctx));
-
-	ctx.offsets = kcalloc(fp->len + 1, sizeof(*ctx.offsets), GFP_KERNEL);
-	if (ctx.offsets == NULL)
-		return;
-
-	ctx.skf = fp;
-
-	if (build_body(&ctx))
-		goto out;
-
-	tmp_idx = ctx.idx;
-	build_prologue(&ctx);
-	ctx.prologue_bytes = (ctx.idx - tmp_idx) * 4;
-	/* just to complete the ctx.idx count */
-	build_epilogue(&ctx);
-
-	alloc_size = 4 * ctx.idx;
-	ctx.target = module_alloc(alloc_size);
-	if (ctx.target == NULL)
-		goto out;
-
-	/* Clean it */
-	memset(ctx.target, 0, alloc_size);
-
-	ctx.idx = 0;
-
-	/* Generate the actual JIT code */
-	build_prologue(&ctx);
-	build_body(&ctx);
-	build_epilogue(&ctx);
-
-	/* Update the icache */
-	flush_icache_range((ptr)ctx.target, (ptr)(ctx.target + ctx.idx));
-
-	if (bpf_jit_enable > 1)
-		/* Dump JIT code */
-		bpf_jit_dump(fp->len, alloc_size, 2, ctx.target);
-
-	fp->bpf_func = (void *)ctx.target;
-	fp->jited = 1;
-
-out:
-	kfree(ctx.offsets);
-}
-
-void bpf_jit_free(struct bpf_prog *fp)
-{
-	if (fp->jited)
-		module_memfree(fp->bpf_func);
-
-	bpf_prog_unlock_free(fp);
-}
diff --git a/arch/mips/net/bpf_jit_asm.S b/arch/mips/net/bpf_jit_asm.S
deleted file mode 100644
index 57154c5883b6..000000000000
--- a/arch/mips/net/bpf_jit_asm.S
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- * bpf_jib_asm.S: Packet/header access helper functions for MIPS/MIPS64 BPF
- * compiler.
- *
- * Copyright (C) 2015 Imagination Technologies Ltd.
- * Author: Markos Chandras <markos.chandras@imgtec.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2 of the License.
- */
-
-#include <asm/asm.h>
-#include <asm/isa-rev.h>
-#include <asm/regdef.h>
-#include "bpf_jit.h"
-
-/* ABI
- *
- * r_skb_hl	skb header length
- * r_skb_data	skb data
- * r_off(a1)	offset register
- * r_A		BPF register A
- * r_X		PF register X
- * r_skb(a0)	*skb
- * r_M		*scratch memory
- * r_skb_le	skb length
- * r_s0		Scratch register 0
- * r_s1		Scratch register 1
- *
- * On entry:
- * a0: *skb
- * a1: offset (imm or imm + X)
- *
- * All non-BPF-ABI registers are free for use. On return, we only
- * care about r_ret. The BPF-ABI registers are assumed to remain
- * unmodified during the entire filter operation.
- */
-
-#define skb	a0
-#define offset	a1
-#define SKF_LL_OFF  (-0x200000) /* Can't include linux/filter.h in assembly */
-
-	/* We know better :) so prevent assembler reordering etc */
-	.set 	noreorder
-
-#define is_offset_negative(TYPE)				\
-	/* If offset is negative we have more work to do */	\
-	slti	t0, offset, 0;					\
-	bgtz	t0, bpf_slow_path_##TYPE##_neg;			\
-	/* Be careful what follows in DS. */
-
-#define is_offset_in_header(SIZE, TYPE)				\
-	/* Reading from header? */				\
-	addiu	$r_s0, $r_skb_hl, -SIZE;			\
-	slt	t0, $r_s0, offset;				\
-	bgtz	t0, bpf_slow_path_##TYPE;			\
-
-LEAF(sk_load_word)
-	is_offset_negative(word)
-FEXPORT(sk_load_word_positive)
-	is_offset_in_header(4, word)
-	/* Offset within header boundaries */
-	PTR_ADDU t1, $r_skb_data, offset
-	.set	reorder
-	lw	$r_A, 0(t1)
-	.set	noreorder
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-# if MIPS_ISA_REV >= 2
-	wsbh	t0, $r_A
-	rotr	$r_A, t0, 16
-# else
-	sll	t0, $r_A, 24
-	srl	t1, $r_A, 24
-	srl	t2, $r_A, 8
-	or	t0, t0, t1
-	andi	t2, t2, 0xff00
-	andi	t1, $r_A, 0xff00
-	or	t0, t0, t2
-	sll	t1, t1, 8
-	or	$r_A, t0, t1
-# endif
-#endif
-	jr	$r_ra
-	 move	$r_ret, zero
-	END(sk_load_word)
-
-LEAF(sk_load_half)
-	is_offset_negative(half)
-FEXPORT(sk_load_half_positive)
-	is_offset_in_header(2, half)
-	/* Offset within header boundaries */
-	PTR_ADDU t1, $r_skb_data, offset
-	lhu	$r_A, 0(t1)
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-# if MIPS_ISA_REV >= 2
-	wsbh	$r_A, $r_A
-# else
-	sll	t0, $r_A, 8
-	srl	t1, $r_A, 8
-	andi	t0, t0, 0xff00
-	or	$r_A, t0, t1
-# endif
-#endif
-	jr	$r_ra
-	 move	$r_ret, zero
-	END(sk_load_half)
-
-LEAF(sk_load_byte)
-	is_offset_negative(byte)
-FEXPORT(sk_load_byte_positive)
-	is_offset_in_header(1, byte)
-	/* Offset within header boundaries */
-	PTR_ADDU t1, $r_skb_data, offset
-	lbu	$r_A, 0(t1)
-	jr	$r_ra
-	 move	$r_ret, zero
-	END(sk_load_byte)
-
-/*
- * call skb_copy_bits:
- * (prototype in linux/skbuff.h)
- *
- * int skb_copy_bits(sk_buff *skb, int offset, void *to, int len)
- *
- * o32 mandates we leave 4 spaces for argument registers in case
- * the callee needs to use them. Even though we don't care about
- * the argument registers ourselves, we need to allocate that space
- * to remain ABI compliant since the callee may want to use that space.
- * We also allocate 2 more spaces for $r_ra and our return register (*to).
- *
- * n64 is a bit different. The *caller* will allocate the space to preserve
- * the arguments. So in 64-bit kernels, we allocate the 4-arg space for no
- * good reason but it does not matter that much really.
- *
- * (void *to) is returned in r_s0
- *
- */
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-#define DS_OFFSET(SIZE) (4 * SZREG)
-#else
-#define DS_OFFSET(SIZE) ((4 * SZREG) + (4 - SIZE))
-#endif
-#define bpf_slow_path_common(SIZE)				\
-	/* Quick check. Are we within reasonable boundaries? */ \
-	LONG_ADDIU	$r_s1, $r_skb_len, -SIZE;		\
-	sltu		$r_s0, offset, $r_s1;			\
-	beqz		$r_s0, fault;				\
-	/* Load 4th argument in DS */				\
-	 LONG_ADDIU	a3, zero, SIZE;				\
-	PTR_ADDIU	$r_sp, $r_sp, -(6 * SZREG);		\
-	PTR_LA		t0, skb_copy_bits;			\
-	PTR_S		$r_ra, (5 * SZREG)($r_sp);		\
-	/* Assign low slot to a2 */				\
-	PTR_ADDIU	a2, $r_sp, DS_OFFSET(SIZE);		\
-	jalr		t0;					\
-	/* Reset our destination slot (DS but it's ok) */	\
-	 INT_S		zero, (4 * SZREG)($r_sp);		\
-	/*							\
-	 * skb_copy_bits returns 0 on success and -EFAULT	\
-	 * on error. Our data live in a2. Do not bother with	\
-	 * our data if an error has been returned.		\
-	 */							\
-	/* Restore our frame */					\
-	PTR_L		$r_ra, (5 * SZREG)($r_sp);		\
-	INT_L		$r_s0, (4 * SZREG)($r_sp);		\
-	bltz		v0, fault;				\
-	 PTR_ADDIU	$r_sp, $r_sp, 6 * SZREG;		\
-	move		$r_ret, zero;				\
-
-NESTED(bpf_slow_path_word, (6 * SZREG), $r_sp)
-	bpf_slow_path_common(4)
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-# if MIPS_ISA_REV >= 2
-	wsbh	t0, $r_s0
-	jr	$r_ra
-	 rotr	$r_A, t0, 16
-# else
-	sll	t0, $r_s0, 24
-	srl	t1, $r_s0, 24
-	srl	t2, $r_s0, 8
-	or	t0, t0, t1
-	andi	t2, t2, 0xff00
-	andi	t1, $r_s0, 0xff00
-	or	t0, t0, t2
-	sll	t1, t1, 8
-	jr	$r_ra
-	 or	$r_A, t0, t1
-# endif
-#else
-	jr	$r_ra
-	 move	$r_A, $r_s0
-#endif
-
-	END(bpf_slow_path_word)
-
-NESTED(bpf_slow_path_half, (6 * SZREG), $r_sp)
-	bpf_slow_path_common(2)
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-# if MIPS_ISA_REV >= 2
-	jr	$r_ra
-	 wsbh	$r_A, $r_s0
-# else
-	sll	t0, $r_s0, 8
-	andi	t1, $r_s0, 0xff00
-	andi	t0, t0, 0xff00
-	srl	t1, t1, 8
-	jr	$r_ra
-	 or	$r_A, t0, t1
-# endif
-#else
-	jr	$r_ra
-	 move	$r_A, $r_s0
-#endif
-
-	END(bpf_slow_path_half)
-
-NESTED(bpf_slow_path_byte, (6 * SZREG), $r_sp)
-	bpf_slow_path_common(1)
-	jr	$r_ra
-	 move	$r_A, $r_s0
-
-	END(bpf_slow_path_byte)
-
-/*
- * Negative entry points
- */
-	.macro bpf_is_end_of_data
-	li	t0, SKF_LL_OFF
-	/* Reading link layer data? */
-	slt	t1, offset, t0
-	bgtz	t1, fault
-	/* Be careful what follows in DS. */
-	.endm
-/*
- * call skb_copy_bits:
- * (prototype in linux/filter.h)
- *
- * void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb,
- *                                            int k, unsigned int size)
- *
- * see above (bpf_slow_path_common) for ABI restrictions
- */
-#define bpf_negative_common(SIZE)					\
-	PTR_ADDIU	$r_sp, $r_sp, -(6 * SZREG);			\
-	PTR_LA		t0, bpf_internal_load_pointer_neg_helper;	\
-	PTR_S		$r_ra, (5 * SZREG)($r_sp);			\
-	jalr		t0;						\
-	 li		a2, SIZE;					\
-	PTR_L		$r_ra, (5 * SZREG)($r_sp);			\
-	/* Check return pointer */					\
-	beqz		v0, fault;					\
-	 PTR_ADDIU	$r_sp, $r_sp, 6 * SZREG;			\
-	/* Preserve our pointer */					\
-	move		$r_s0, v0;					\
-	/* Set return value */						\
-	move		$r_ret, zero;					\
-
-bpf_slow_path_word_neg:
-	bpf_is_end_of_data
-NESTED(sk_load_word_negative, (6 * SZREG), $r_sp)
-	bpf_negative_common(4)
-	jr	$r_ra
-	 lw	$r_A, 0($r_s0)
-	END(sk_load_word_negative)
-
-bpf_slow_path_half_neg:
-	bpf_is_end_of_data
-NESTED(sk_load_half_negative, (6 * SZREG), $r_sp)
-	bpf_negative_common(2)
-	jr	$r_ra
-	 lhu	$r_A, 0($r_s0)
-	END(sk_load_half_negative)
-
-bpf_slow_path_byte_neg:
-	bpf_is_end_of_data
-NESTED(sk_load_byte_negative, (6 * SZREG), $r_sp)
-	bpf_negative_common(1)
-	jr	$r_ra
-	 lbu	$r_A, 0($r_s0)
-	END(sk_load_byte_negative)
-
-fault:
-	jr	$r_ra
-	 addiu $r_ret, zero, 1
diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index 26eef9ad3896..3548a69c82f7 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -126,15 +126,21 @@ static enum reg_val_type get_reg_val_type(const struct jit_ctx *ctx,
 }
 
 /* Simply emit the instruction if the JIT memory space has been allocated */
-#define emit_instr(ctx, func, ...)			\
-do {							\
-	if ((ctx)->target != NULL) {			\
-		u32 *p = &(ctx)->target[ctx->idx];	\
-		uasm_i_##func(&p, ##__VA_ARGS__);	\
-	}						\
-	(ctx)->idx++;					\
+#define emit_instr_long(ctx, func64, func32, ...)		\
+do {								\
+	if ((ctx)->target != NULL) {				\
+		u32 *p = &(ctx)->target[ctx->idx];		\
+		if (IS_ENABLED(CONFIG_64BIT))			\
+			uasm_i_##func64(&p, ##__VA_ARGS__);	\
+		else						\
+			uasm_i_##func32(&p, ##__VA_ARGS__);	\
+	}							\
+	(ctx)->idx++;						\
 } while (0)
 
+#define emit_instr(ctx, func, ...)				\
+	emit_instr_long(ctx, func, func, ##__VA_ARGS__)
+
 static unsigned int j_target(struct jit_ctx *ctx, int target_idx)
 {
 	unsigned long target_va, base_va;
@@ -274,17 +280,17 @@ static int gen_int_prologue(struct jit_ctx *ctx)
 		 * If RA we are doing a function call and may need
 		 * extra 8-byte tmp area.
 		 */
-		stack_adjust += 16;
+		stack_adjust += 2 * sizeof(long);
 	if (ctx->flags & EBPF_SAVE_S0)
-		stack_adjust += 8;
+		stack_adjust += sizeof(long);
 	if (ctx->flags & EBPF_SAVE_S1)
-		stack_adjust += 8;
+		stack_adjust += sizeof(long);
 	if (ctx->flags & EBPF_SAVE_S2)
-		stack_adjust += 8;
+		stack_adjust += sizeof(long);
 	if (ctx->flags & EBPF_SAVE_S3)
-		stack_adjust += 8;
+		stack_adjust += sizeof(long);
 	if (ctx->flags & EBPF_SAVE_S4)
-		stack_adjust += 8;
+		stack_adjust += sizeof(long);
 
 	BUILD_BUG_ON(MAX_BPF_STACK & 7);
 	locals_size = (ctx->flags & EBPF_SEEN_FP) ? MAX_BPF_STACK : 0;
@@ -298,41 +304,49 @@ static int gen_int_prologue(struct jit_ctx *ctx)
 	 * On tail call we skip this instruction, and the TCC is
 	 * passed in $v1 from the caller.
 	 */
-	emit_instr(ctx, daddiu, MIPS_R_V1, MIPS_R_ZERO, MAX_TAIL_CALL_CNT);
+	emit_instr(ctx, addiu, MIPS_R_V1, MIPS_R_ZERO, MAX_TAIL_CALL_CNT);
 	if (stack_adjust)
-		emit_instr(ctx, daddiu, MIPS_R_SP, MIPS_R_SP, -stack_adjust);
+		emit_instr_long(ctx, daddiu, addiu,
+					MIPS_R_SP, MIPS_R_SP, -stack_adjust);
 	else
 		return 0;
 
-	store_offset = stack_adjust - 8;
+	store_offset = stack_adjust - sizeof(long);
 
 	if (ctx->flags & EBPF_SAVE_RA) {
-		emit_instr(ctx, sd, MIPS_R_RA, store_offset, MIPS_R_SP);
-		store_offset -= 8;
+		emit_instr_long(ctx, sd, sw,
+					MIPS_R_RA, store_offset, MIPS_R_SP);
+		store_offset -= sizeof(long);
 	}
 	if (ctx->flags & EBPF_SAVE_S0) {
-		emit_instr(ctx, sd, MIPS_R_S0, store_offset, MIPS_R_SP);
-		store_offset -= 8;
+		emit_instr_long(ctx, sd, sw,
+					MIPS_R_S0, store_offset, MIPS_R_SP);
+		store_offset -= sizeof(long);
 	}
 	if (ctx->flags & EBPF_SAVE_S1) {
-		emit_instr(ctx, sd, MIPS_R_S1, store_offset, MIPS_R_SP);
-		store_offset -= 8;
+		emit_instr_long(ctx, sd, sw,
+					MIPS_R_S1, store_offset, MIPS_R_SP);
+		store_offset -= sizeof(long);
 	}
 	if (ctx->flags & EBPF_SAVE_S2) {
-		emit_instr(ctx, sd, MIPS_R_S2, store_offset, MIPS_R_SP);
-		store_offset -= 8;
+		emit_instr_long(ctx, sd, sw,
+					MIPS_R_S2, store_offset, MIPS_R_SP);
+		store_offset -= sizeof(long);
 	}
 	if (ctx->flags & EBPF_SAVE_S3) {
-		emit_instr(ctx, sd, MIPS_R_S3, store_offset, MIPS_R_SP);
-		store_offset -= 8;
+		emit_instr_long(ctx, sd, sw,
+					MIPS_R_S3, store_offset, MIPS_R_SP);
+		store_offset -= sizeof(long);
 	}
 	if (ctx->flags & EBPF_SAVE_S4) {
-		emit_instr(ctx, sd, MIPS_R_S4, store_offset, MIPS_R_SP);
-		store_offset -= 8;
+		emit_instr_long(ctx, sd, sw,
+					MIPS_R_S4, store_offset, MIPS_R_SP);
+		store_offset -= sizeof(long);
 	}
 
 	if ((ctx->flags & EBPF_SEEN_TC) && !(ctx->flags & EBPF_TCC_IN_V1))
-		emit_instr(ctx, daddu, MIPS_R_S4, MIPS_R_V1, MIPS_R_ZERO);
+		emit_instr_long(ctx, daddu, addu,
+					MIPS_R_S4, MIPS_R_V1, MIPS_R_ZERO);
 
 	return 0;
 }
@@ -341,7 +355,7 @@ static int build_int_epilogue(struct jit_ctx *ctx, int dest_reg)
 {
 	const struct bpf_prog *prog = ctx->skf;
 	int stack_adjust = ctx->stack_size;
-	int store_offset = stack_adjust - 8;
+	int store_offset = stack_adjust - sizeof(long);
 	enum reg_val_type td;
 	int r0 = MIPS_R_V0;
 
@@ -353,33 +367,40 @@ static int build_int_epilogue(struct jit_ctx *ctx, int dest_reg)
 	}
 
 	if (ctx->flags & EBPF_SAVE_RA) {
-		emit_instr(ctx, ld, MIPS_R_RA, store_offset, MIPS_R_SP);
-		store_offset -= 8;
+		emit_instr_long(ctx, ld, lw,
+					MIPS_R_RA, store_offset, MIPS_R_SP);
+		store_offset -= sizeof(long);
 	}
 	if (ctx->flags & EBPF_SAVE_S0) {
-		emit_instr(ctx, ld, MIPS_R_S0, store_offset, MIPS_R_SP);
-		store_offset -= 8;
+		emit_instr_long(ctx, ld, lw,
+					MIPS_R_S0, store_offset, MIPS_R_SP);
+		store_offset -= sizeof(long);
 	}
 	if (ctx->flags & EBPF_SAVE_S1) {
-		emit_instr(ctx, ld, MIPS_R_S1, store_offset, MIPS_R_SP);
-		store_offset -= 8;
+		emit_instr_long(ctx, ld, lw,
+					MIPS_R_S1, store_offset, MIPS_R_SP);
+		store_offset -= sizeof(long);
 	}
 	if (ctx->flags & EBPF_SAVE_S2) {
-		emit_instr(ctx, ld, MIPS_R_S2, store_offset, MIPS_R_SP);
-		store_offset -= 8;
+		emit_instr_long(ctx, ld, lw,
+				MIPS_R_S2, store_offset, MIPS_R_SP);
+		store_offset -= sizeof(long);
 	}
 	if (ctx->flags & EBPF_SAVE_S3) {
-		emit_instr(ctx, ld, MIPS_R_S3, store_offset, MIPS_R_SP);
-		store_offset -= 8;
+		emit_instr_long(ctx, ld, lw,
+					MIPS_R_S3, store_offset, MIPS_R_SP);
+		store_offset -= sizeof(long);
 	}
 	if (ctx->flags & EBPF_SAVE_S4) {
-		emit_instr(ctx, ld, MIPS_R_S4, store_offset, MIPS_R_SP);
-		store_offset -= 8;
+		emit_instr_long(ctx, ld, lw,
+					MIPS_R_S4, store_offset, MIPS_R_SP);
+		store_offset -= sizeof(long);
 	}
 	emit_instr(ctx, jr, dest_reg);
 
 	if (stack_adjust)
-		emit_instr(ctx, daddiu, MIPS_R_SP, MIPS_R_SP, stack_adjust);
+		emit_instr_long(ctx, daddiu, addiu,
+					MIPS_R_SP, MIPS_R_SP, stack_adjust);
 	else
 		emit_instr(ctx, nop);
 
@@ -646,6 +667,10 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 	s64 t64s;
 	int bpf_op = BPF_OP(insn->code);
 
+	if (IS_ENABLED(CONFIG_32BIT) && ((BPF_CLASS(insn->code) == BPF_ALU64)
+						|| (bpf_op == BPF_DW)))
+		return -EINVAL;
+
 	switch (insn->code) {
 	case BPF_ALU64 | BPF_ADD | BPF_K: /* ALU64_IMM */
 	case BPF_ALU64 | BPF_SUB | BPF_K: /* ALU64_IMM */
@@ -1283,7 +1308,7 @@ jeq_common:
 
 	case BPF_JMP | BPF_CALL:
 		ctx->flags |= EBPF_SAVE_RA;
-		t64s = (s64)insn->imm + (s64)__bpf_call_base;
+		t64s = (s64)insn->imm + (long)__bpf_call_base;
 		emit_const_to_reg(ctx, MIPS_R_T9, (u64)t64s);
 		emit_instr(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
 		/* delay slot */
-- 
cgit v1.2.3-59-g8ed1b


From 371a415153be953520003ffe4a4eca3fecc0e8c2 Mon Sep 17 00:00:00 2001
From: "Enrico Weigelt, metux IT consult" <info@metux.net>
Date: Mon, 11 Mar 2019 16:54:27 +0100
Subject: arch: mips: Kconfig: pedantic formatting

Formatting of Kconfig files doesn't look so pretty, so let the
Great White Handkerchief come around and clean it up.

Signed-off-by: Enrico Weigelt, metux IT consult <info@metux.net>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: linux-kernel@vger.kernel.org
Cc: hauke@hauke-m.de
Cc: zajec5@gmail.com
Cc: f.fainelli@gmail.com
Cc: bcm-kernel-feedback-list@broadcom.com
Cc: linux-mips@vger.kernel.org
---
 arch/mips/Kconfig                | 61 ++++++++++++++++++++--------------------
 arch/mips/bcm47xx/Kconfig        |  8 +++---
 arch/mips/bcm63xx/boards/Kconfig |  2 +-
 arch/mips/pic32/Kconfig          |  8 +++---
 4 files changed, 39 insertions(+), 40 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index cbf16db3366c..fb8a39d53168 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -275,7 +275,7 @@ config BCM47XX
 	select BCM47XX_SPROM
 	select BCM47XX_SSB if !BCM47XX_BCMA
 	help
-	 Support for BCM47XX based boards
+	  Support for BCM47XX based boards
 
 config BCM63XX
 	bool "Broadcom BCM63XX based boards"
@@ -294,7 +294,7 @@ config BCM63XX
 	select MIPS_L1_CACHE_SHIFT_4
 	select CLKDEV_LOOKUP
 	help
-	 Support for BCM63XX based boards
+	  Support for BCM63XX based boards
 
 config MIPS_COBALT
 	bool "Cobalt Server"
@@ -373,10 +373,10 @@ config MACH_JAZZ
 	select SYS_SUPPORTS_64BIT_KERNEL
 	select SYS_SUPPORTS_100HZ
 	help
-	 This a family of machines based on the MIPS R4030 chipset which was
-	 used by several vendors to build RISC/os and Windows NT workstations.
-	 Members include the Acer PICA, MIPS Magnum 4000, MIPS Millennium and
-	 Olivetti M700-10 workstations.
+	  This a family of machines based on the MIPS R4030 chipset which was
+	  used by several vendors to build RISC/os and Windows NT workstations.
+	  Members include the Acer PICA, MIPS Magnum 4000, MIPS Millennium and
+	  Olivetti M700-10 workstations.
 
 config MACH_INGENIC
 	bool "Ingenic SoC based machines"
@@ -572,14 +572,14 @@ config NXP_STB220
 	bool "NXP STB220 board"
 	select SOC_PNX833X
 	help
-	 Support for NXP Semiconductors STB220 Development Board.
+	  Support for NXP Semiconductors STB220 Development Board.
 
 config NXP_STB225
 	bool "NXP 225 board"
 	select SOC_PNX833X
 	select SOC_PNX8335
 	help
-	 Support for NXP Semiconductors STB225 Development Board.
+	  Support for NXP Semiconductors STB225 Development Board.
 
 config PMC_MSP
 	bool "PMC-Sierra MSP chipsets"
@@ -721,9 +721,9 @@ config SGI_IP28
 	select SYS_SUPPORTS_64BIT_KERNEL
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select MIPS_L1_CACHE_SHIFT_7
-      help
-        This is the SGI Indigo2 with R10000 processor.  To compile a Linux
-        kernel that runs on these, say Y here.
+	help
+	  This is the SGI Indigo2 with R10000 processor.  To compile a Linux
+	  kernel that runs on these, say Y here.
 
 config SGI_IP32
 	bool "SGI IP32 (O2)"
@@ -1174,9 +1174,9 @@ config HOLES_IN_ZONE
 config SYS_SUPPORTS_RELOCATABLE
 	bool
 	help
-	 Selected if the platform supports relocating the kernel.
-	 The platform must provide plat_get_fdt() if it selects CONFIG_USE_OF
-	 to allow access to command line and entropy sources.
+	  Selected if the platform supports relocating the kernel.
+	  The platform must provide plat_get_fdt() if it selects CONFIG_USE_OF
+	  to allow access to command line and entropy sources.
 
 config MIPS_CBPF_JIT
 	def_bool y
@@ -2119,8 +2119,8 @@ config MIPS_PGD_C0_CONTEXT
 # Set to y for ptrace access to watch registers.
 #
 config HARDWARE_WATCHPOINTS
-       bool
-       default y if CPU_MIPSR1 || CPU_MIPSR2 || CPU_MIPSR6
+	bool
+	default y if CPU_MIPSR1 || CPU_MIPSR2 || CPU_MIPSR6
 
 menu "Kernel type"
 
@@ -2184,10 +2184,10 @@ config PAGE_SIZE_4KB
 	bool "4kB"
 	depends on !CPU_LOONGSON2 && !CPU_LOONGSON3
 	help
-	 This option select the standard 4kB Linux page size.  On some
-	 R3000-family processors this is the only available page size.  Using
-	 4kB page size will minimize memory consumption and is therefore
-	 recommended for low memory systems.
+	  This option select the standard 4kB Linux page size.  On some
+	  R3000-family processors this is the only available page size.  Using
+	  4kB page size will minimize memory consumption and is therefore
+	  recommended for low memory systems.
 
 config PAGE_SIZE_8KB
 	bool "8kB"
@@ -2480,7 +2480,6 @@ config SB1_PASS_2_1_WORKAROUNDS
 	depends on CPU_SB1 && CPU_SB1_PASS_2
 	default y
 
-
 choice
 	prompt "SmartMIPS or microMIPS ASE support"
 
@@ -2688,16 +2687,16 @@ config RANDOMIZE_BASE
 	bool "Randomize the address of the kernel image"
 	depends on RELOCATABLE
 	---help---
-	   Randomizes the physical and virtual address at which the
-	   kernel image is loaded, as a security feature that
-	   deters exploit attempts relying on knowledge of the location
-	   of kernel internals.
+	  Randomizes the physical and virtual address at which the
+	  kernel image is loaded, as a security feature that
+	  deters exploit attempts relying on knowledge of the location
+	  of kernel internals.
 
-	   Entropy is generated using any coprocessor 0 registers available.
+	  Entropy is generated using any coprocessor 0 registers available.
 
-	   The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET.
+	  The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET.
 
-	   If unsure, say N.
+	  If unsure, say N.
 
 config RANDOMIZE_BASE_MAX_OFFSET
 	hex "Maximum kASLR offset" if EXPERT
@@ -2827,7 +2826,7 @@ choice
 	prompt "Timer frequency"
 	default HZ_250
 	help
-	 Allows the configuration of the timer frequency.
+	  Allows the configuration of the timer frequency.
 
 	config HZ_24
 		bool "24 HZ" if SYS_SUPPORTS_24HZ || SYS_SUPPORTS_ARBIT_HZ
@@ -3127,10 +3126,10 @@ config ARCH_MMAP_RND_BITS_MAX
 	default 15
 
 config ARCH_MMAP_RND_COMPAT_BITS_MIN
-       default 8
+	default 8
 
 config ARCH_MMAP_RND_COMPAT_BITS_MAX
-       default 15
+	default 15
 
 config I8253
 	bool
diff --git a/arch/mips/bcm47xx/Kconfig b/arch/mips/bcm47xx/Kconfig
index 29471038d817..6889f74e06f5 100644
--- a/arch/mips/bcm47xx/Kconfig
+++ b/arch/mips/bcm47xx/Kconfig
@@ -15,9 +15,9 @@ config BCM47XX_SSB
 	select SSB_DRIVER_GPIO
 	default y
 	help
-	 Add support for old Broadcom BCM47xx boards with Sonics Silicon Backplane support.
+	  Add support for old Broadcom BCM47xx boards with Sonics Silicon Backplane support.
 
-	 This will generate an image with support for SSB and MIPS32 R1 instruction set.
+	  This will generate an image with support for SSB and MIPS32 R1 instruction set.
 
 config BCM47XX_BCMA
 	bool "BCMA Support for Broadcom BCM47XX"
@@ -31,8 +31,8 @@ config BCM47XX_BCMA
 	select BCMA_DRIVER_GPIO
 	default y
 	help
-	 Add support for new Broadcom BCM47xx boards with Broadcom specific Advanced Microcontroller Bus.
+	  Add support for new Broadcom BCM47xx boards with Broadcom specific Advanced Microcontroller Bus.
 
-	 This will generate an image with support for BCMA and MIPS32 R2 instruction set.
+	  This will generate an image with support for BCMA and MIPS32 R2 instruction set.
 
 endif
diff --git a/arch/mips/bcm63xx/boards/Kconfig b/arch/mips/bcm63xx/boards/Kconfig
index f60d96610ace..492c3bd005d5 100644
--- a/arch/mips/bcm63xx/boards/Kconfig
+++ b/arch/mips/bcm63xx/boards/Kconfig
@@ -5,7 +5,7 @@ choice
 	default BOARD_BCM963XX
 
 config BOARD_BCM963XX
-       bool "Generic Broadcom 963xx boards"
+	bool "Generic Broadcom 963xx boards"
 	select SSB
 
 endchoice
diff --git a/arch/mips/pic32/Kconfig b/arch/mips/pic32/Kconfig
index e284e89183cc..7acbb50c1dcd 100644
--- a/arch/mips/pic32/Kconfig
+++ b/arch/mips/pic32/Kconfig
@@ -39,12 +39,12 @@ choice
 	  Select the devicetree.
 
 config DTB_PIC32_NONE
-       bool "None"
+	bool "None"
 
 config DTB_PIC32_MZDA_SK
-       bool "PIC32MZDA Starter Kit"
-       depends on PIC32MZDA
-       select BUILTIN_DTB
+	bool "PIC32MZDA Starter Kit"
+	depends on PIC32MZDA
+	select BUILTIN_DTB
 
 endchoice
 
-- 
cgit v1.2.3-59-g8ed1b


From a07b20004793d8926f78d63eb5980559f7813404 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 5 Nov 2018 17:40:30 +0000
Subject: vfs: syscall: Add open_tree(2) to reference or clone a mount

open_tree(dfd, pathname, flags)

Returns an O_PATH-opened file descriptor or an error.
dfd and pathname specify the location to open, in usual
fashion (see e.g. fstatat(2)).  flags should be an OR of
some of the following:
	* AT_PATH_EMPTY, AT_NO_AUTOMOUNT, AT_SYMLINK_NOFOLLOW -
same meanings as usual
	* OPEN_TREE_CLOEXEC - make the resulting descriptor
close-on-exec
	* OPEN_TREE_CLONE or OPEN_TREE_CLONE | AT_RECURSIVE -
instead of opening the location in question, create a detached
mount tree matching the subtree rooted at location specified by
dfd/pathname.  With AT_RECURSIVE the entire subtree is cloned,
without it - only the part within in the mount containing the
location in question.  In other words, the same as mount --rbind
or mount --bind would've taken.  The detached tree will be
dissolved on the final close of obtained file.  Creation of such
detached trees requires the same capabilities as doing mount --bind.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/entry/syscalls/syscall_32.tbl |   3 +-
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 fs/file_table.c                        |   9 +-
 fs/internal.h                          |   1 +
 fs/namespace.c                         | 157 ++++++++++++++++++++++++++++-----
 include/linux/fs.h                     |   7 +-
 include/linux/syscalls.h               |   1 +
 include/uapi/linux/fcntl.h             |   2 +
 include/uapi/linux/mount.h             |   6 ++
 9 files changed, 159 insertions(+), 28 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 1f9607ed087c..ae2294d07ecb 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -398,7 +398,8 @@
 384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl
 385	i386	io_pgetevents		sys_io_pgetevents_time32	__ia32_compat_sys_io_pgetevents
 386	i386	rseq			sys_rseq			__ia32_sys_rseq
-# don't use numbers 387 through 392, add new calls at the end
+387	i386	open_tree		sys_open_tree			__ia32_sys_open_tree
+# don't use numbers 388 through 392, add new calls at the end
 393	i386	semget			sys_semget    			__ia32_sys_semget
 394	i386	semctl			sys_semctl    			__ia32_compat_sys_semctl
 395	i386	shmget			sys_shmget    			__ia32_sys_shmget
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 92ee0b4378d4..a6e06c35b5b1 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -343,6 +343,7 @@
 332	common	statx			__x64_sys_statx
 333	common	io_pgetevents		__x64_sys_io_pgetevents
 334	common	rseq			__x64_sys_rseq
+335	common	open_tree		__x64_sys_open_tree
 # don't use numbers 387 through 423, add new calls after the last
 # 'common' entry
 424	common	pidfd_send_signal	__x64_sys_pidfd_send_signal
diff --git a/fs/file_table.c b/fs/file_table.c
index 155d7514a094..3f9c1b452c1d 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -255,6 +255,7 @@ static void __fput(struct file *file)
 	struct dentry *dentry = file->f_path.dentry;
 	struct vfsmount *mnt = file->f_path.mnt;
 	struct inode *inode = file->f_inode;
+	fmode_t mode = file->f_mode;
 
 	if (unlikely(!(file->f_mode & FMODE_OPENED)))
 		goto out;
@@ -277,18 +278,20 @@ static void __fput(struct file *file)
 	if (file->f_op->release)
 		file->f_op->release(inode, file);
 	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
-		     !(file->f_mode & FMODE_PATH))) {
+		     !(mode & FMODE_PATH))) {
 		cdev_put(inode->i_cdev);
 	}
 	fops_put(file->f_op);
 	put_pid(file->f_owner.pid);
-	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+	if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 		i_readcount_dec(inode);
-	if (file->f_mode & FMODE_WRITER) {
+	if (mode & FMODE_WRITER) {
 		put_write_access(inode);
 		__mnt_drop_write(mnt);
 	}
 	dput(dentry);
+	if (unlikely(mode & FMODE_NEED_UNMOUNT))
+		dissolve_on_fput(mnt);
 	mntput(mnt);
 out:
 	file_free(file);
diff --git a/fs/internal.h b/fs/internal.h
index 6a8b71643af4..f3a027c44758 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -94,6 +94,7 @@ extern int __mnt_want_write_file(struct file *);
 extern void __mnt_drop_write(struct vfsmount *);
 extern void __mnt_drop_write_file(struct file *);
 
+extern void dissolve_on_fput(struct vfsmount *);
 /*
  * fs_struct.c
  */
diff --git a/fs/namespace.c b/fs/namespace.c
index c9cab307fa77..b804a1a497ee 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>		/* init_rootfs */
 #include <linux/fs_struct.h>	/* get_fs_root et.al. */
 #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
+#include <linux/file.h>
 #include <linux/uaccess.h>
 #include <linux/proc_ns.h>
 #include <linux/magic.h>
@@ -1832,6 +1833,21 @@ struct vfsmount *collect_mounts(const struct path *path)
 	return &tree->mnt;
 }
 
+static void free_mnt_ns(struct mnt_namespace *);
+static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
+
+void dissolve_on_fput(struct vfsmount *mnt)
+{
+	struct mnt_namespace *ns;
+	namespace_lock();
+	lock_mount_hash();
+	ns = real_mount(mnt)->mnt_ns;
+	umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
+	unlock_mount_hash();
+	namespace_unlock();
+	free_mnt_ns(ns);
+}
+
 void drop_collected_mounts(struct vfsmount *mnt)
 {
 	namespace_lock();
@@ -2222,6 +2238,30 @@ static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
 	return false;
 }
 
+static struct mount *__do_loopback(struct path *old_path, int recurse)
+{
+	struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
+
+	if (IS_MNT_UNBINDABLE(old))
+		return mnt;
+
+	if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
+		return mnt;
+
+	if (!recurse && has_locked_children(old, old_path->dentry))
+		return mnt;
+
+	if (recurse)
+		mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
+	else
+		mnt = clone_mnt(old, old_path->dentry, 0);
+
+	if (!IS_ERR(mnt))
+		mnt->mnt.mnt_flags &= ~MNT_LOCKED;
+
+	return mnt;
+}
+
 /*
  * do loopback mount.
  */
@@ -2229,7 +2269,7 @@ static int do_loopback(struct path *path, const char *old_name,
 				int recurse)
 {
 	struct path old_path;
-	struct mount *mnt = NULL, *old, *parent;
+	struct mount *mnt = NULL, *parent;
 	struct mountpoint *mp;
 	int err;
 	if (!old_name || !*old_name)
@@ -2243,38 +2283,21 @@ static int do_loopback(struct path *path, const char *old_name,
 		goto out;
 
 	mp = lock_mount(path);
-	err = PTR_ERR(mp);
-	if (IS_ERR(mp))
+	if (IS_ERR(mp)) {
+		err = PTR_ERR(mp);
 		goto out;
+	}
 
-	old = real_mount(old_path.mnt);
 	parent = real_mount(path->mnt);
-
-	err = -EINVAL;
-	if (IS_MNT_UNBINDABLE(old))
-		goto out2;
-
 	if (!check_mnt(parent))
 		goto out2;
 
-	if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
-		goto out2;
-
-	if (!recurse && has_locked_children(old, old_path.dentry))
-		goto out2;
-
-	if (recurse)
-		mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
-	else
-		mnt = clone_mnt(old, old_path.dentry, 0);
-
+	mnt = __do_loopback(&old_path, recurse);
 	if (IS_ERR(mnt)) {
 		err = PTR_ERR(mnt);
 		goto out2;
 	}
 
-	mnt->mnt.mnt_flags &= ~MNT_LOCKED;
-
 	err = graft_tree(mnt, parent, mp);
 	if (err) {
 		lock_mount_hash();
@@ -2288,6 +2311,96 @@ out:
 	return err;
 }
 
+static struct file *open_detached_copy(struct path *path, bool recursive)
+{
+	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
+	struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
+	struct mount *mnt, *p;
+	struct file *file;
+
+	if (IS_ERR(ns))
+		return ERR_CAST(ns);
+
+	namespace_lock();
+	mnt = __do_loopback(path, recursive);
+	if (IS_ERR(mnt)) {
+		namespace_unlock();
+		free_mnt_ns(ns);
+		return ERR_CAST(mnt);
+	}
+
+	lock_mount_hash();
+	for (p = mnt; p; p = next_mnt(p, mnt)) {
+		p->mnt_ns = ns;
+		ns->mounts++;
+	}
+	ns->root = mnt;
+	list_add_tail(&ns->list, &mnt->mnt_list);
+	mntget(&mnt->mnt);
+	unlock_mount_hash();
+	namespace_unlock();
+
+	mntput(path->mnt);
+	path->mnt = &mnt->mnt;
+	file = dentry_open(path, O_PATH, current_cred());
+	if (IS_ERR(file))
+		dissolve_on_fput(path->mnt);
+	else
+		file->f_mode |= FMODE_NEED_UNMOUNT;
+	return file;
+}
+
+SYSCALL_DEFINE3(open_tree, int, dfd, const char *, filename, unsigned, flags)
+{
+	struct file *file;
+	struct path path;
+	int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
+	bool detached = flags & OPEN_TREE_CLONE;
+	int error;
+	int fd;
+
+	BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
+
+	if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
+		      AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
+		      OPEN_TREE_CLOEXEC))
+		return -EINVAL;
+
+	if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
+		return -EINVAL;
+
+	if (flags & AT_NO_AUTOMOUNT)
+		lookup_flags &= ~LOOKUP_AUTOMOUNT;
+	if (flags & AT_SYMLINK_NOFOLLOW)
+		lookup_flags &= ~LOOKUP_FOLLOW;
+	if (flags & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+
+	if (detached && !may_mount())
+		return -EPERM;
+
+	fd = get_unused_fd_flags(flags & O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+
+	error = user_path_at(dfd, filename, lookup_flags, &path);
+	if (unlikely(error)) {
+		file = ERR_PTR(error);
+	} else {
+		if (detached)
+			file = open_detached_copy(&path, flags & AT_RECURSIVE);
+		else
+			file = dentry_open(&path, O_PATH, current_cred());
+		path_put(&path);
+	}
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		return PTR_ERR(file);
+	}
+	fd_install(fd, file);
+	return fd;
+}
+
 /*
  * Don't allow locked mount flags to be cleared.
  *
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8b42df09b04c..09b05ec5d059 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -162,10 +162,13 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 #define FMODE_NONOTIFY		((__force fmode_t)0x4000000)
 
 /* File is capable of returning -EAGAIN if I/O will block */
-#define FMODE_NOWAIT	((__force fmode_t)0x8000000)
+#define FMODE_NOWAIT		((__force fmode_t)0x8000000)
+
+/* File represents mount that needs unmounting */
+#define FMODE_NEED_UNMOUNT	((__force fmode_t)0x10000000)
 
 /* File does not contribute to nr_files count */
-#define FMODE_NOACCOUNT	((__force fmode_t)0x20000000)
+#define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
 
 /*
  * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e446806a561f..6c29d586e66b 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -985,6 +985,7 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 			  unsigned mask, struct statx __user *buffer);
 asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
 			 int flags, uint32_t sig);
+asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
 asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
 				       siginfo_t __user *info,
 				       unsigned int flags);
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index a2f8658f1c55..1d338357df8a 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -91,5 +91,7 @@
 #define AT_STATX_FORCE_SYNC	0x2000	/* - Force the attributes to be sync'd with the server */
 #define AT_STATX_DONT_SYNC	0x4000	/* - Don't sync attributes with the server */
 
+#define AT_RECURSIVE		0x8000	/* Apply to the entire subtree */
+
 
 #endif /* _UAPI_LINUX_FCNTL_H */
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 3f9ec42510b0..fd7ae2e7eccf 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -55,4 +55,10 @@
 #define MS_MGC_VAL 0xC0ED0000
 #define MS_MGC_MSK 0xffff0000
 
+/*
+ * open_tree() flags.
+ */
+#define OPEN_TREE_CLONE		1		/* Clone the target tree and attach the clone */
+#define OPEN_TREE_CLOEXEC	O_CLOEXEC	/* Close the file on execve() */
+
 #endif /* _UAPI_LINUX_MOUNT_H */
-- 
cgit v1.2.3-59-g8ed1b


From 2db154b3ea8e14b04fee23e3fdfd5e9d17fbc6ae Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 5 Nov 2018 17:40:30 +0000
Subject: vfs: syscall: Add move_mount(2) to move mounts around

Add a move_mount() system call that will move a mount from one place to
another and, in the next commit, allow to attach an unattached mount tree.

The new system call looks like the following:

	int move_mount(int from_dfd, const char *from_path,
		       int to_dfd, const char *to_path,
		       unsigned int flags);

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/entry/syscalls/syscall_32.tbl |   3 +-
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 fs/namespace.c                         | 126 +++++++++++++++++++++++++--------
 include/linux/lsm_hooks.h              |   6 ++
 include/linux/security.h               |   7 ++
 include/linux/syscalls.h               |   3 +
 include/uapi/linux/mount.h             |  11 +++
 security/security.c                    |   5 ++
 8 files changed, 130 insertions(+), 32 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ae2294d07ecb..0db9effb18d9 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -399,7 +399,8 @@
 385	i386	io_pgetevents		sys_io_pgetevents_time32	__ia32_compat_sys_io_pgetevents
 386	i386	rseq			sys_rseq			__ia32_sys_rseq
 387	i386	open_tree		sys_open_tree			__ia32_sys_open_tree
-# don't use numbers 388 through 392, add new calls at the end
+388	i386	move_mount		sys_move_mount			__ia32_sys_move_mount
+# don't use numbers 389 through 392, add new calls at the end
 393	i386	semget			sys_semget    			__ia32_sys_semget
 394	i386	semctl			sys_semctl    			__ia32_compat_sys_semctl
 395	i386	shmget			sys_shmget    			__ia32_sys_shmget
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index a6e06c35b5b1..0440f0eefa02 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -344,6 +344,7 @@
 333	common	io_pgetevents		__x64_sys_io_pgetevents
 334	common	rseq			__x64_sys_rseq
 335	common	open_tree		__x64_sys_open_tree
+336	common	move_mount		__x64_sys_move_mount
 # don't use numbers 387 through 423, add new calls after the last
 # 'common' entry
 424	common	pidfd_send_signal	__x64_sys_pidfd_send_signal
diff --git a/fs/namespace.c b/fs/namespace.c
index b804a1a497ee..dc600f53de9d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2539,72 +2539,81 @@ static inline int tree_contains_unbindable(struct mount *mnt)
 	return 0;
 }
 
-static int do_move_mount(struct path *path, const char *old_name)
+static int do_move_mount(struct path *old_path, struct path *new_path)
 {
-	struct path old_path, parent_path;
+	struct path parent_path = {.mnt = NULL, .dentry = NULL};
 	struct mount *p;
 	struct mount *old;
 	struct mountpoint *mp;
 	int err;
-	if (!old_name || !*old_name)
-		return -EINVAL;
-	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
-	if (err)
-		return err;
 
-	mp = lock_mount(path);
-	err = PTR_ERR(mp);
+	mp = lock_mount(new_path);
 	if (IS_ERR(mp))
-		goto out;
+		return PTR_ERR(mp);
 
-	old = real_mount(old_path.mnt);
-	p = real_mount(path->mnt);
+	old = real_mount(old_path->mnt);
+	p = real_mount(new_path->mnt);
 
 	err = -EINVAL;
 	if (!check_mnt(p) || !check_mnt(old))
-		goto out1;
+		goto out;
 
-	if (old->mnt.mnt_flags & MNT_LOCKED)
-		goto out1;
+	if (!mnt_has_parent(old))
+		goto out;
 
-	err = -EINVAL;
-	if (old_path.dentry != old_path.mnt->mnt_root)
-		goto out1;
+	if (old->mnt.mnt_flags & MNT_LOCKED)
+		goto out;
 
-	if (!mnt_has_parent(old))
-		goto out1;
+	if (old_path->dentry != old_path->mnt->mnt_root)
+		goto out;
 
-	if (d_is_dir(path->dentry) !=
-	      d_is_dir(old_path.dentry))
-		goto out1;
+	if (d_is_dir(new_path->dentry) !=
+	    d_is_dir(old_path->dentry))
+		goto out;
 	/*
 	 * Don't move a mount residing in a shared parent.
 	 */
 	if (IS_MNT_SHARED(old->mnt_parent))
-		goto out1;
+		goto out;
 	/*
 	 * Don't move a mount tree containing unbindable mounts to a destination
 	 * mount which is shared.
 	 */
 	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
-		goto out1;
+		goto out;
 	err = -ELOOP;
 	for (; mnt_has_parent(p); p = p->mnt_parent)
 		if (p == old)
-			goto out1;
+			goto out;
 
-	err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
+	err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
+				   &parent_path);
 	if (err)
-		goto out1;
+		goto out;
 
 	/* if the mount is moved, it should no longer be expire
 	 * automatically */
 	list_del_init(&old->mnt_expire);
-out1:
-	unlock_mount(mp);
 out:
+	unlock_mount(mp);
 	if (!err)
 		path_put(&parent_path);
+	return err;
+}
+
+static int do_move_mount_old(struct path *path, const char *old_name)
+{
+	struct path old_path;
+	int err;
+
+	if (!old_name || !*old_name)
+		return -EINVAL;
+
+	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
+	if (err)
+		return err;
+
+	err = do_move_mount(&old_path, path);
 	path_put(&old_path);
 	return err;
 }
@@ -3050,7 +3059,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
 		retval = do_change_type(&path, flags);
 	else if (flags & MS_MOVE)
-		retval = do_move_mount(&path, dev_name);
+		retval = do_move_mount_old(&path, dev_name);
 	else
 		retval = do_new_mount(&path, type_page, sb_flags, mnt_flags,
 				      dev_name, data_page);
@@ -3278,6 +3287,61 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
 	return ksys_mount(dev_name, dir_name, type, flags, data);
 }
 
+/*
+ * Move a mount from one place to another.
+ *
+ * Note the flags value is a combination of MOVE_MOUNT_* flags.
+ */
+SYSCALL_DEFINE5(move_mount,
+		int, from_dfd, const char *, from_pathname,
+		int, to_dfd, const char *, to_pathname,
+		unsigned int, flags)
+{
+	struct path from_path, to_path;
+	unsigned int lflags;
+	int ret = 0;
+
+	if (!may_mount())
+		return -EPERM;
+
+	if (flags & ~MOVE_MOUNT__MASK)
+		return -EINVAL;
+
+	/* If someone gives a pathname, they aren't permitted to move
+	 * from an fd that requires unmount as we can't get at the flag
+	 * to clear it afterwards.
+	 */
+	lflags = 0;
+	if (flags & MOVE_MOUNT_F_SYMLINKS)	lflags |= LOOKUP_FOLLOW;
+	if (flags & MOVE_MOUNT_F_AUTOMOUNTS)	lflags |= LOOKUP_AUTOMOUNT;
+	if (flags & MOVE_MOUNT_F_EMPTY_PATH)	lflags |= LOOKUP_EMPTY;
+
+	ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
+	if (ret < 0)
+		return ret;
+
+	lflags = 0;
+	if (flags & MOVE_MOUNT_T_SYMLINKS)	lflags |= LOOKUP_FOLLOW;
+	if (flags & MOVE_MOUNT_T_AUTOMOUNTS)	lflags |= LOOKUP_AUTOMOUNT;
+	if (flags & MOVE_MOUNT_T_EMPTY_PATH)	lflags |= LOOKUP_EMPTY;
+
+	ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
+	if (ret < 0)
+		goto out_from;
+
+	ret = security_move_mount(&from_path, &to_path);
+	if (ret < 0)
+		goto out_to;
+
+	ret = do_move_mount(&from_path, &to_path);
+
+out_to:
+	path_put(&to_path);
+out_from:
+	path_put(&from_path);
+	return ret;
+}
+
 /*
  * Return true if path is reachable from root
  *
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index a9b8ff578b6b..cb33f81cf5a1 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -160,6 +160,10 @@
  *	Parse a string of security data filling in the opts structure
  *	@options string containing all mount options known by the LSM
  *	@opts binary data structure usable by the LSM
+ * @move_mount:
+ *	Check permission before a mount is moved.
+ *	@from_path indicates the mount that is going to be moved.
+ *	@to_path indicates the mountpoint that will be mounted upon.
  * @dentry_init_security:
  *	Compute a context for a dentry as the inode is not yet available
  *	since NFSv4 has no label backed by an EA anyway.
@@ -1501,6 +1505,7 @@ union security_list_options {
 					unsigned long *set_kern_flags);
 	int (*sb_add_mnt_opt)(const char *option, const char *val, int len,
 			      void **mnt_opts);
+	int (*move_mount)(const struct path *from_path, const struct path *to_path);
 	int (*dentry_init_security)(struct dentry *dentry, int mode,
 					const struct qstr *name, void **ctx,
 					u32 *ctxlen);
@@ -1835,6 +1840,7 @@ struct security_hook_heads {
 	struct hlist_head sb_set_mnt_opts;
 	struct hlist_head sb_clone_mnt_opts;
 	struct hlist_head sb_add_mnt_opt;
+	struct hlist_head move_mount;
 	struct hlist_head dentry_init_security;
 	struct hlist_head dentry_create_files_as;
 #ifdef CONFIG_SECURITY_PATH
diff --git a/include/linux/security.h b/include/linux/security.h
index 49f2685324b0..1f2e06afc28f 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -250,6 +250,7 @@ int security_sb_clone_mnt_opts(const struct super_block *oldsb,
 				unsigned long *set_kern_flags);
 int security_add_mnt_opt(const char *option, const char *val,
 				int len, void **mnt_opts);
+int security_move_mount(const struct path *from_path, const struct path *to_path);
 int security_dentry_init_security(struct dentry *dentry, int mode,
 					const struct qstr *name, void **ctx,
 					u32 *ctxlen);
@@ -611,6 +612,12 @@ static inline int security_add_mnt_opt(const char *option, const char *val,
 	return 0;
 }
 
+static inline int security_move_mount(const struct path *from_path,
+				      const struct path *to_path)
+{
+	return 0;
+}
+
 static inline int security_inode_alloc(struct inode *inode)
 {
 	return 0;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 6c29d586e66b..84347fc0a1a7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -986,6 +986,9 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
 			 int flags, uint32_t sig);
 asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
+asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path,
+			       int to_dfd, const char __user *to_path,
+			       unsigned int ms_flags);
 asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
 				       siginfo_t __user *info,
 				       unsigned int flags);
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index fd7ae2e7eccf..3634e065836c 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -61,4 +61,15 @@
 #define OPEN_TREE_CLONE		1		/* Clone the target tree and attach the clone */
 #define OPEN_TREE_CLOEXEC	O_CLOEXEC	/* Close the file on execve() */
 
+/*
+ * move_mount() flags.
+ */
+#define MOVE_MOUNT_F_SYMLINKS		0x00000001 /* Follow symlinks on from path */
+#define MOVE_MOUNT_F_AUTOMOUNTS		0x00000002 /* Follow automounts on from path */
+#define MOVE_MOUNT_F_EMPTY_PATH		0x00000004 /* Empty from path permitted */
+#define MOVE_MOUNT_T_SYMLINKS		0x00000010 /* Follow symlinks on to path */
+#define MOVE_MOUNT_T_AUTOMOUNTS		0x00000020 /* Follow automounts on to path */
+#define MOVE_MOUNT_T_EMPTY_PATH		0x00000040 /* Empty to path permitted */
+#define MOVE_MOUNT__MASK		0x00000077
+
 #endif /* _UAPI_LINUX_MOUNT_H */
diff --git a/security/security.c b/security/security.c
index 23cbb1a295a3..5b3d23e427b3 100644
--- a/security/security.c
+++ b/security/security.c
@@ -866,6 +866,11 @@ int security_add_mnt_opt(const char *option, const char *val, int len,
 }
 EXPORT_SYMBOL(security_add_mnt_opt);
 
+int security_move_mount(const struct path *from_path, const struct path *to_path)
+{
+	return call_int_hook(move_mount, 0, from_path, to_path);
+}
+
 int security_inode_alloc(struct inode *inode)
 {
 	int rc = lsm_inode_alloc(inode);
-- 
cgit v1.2.3-59-g8ed1b


From dadd2299ab61fc2b55b95b7b3a8f674cdd3b69c9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 5 Nov 2018 17:40:31 +0000
Subject: Make anon_inodes unconditional

Make the anon_inodes facility unconditional so that it can be used by core
VFS code.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/arm/kvm/Kconfig       |  1 -
 arch/arm64/kvm/Kconfig     |  1 -
 arch/mips/kvm/Kconfig      |  1 -
 arch/powerpc/kvm/Kconfig   |  1 -
 arch/s390/kvm/Kconfig      |  1 -
 arch/x86/Kconfig           |  1 -
 arch/x86/kvm/Kconfig       |  1 -
 drivers/base/Kconfig       |  1 -
 drivers/char/tpm/Kconfig   |  1 -
 drivers/dma-buf/Kconfig    |  1 -
 drivers/gpio/Kconfig       |  1 -
 drivers/iio/Kconfig        |  1 -
 drivers/infiniband/Kconfig |  1 -
 drivers/vfio/Kconfig       |  1 -
 fs/Makefile                |  2 +-
 fs/notify/fanotify/Kconfig |  1 -
 fs/notify/inotify/Kconfig  |  1 -
 init/Kconfig               | 10 ----------
 18 files changed, 1 insertion(+), 27 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 3f5320f46de2..f591026347a5 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -22,7 +22,6 @@ config KVM
 	bool "Kernel-based Virtual Machine (KVM) support"
 	depends on MMU && OF
 	select PREEMPT_NOTIFIERS
-	select ANON_INODES
 	select ARM_GIC
 	select ARM_GIC_V3
 	select ARM_GIC_V3_ITS
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index a3f85624313e..a67121d419a2 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -23,7 +23,6 @@ config KVM
 	depends on OF
 	select MMU_NOTIFIER
 	select PREEMPT_NOTIFIERS
-	select ANON_INODES
 	select HAVE_KVM_CPU_RELAX_INTERCEPT
 	select HAVE_KVM_ARCH_TLB_FLUSH_ALL
 	select KVM_MMIO
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index 4528bc9c3cb1..eac25aef21e0 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -21,7 +21,6 @@ config KVM
 	depends on MIPS_FP_SUPPORT
 	select EXPORT_UASM
 	select PREEMPT_NOTIFIERS
-	select ANON_INODES
 	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
 	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select KVM_MMIO
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index bfdde04e4905..f53997a8ca62 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,7 +20,6 @@ if VIRTUALIZATION
 config KVM
 	bool
 	select PREEMPT_NOTIFIERS
-	select ANON_INODES
 	select HAVE_KVM_EVENTFD
 	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select SRCU
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 767453faacfc..1816ee48eadd 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -21,7 +21,6 @@ config KVM
 	prompt "Kernel-based Virtual Machine (KVM) support"
 	depends on HAVE_KVM
 	select PREEMPT_NOTIFIERS
-	select ANON_INODES
 	select HAVE_KVM_CPU_RELAX_INTERCEPT
 	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select HAVE_KVM_EVENTFD
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c1f9b3cf437c..18f2c954464e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -44,7 +44,6 @@ config X86
 	#
 	select ACPI_LEGACY_TABLES_LOOKUP	if ACPI
 	select ACPI_SYSTEM_POWER_STATES_SUPPORT	if ACPI
-	select ANON_INODES
 	select ARCH_32BIT_OFF_T			if X86_32
 	select ARCH_CLOCKSOURCE_DATA
 	select ARCH_CLOCKSOURCE_INIT
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 72fa955f4a15..fc042419e670 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -27,7 +27,6 @@ config KVM
 	depends on X86_LOCAL_APIC
 	select PREEMPT_NOTIFIERS
 	select MMU_NOTIFIER
-	select ANON_INODES
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_IRQFD
 	select IRQ_BYPASS_MANAGER
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 059700ea3521..03f067da12ee 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -174,7 +174,6 @@ source "drivers/base/regmap/Kconfig"
 config DMA_SHARED_BUFFER
 	bool
 	default n
-	select ANON_INODES
 	select IRQ_WORK
 	help
 	  This option enables the framework for buffer-sharing between
diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig
index 536e55d3919f..f3e4bc490cf0 100644
--- a/drivers/char/tpm/Kconfig
+++ b/drivers/char/tpm/Kconfig
@@ -157,7 +157,6 @@ config TCG_CRB
 config TCG_VTPM_PROXY
 	tristate "VTPM Proxy Interface"
 	depends on TCG_TPM
-	select ANON_INODES
 	---help---
 	  This driver proxies for an emulated TPM (vTPM) running in userspace.
 	  A device /dev/vtpmx is provided that creates a device pair
diff --git a/drivers/dma-buf/Kconfig b/drivers/dma-buf/Kconfig
index 2e5a0faa2cb1..3fc9c2efc583 100644
--- a/drivers/dma-buf/Kconfig
+++ b/drivers/dma-buf/Kconfig
@@ -3,7 +3,6 @@ menu "DMABUF options"
 config SYNC_FILE
 	bool "Explicit Synchronization Framework"
 	default n
-	select ANON_INODES
 	select DMA_SHARED_BUFFER
 	---help---
 	  The Sync File Framework adds explicit syncronization via
diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 3f50526a771f..0f91600c27ae 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -12,7 +12,6 @@ config ARCH_HAVE_CUSTOM_GPIO_H
 
 menuconfig GPIOLIB
 	bool "GPIO Support"
-	select ANON_INODES
 	help
 	  This enables GPIO support through the generic GPIO library.
 	  You only need to enable this, if you also want to enable
diff --git a/drivers/iio/Kconfig b/drivers/iio/Kconfig
index d08aeb41cd07..1dec0fecb6ef 100644
--- a/drivers/iio/Kconfig
+++ b/drivers/iio/Kconfig
@@ -4,7 +4,6 @@
 
 menuconfig IIO
 	tristate "Industrial I/O support"
-	select ANON_INODES
 	help
 	  The industrial I/O subsystem provides a unified framework for
 	  drivers for many different types of embedded sensors using a
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index a1fb840de45d..d318bab25860 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -25,7 +25,6 @@ config INFINIBAND_USER_MAD
 
 config INFINIBAND_USER_ACCESS
 	tristate "InfiniBand userspace access (verbs and CM)"
-	select ANON_INODES
 	depends on MMU
 	---help---
 	  Userspace InfiniBand access support.  This enables the
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 9de5ed38da83..3798d77d131c 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -22,7 +22,6 @@ menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if (X86 || S390 || ARM || ARM64)
-	select ANON_INODES
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/fs/Makefile b/fs/Makefile
index 427fec226fae..35945f8139e6 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -25,7 +25,7 @@ obj-$(CONFIG_PROC_FS) += proc_namespace.o
 
 obj-y				+= notify/
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
-obj-$(CONFIG_ANON_INODES)	+= anon_inodes.o
+obj-y				+= anon_inodes.o
 obj-$(CONFIG_SIGNALFD)		+= signalfd.o
 obj-$(CONFIG_TIMERFD)		+= timerfd.o
 obj-$(CONFIG_EVENTFD)		+= eventfd.o
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 735bfb2e9190..521dc91d2cb5 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -1,7 +1,6 @@
 config FANOTIFY
 	bool "Filesystem wide access notification"
 	select FSNOTIFY
-	select ANON_INODES
 	select EXPORTFS
 	default n
 	---help---
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index b981fc0c8379..0161c74e76e2 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,6 +1,5 @@
 config INOTIFY_USER
 	bool "Inotify support for userspace"
-	select ANON_INODES
 	select FSNOTIFY
 	default y
 	---help---
diff --git a/init/Kconfig b/init/Kconfig
index 4592bf7997c0..be8f97e37a76 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1171,9 +1171,6 @@ config LD_DEAD_CODE_DATA_ELIMINATION
 config SYSCTL
 	bool
 
-config ANON_INODES
-	bool
-
 config HAVE_UID16
 	bool
 
@@ -1378,14 +1375,12 @@ config HAVE_FUTEX_CMPXCHG
 config EPOLL
 	bool "Enable eventpoll support" if EXPERT
 	default y
-	select ANON_INODES
 	help
 	  Disabling this option will cause the kernel to be built without
 	  support for epoll family of system calls.
 
 config SIGNALFD
 	bool "Enable signalfd() system call" if EXPERT
-	select ANON_INODES
 	default y
 	help
 	  Enable the signalfd() system call that allows to receive signals
@@ -1395,7 +1390,6 @@ config SIGNALFD
 
 config TIMERFD
 	bool "Enable timerfd() system call" if EXPERT
-	select ANON_INODES
 	default y
 	help
 	  Enable the timerfd() system call that allows to receive timer
@@ -1405,7 +1399,6 @@ config TIMERFD
 
 config EVENTFD
 	bool "Enable eventfd() system call" if EXPERT
-	select ANON_INODES
 	default y
 	help
 	  Enable the eventfd() system call that allows to receive both
@@ -1516,7 +1509,6 @@ config KALLSYMS_BASE_RELATIVE
 # syscall, maps, verifier
 config BPF_SYSCALL
 	bool "Enable bpf() system call"
-	select ANON_INODES
 	select BPF
 	select IRQ_WORK
 	default n
@@ -1533,7 +1525,6 @@ config BPF_JIT_ALWAYS_ON
 
 config USERFAULTFD
 	bool "Enable userfaultfd() system call"
-	select ANON_INODES
 	depends on MMU
 	help
 	  Enable the userfaultfd() system call that allows to intercept and
@@ -1600,7 +1591,6 @@ config PERF_EVENTS
 	bool "Kernel performance events and counters"
 	default y if PROFILING
 	depends on HAVE_PERF_EVENTS
-	select ANON_INODES
 	select IRQ_WORK
 	select SRCU
 	help
-- 
cgit v1.2.3-59-g8ed1b


From 24dcb3d90a1f67fe08c68a004af37df059d74005 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 1 Nov 2018 23:33:31 +0000
Subject: vfs: syscall: Add fsopen() to prepare for superblock creation

Provide an fsopen() system call that starts the process of preparing to
create a superblock that will then be mountable, using an fd as a context
handle.  fsopen() is given the name of the filesystem that will be used:

	int mfd = fsopen(const char *fsname, unsigned int flags);

where flags can be 0 or FSOPEN_CLOEXEC.

For example:

	sfd = fsopen("ext4", FSOPEN_CLOEXEC);
	fsconfig(sfd, FSCONFIG_SET_PATH, "source", "/dev/sda1", AT_FDCWD);
	fsconfig(sfd, FSCONFIG_SET_FLAG, "noatime", NULL, 0);
	fsconfig(sfd, FSCONFIG_SET_FLAG, "acl", NULL, 0);
	fsconfig(sfd, FSCONFIG_SET_FLAG, "user_xattr", NULL, 0);
	fsconfig(sfd, FSCONFIG_SET_STRING, "sb", "1", 0);
	fsconfig(sfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
	fsinfo(sfd, NULL, ...); // query new superblock attributes
	mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_RELATIME);
	move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);

	sfd = fsopen("afs", -1);
	fsconfig(fd, FSCONFIG_SET_STRING, "source",
		 "#grand.central.org:root.cell", 0);
	fsconfig(fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
	mfd = fsmount(sfd, 0, MS_NODEV);
	move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);

If an error is reported at any step, an error message may be available to be
read() back (ENODATA will be reported if there isn't an error available) in
the form:

	"e <subsys>:<problem>"
	"e SELinux:Mount on mountpoint not permitted"

Once fsmount() has been called, further fsconfig() calls will incur EBUSY,
even if the fsmount() fails.  read() is still possible to retrieve error
information.

The fsopen() syscall creates a mount context and hangs it of the fd that it
returns.

Netlink is not used because it is optional and would make the core VFS
dependent on the networking layer and also potentially add network
namespace issues.

Note that, for the moment, the caller must have SYS_CAP_ADMIN to use
fsopen().

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/entry/syscalls/syscall_32.tbl |  3 +-
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 fs/Makefile                            |  2 +-
 fs/fs_context.c                        |  4 ++
 fs/fsopen.c                            | 88 ++++++++++++++++++++++++++++++++++
 include/linux/fs_context.h             | 16 +++++++
 include/linux/syscalls.h               |  1 +
 include/uapi/linux/mount.h             |  5 ++
 8 files changed, 118 insertions(+), 2 deletions(-)
 create mode 100644 fs/fsopen.c

(limited to 'arch')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 0db9effb18d9..37fd1fc5396e 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -400,7 +400,8 @@
 386	i386	rseq			sys_rseq			__ia32_sys_rseq
 387	i386	open_tree		sys_open_tree			__ia32_sys_open_tree
 388	i386	move_mount		sys_move_mount			__ia32_sys_move_mount
-# don't use numbers 389 through 392, add new calls at the end
+389	i386	fsopen			sys_fsopen			__ia32_sys_fsopen
+# don't use numbers 390 through 392, add new calls at the end
 393	i386	semget			sys_semget    			__ia32_sys_semget
 394	i386	semctl			sys_semctl    			__ia32_compat_sys_semctl
 395	i386	shmget			sys_shmget    			__ia32_sys_shmget
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 0440f0eefa02..511608a21611 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -345,6 +345,7 @@
 334	common	rseq			__x64_sys_rseq
 335	common	open_tree		__x64_sys_open_tree
 336	common	move_mount		__x64_sys_move_mount
+337	common	fsopen			__x64_sys_fsopen
 # don't use numbers 387 through 423, add new calls after the last
 # 'common' entry
 424	common	pidfd_send_signal	__x64_sys_pidfd_send_signal
diff --git a/fs/Makefile b/fs/Makefile
index 35945f8139e6..5a51bc2489ba 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,7 +13,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o splice.o sync.o utimes.o d_path.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
-		fs_types.o fs_context.o fs_parser.o
+		fs_types.o fs_context.o fs_parser.o fsopen.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 87e3546b9a52..eb806fae3117 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -271,6 +271,8 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
 	fc->cred	= get_current_cred();
 	fc->net_ns	= get_net(current->nsproxy->net_ns);
 
+	mutex_init(&fc->uapi_mutex);
+
 	switch (purpose) {
 	case FS_CONTEXT_FOR_MOUNT:
 		fc->user_ns = get_user_ns(fc->cred->user_ns);
@@ -353,6 +355,8 @@ struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc)
 	if (!fc)
 		return ERR_PTR(-ENOMEM);
 
+	mutex_init(&fc->uapi_mutex);
+
 	fc->fs_private	= NULL;
 	fc->s_fs_info	= NULL;
 	fc->source	= NULL;
diff --git a/fs/fsopen.c b/fs/fsopen.c
new file mode 100644
index 000000000000..d256f1ac9ff1
--- /dev/null
+++ b/fs/fsopen.c
@@ -0,0 +1,88 @@
+/* Filesystem access-by-fd.
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/fs_context.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/security.h>
+#include <linux/anon_inodes.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <uapi/linux/mount.h>
+#include "mount.h"
+
+static int fscontext_release(struct inode *inode, struct file *file)
+{
+	struct fs_context *fc = file->private_data;
+
+	if (fc) {
+		file->private_data = NULL;
+		put_fs_context(fc);
+	}
+	return 0;
+}
+
+const struct file_operations fscontext_fops = {
+	.release	= fscontext_release,
+	.llseek		= no_llseek,
+};
+
+/*
+ * Attach a filesystem context to a file and an fd.
+ */
+static int fscontext_create_fd(struct fs_context *fc, unsigned int o_flags)
+{
+	int fd;
+
+	fd = anon_inode_getfd("fscontext", &fscontext_fops, fc,
+			      O_RDWR | o_flags);
+	if (fd < 0)
+		put_fs_context(fc);
+	return fd;
+}
+
+/*
+ * Open a filesystem by name so that it can be configured for mounting.
+ *
+ * We are allowed to specify a container in which the filesystem will be
+ * opened, thereby indicating which namespaces will be used (notably, which
+ * network namespace will be used for network filesystems).
+ */
+SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags)
+{
+	struct file_system_type *fs_type;
+	struct fs_context *fc;
+	const char *fs_name;
+
+	if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (flags & ~FSOPEN_CLOEXEC)
+		return -EINVAL;
+
+	fs_name = strndup_user(_fs_name, PAGE_SIZE);
+	if (IS_ERR(fs_name))
+		return PTR_ERR(fs_name);
+
+	fs_type = get_fs_type(fs_name);
+	kfree(fs_name);
+	if (!fs_type)
+		return -ENODEV;
+
+	fc = fs_context_for_mount(fs_type, 0);
+	put_filesystem(fs_type);
+	if (IS_ERR(fc))
+		return PTR_ERR(fc);
+
+	fc->phase = FS_CONTEXT_CREATE_PARAMS;
+	return fscontext_create_fd(fc, flags & FSOPEN_CLOEXEC ? O_CLOEXEC : 0);
+}
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index eaca452088fa..7ab8b44fab3e 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/security.h>
+#include <linux/mutex.h>
 
 struct cred;
 struct dentry;
@@ -34,6 +35,19 @@ enum fs_context_purpose {
 	FS_CONTEXT_FOR_RECONFIGURE,	/* Superblock reconfiguration (remount) */
 };
 
+/*
+ * Userspace usage phase for fsopen/fspick.
+ */
+enum fs_context_phase {
+	FS_CONTEXT_CREATE_PARAMS,	/* Loading params for sb creation */
+	FS_CONTEXT_CREATING,		/* A superblock is being created */
+	FS_CONTEXT_AWAITING_MOUNT,	/* Superblock created, awaiting fsmount() */
+	FS_CONTEXT_AWAITING_RECONF,	/* Awaiting initialisation for reconfiguration */
+	FS_CONTEXT_RECONF_PARAMS,	/* Loading params for reconfiguration */
+	FS_CONTEXT_RECONFIGURING,	/* Reconfiguring the superblock */
+	FS_CONTEXT_FAILED,		/* Failed to correctly transition a context */
+};
+
 /*
  * Type of parameter value.
  */
@@ -74,6 +88,7 @@ struct fs_parameter {
  */
 struct fs_context {
 	const struct fs_context_operations *ops;
+	struct mutex		uapi_mutex;	/* Userspace access mutex */
 	struct file_system_type	*fs_type;
 	void			*fs_private;	/* The filesystem's context */
 	struct dentry		*root;		/* The root and superblock */
@@ -88,6 +103,7 @@ struct fs_context {
 	unsigned int		sb_flags_mask;	/* Superblock flags that were changed */
 	unsigned int		lsm_flags;	/* Information flags from the fs to the LSM */
 	enum fs_context_purpose	purpose:8;
+	enum fs_context_phase	phase:8;	/* The phase the context is in */
 	bool			need_free:1;	/* Need to call ops->free() */
 	bool			global:1;	/* Goes into &init_user_ns */
 };
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 84347fc0a1a7..0c9bd5427e8f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -989,6 +989,7 @@ asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
 asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path,
 			       int to_dfd, const char __user *to_path,
 			       unsigned int ms_flags);
+asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags);
 asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
 				       siginfo_t __user *info,
 				       unsigned int flags);
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 3634e065836c..7570df43d08f 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -72,4 +72,9 @@
 #define MOVE_MOUNT_T_EMPTY_PATH		0x00000040 /* Empty to path permitted */
 #define MOVE_MOUNT__MASK		0x00000077
 
+/*
+ * fsopen() flags.
+ */
+#define FSOPEN_CLOEXEC		0x00000001
+
 #endif /* _UAPI_LINUX_MOUNT_H */
-- 
cgit v1.2.3-59-g8ed1b


From ecdab150fddb42fe6a739335257949220033b782 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 1 Nov 2018 23:36:09 +0000
Subject: vfs: syscall: Add fsconfig() for configuring and managing a context

Add a syscall for configuring a filesystem creation context and triggering
actions upon it, to be used in conjunction with fsopen, fspick and fsmount.

    long fsconfig(int fs_fd, unsigned int cmd, const char *key,
		  const void *value, int aux);

Where fs_fd indicates the context, cmd indicates the action to take, key
indicates the parameter name for parameter-setting actions and, if needed,
value points to a buffer containing the value and aux can give more
information for the value.

The following command IDs are proposed:

 (*) FSCONFIG_SET_FLAG: No value is specified.  The parameter must be
     boolean in nature.  The key may be prefixed with "no" to invert the
     setting. value must be NULL and aux must be 0.

 (*) FSCONFIG_SET_STRING: A string value is specified.  The parameter can
     be expecting boolean, integer, string or take a path.  A conversion to
     an appropriate type will be attempted (which may include looking up as
     a path).  value points to a NUL-terminated string and aux must be 0.

 (*) FSCONFIG_SET_BINARY: A binary blob is specified.  value points to
     the blob and aux indicates its size.  The parameter must be expecting
     a blob.

 (*) FSCONFIG_SET_PATH: A non-empty path is specified.  The parameter must
     be expecting a path object.  value points to a NUL-terminated string
     that is the path and aux is a file descriptor at which to start a
     relative lookup or AT_FDCWD.

 (*) FSCONFIG_SET_PATH_EMPTY: As fsconfig_set_path, but with AT_EMPTY_PATH
     implied.

 (*) FSCONFIG_SET_FD: An open file descriptor is specified.  value must
     be NULL and aux indicates the file descriptor.

 (*) FSCONFIG_CMD_CREATE: Trigger superblock creation.

 (*) FSCONFIG_CMD_RECONFIGURE: Trigger superblock reconfiguration.

For the "set" command IDs, the idea is that the file_system_type will point
to a list of parameters and the types of value that those parameters expect
to take.  The core code can then do the parse and argument conversion and
then give the LSM and FS a cooked option or array of options to use.

Source specification is also done the same way same way, using special keys
"source", "source1", "source2", etc..

[!] Note that, for the moment, the key and value are just glued back
together and handed to the filesystem.  Every filesystem that uses options
uses match_token() and co. to do this, and this will need to be changed -
but not all at once.

Example usage:

    fd = fsopen("ext4", FSOPEN_CLOEXEC);
    fsconfig(fd, fsconfig_set_path, "source", "/dev/sda1", AT_FDCWD);
    fsconfig(fd, fsconfig_set_path_empty, "journal_path", "", journal_fd);
    fsconfig(fd, fsconfig_set_fd, "journal_fd", "", journal_fd);
    fsconfig(fd, fsconfig_set_flag, "user_xattr", NULL, 0);
    fsconfig(fd, fsconfig_set_flag, "noacl", NULL, 0);
    fsconfig(fd, fsconfig_set_string, "sb", "1", 0);
    fsconfig(fd, fsconfig_set_string, "errors", "continue", 0);
    fsconfig(fd, fsconfig_set_string, "data", "journal", 0);
    fsconfig(fd, fsconfig_set_string, "context", "unconfined_u:...", 0);
    fsconfig(fd, fsconfig_cmd_create, NULL, NULL, 0);
    mfd = fsmount(fd, FSMOUNT_CLOEXEC, MS_NOEXEC);

or:

    fd = fsopen("ext4", FSOPEN_CLOEXEC);
    fsconfig(fd, fsconfig_set_string, "source", "/dev/sda1", 0);
    fsconfig(fd, fsconfig_cmd_create, NULL, NULL, 0);
    mfd = fsmount(fd, FSMOUNT_CLOEXEC, MS_NOEXEC);

or:

    fd = fsopen("afs", FSOPEN_CLOEXEC);
    fsconfig(fd, fsconfig_set_string, "source", "#grand.central.org:root.cell", 0);
    fsconfig(fd, fsconfig_cmd_create, NULL, NULL, 0);
    mfd = fsmount(fd, FSMOUNT_CLOEXEC, MS_NOEXEC);

or:

    fd = fsopen("jffs2", FSOPEN_CLOEXEC);
    fsconfig(fd, fsconfig_set_string, "source", "mtd0", 0);
    fsconfig(fd, fsconfig_cmd_create, NULL, NULL, 0);
    mfd = fsmount(fd, FSMOUNT_CLOEXEC, MS_NOEXEC);

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/entry/syscalls/syscall_32.tbl |   3 +-
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 fs/fs_context.c                        |  51 +++++++
 fs/fsopen.c                            | 265 +++++++++++++++++++++++++++++++++
 fs/internal.h                          |   3 +
 include/linux/syscalls.h               |   2 +
 include/uapi/linux/mount.h             |  14 ++
 7 files changed, 338 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 37fd1fc5396e..786728143205 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -401,7 +401,8 @@
 387	i386	open_tree		sys_open_tree			__ia32_sys_open_tree
 388	i386	move_mount		sys_move_mount			__ia32_sys_move_mount
 389	i386	fsopen			sys_fsopen			__ia32_sys_fsopen
-# don't use numbers 390 through 392, add new calls at the end
+390	i386	fsconfig		sys_fsconfig			__ia32_sys_fsconfig
+# don't use numbers 391 through 392, add new calls at the end
 393	i386	semget			sys_semget    			__ia32_sys_semget
 394	i386	semctl			sys_semctl    			__ia32_compat_sys_semctl
 395	i386	shmget			sys_shmget    			__ia32_sys_shmget
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 511608a21611..7039a809d37d 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -346,6 +346,7 @@
 335	common	open_tree		__x64_sys_open_tree
 336	common	move_mount		__x64_sys_move_mount
 337	common	fsopen			__x64_sys_fsopen
+338	common	fsconfig		__x64_sys_fsconfig
 # don't use numbers 387 through 423, add new calls after the last
 # 'common' entry
 424	common	pidfd_send_signal	__x64_sys_pidfd_send_signal
diff --git a/fs/fs_context.c b/fs/fs_context.c
index dcf3786f90f9..a47ccd5a4a78 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -721,3 +721,54 @@ int parse_monolithic_mount_data(struct fs_context *fc, void *data)
 
 	return monolithic_mount_data(fc, data);
 }
+
+/*
+ * Clean up a context after performing an action on it and put it into a state
+ * from where it can be used to reconfigure a superblock.
+ *
+ * Note that here we do only the parts that can't fail; the rest is in
+ * finish_clean_context() below and in between those fs_context is marked
+ * FS_CONTEXT_AWAITING_RECONF.  The reason for splitup is that after
+ * successful mount or remount we need to report success to userland.
+ * Trying to do full reinit (for the sake of possible subsequent remount)
+ * and failing to allocate memory would've put us into a nasty situation.
+ * So here we only discard the old state and reinitialization is left
+ * until we actually try to reconfigure.
+ */
+void vfs_clean_context(struct fs_context *fc)
+{
+	if (fc->need_free && fc->ops && fc->ops->free)
+		fc->ops->free(fc);
+	fc->need_free = false;
+	fc->fs_private = NULL;
+	fc->s_fs_info = NULL;
+	fc->sb_flags = 0;
+	security_free_mnt_opts(&fc->security);
+	kfree(fc->subtype);
+	fc->subtype = NULL;
+	kfree(fc->source);
+	fc->source = NULL;
+
+	fc->purpose = FS_CONTEXT_FOR_RECONFIGURE;
+	fc->phase = FS_CONTEXT_AWAITING_RECONF;
+}
+
+int finish_clean_context(struct fs_context *fc)
+{
+	int error;
+
+	if (fc->phase != FS_CONTEXT_AWAITING_RECONF)
+		return 0;
+
+	if (fc->fs_type->init_fs_context)
+		error = fc->fs_type->init_fs_context(fc);
+	else
+		error = legacy_init_fs_context(fc);
+	if (unlikely(error)) {
+		fc->phase = FS_CONTEXT_FAILED;
+		return error;
+	}
+	fc->need_free = true;
+	fc->phase = FS_CONTEXT_RECONF_PARAMS;
+	return 0;
+}
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 5fce6347de7a..65cc2f68f994 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/syscalls.h>
@@ -18,6 +19,7 @@
 #include <linux/namei.h>
 #include <linux/file.h>
 #include <uapi/linux/mount.h>
+#include "internal.h"
 #include "mount.h"
 
 /*
@@ -153,3 +155,266 @@ err_fc:
 	put_fs_context(fc);
 	return ret;
 }
+
+/*
+ * Check the state and apply the configuration.  Note that this function is
+ * allowed to 'steal' the value by setting param->xxx to NULL before returning.
+ */
+static int vfs_fsconfig_locked(struct fs_context *fc, int cmd,
+			       struct fs_parameter *param)
+{
+	struct super_block *sb;
+	int ret;
+
+	ret = finish_clean_context(fc);
+	if (ret)
+		return ret;
+	switch (cmd) {
+	case FSCONFIG_CMD_CREATE:
+		if (fc->phase != FS_CONTEXT_CREATE_PARAMS)
+			return -EBUSY;
+		fc->phase = FS_CONTEXT_CREATING;
+		ret = vfs_get_tree(fc);
+		if (ret)
+			break;
+		sb = fc->root->d_sb;
+		ret = security_sb_kern_mount(sb);
+		if (unlikely(ret)) {
+			fc_drop_locked(fc);
+			break;
+		}
+		up_write(&sb->s_umount);
+		fc->phase = FS_CONTEXT_AWAITING_MOUNT;
+		return 0;
+	case FSCONFIG_CMD_RECONFIGURE:
+		if (fc->phase != FS_CONTEXT_RECONF_PARAMS)
+			return -EBUSY;
+		fc->phase = FS_CONTEXT_RECONFIGURING;
+		sb = fc->root->d_sb;
+		if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
+			ret = -EPERM;
+			break;
+		}
+		down_write(&sb->s_umount);
+		ret = reconfigure_super(fc);
+		up_write(&sb->s_umount);
+		if (ret)
+			break;
+		vfs_clean_context(fc);
+		return 0;
+	default:
+		if (fc->phase != FS_CONTEXT_CREATE_PARAMS &&
+		    fc->phase != FS_CONTEXT_RECONF_PARAMS)
+			return -EBUSY;
+
+		return vfs_parse_fs_param(fc, param);
+	}
+	fc->phase = FS_CONTEXT_FAILED;
+	return ret;
+}
+
+/**
+ * sys_fsconfig - Set parameters and trigger actions on a context
+ * @fd: The filesystem context to act upon
+ * @cmd: The action to take
+ * @_key: Where appropriate, the parameter key to set
+ * @_value: Where appropriate, the parameter value to set
+ * @aux: Additional information for the value
+ *
+ * This system call is used to set parameters on a context, including
+ * superblock settings, data source and security labelling.
+ *
+ * Actions include triggering the creation of a superblock and the
+ * reconfiguration of the superblock attached to the specified context.
+ *
+ * When setting a parameter, @cmd indicates the type of value being proposed
+ * and @_key indicates the parameter to be altered.
+ *
+ * @_value and @aux are used to specify the value, should a value be required:
+ *
+ * (*) fsconfig_set_flag: No value is specified.  The parameter must be boolean
+ *     in nature.  The key may be prefixed with "no" to invert the
+ *     setting. @_value must be NULL and @aux must be 0.
+ *
+ * (*) fsconfig_set_string: A string value is specified.  The parameter can be
+ *     expecting boolean, integer, string or take a path.  A conversion to an
+ *     appropriate type will be attempted (which may include looking up as a
+ *     path).  @_value points to a NUL-terminated string and @aux must be 0.
+ *
+ * (*) fsconfig_set_binary: A binary blob is specified.  @_value points to the
+ *     blob and @aux indicates its size.  The parameter must be expecting a
+ *     blob.
+ *
+ * (*) fsconfig_set_path: A non-empty path is specified.  The parameter must be
+ *     expecting a path object.  @_value points to a NUL-terminated string that
+ *     is the path and @aux is a file descriptor at which to start a relative
+ *     lookup or AT_FDCWD.
+ *
+ * (*) fsconfig_set_path_empty: As fsconfig_set_path, but with AT_EMPTY_PATH
+ *     implied.
+ *
+ * (*) fsconfig_set_fd: An open file descriptor is specified.  @_value must be
+ *     NULL and @aux indicates the file descriptor.
+ */
+SYSCALL_DEFINE5(fsconfig,
+		int, fd,
+		unsigned int, cmd,
+		const char __user *, _key,
+		const void __user *, _value,
+		int, aux)
+{
+	struct fs_context *fc;
+	struct fd f;
+	int ret;
+
+	struct fs_parameter param = {
+		.type	= fs_value_is_undefined,
+	};
+
+	if (fd < 0)
+		return -EINVAL;
+
+	switch (cmd) {
+	case FSCONFIG_SET_FLAG:
+		if (!_key || _value || aux)
+			return -EINVAL;
+		break;
+	case FSCONFIG_SET_STRING:
+		if (!_key || !_value || aux)
+			return -EINVAL;
+		break;
+	case FSCONFIG_SET_BINARY:
+		if (!_key || !_value || aux <= 0 || aux > 1024 * 1024)
+			return -EINVAL;
+		break;
+	case FSCONFIG_SET_PATH:
+	case FSCONFIG_SET_PATH_EMPTY:
+		if (!_key || !_value || (aux != AT_FDCWD && aux < 0))
+			return -EINVAL;
+		break;
+	case FSCONFIG_SET_FD:
+		if (!_key || _value || aux < 0)
+			return -EINVAL;
+		break;
+	case FSCONFIG_CMD_CREATE:
+	case FSCONFIG_CMD_RECONFIGURE:
+		if (_key || _value || aux)
+			return -EINVAL;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+	ret = -EINVAL;
+	if (f.file->f_op != &fscontext_fops)
+		goto out_f;
+
+	fc = f.file->private_data;
+	if (fc->ops == &legacy_fs_context_ops) {
+		switch (cmd) {
+		case FSCONFIG_SET_BINARY:
+		case FSCONFIG_SET_PATH:
+		case FSCONFIG_SET_PATH_EMPTY:
+		case FSCONFIG_SET_FD:
+			ret = -EOPNOTSUPP;
+			goto out_f;
+		}
+	}
+
+	if (_key) {
+		param.key = strndup_user(_key, 256);
+		if (IS_ERR(param.key)) {
+			ret = PTR_ERR(param.key);
+			goto out_f;
+		}
+	}
+
+	switch (cmd) {
+	case FSCONFIG_SET_FLAG:
+		param.type = fs_value_is_flag;
+		break;
+	case FSCONFIG_SET_STRING:
+		param.type = fs_value_is_string;
+		param.string = strndup_user(_value, 256);
+		if (IS_ERR(param.string)) {
+			ret = PTR_ERR(param.string);
+			goto out_key;
+		}
+		param.size = strlen(param.string);
+		break;
+	case FSCONFIG_SET_BINARY:
+		param.type = fs_value_is_blob;
+		param.size = aux;
+		param.blob = memdup_user_nul(_value, aux);
+		if (IS_ERR(param.blob)) {
+			ret = PTR_ERR(param.blob);
+			goto out_key;
+		}
+		break;
+	case FSCONFIG_SET_PATH:
+		param.type = fs_value_is_filename;
+		param.name = getname_flags(_value, 0, NULL);
+		if (IS_ERR(param.name)) {
+			ret = PTR_ERR(param.name);
+			goto out_key;
+		}
+		param.dirfd = aux;
+		param.size = strlen(param.name->name);
+		break;
+	case FSCONFIG_SET_PATH_EMPTY:
+		param.type = fs_value_is_filename_empty;
+		param.name = getname_flags(_value, LOOKUP_EMPTY, NULL);
+		if (IS_ERR(param.name)) {
+			ret = PTR_ERR(param.name);
+			goto out_key;
+		}
+		param.dirfd = aux;
+		param.size = strlen(param.name->name);
+		break;
+	case FSCONFIG_SET_FD:
+		param.type = fs_value_is_file;
+		ret = -EBADF;
+		param.file = fget(aux);
+		if (!param.file)
+			goto out_key;
+		break;
+	default:
+		break;
+	}
+
+	ret = mutex_lock_interruptible(&fc->uapi_mutex);
+	if (ret == 0) {
+		ret = vfs_fsconfig_locked(fc, cmd, &param);
+		mutex_unlock(&fc->uapi_mutex);
+	}
+
+	/* Clean up the our record of any value that we obtained from
+	 * userspace.  Note that the value may have been stolen by the LSM or
+	 * filesystem, in which case the value pointer will have been cleared.
+	 */
+	switch (cmd) {
+	case FSCONFIG_SET_STRING:
+	case FSCONFIG_SET_BINARY:
+		kfree(param.string);
+		break;
+	case FSCONFIG_SET_PATH:
+	case FSCONFIG_SET_PATH_EMPTY:
+		if (param.name)
+			putname(param.name);
+		break;
+	case FSCONFIG_SET_FD:
+		if (param.file)
+			fput(param.file);
+		break;
+	default:
+		break;
+	}
+out_key:
+	kfree(param.key);
+out_f:
+	fdput(f);
+	return ret;
+}
diff --git a/fs/internal.h b/fs/internal.h
index f3a027c44758..95cf7b0af21f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -55,8 +55,11 @@ extern void __init chrdev_init(void);
 /*
  * fs_context.c
  */
+extern const struct fs_context_operations legacy_fs_context_ops;
 extern int parse_monolithic_mount_data(struct fs_context *, void *);
 extern void fc_drop_locked(struct fs_context *);
+extern void vfs_clean_context(struct fs_context *fc);
+extern int finish_clean_context(struct fs_context *fc);
 
 /*
  * namei.c
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 0c9bd5427e8f..925f9dfc356b 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -990,6 +990,8 @@ asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path,
 			       int to_dfd, const char __user *to_path,
 			       unsigned int ms_flags);
 asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags);
+asmlinkage long sys_fsconfig(int fs_fd, unsigned int cmd, const char __user *key,
+			     const void __user *value, int aux);
 asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
 				       siginfo_t __user *info,
 				       unsigned int flags);
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 7570df43d08f..4b90ba9d1770 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -77,4 +77,18 @@
  */
 #define FSOPEN_CLOEXEC		0x00000001
 
+/*
+ * The type of fsconfig() call made.
+ */
+enum fsconfig_command {
+	FSCONFIG_SET_FLAG	= 0,	/* Set parameter, supplying no value */
+	FSCONFIG_SET_STRING	= 1,	/* Set parameter, supplying a string value */
+	FSCONFIG_SET_BINARY	= 2,	/* Set parameter, supplying a binary blob value */
+	FSCONFIG_SET_PATH	= 3,	/* Set parameter, supplying an object by path */
+	FSCONFIG_SET_PATH_EMPTY	= 4,	/* Set parameter, supplying an object by (empty) path */
+	FSCONFIG_SET_FD		= 5,	/* Set parameter, supplying an object by fd */
+	FSCONFIG_CMD_CREATE	= 6,	/* Invoke superblock creation */
+	FSCONFIG_CMD_RECONFIGURE = 7,	/* Invoke superblock reconfiguration */
+};
+
 #endif /* _UAPI_LINUX_MOUNT_H */
-- 
cgit v1.2.3-59-g8ed1b


From 93766fbd2696c2c4453dd8e1070977e9cd4e6b6d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 1 Nov 2018 23:36:14 +0000
Subject: vfs: syscall: Add fsmount() to create a mount for a superblock

Provide a system call by which a filesystem opened with fsopen() and
configured by a series of fsconfig() calls can have a detached mount object
created for it.  This mount object can then be attached to the VFS mount
hierarchy using move_mount() by passing the returned file descriptor as the
from directory fd.

The system call looks like:

	int mfd = fsmount(int fsfd, unsigned int flags,
			  unsigned int attr_flags);

where fsfd is the file descriptor returned by fsopen().  flags can be 0 or
FSMOUNT_CLOEXEC.  attr_flags is a bitwise-OR of the following flags:

	MOUNT_ATTR_RDONLY	Mount read-only
	MOUNT_ATTR_NOSUID	Ignore suid and sgid bits
	MOUNT_ATTR_NODEV	Disallow access to device special files
	MOUNT_ATTR_NOEXEC	Disallow program execution
	MOUNT_ATTR__ATIME	Setting on how atime should be updated
	MOUNT_ATTR_RELATIME	- Update atime relative to mtime/ctime
	MOUNT_ATTR_NOATIME	- Do not update access times
	MOUNT_ATTR_STRICTATIME	- Always perform atime updates
	MOUNT_ATTR_NODIRATIME	Do not update directory access times

In the event that fsmount() fails, it may be possible to get an error
message by calling read() on fsfd.  If no message is available, ENODATA
will be reported.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/entry/syscalls/syscall_32.tbl |   3 +-
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 fs/namespace.c                         | 146 ++++++++++++++++++++++++++++++++-
 include/linux/syscalls.h               |   1 +
 include/uapi/linux/mount.h             |  18 ++++
 5 files changed, 165 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 786728143205..5b5c9189c507 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -402,7 +402,8 @@
 388	i386	move_mount		sys_move_mount			__ia32_sys_move_mount
 389	i386	fsopen			sys_fsopen			__ia32_sys_fsopen
 390	i386	fsconfig		sys_fsconfig			__ia32_sys_fsconfig
-# don't use numbers 391 through 392, add new calls at the end
+391	i386	fsmount			sys_fsmount			__ia32_sys_fsmount
+# don't use number 392, add new calls at the end
 393	i386	semget			sys_semget    			__ia32_sys_semget
 394	i386	semctl			sys_semctl    			__ia32_compat_sys_semctl
 395	i386	shmget			sys_shmget    			__ia32_sys_shmget
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 7039a809d37d..984ad594bb2b 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -347,6 +347,7 @@
 336	common	move_mount		__x64_sys_move_mount
 337	common	fsopen			__x64_sys_fsopen
 338	common	fsconfig		__x64_sys_fsconfig
+339	common	fsmount			__x64_sys_fsmount
 # don't use numbers 387 through 423, add new calls after the last
 # 'common' entry
 424	common	pidfd_send_signal	__x64_sys_pidfd_send_signal
diff --git a/fs/namespace.c b/fs/namespace.c
index 1e72d19fa4f8..3357c3d65475 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3334,9 +3334,149 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
 }
 
 /*
- * Move a mount from one place to another.
- * In combination with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be
- * used to copy a mount subtree.
+ * Create a kernel mount representation for a new, prepared superblock
+ * (specified by fs_fd) and attach to an open_tree-like file descriptor.
+ */
+SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
+		unsigned int, attr_flags)
+{
+	struct mnt_namespace *ns;
+	struct fs_context *fc;
+	struct file *file;
+	struct path newmount;
+	struct mount *mnt;
+	struct fd f;
+	unsigned int mnt_flags = 0;
+	long ret;
+
+	if (!may_mount())
+		return -EPERM;
+
+	if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
+		return -EINVAL;
+
+	if (attr_flags & ~(MOUNT_ATTR_RDONLY |
+			   MOUNT_ATTR_NOSUID |
+			   MOUNT_ATTR_NODEV |
+			   MOUNT_ATTR_NOEXEC |
+			   MOUNT_ATTR__ATIME |
+			   MOUNT_ATTR_NODIRATIME))
+		return -EINVAL;
+
+	if (attr_flags & MOUNT_ATTR_RDONLY)
+		mnt_flags |= MNT_READONLY;
+	if (attr_flags & MOUNT_ATTR_NOSUID)
+		mnt_flags |= MNT_NOSUID;
+	if (attr_flags & MOUNT_ATTR_NODEV)
+		mnt_flags |= MNT_NODEV;
+	if (attr_flags & MOUNT_ATTR_NOEXEC)
+		mnt_flags |= MNT_NOEXEC;
+	if (attr_flags & MOUNT_ATTR_NODIRATIME)
+		mnt_flags |= MNT_NODIRATIME;
+
+	switch (attr_flags & MOUNT_ATTR__ATIME) {
+	case MOUNT_ATTR_STRICTATIME:
+		break;
+	case MOUNT_ATTR_NOATIME:
+		mnt_flags |= MNT_NOATIME;
+		break;
+	case MOUNT_ATTR_RELATIME:
+		mnt_flags |= MNT_RELATIME;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	f = fdget(fs_fd);
+	if (!f.file)
+		return -EBADF;
+
+	ret = -EINVAL;
+	if (f.file->f_op != &fscontext_fops)
+		goto err_fsfd;
+
+	fc = f.file->private_data;
+
+	ret = mutex_lock_interruptible(&fc->uapi_mutex);
+	if (ret < 0)
+		goto err_fsfd;
+
+	/* There must be a valid superblock or we can't mount it */
+	ret = -EINVAL;
+	if (!fc->root)
+		goto err_unlock;
+
+	ret = -EPERM;
+	if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
+		pr_warn("VFS: Mount too revealing\n");
+		goto err_unlock;
+	}
+
+	ret = -EBUSY;
+	if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
+		goto err_unlock;
+
+	ret = -EPERM;
+	if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock())
+		goto err_unlock;
+
+	newmount.mnt = vfs_create_mount(fc);
+	if (IS_ERR(newmount.mnt)) {
+		ret = PTR_ERR(newmount.mnt);
+		goto err_unlock;
+	}
+	newmount.dentry = dget(fc->root);
+	newmount.mnt->mnt_flags = mnt_flags;
+
+	/* We've done the mount bit - now move the file context into more or
+	 * less the same state as if we'd done an fspick().  We don't want to
+	 * do any memory allocation or anything like that at this point as we
+	 * don't want to have to handle any errors incurred.
+	 */
+	vfs_clean_context(fc);
+
+	ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
+	if (IS_ERR(ns)) {
+		ret = PTR_ERR(ns);
+		goto err_path;
+	}
+	mnt = real_mount(newmount.mnt);
+	mnt->mnt_ns = ns;
+	ns->root = mnt;
+	ns->mounts = 1;
+	list_add(&mnt->mnt_list, &ns->list);
+
+	/* Attach to an apparent O_PATH fd with a note that we need to unmount
+	 * it, not just simply put it.
+	 */
+	file = dentry_open(&newmount, O_PATH, fc->cred);
+	if (IS_ERR(file)) {
+		dissolve_on_fput(newmount.mnt);
+		ret = PTR_ERR(file);
+		goto err_path;
+	}
+	file->f_mode |= FMODE_NEED_UNMOUNT;
+
+	ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
+	if (ret >= 0)
+		fd_install(ret, file);
+	else
+		fput(file);
+
+err_path:
+	path_put(&newmount);
+err_unlock:
+	mutex_unlock(&fc->uapi_mutex);
+err_fsfd:
+	fdput(f);
+	return ret;
+}
+
+/*
+ * Move a mount from one place to another.  In combination with
+ * fsopen()/fsmount() this is used to install a new mount and in combination
+ * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
+ * a mount subtree.
  *
  * Note the flags value is a combination of MOVE_MOUNT_* flags.
  */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 925f9dfc356b..0e697f595278 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -992,6 +992,7 @@ asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path,
 asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags);
 asmlinkage long sys_fsconfig(int fs_fd, unsigned int cmd, const char __user *key,
 			     const void __user *value, int aux);
+asmlinkage long sys_fsmount(int fs_fd, unsigned int flags, unsigned int ms_flags);
 asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
 				       siginfo_t __user *info,
 				       unsigned int flags);
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 4b90ba9d1770..3888d3b91dc5 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -91,4 +91,22 @@ enum fsconfig_command {
 	FSCONFIG_CMD_RECONFIGURE = 7,	/* Invoke superblock reconfiguration */
 };
 
+/*
+ * fsmount() flags.
+ */
+#define FSMOUNT_CLOEXEC		0x00000001
+
+/*
+ * Mount attributes.
+ */
+#define MOUNT_ATTR_RDONLY	0x00000001 /* Mount read-only */
+#define MOUNT_ATTR_NOSUID	0x00000002 /* Ignore suid and sgid bits */
+#define MOUNT_ATTR_NODEV	0x00000004 /* Disallow access to device special files */
+#define MOUNT_ATTR_NOEXEC	0x00000008 /* Disallow program execution */
+#define MOUNT_ATTR__ATIME	0x00000070 /* Setting on how atime should be updated */
+#define MOUNT_ATTR_RELATIME	0x00000000 /* - Update atime relative to mtime/ctime. */
+#define MOUNT_ATTR_NOATIME	0x00000010 /* - Do not update access times. */
+#define MOUNT_ATTR_STRICTATIME	0x00000020 /* - Always perform atime updates */
+#define MOUNT_ATTR_NODIRATIME	0x00000080 /* Do not update directory access times */
+
 #endif /* _UAPI_LINUX_MOUNT_H */
-- 
cgit v1.2.3-59-g8ed1b


From cf3cba4a429be43e5527a3f78859b1bfd9ebc5fb Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 1 Nov 2018 23:36:23 +0000
Subject: vfs: syscall: Add fspick() to select a superblock for reconfiguration

Provide an fspick() system call that can be used to pick an existing
mountpoint into an fs_context which can thereafter be used to reconfigure a
superblock (equivalent of the superblock side of -o remount).

This looks like:

	int fd = fspick(AT_FDCWD, "/mnt",
			FSPICK_CLOEXEC | FSPICK_NO_AUTOMOUNT);
	fsconfig(fd, FSCONFIG_SET_FLAG, "intr", NULL, 0);
	fsconfig(fd, FSCONFIG_SET_FLAG, "noac", NULL, 0);
	fsconfig(fd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0);

At the point of fspick being called, the file descriptor referring to the
filesystem context is in exactly the same state as the one that was created
by fsopen() after fsmount() has been successfully called.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/entry/syscalls/syscall_32.tbl |  2 +-
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 fs/fsopen.c                            | 57 ++++++++++++++++++++++++++++++++++
 include/linux/syscalls.h               |  1 +
 include/uapi/linux/mount.h             |  8 +++++
 5 files changed, 68 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 5b5c9189c507..4cd5f982b1e5 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -403,7 +403,7 @@
 389	i386	fsopen			sys_fsopen			__ia32_sys_fsopen
 390	i386	fsconfig		sys_fsconfig			__ia32_sys_fsconfig
 391	i386	fsmount			sys_fsmount			__ia32_sys_fsmount
-# don't use number 392, add new calls at the end
+392	i386	fspick			sys_fspick			__ia32_sys_fspick
 393	i386	semget			sys_semget    			__ia32_sys_semget
 394	i386	semctl			sys_semctl    			__ia32_compat_sys_semctl
 395	i386	shmget			sys_shmget    			__ia32_sys_shmget
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 984ad594bb2b..64ca0d06259a 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -348,6 +348,7 @@
 337	common	fsopen			__x64_sys_fsopen
 338	common	fsconfig		__x64_sys_fsconfig
 339	common	fsmount			__x64_sys_fsmount
+340	common	fspick			__x64_sys_fspick
 # don't use numbers 387 through 423, add new calls after the last
 # 'common' entry
 424	common	pidfd_send_signal	__x64_sys_pidfd_send_signal
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 65cc2f68f994..3bb9c0c8cbcc 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -156,6 +156,63 @@ err_fc:
 	return ret;
 }
 
+/*
+ * Pick a superblock into a context for reconfiguration.
+ */
+SYSCALL_DEFINE3(fspick, int, dfd, const char __user *, path, unsigned int, flags)
+{
+	struct fs_context *fc;
+	struct path target;
+	unsigned int lookup_flags;
+	int ret;
+
+	if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if ((flags & ~(FSPICK_CLOEXEC |
+		       FSPICK_SYMLINK_NOFOLLOW |
+		       FSPICK_NO_AUTOMOUNT |
+		       FSPICK_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
+	lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
+	if (flags & FSPICK_SYMLINK_NOFOLLOW)
+		lookup_flags &= ~LOOKUP_FOLLOW;
+	if (flags & FSPICK_NO_AUTOMOUNT)
+		lookup_flags &= ~LOOKUP_AUTOMOUNT;
+	if (flags & FSPICK_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+	ret = user_path_at(dfd, path, lookup_flags, &target);
+	if (ret < 0)
+		goto err;
+
+	ret = -EINVAL;
+	if (target.mnt->mnt_root != target.dentry)
+		goto err_path;
+
+	fc = fs_context_for_reconfigure(target.dentry, 0, 0);
+	if (IS_ERR(fc)) {
+		ret = PTR_ERR(fc);
+		goto err_path;
+	}
+
+	fc->phase = FS_CONTEXT_RECONF_PARAMS;
+
+	ret = fscontext_alloc_log(fc);
+	if (ret < 0)
+		goto err_fc;
+
+	path_put(&target);
+	return fscontext_create_fd(fc, flags & FSPICK_CLOEXEC ? O_CLOEXEC : 0);
+
+err_fc:
+	put_fs_context(fc);
+err_path:
+	path_put(&target);
+err:
+	return ret;
+}
+
 /*
  * Check the state and apply the configuration.  Note that this function is
  * allowed to 'steal' the value by setting param->xxx to NULL before returning.
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 0e697f595278..e2870fe1be5b 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -993,6 +993,7 @@ asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags);
 asmlinkage long sys_fsconfig(int fs_fd, unsigned int cmd, const char __user *key,
 			     const void __user *value, int aux);
 asmlinkage long sys_fsmount(int fs_fd, unsigned int flags, unsigned int ms_flags);
+asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags);
 asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
 				       siginfo_t __user *info,
 				       unsigned int flags);
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 3888d3b91dc5..96a0240f23fe 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -77,6 +77,14 @@
  */
 #define FSOPEN_CLOEXEC		0x00000001
 
+/*
+ * fspick() flags.
+ */
+#define FSPICK_CLOEXEC		0x00000001
+#define FSPICK_SYMLINK_NOFOLLOW	0x00000002
+#define FSPICK_NO_AUTOMOUNT	0x00000004
+#define FSPICK_EMPTY_PATH	0x00000008
+
 /*
  * The type of fsconfig() call made.
  */
-- 
cgit v1.2.3-59-g8ed1b


From 162f33dd45a7ed606ae07c1e3f22db4ac43a584c Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 18 Mar 2019 02:28:39 +0300
Subject: Move EM_ARCOMPACT and EM_ARCV2 to uapi/linux/elf-em.h

These should never have been defined in the arch tree to begin with, and
now uapi/linux/audit.h header is going to use EM_ARCOMPACT and EM_ARCV2
in order to define AUDIT_ARCH_ARCOMPACT and AUDIT_ARCH_ARCV2 which are
needed to implement syscall_get_arch() which in turn is required to
extend the generic ptrace API with PTRACE_GET_SYSCALL_INFO request.

Acked-by: Vineet Gupta <vgupta@synopsys.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Cc: Elvira Khabirova <lineprinter@altlinux.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: Alexey Brodkin <alexey.brodkin@synopsys.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: linux-snps-arc@lists.infradead.org
Cc: linux-audit@redhat.com
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 arch/arc/include/asm/elf.h  | 6 +-----
 include/uapi/linux/elf-em.h | 2 ++
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/arc/include/asm/elf.h b/arch/arc/include/asm/elf.h
index aa2d6da9d187..2b80c184c9c8 100644
--- a/arch/arc/include/asm/elf.h
+++ b/arch/arc/include/asm/elf.h
@@ -10,13 +10,9 @@
 #define __ASM_ARC_ELF_H
 
 #include <linux/types.h>
+#include <linux/elf-em.h>
 #include <uapi/asm/elf.h>
 
-/* These ELF defines belong to uapi but libc elf.h already defines them */
-#define EM_ARCOMPACT		93
-
-#define EM_ARCV2		195	/* ARCv2 Cores */
-
 #define EM_ARC_INUSE		(IS_ENABLED(CONFIG_ISA_ARCOMPACT) ? \
 					EM_ARCOMPACT : EM_ARCV2)
 
diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h
index 0c3000faedba..081675ed89cb 100644
--- a/include/uapi/linux/elf-em.h
+++ b/include/uapi/linux/elf-em.h
@@ -34,6 +34,7 @@
 #define EM_M32R		88	/* Renesas M32R */
 #define EM_MN10300	89	/* Panasonic/MEI MN10300, AM33 */
 #define EM_OPENRISC     92     /* OpenRISC 32-bit embedded processor */
+#define EM_ARCOMPACT	93	/* ARCompact processor */
 #define EM_XTENSA	94	/* Tensilica Xtensa Architecture */
 #define EM_BLACKFIN     106     /* ADI Blackfin Processor */
 #define EM_ALTERA_NIOS2	113	/* Altera Nios II soft-core processor */
@@ -42,6 +43,7 @@
 #define EM_TILEPRO	188	/* Tilera TILEPro */
 #define EM_MICROBLAZE	189	/* Xilinx MicroBlaze */
 #define EM_TILEGX	191	/* Tilera TILE-Gx */
+#define EM_ARCV2	195	/* ARCv2 Cores */
 #define EM_RISCV	243	/* RISC-V */
 #define EM_BPF		247	/* Linux BPF - in-kernel virtual machine */
 #define EM_CSKY		252	/* C-SKY */
-- 
cgit v1.2.3-59-g8ed1b


From 67f2a8a29311841ba6ab9b0e2d1b8f1e9978cd84 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 18 Mar 2019 02:28:47 +0300
Subject: arc: define syscall_get_arch()

syscall_get_arch() is required to be implemented on all architectures
in addition to already implemented syscall_get_nr(),
syscall_get_arguments(), syscall_get_error(), and
syscall_get_return_value() functions in order to extend the generic
ptrace API with PTRACE_GET_SYSCALL_INFO request.

Acked-by: Vineet Gupta <vgupta@synopsys.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Cc: Elvira Khabirova <lineprinter@altlinux.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: Alexey Brodkin <alexey.brodkin@synopsys.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: linux-snps-arc@lists.infradead.org
Cc: linux-audit@redhat.com
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 arch/arc/include/asm/syscall.h | 11 +++++++++++
 include/uapi/linux/audit.h     |  4 ++++
 2 files changed, 15 insertions(+)

(limited to 'arch')

diff --git a/arch/arc/include/asm/syscall.h b/arch/arc/include/asm/syscall.h
index 29de09804306..c7fc4c0c3bcb 100644
--- a/arch/arc/include/asm/syscall.h
+++ b/arch/arc/include/asm/syscall.h
@@ -9,6 +9,7 @@
 #ifndef _ASM_ARC_SYSCALL_H
 #define _ASM_ARC_SYSCALL_H  1
 
+#include <uapi/linux/audit.h>
 #include <linux/err.h>
 #include <linux/sched.h>
 #include <asm/unistd.h>
@@ -68,4 +69,14 @@ syscall_get_arguments(struct task_struct *task, struct pt_regs *regs,
 	}
 }
 
+static inline int
+syscall_get_arch(void)
+{
+	return IS_ENABLED(CONFIG_ISA_ARCOMPACT)
+		? (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)
+			? AUDIT_ARCH_ARCOMPACTBE : AUDIT_ARCH_ARCOMPACT)
+		: (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)
+			? AUDIT_ARCH_ARCV2BE : AUDIT_ARCH_ARCV2);
+}
+
 #endif
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index f28acd952d03..1626727bb921 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -375,6 +375,10 @@ enum {
 
 #define AUDIT_ARCH_AARCH64	(EM_AARCH64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_ALPHA	(EM_ALPHA|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
+#define AUDIT_ARCH_ARCOMPACT	(EM_ARCOMPACT|__AUDIT_ARCH_LE)
+#define AUDIT_ARCH_ARCOMPACTBE	(EM_ARCOMPACT)
+#define AUDIT_ARCH_ARCV2	(EM_ARCV2|__AUDIT_ARCH_LE)
+#define AUDIT_ARCH_ARCV2BE	(EM_ARCV2)
 #define AUDIT_ARCH_ARM		(EM_ARM|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_ARMEB	(EM_ARM)
 #define AUDIT_ARCH_CRIS		(EM_CRIS|__AUDIT_ARCH_LE)
-- 
cgit v1.2.3-59-g8ed1b


From a43e66478ef7a2f8a7b2823b97cdae6605d34a02 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 18 Mar 2019 02:28:53 +0300
Subject: c6x: define syscall_get_arch()

syscall_get_arch() is required to be implemented on all architectures
in addition to already implemented syscall_get_nr(),
syscall_get_arguments(), syscall_get_error(), and
syscall_get_return_value() functions in order to extend the generic
ptrace API with PTRACE_GET_SYSCALL_INFO request.

Acked-by: Mark Salter <msalter@redhat.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Cc: Elvira Khabirova <lineprinter@altlinux.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: Aurelien Jacquiot <jacquiot.aurelien@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: linux-c6x-dev@linux-c6x.org
Cc: linux-audit@redhat.com
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 arch/c6x/include/asm/syscall.h | 7 +++++++
 include/uapi/linux/audit.h     | 2 ++
 2 files changed, 9 insertions(+)

(limited to 'arch')

diff --git a/arch/c6x/include/asm/syscall.h b/arch/c6x/include/asm/syscall.h
index ae2be315ee9c..39dbd1ef994c 100644
--- a/arch/c6x/include/asm/syscall.h
+++ b/arch/c6x/include/asm/syscall.h
@@ -11,6 +11,7 @@
 #ifndef __ASM_C6X_SYSCALL_H
 #define __ASM_C6X_SYSCALL_H
 
+#include <uapi/linux/audit.h>
 #include <linux/err.h>
 #include <linux/sched.h>
 
@@ -120,4 +121,10 @@ static inline void syscall_set_arguments(struct task_struct *task,
 	}
 }
 
+static inline int syscall_get_arch(void)
+{
+	return IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)
+		? AUDIT_ARCH_C6XBE : AUDIT_ARCH_C6X;
+}
+
 #endif /* __ASM_C6X_SYSCALLS_H */
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 1626727bb921..35ccf269ca3d 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -381,6 +381,8 @@ enum {
 #define AUDIT_ARCH_ARCV2BE	(EM_ARCV2)
 #define AUDIT_ARCH_ARM		(EM_ARM|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_ARMEB	(EM_ARM)
+#define AUDIT_ARCH_C6X		(EM_TI_C6000|__AUDIT_ARCH_LE)
+#define AUDIT_ARCH_C6XBE	(EM_TI_C6000)
 #define AUDIT_ARCH_CRIS		(EM_CRIS|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_CSKY		(EM_CSKY|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_FRV		(EM_FRV)
-- 
cgit v1.2.3-59-g8ed1b


From 122a43b107420fec4c69d1bf99706cbb0da40ad9 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 18 Mar 2019 02:29:01 +0300
Subject: h8300: define syscall_get_arch()

syscall_get_arch() is required to be implemented on all architectures
in addition to already implemented syscall_get_nr(),
syscall_get_arguments(), syscall_get_error(), and
syscall_get_return_value() functions in order to extend the generic
ptrace API with PTRACE_GET_SYSCALL_INFO request.

Acked-by: Paul Moore <paul@paul-moore.com>
Cc: Elvira Khabirova <lineprinter@altlinux.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: uclinux-h8-devel@lists.sourceforge.jp
Cc: linux-audit@redhat.com
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 arch/h8300/include/asm/syscall.h | 6 ++++++
 include/uapi/linux/audit.h       | 1 +
 2 files changed, 7 insertions(+)

(limited to 'arch')

diff --git a/arch/h8300/include/asm/syscall.h b/arch/h8300/include/asm/syscall.h
index 924990401237..5135910616e2 100644
--- a/arch/h8300/include/asm/syscall.h
+++ b/arch/h8300/include/asm/syscall.h
@@ -8,6 +8,7 @@
 #include <linux/linkage.h>
 #include <linux/types.h>
 #include <linux/ptrace.h>
+#include <uapi/linux/audit.h>
 
 static inline int
 syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
@@ -47,6 +48,11 @@ syscall_get_arguments(struct task_struct *task, struct pt_regs *regs,
 	}
 }
 
+static inline int
+syscall_get_arch(void)
+{
+	return AUDIT_ARCH_H8300;
+}
 
 
 /* Misc syscall related bits */
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 35ccf269ca3d..09994181b2d5 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -386,6 +386,7 @@ enum {
 #define AUDIT_ARCH_CRIS		(EM_CRIS|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_CSKY		(EM_CSKY|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_FRV		(EM_FRV)
+#define AUDIT_ARCH_H8300	(EM_H8_300)
 #define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_IA64		(EM_IA_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_M32R		(EM_M32R)
-- 
cgit v1.2.3-59-g8ed1b


From f4780e2db06df05c7349718baf1cd26767aa90f8 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 18 Mar 2019 02:29:08 +0300
Subject: Move EM_HEXAGON to uapi/linux/elf-em.h

This should never have been defined in the arch tree to begin with,
and now uapi/linux/audit.h header is going to use EM_HEXAGON
in order to define AUDIT_ARCH_HEXAGON which is needed to implement
syscall_get_arch() which in turn is required to extend
the generic ptrace API with PTRACE_GET_SYSCALL_INFO request.

Acked-by: Paul Moore <paul@paul-moore.com>
Cc: Elvira Khabirova <lineprinter@altlinux.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: linux-hexagon@vger.kernel.org
Cc: linux-audit@redhat.com
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 arch/hexagon/include/asm/elf.h | 6 +-----
 include/uapi/linux/elf-em.h    | 1 +
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/hexagon/include/asm/elf.h b/arch/hexagon/include/asm/elf.h
index 80311e7b8ca6..d10fbd54ae51 100644
--- a/arch/hexagon/include/asm/elf.h
+++ b/arch/hexagon/include/asm/elf.h
@@ -23,11 +23,7 @@
 
 #include <asm/ptrace.h>
 #include <asm/user.h>
-
-/*
- * This should really be in linux/elf-em.h.
- */
-#define EM_HEXAGON	164   /* QUALCOMM Hexagon */
+#include <linux/elf-em.h>
 
 struct elf32_hdr;
 
diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h
index 081675ed89cb..bd02325028d8 100644
--- a/include/uapi/linux/elf-em.h
+++ b/include/uapi/linux/elf-em.h
@@ -39,6 +39,7 @@
 #define EM_BLACKFIN     106     /* ADI Blackfin Processor */
 #define EM_ALTERA_NIOS2	113	/* Altera Nios II soft-core processor */
 #define EM_TI_C6000	140	/* TI C6X DSPs */
+#define EM_HEXAGON	164	/* QUALCOMM Hexagon */
 #define EM_AARCH64	183	/* ARM 64 bit */
 #define EM_TILEPRO	188	/* Tilera TILEPro */
 #define EM_MICROBLAZE	189	/* Xilinx MicroBlaze */
-- 
cgit v1.2.3-59-g8ed1b


From d093153431dc6e5982ec77aabe31fa38d2041ac0 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 18 Mar 2019 02:29:32 +0300
Subject: hexagon: define syscall_get_arch()

syscall_get_arch() is required to be implemented on all architectures
in addition to already implemented syscall_get_nr(),
syscall_get_arguments(), syscall_get_error(), and
syscall_get_return_value() functions in order to extend the generic
ptrace API with PTRACE_GET_SYSCALL_INFO request.

Acked-by: Paul Moore <paul@paul-moore.com>
Cc: Elvira Khabirova <lineprinter@altlinux.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: linux-hexagon@vger.kernel.org
Cc: linux-audit@redhat.com
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 arch/hexagon/include/asm/syscall.h | 8 ++++++++
 include/uapi/linux/audit.h         | 1 +
 2 files changed, 9 insertions(+)

(limited to 'arch')

diff --git a/arch/hexagon/include/asm/syscall.h b/arch/hexagon/include/asm/syscall.h
index 4af9c7b6f13a..de3917aad3fd 100644
--- a/arch/hexagon/include/asm/syscall.h
+++ b/arch/hexagon/include/asm/syscall.h
@@ -21,6 +21,8 @@
 #ifndef _ASM_HEXAGON_SYSCALL_H
 #define _ASM_HEXAGON_SYSCALL_H
 
+#include <uapi/linux/audit.h>
+
 typedef long (*syscall_fn)(unsigned long, unsigned long,
 	unsigned long, unsigned long,
 	unsigned long, unsigned long);
@@ -43,4 +45,10 @@ static inline void syscall_get_arguments(struct task_struct *task,
 	BUG_ON(i + n > 6);
 	memcpy(args, &(&regs->r00)[i], n * sizeof(args[0]));
 }
+
+static inline int syscall_get_arch(void)
+{
+	return AUDIT_ARCH_HEXAGON;
+}
+
 #endif
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 09994181b2d5..fc94e74c8b77 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -387,6 +387,7 @@ enum {
 #define AUDIT_ARCH_CSKY		(EM_CSKY|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_FRV		(EM_FRV)
 #define AUDIT_ARCH_H8300	(EM_H8_300)
+#define AUDIT_ARCH_HEXAGON	(EM_HEXAGON)
 #define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_IA64		(EM_IA_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_M32R		(EM_M32R)
-- 
cgit v1.2.3-59-g8ed1b


From 92f922f35078245e2acbafcf74fb1947dbbac398 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 18 Mar 2019 02:29:38 +0300
Subject: m68k: define syscall_get_arch()

syscall_get_arch() is required to be implemented on all architectures
in addition to already implemented syscall_get_nr(),
syscall_get_arguments(), syscall_get_error(), and
syscall_get_return_value() functions in order to extend the generic
ptrace API with PTRACE_GET_SYSCALL_INFO request.

Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Paul Moore <paul@paul-moore.com>
Cc: Elvira Khabirova <lineprinter@altlinux.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: linux-m68k@lists.linux-m68k.org
Cc: linux-audit@redhat.com
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 arch/m68k/include/asm/syscall.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 arch/m68k/include/asm/syscall.h

(limited to 'arch')

diff --git a/arch/m68k/include/asm/syscall.h b/arch/m68k/include/asm/syscall.h
new file mode 100644
index 000000000000..d4d7deda8d50
--- /dev/null
+++ b/arch/m68k/include/asm/syscall.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_M68K_SYSCALL_H
+#define _ASM_M68K_SYSCALL_H
+
+#include <uapi/linux/audit.h>
+
+static inline int syscall_get_arch(void)
+{
+	return AUDIT_ARCH_M68K;
+}
+
+#endif	/* _ASM_M68K_SYSCALL_H */
-- 
cgit v1.2.3-59-g8ed1b


From 530ff23a8e46814a1be4a516c707d3b0fb292186 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 18 Mar 2019 02:29:48 +0300
Subject: Move EM_NDS32 to uapi/linux/elf-em.h

This should never have been defined in the arch tree to begin with,
and now uapi/linux/audit.h header is going to use EM_NDS32
in order to define AUDIT_ARCH_NDS32 which is needed to implement
syscall_get_arch() which in turn is required to extend
the generic ptrace API with PTRACE_GET_SYSCALL_INFO request.

Acked-by: Paul Moore <paul@paul-moore.com>
Acked-by: Vincent Chen <vincentc@andestech.com>
Acked-by: Greentime Hu <greentime@andestech.com>
Cc: Elvira Khabirova <lineprinter@altlinux.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: linux-audit@redhat.com
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 arch/nds32/include/asm/elf.h | 3 +--
 include/uapi/linux/elf-em.h  | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/nds32/include/asm/elf.h b/arch/nds32/include/asm/elf.h
index 95f3ea253e4c..02250626b9f0 100644
--- a/arch/nds32/include/asm/elf.h
+++ b/arch/nds32/include/asm/elf.h
@@ -10,14 +10,13 @@
 
 #include <asm/ptrace.h>
 #include <asm/fpu.h>
+#include <linux/elf-em.h>
 
 typedef unsigned long elf_greg_t;
 typedef unsigned long elf_freg_t[3];
 
 extern unsigned int elf_hwcap;
 
-#define EM_NDS32			167
-
 #define R_NDS32_NONE			0
 #define R_NDS32_16_RELA			19
 #define R_NDS32_32_RELA			20
diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h
index bd02325028d8..4b8df722330e 100644
--- a/include/uapi/linux/elf-em.h
+++ b/include/uapi/linux/elf-em.h
@@ -40,6 +40,8 @@
 #define EM_ALTERA_NIOS2	113	/* Altera Nios II soft-core processor */
 #define EM_TI_C6000	140	/* TI C6X DSPs */
 #define EM_HEXAGON	164	/* QUALCOMM Hexagon */
+#define EM_NDS32	167	/* Andes Technology compact code size
+				   embedded RISC processor family */
 #define EM_AARCH64	183	/* ARM 64 bit */
 #define EM_TILEPRO	188	/* Tilera TILEPro */
 #define EM_MICROBLAZE	189	/* Xilinx MicroBlaze */
-- 
cgit v1.2.3-59-g8ed1b


From fa562447e154334523daa44c0b60625d71a345f5 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 18 Mar 2019 02:29:57 +0300
Subject: nds32: define syscall_get_arch()

syscall_get_arch() is required to be implemented on all architectures
in addition to already implemented syscall_get_nr(),
syscall_get_arguments(), syscall_get_error(), and
syscall_get_return_value() functions in order to extend the generic
ptrace API with PTRACE_GET_SYSCALL_INFO request.

Acked-by: Paul Moore <paul@paul-moore.com>
Acked-by: Vincent Chen <vincentc@andestech.com>
Acked-by: Greentime Hu <greentime@andestech.com>
Cc: Elvira Khabirova <lineprinter@altlinux.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: linux-audit@redhat.com
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 arch/nds32/include/asm/syscall.h | 9 +++++++++
 include/uapi/linux/audit.h       | 2 ++
 2 files changed, 11 insertions(+)

(limited to 'arch')

diff --git a/arch/nds32/include/asm/syscall.h b/arch/nds32/include/asm/syscall.h
index f7e5e86765fe..cc56a3962f8b 100644
--- a/arch/nds32/include/asm/syscall.h
+++ b/arch/nds32/include/asm/syscall.h
@@ -5,6 +5,7 @@
 #ifndef _ASM_NDS32_SYSCALL_H
 #define _ASM_NDS32_SYSCALL_H	1
 
+#include <uapi/linux/audit.h>
 #include <linux/err.h>
 struct task_struct;
 struct pt_regs;
@@ -185,4 +186,12 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
 
 	memcpy(&regs->uregs[0] + i, args, n * sizeof(args[0]));
 }
+
+static inline int
+syscall_get_arch(void)
+{
+	return IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)
+		? AUDIT_ARCH_NDS32BE : AUDIT_ARCH_NDS32;
+}
+
 #endif /* _ASM_NDS32_SYSCALL_H */
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index fc94e74c8b77..fb0529da4d49 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -401,6 +401,8 @@ enum {
 #define AUDIT_ARCH_MIPSEL64	(EM_MIPS|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_MIPSEL64N32	(EM_MIPS|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE|\
 				 __AUDIT_ARCH_CONVENTION_MIPS64_N32)
+#define AUDIT_ARCH_NDS32	(EM_NDS32|__AUDIT_ARCH_LE)
+#define AUDIT_ARCH_NDS32BE	(EM_NDS32)
 #define AUDIT_ARCH_OPENRISC	(EM_OPENRISC)
 #define AUDIT_ARCH_PARISC	(EM_PARISC)
 #define AUDIT_ARCH_PARISC64	(EM_PARISC|__AUDIT_ARCH_64BIT)
-- 
cgit v1.2.3-59-g8ed1b


From 1660aac45e5b49a5ace29fb5b73254617533fcbd Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 18 Mar 2019 02:30:02 +0300
Subject: nios2: define syscall_get_arch()

syscall_get_arch() is required to be implemented on all architectures
in addition to already implemented syscall_get_nr(),
syscall_get_arguments(), syscall_get_error(), and
syscall_get_return_value() functions in order to extend the generic
ptrace API with PTRACE_GET_SYSCALL_INFO request.

Acked-by: Paul Moore <paul@paul-moore.com>
Acked-by: Ley Foon Tan <ley.foon.tan@intel.com>
Cc: Elvira Khabirova <lineprinter@altlinux.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: nios2-dev@lists.rocketboards.org
Cc: linux-audit@redhat.com
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 arch/nios2/include/asm/syscall.h | 6 ++++++
 include/uapi/linux/audit.h       | 1 +
 2 files changed, 7 insertions(+)

(limited to 'arch')

diff --git a/arch/nios2/include/asm/syscall.h b/arch/nios2/include/asm/syscall.h
index 9de220854c4a..cf35e210fc4d 100644
--- a/arch/nios2/include/asm/syscall.h
+++ b/arch/nios2/include/asm/syscall.h
@@ -17,6 +17,7 @@
 #ifndef __ASM_NIOS2_SYSCALL_H__
 #define __ASM_NIOS2_SYSCALL_H__
 
+#include <uapi/linux/audit.h>
 #include <linux/err.h>
 #include <linux/sched.h>
 
@@ -135,4 +136,9 @@ static inline void syscall_set_arguments(struct task_struct *task,
 	}
 }
 
+static inline int syscall_get_arch(void)
+{
+	return AUDIT_ARCH_NIOS2;
+}
+
 #endif
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index fb0529da4d49..bcc0619b046f 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -403,6 +403,7 @@ enum {
 				 __AUDIT_ARCH_CONVENTION_MIPS64_N32)
 #define AUDIT_ARCH_NDS32	(EM_NDS32|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_NDS32BE	(EM_NDS32)
+#define AUDIT_ARCH_NIOS2	(EM_ALTERA_NIOS2|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_OPENRISC	(EM_OPENRISC)
 #define AUDIT_ARCH_PARISC	(EM_PARISC)
 #define AUDIT_ARCH_PARISC64	(EM_PARISC|__AUDIT_ARCH_64BIT)
-- 
cgit v1.2.3-59-g8ed1b


From 03f7e6adfbd02d026817d2b0b21c8420fe58e0b3 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 18 Mar 2019 02:30:06 +0300
Subject: Move EM_UNICORE to uapi/linux/elf-em.h

This should never have been defined in the arch tree to begin with,
and now uapi/linux/audit.h header is going to use EM_UNICORE
in order to define AUDIT_ARCH_UNICORE which is needed to implement
syscall_get_arch() which in turn is required to extend
the generic ptrace API with PTRACE_GET_SYSCALL_INFO request.

Acked-by: Paul Moore <paul@paul-moore.com>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: Elvira Khabirova <lineprinter@altlinux.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: linux-audit@redhat.com
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 arch/unicore32/include/asm/elf.h | 3 +--
 include/uapi/linux/elf-em.h      | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/unicore32/include/asm/elf.h b/arch/unicore32/include/asm/elf.h
index 829042d07722..ae66dc1be49e 100644
--- a/arch/unicore32/include/asm/elf.h
+++ b/arch/unicore32/include/asm/elf.h
@@ -19,6 +19,7 @@
  * ELF register definitions..
  */
 #include <asm/ptrace.h>
+#include <linux/elf-em.h>
 
 typedef unsigned long elf_greg_t;
 typedef unsigned long elf_freg_t[3];
@@ -28,8 +29,6 @@ typedef elf_greg_t elf_gregset_t[ELF_NGREG];
 
 typedef struct fp_state elf_fpregset_t;
 
-#define EM_UNICORE		110
-
 #define R_UNICORE_NONE		0
 #define R_UNICORE_PC24		1
 #define R_UNICORE_ABS32		2
diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h
index 4b8df722330e..f47e853546fa 100644
--- a/include/uapi/linux/elf-em.h
+++ b/include/uapi/linux/elf-em.h
@@ -37,6 +37,7 @@
 #define EM_ARCOMPACT	93	/* ARCompact processor */
 #define EM_XTENSA	94	/* Tensilica Xtensa Architecture */
 #define EM_BLACKFIN     106     /* ADI Blackfin Processor */
+#define EM_UNICORE	110	/* UniCore-32 */
 #define EM_ALTERA_NIOS2	113	/* Altera Nios II soft-core processor */
 #define EM_TI_C6000	140	/* TI C6X DSPs */
 #define EM_HEXAGON	164	/* QUALCOMM Hexagon */
-- 
cgit v1.2.3-59-g8ed1b


From b15fe94acece954feda32706e3ca7cc024999aee Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 18 Mar 2019 02:30:11 +0300
Subject: unicore32: define syscall_get_arch()

syscall_get_arch() is required to be implemented on all architectures
in addition to already implemented syscall_get_nr(),
syscall_get_arguments(), syscall_get_error(), and
syscall_get_return_value() functions in order to extend the generic
ptrace API with PTRACE_GET_SYSCALL_INFO request.

Acked-by: Paul Moore <paul@paul-moore.com>
Cc: Elvira Khabirova <lineprinter@altlinux.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: linux-audit@redhat.com
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 arch/unicore32/include/asm/syscall.h | 12 ++++++++++++
 include/uapi/linux/audit.h           |  1 +
 2 files changed, 13 insertions(+)
 create mode 100644 arch/unicore32/include/asm/syscall.h

(limited to 'arch')

diff --git a/arch/unicore32/include/asm/syscall.h b/arch/unicore32/include/asm/syscall.h
new file mode 100644
index 000000000000..3a6b885476b4
--- /dev/null
+++ b/arch/unicore32/include/asm/syscall.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_UNICORE_SYSCALL_H
+#define _ASM_UNICORE_SYSCALL_H
+
+#include <uapi/linux/audit.h>
+
+static inline int syscall_get_arch(void)
+{
+	return AUDIT_ARCH_UNICORE;
+}
+
+#endif	/* _ASM_UNICORE_SYSCALL_H */
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index bcc0619b046f..3901c51c0b93 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -424,6 +424,7 @@ enum {
 #define AUDIT_ARCH_TILEGX	(EM_TILEGX|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_TILEGX32	(EM_TILEGX|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_TILEPRO	(EM_TILEPRO|__AUDIT_ARCH_LE)
+#define AUDIT_ARCH_UNICORE	(EM_UNICORE|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_X86_64	(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_XTENSA	(EM_XTENSA)
 
-- 
cgit v1.2.3-59-g8ed1b


From 16add411645cff83360086e102daa67b25f1e39a Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 18 Mar 2019 02:30:18 +0300
Subject: syscall_get_arch: add "struct task_struct *" argument

This argument is required to extend the generic ptrace API with
PTRACE_GET_SYSCALL_INFO request: syscall_get_arch() is going
to be called from ptrace_request() along with syscall_get_nr(),
syscall_get_arguments(), syscall_get_error(), and
syscall_get_return_value() functions with a tracee as their argument.

The primary intent is that the triple (audit_arch, syscall_nr, arg1..arg6)
should describe what system call is being called and what its arguments
are.

Reverts: 5e937a9ae913 ("syscall_get_arch: remove useless function arguments")
Reverts: 1002d94d3076 ("syscall.h: fix doc text for syscall_get_arch()")
Reviewed-by: Andy Lutomirski <luto@kernel.org> # for x86
Reviewed-by: Palmer Dabbelt <palmer@sifive.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Acked-by: Paul Burton <paul.burton@mips.com> # MIPS parts
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Acked-by: Kees Cook <keescook@chromium.org> # seccomp parts
Acked-by: Mark Salter <msalter@redhat.com> # for the c6x bit
Cc: Elvira Khabirova <lineprinter@altlinux.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: x86@kernel.org
Cc: linux-alpha@vger.kernel.org
Cc: linux-snps-arc@lists.infradead.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-c6x-dev@linux-c6x.org
Cc: uclinux-h8-devel@lists.sourceforge.jp
Cc: linux-hexagon@vger.kernel.org
Cc: linux-ia64@vger.kernel.org
Cc: linux-m68k@lists.linux-m68k.org
Cc: linux-mips@vger.kernel.org
Cc: nios2-dev@lists.rocketboards.org
Cc: openrisc@lists.librecores.org
Cc: linux-parisc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-riscv@lists.infradead.org
Cc: linux-s390@vger.kernel.org
Cc: linux-sh@vger.kernel.org
Cc: sparclinux@vger.kernel.org
Cc: linux-um@lists.infradead.org
Cc: linux-xtensa@linux-xtensa.org
Cc: linux-arch@vger.kernel.org
Cc: linux-audit@redhat.com
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 arch/alpha/include/asm/syscall.h      |  2 +-
 arch/arc/include/asm/syscall.h        |  2 +-
 arch/arm/include/asm/syscall.h        |  2 +-
 arch/arm64/include/asm/syscall.h      |  4 ++--
 arch/c6x/include/asm/syscall.h        |  2 +-
 arch/csky/include/asm/syscall.h       |  2 +-
 arch/h8300/include/asm/syscall.h      |  2 +-
 arch/hexagon/include/asm/syscall.h    |  2 +-
 arch/ia64/include/asm/syscall.h       |  2 +-
 arch/m68k/include/asm/syscall.h       |  2 +-
 arch/microblaze/include/asm/syscall.h |  2 +-
 arch/mips/include/asm/syscall.h       |  6 +++---
 arch/mips/kernel/ptrace.c             |  2 +-
 arch/nds32/include/asm/syscall.h      |  2 +-
 arch/nios2/include/asm/syscall.h      |  2 +-
 arch/openrisc/include/asm/syscall.h   |  2 +-
 arch/parisc/include/asm/syscall.h     |  4 ++--
 arch/powerpc/include/asm/syscall.h    | 10 ++++++++--
 arch/riscv/include/asm/syscall.h      |  2 +-
 arch/s390/include/asm/syscall.h       |  4 ++--
 arch/sh/include/asm/syscall_32.h      |  2 +-
 arch/sh/include/asm/syscall_64.h      |  2 +-
 arch/sparc/include/asm/syscall.h      |  5 +++--
 arch/unicore32/include/asm/syscall.h  |  2 +-
 arch/x86/include/asm/syscall.h        |  8 +++++---
 arch/x86/um/asm/syscall.h             |  2 +-
 arch/xtensa/include/asm/syscall.h     |  2 +-
 include/asm-generic/syscall.h         |  5 +++--
 kernel/auditsc.c                      |  4 ++--
 kernel/seccomp.c                      |  4 ++--
 30 files changed, 52 insertions(+), 42 deletions(-)

(limited to 'arch')

diff --git a/arch/alpha/include/asm/syscall.h b/arch/alpha/include/asm/syscall.h
index d73a6fcb519c..11c688c1d7ec 100644
--- a/arch/alpha/include/asm/syscall.h
+++ b/arch/alpha/include/asm/syscall.h
@@ -4,7 +4,7 @@
 
 #include <uapi/linux/audit.h>
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 	return AUDIT_ARCH_ALPHA;
 }
diff --git a/arch/arc/include/asm/syscall.h b/arch/arc/include/asm/syscall.h
index c7fc4c0c3bcb..caf2697ef5b7 100644
--- a/arch/arc/include/asm/syscall.h
+++ b/arch/arc/include/asm/syscall.h
@@ -70,7 +70,7 @@ syscall_get_arguments(struct task_struct *task, struct pt_regs *regs,
 }
 
 static inline int
-syscall_get_arch(void)
+syscall_get_arch(struct task_struct *task)
 {
 	return IS_ENABLED(CONFIG_ISA_ARCOMPACT)
 		? (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)
diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h
index 06dea6bce293..3940ceac0bdc 100644
--- a/arch/arm/include/asm/syscall.h
+++ b/arch/arm/include/asm/syscall.h
@@ -104,7 +104,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
 	memcpy(&regs->ARM_r0 + i, args, n * sizeof(args[0]));
 }
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 	/* ARM tasks don't change audit architectures on the fly. */
 	return AUDIT_ARCH_ARM;
diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h
index ad8be16a39c9..1870df03f774 100644
--- a/arch/arm64/include/asm/syscall.h
+++ b/arch/arm64/include/asm/syscall.h
@@ -117,9 +117,9 @@ static inline void syscall_set_arguments(struct task_struct *task,
  * We don't care about endianness (__AUDIT_ARCH_LE bit) here because
  * AArch64 has the same system calls both on little- and big- endian.
  */
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
-	if (is_compat_task())
+	if (is_compat_thread(task_thread_info(task)))
 		return AUDIT_ARCH_ARM;
 
 	return AUDIT_ARCH_AARCH64;
diff --git a/arch/c6x/include/asm/syscall.h b/arch/c6x/include/asm/syscall.h
index 39dbd1ef994c..595057191c9c 100644
--- a/arch/c6x/include/asm/syscall.h
+++ b/arch/c6x/include/asm/syscall.h
@@ -121,7 +121,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
 	}
 }
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 	return IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)
 		? AUDIT_ARCH_C6XBE : AUDIT_ARCH_C6X;
diff --git a/arch/csky/include/asm/syscall.h b/arch/csky/include/asm/syscall.h
index d637445737b7..150ffb894fa2 100644
--- a/arch/csky/include/asm/syscall.h
+++ b/arch/csky/include/asm/syscall.h
@@ -70,7 +70,7 @@ syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
 }
 
 static inline int
-syscall_get_arch(void)
+syscall_get_arch(struct task_struct *task)
 {
 	return AUDIT_ARCH_CSKY;
 }
diff --git a/arch/h8300/include/asm/syscall.h b/arch/h8300/include/asm/syscall.h
index 5135910616e2..d316c3d40d4e 100644
--- a/arch/h8300/include/asm/syscall.h
+++ b/arch/h8300/include/asm/syscall.h
@@ -49,7 +49,7 @@ syscall_get_arguments(struct task_struct *task, struct pt_regs *regs,
 }
 
 static inline int
-syscall_get_arch(void)
+syscall_get_arch(struct task_struct *task)
 {
 	return AUDIT_ARCH_H8300;
 }
diff --git a/arch/hexagon/include/asm/syscall.h b/arch/hexagon/include/asm/syscall.h
index de3917aad3fd..47b0bc3f16be 100644
--- a/arch/hexagon/include/asm/syscall.h
+++ b/arch/hexagon/include/asm/syscall.h
@@ -46,7 +46,7 @@ static inline void syscall_get_arguments(struct task_struct *task,
 	memcpy(args, &(&regs->r00)[i], n * sizeof(args[0]));
 }
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 	return AUDIT_ARCH_HEXAGON;
 }
diff --git a/arch/ia64/include/asm/syscall.h b/arch/ia64/include/asm/syscall.h
index 1d0b875fec44..47ab33f5448a 100644
--- a/arch/ia64/include/asm/syscall.h
+++ b/arch/ia64/include/asm/syscall.h
@@ -81,7 +81,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
 	ia64_syscall_get_set_arguments(task, regs, i, n, args, 1);
 }
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 	return AUDIT_ARCH_IA64;
 }
diff --git a/arch/m68k/include/asm/syscall.h b/arch/m68k/include/asm/syscall.h
index d4d7deda8d50..465ac039be09 100644
--- a/arch/m68k/include/asm/syscall.h
+++ b/arch/m68k/include/asm/syscall.h
@@ -4,7 +4,7 @@
 
 #include <uapi/linux/audit.h>
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 	return AUDIT_ARCH_M68K;
 }
diff --git a/arch/microblaze/include/asm/syscall.h b/arch/microblaze/include/asm/syscall.h
index 220decd605a4..77a86fafa974 100644
--- a/arch/microblaze/include/asm/syscall.h
+++ b/arch/microblaze/include/asm/syscall.h
@@ -101,7 +101,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
 asmlinkage unsigned long do_syscall_trace_enter(struct pt_regs *regs);
 asmlinkage void do_syscall_trace_leave(struct pt_regs *regs);
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 	return AUDIT_ARCH_MICROBLAZE;
 }
diff --git a/arch/mips/include/asm/syscall.h b/arch/mips/include/asm/syscall.h
index 6cf8ffb5367e..6a22c9352ef6 100644
--- a/arch/mips/include/asm/syscall.h
+++ b/arch/mips/include/asm/syscall.h
@@ -140,14 +140,14 @@ extern const unsigned long sys_call_table[];
 extern const unsigned long sys32_call_table[];
 extern const unsigned long sysn32_call_table[];
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 	int arch = AUDIT_ARCH_MIPS;
 #ifdef CONFIG_64BIT
-	if (!test_thread_flag(TIF_32BIT_REGS)) {
+	if (!test_tsk_thread_flag(task, TIF_32BIT_REGS)) {
 		arch |= __AUDIT_ARCH_64BIT;
 		/* N32 sets only TIF_32BIT_ADDR */
-		if (test_thread_flag(TIF_32BIT_ADDR))
+		if (test_tsk_thread_flag(task, TIF_32BIT_ADDR))
 			arch |= __AUDIT_ARCH_CONVENTION_MIPS64_N32;
 	}
 #endif
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 0057c910bc2f..2ead6ff919b7 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -1418,7 +1418,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall)
 		unsigned long args[6];
 
 		sd.nr = syscall;
-		sd.arch = syscall_get_arch();
+		sd.arch = syscall_get_arch(current);
 		syscall_get_arguments(current, regs, 0, 6, args);
 		for (i = 0; i < 6; i++)
 			sd.args[i] = args[i];
diff --git a/arch/nds32/include/asm/syscall.h b/arch/nds32/include/asm/syscall.h
index cc56a3962f8b..7501e376a6b1 100644
--- a/arch/nds32/include/asm/syscall.h
+++ b/arch/nds32/include/asm/syscall.h
@@ -188,7 +188,7 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
 }
 
 static inline int
-syscall_get_arch(void)
+syscall_get_arch(struct task_struct *task)
 {
 	return IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)
 		? AUDIT_ARCH_NDS32BE : AUDIT_ARCH_NDS32;
diff --git a/arch/nios2/include/asm/syscall.h b/arch/nios2/include/asm/syscall.h
index cf35e210fc4d..f0f6ae208e78 100644
--- a/arch/nios2/include/asm/syscall.h
+++ b/arch/nios2/include/asm/syscall.h
@@ -136,7 +136,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
 	}
 }
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 	return AUDIT_ARCH_NIOS2;
 }
diff --git a/arch/openrisc/include/asm/syscall.h b/arch/openrisc/include/asm/syscall.h
index 2db9f1cf0694..46b10c674bd2 100644
--- a/arch/openrisc/include/asm/syscall.h
+++ b/arch/openrisc/include/asm/syscall.h
@@ -72,7 +72,7 @@ syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
 	memcpy(&regs->gpr[3 + i], args, n * sizeof(args[0]));
 }
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 	return AUDIT_ARCH_OPENRISC;
 }
diff --git a/arch/parisc/include/asm/syscall.h b/arch/parisc/include/asm/syscall.h
index 8bff1a58c97f..c04ffc6ac928 100644
--- a/arch/parisc/include/asm/syscall.h
+++ b/arch/parisc/include/asm/syscall.h
@@ -62,11 +62,11 @@ static inline void syscall_rollback(struct task_struct *task,
 	/* do nothing */
 }
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 	int arch = AUDIT_ARCH_PARISC;
 #ifdef CONFIG_64BIT
-	if (!is_compat_task())
+	if (!__is_compat_task(task))
 		arch = AUDIT_ARCH_PARISC64;
 #endif
 	return arch;
diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h
index 1a0e7a8b1c81..efb50429c9f4 100644
--- a/arch/powerpc/include/asm/syscall.h
+++ b/arch/powerpc/include/asm/syscall.h
@@ -99,9 +99,15 @@ static inline void syscall_set_arguments(struct task_struct *task,
 		regs->orig_gpr3 = args[0];
 }
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
-	int arch = is_32bit_task() ? AUDIT_ARCH_PPC : AUDIT_ARCH_PPC64;
+	int arch;
+
+	if (IS_ENABLED(CONFIG_PPC64) && !test_tsk_thread_flag(task, TIF_32BIT))
+		arch = AUDIT_ARCH_PPC64;
+	else
+		arch = AUDIT_ARCH_PPC;
+
 #ifdef __LITTLE_ENDIAN__
 	arch |= __AUDIT_ARCH_LE;
 #endif
diff --git a/arch/riscv/include/asm/syscall.h b/arch/riscv/include/asm/syscall.h
index bba3da6ef157..ca120a36a037 100644
--- a/arch/riscv/include/asm/syscall.h
+++ b/arch/riscv/include/asm/syscall.h
@@ -100,7 +100,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
 	memcpy(&regs->a1 + i * sizeof(regs->a1), args, n * sizeof(regs->a0));
 }
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 #ifdef CONFIG_64BIT
 	return AUDIT_ARCH_RISCV64;
diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h
index 96f9a9151fde..5a40ea8b90ea 100644
--- a/arch/s390/include/asm/syscall.h
+++ b/arch/s390/include/asm/syscall.h
@@ -92,10 +92,10 @@ static inline void syscall_set_arguments(struct task_struct *task,
 		regs->orig_gpr2 = args[0];
 }
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 #ifdef CONFIG_COMPAT
-	if (test_tsk_thread_flag(current, TIF_31BIT))
+	if (test_tsk_thread_flag(task, TIF_31BIT))
 		return AUDIT_ARCH_S390;
 #endif
 	return AUDIT_ARCH_S390X;
diff --git a/arch/sh/include/asm/syscall_32.h b/arch/sh/include/asm/syscall_32.h
index 6e118799831c..08de429eccd4 100644
--- a/arch/sh/include/asm/syscall_32.h
+++ b/arch/sh/include/asm/syscall_32.h
@@ -95,7 +95,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
 	}
 }
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 	int arch = AUDIT_ARCH_SH;
 
diff --git a/arch/sh/include/asm/syscall_64.h b/arch/sh/include/asm/syscall_64.h
index 43882580c7f9..9b62a2404531 100644
--- a/arch/sh/include/asm/syscall_64.h
+++ b/arch/sh/include/asm/syscall_64.h
@@ -63,7 +63,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
 	memcpy(&regs->regs[2 + i], args, n * sizeof(args[0]));
 }
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 	int arch = AUDIT_ARCH_SH;
 
diff --git a/arch/sparc/include/asm/syscall.h b/arch/sparc/include/asm/syscall.h
index 053989e3f6a6..9ffb367c17fd 100644
--- a/arch/sparc/include/asm/syscall.h
+++ b/arch/sparc/include/asm/syscall.h
@@ -128,10 +128,11 @@ static inline void syscall_set_arguments(struct task_struct *task,
 		regs->u_regs[UREG_I0 + i + j] = args[j];
 }
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 #if defined(CONFIG_SPARC64) && defined(CONFIG_COMPAT)
-	return in_compat_syscall() ? AUDIT_ARCH_SPARC : AUDIT_ARCH_SPARC64;
+	return test_tsk_thread_flag(task, TIF_32BIT)
+		? AUDIT_ARCH_SPARC : AUDIT_ARCH_SPARC64;
 #elif defined(CONFIG_SPARC64)
 	return AUDIT_ARCH_SPARC64;
 #else
diff --git a/arch/unicore32/include/asm/syscall.h b/arch/unicore32/include/asm/syscall.h
index 3a6b885476b4..607961797fff 100644
--- a/arch/unicore32/include/asm/syscall.h
+++ b/arch/unicore32/include/asm/syscall.h
@@ -4,7 +4,7 @@
 
 #include <uapi/linux/audit.h>
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 	return AUDIT_ARCH_UNICORE;
 }
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index d653139857af..435f3f09279c 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -107,7 +107,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
 	memcpy(&regs->bx + i, args, n * sizeof(args[0]));
 }
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 	return AUDIT_ARCH_I386;
 }
@@ -236,10 +236,12 @@ static inline void syscall_set_arguments(struct task_struct *task,
 		}
 }
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 	/* x32 tasks should be considered AUDIT_ARCH_X86_64. */
-	return in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
+	return (IS_ENABLED(CONFIG_IA32_EMULATION) &&
+		task->thread_info.status & TS_COMPAT)
+		? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
 }
 #endif	/* CONFIG_X86_32 */
 
diff --git a/arch/x86/um/asm/syscall.h b/arch/x86/um/asm/syscall.h
index ef898af102d1..56a2f0913e3c 100644
--- a/arch/x86/um/asm/syscall.h
+++ b/arch/x86/um/asm/syscall.h
@@ -9,7 +9,7 @@ typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long,
 					  unsigned long, unsigned long,
 					  unsigned long, unsigned long);
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 #ifdef CONFIG_X86_32
 	return AUDIT_ARCH_I386;
diff --git a/arch/xtensa/include/asm/syscall.h b/arch/xtensa/include/asm/syscall.h
index a168bf81c7f4..0681ca656809 100644
--- a/arch/xtensa/include/asm/syscall.h
+++ b/arch/xtensa/include/asm/syscall.h
@@ -14,7 +14,7 @@
 #include <asm/ptrace.h>
 #include <uapi/linux/audit.h>
 
-static inline int syscall_get_arch(void)
+static inline int syscall_get_arch(struct task_struct *task)
 {
 	return AUDIT_ARCH_XTENSA;
 }
diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h
index 0c938a4354f6..e0d060b43321 100644
--- a/include/asm-generic/syscall.h
+++ b/include/asm-generic/syscall.h
@@ -144,14 +144,15 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
 
 /**
  * syscall_get_arch - return the AUDIT_ARCH for the current system call
+ * @task:	task of interest, must be blocked
  *
  * Returns the AUDIT_ARCH_* based on the system call convention in use.
  *
- * It's only valid to call this when current is stopped on entry to a system
+ * It's only valid to call this when @task is stopped on entry to a system
  * call, due to %TIF_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or %TIF_SECCOMP.
  *
  * Architectures which permit CONFIG_HAVE_ARCH_SECCOMP_FILTER must
  * provide an implementation of this.
  */
-int syscall_get_arch(void);
+int syscall_get_arch(struct task_struct *task);
 #endif	/* _ASM_SYSCALL_H */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 17b0007fafc2..98a98e6dca05 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1636,7 +1636,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
 			return;
 	}
 
-	context->arch	    = syscall_get_arch();
+	context->arch	    = syscall_get_arch(current);
 	context->major      = major;
 	context->argv[0]    = a1;
 	context->argv[1]    = a2;
@@ -2590,7 +2590,7 @@ void audit_seccomp(unsigned long syscall, long signr, int code)
 		return;
 	audit_log_task(ab);
 	audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x",
-			 signr, syscall_get_arch(), syscall,
+			 signr, syscall_get_arch(current), syscall,
 			 in_compat_syscall(), KSTK_EIP(current), code);
 	audit_log_end(ab);
 }
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 54a0347ca812..36f36ab00f48 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -148,7 +148,7 @@ static void populate_seccomp_data(struct seccomp_data *sd)
 	unsigned long args[6];
 
 	sd->nr = syscall_get_nr(task, regs);
-	sd->arch = syscall_get_arch();
+	sd->arch = syscall_get_arch(task);
 	syscall_get_arguments(task, regs, 0, 6, args);
 	sd->args[0] = args[0];
 	sd->args[1] = args[1];
@@ -591,7 +591,7 @@ static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason
 	info->si_code = SYS_SECCOMP;
 	info->si_call_addr = (void __user *)KSTK_EIP(current);
 	info->si_errno = reason;
-	info->si_arch = syscall_get_arch();
+	info->si_arch = syscall_get_arch(current);
 	info->si_syscall = syscall;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 8b56d3488d87555c77494b1fb90dbea84ff0e9fc Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 10 Mar 2019 12:00:51 -0700
Subject: crypto: x86/aesni - convert to use skcipher SIMD bulk registration

Convert the AES-NI glue code to use simd_register_skciphers_compat() to
create SIMD wrappers for all the internal skcipher algorithms at once,
rather than wrapping each one individually.  This simplifies the code.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_glue.c | 43 +++++++-------------------------------
 1 file changed, 7 insertions(+), 36 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 1e3d2102033a..451918cc5f73 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1253,23 +1253,9 @@ static const struct x86_cpu_id aesni_cpu_id[] = {
 };
 MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
 
-static void aesni_free_simds(void)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers) &&
-		    aesni_simd_skciphers[i]; i++)
-		simd_skcipher_free(aesni_simd_skciphers[i]);
-}
-
 static int __init aesni_init(void)
 {
-	struct simd_skcipher_alg *simd;
-	const char *basename;
-	const char *algname;
-	const char *drvname;
 	int err;
-	int i;
 
 	if (!x86_match_cpu(aesni_cpu_id))
 		return -ENODEV;
@@ -1304,8 +1290,9 @@ static int __init aesni_init(void)
 	if (err)
 		return err;
 
-	err = crypto_register_skciphers(aesni_skciphers,
-					ARRAY_SIZE(aesni_skciphers));
+	err = simd_register_skciphers_compat(aesni_skciphers,
+					     ARRAY_SIZE(aesni_skciphers),
+					     aesni_simd_skciphers);
 	if (err)
 		goto unregister_algs;
 
@@ -1314,26 +1301,11 @@ static int __init aesni_init(void)
 	if (err)
 		goto unregister_skciphers;
 
-	for (i = 0; i < ARRAY_SIZE(aesni_skciphers); i++) {
-		algname = aesni_skciphers[i].base.cra_name + 2;
-		drvname = aesni_skciphers[i].base.cra_driver_name + 2;
-		basename = aesni_skciphers[i].base.cra_driver_name;
-		simd = simd_skcipher_create_compat(algname, drvname, basename);
-		err = PTR_ERR(simd);
-		if (IS_ERR(simd))
-			goto unregister_simds;
-
-		aesni_simd_skciphers[i] = simd;
-	}
-
 	return 0;
 
-unregister_simds:
-	aesni_free_simds();
-	crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs));
 unregister_skciphers:
-	crypto_unregister_skciphers(aesni_skciphers,
-				    ARRAY_SIZE(aesni_skciphers));
+	simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
+				  aesni_simd_skciphers);
 unregister_algs:
 	crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
 	return err;
@@ -1341,10 +1313,9 @@ unregister_algs:
 
 static void __exit aesni_exit(void)
 {
-	aesni_free_simds();
 	crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs));
-	crypto_unregister_skciphers(aesni_skciphers,
-				    ARRAY_SIZE(aesni_skciphers));
+	simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
+				  aesni_simd_skciphers);
 	crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 149e12252fb38937d3efa4419be1a87884ab1427 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 10 Mar 2019 12:00:52 -0700
Subject: crypto: x86/aesni - convert to use AEAD SIMD helpers

Convert the AES-NI implementations of "gcm(aes)" and "rfc4106(gcm(aes))"
to use the AEAD SIMD helpers, rather than hand-rolling the same
functionality.  This simplifies the code and also fixes the bug where
the user-provided aead_request is modified.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_glue.c | 161 ++++---------------------------------
 1 file changed, 15 insertions(+), 146 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 451918cc5f73..1466d59384ac 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -25,7 +25,6 @@
 #include <linux/err.h>
 #include <crypto/algapi.h>
 #include <crypto/aes.h>
-#include <crypto/cryptd.h>
 #include <crypto/ctr.h>
 #include <crypto/b128ops.h>
 #include <crypto/gcm.h>
@@ -643,29 +642,6 @@ static int xts_decrypt(struct skcipher_request *req)
 				   aes_ctx(ctx->raw_crypt_ctx));
 }
 
-static int rfc4106_init(struct crypto_aead *aead)
-{
-	struct cryptd_aead *cryptd_tfm;
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
-	cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni",
-				       CRYPTO_ALG_INTERNAL,
-				       CRYPTO_ALG_INTERNAL);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-
-	*ctx = cryptd_tfm;
-	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
-	return 0;
-}
-
-static void rfc4106_exit(struct crypto_aead *aead)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
-	cryptd_free_aead(*ctx);
-}
-
 static int
 rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
 {
@@ -710,15 +686,8 @@ static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
 	       rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
 }
 
-static int gcmaes_wrapper_set_key(struct crypto_aead *parent, const u8 *key,
-				  unsigned int key_len)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(parent);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	return crypto_aead_setkey(&cryptd_tfm->base, key, key_len);
-}
-
+/* This is the Integrity Check Value (aka the authentication tag) length and can
+ * be 8, 12 or 16 bytes long. */
 static int common_rfc4106_set_authsize(struct crypto_aead *aead,
 				       unsigned int authsize)
 {
@@ -734,17 +703,6 @@ static int common_rfc4106_set_authsize(struct crypto_aead *aead,
 	return 0;
 }
 
-/* This is the Integrity Check Value (aka the authentication tag length and can
- * be 8, 12 or 16 bytes long. */
-static int gcmaes_wrapper_set_authsize(struct crypto_aead *parent,
-				       unsigned int authsize)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(parent);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
-}
-
 static int generic_gcmaes_set_authsize(struct crypto_aead *tfm,
 				       unsigned int authsize)
 {
@@ -964,38 +922,6 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
 	return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv,
 			      aes_ctx);
 }
-
-static int gcmaes_wrapper_encrypt(struct aead_request *req)
-{
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	tfm = &cryptd_tfm->base;
-	if (irq_fpu_usable() && (!in_atomic() ||
-				 !cryptd_aead_queued(cryptd_tfm)))
-		tfm = cryptd_aead_child(cryptd_tfm);
-
-	aead_request_set_tfm(req, tfm);
-
-	return crypto_aead_encrypt(req);
-}
-
-static int gcmaes_wrapper_decrypt(struct aead_request *req)
-{
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	tfm = &cryptd_tfm->base;
-	if (irq_fpu_usable() && (!in_atomic() ||
-				 !cryptd_aead_queued(cryptd_tfm)))
-		tfm = cryptd_aead_child(cryptd_tfm);
-
-	aead_request_set_tfm(req, tfm);
-
-	return crypto_aead_decrypt(req);
-}
 #endif
 
 static struct crypto_alg aesni_algs[] = { {
@@ -1148,31 +1074,7 @@ static int generic_gcmaes_decrypt(struct aead_request *req)
 			      aes_ctx);
 }
 
-static int generic_gcmaes_init(struct crypto_aead *aead)
-{
-	struct cryptd_aead *cryptd_tfm;
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
-	cryptd_tfm = cryptd_alloc_aead("__driver-generic-gcm-aes-aesni",
-				       CRYPTO_ALG_INTERNAL,
-				       CRYPTO_ALG_INTERNAL);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-
-	*ctx = cryptd_tfm;
-	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
-
-	return 0;
-}
-
-static void generic_gcmaes_exit(struct crypto_aead *aead)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
-	cryptd_free_aead(*ctx);
-}
-
-static struct aead_alg aesni_aead_algs[] = { {
+static struct aead_alg aesni_aeads[] = { {
 	.setkey			= common_rfc4106_set_key,
 	.setauthsize		= common_rfc4106_set_authsize,
 	.encrypt		= helper_rfc4106_encrypt,
@@ -1180,32 +1082,15 @@ static struct aead_alg aesni_aead_algs[] = { {
 	.ivsize			= GCM_RFC4106_IV_SIZE,
 	.maxauthsize		= 16,
 	.base = {
-		.cra_name		= "__gcm-aes-aesni",
-		.cra_driver_name	= "__driver-gcm-aes-aesni",
+		.cra_name		= "__rfc4106(gcm(aes))",
+		.cra_driver_name	= "__rfc4106-gcm-aesni",
+		.cra_priority		= 400,
 		.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= 1,
 		.cra_ctxsize		= sizeof(struct aesni_rfc4106_gcm_ctx),
 		.cra_alignmask		= AESNI_ALIGN - 1,
 		.cra_module		= THIS_MODULE,
 	},
-}, {
-	.init			= rfc4106_init,
-	.exit			= rfc4106_exit,
-	.setkey			= gcmaes_wrapper_set_key,
-	.setauthsize		= gcmaes_wrapper_set_authsize,
-	.encrypt		= gcmaes_wrapper_encrypt,
-	.decrypt		= gcmaes_wrapper_decrypt,
-	.ivsize			= GCM_RFC4106_IV_SIZE,
-	.maxauthsize		= 16,
-	.base = {
-		.cra_name		= "rfc4106(gcm(aes))",
-		.cra_driver_name	= "rfc4106-gcm-aesni",
-		.cra_priority		= 400,
-		.cra_flags		= CRYPTO_ALG_ASYNC,
-		.cra_blocksize		= 1,
-		.cra_ctxsize		= sizeof(struct cryptd_aead *),
-		.cra_module		= THIS_MODULE,
-	},
 }, {
 	.setkey			= generic_gcmaes_set_key,
 	.setauthsize		= generic_gcmaes_set_authsize,
@@ -1214,38 +1099,21 @@ static struct aead_alg aesni_aead_algs[] = { {
 	.ivsize			= GCM_AES_IV_SIZE,
 	.maxauthsize		= 16,
 	.base = {
-		.cra_name		= "__generic-gcm-aes-aesni",
-		.cra_driver_name	= "__driver-generic-gcm-aes-aesni",
-		.cra_priority		= 0,
+		.cra_name		= "__gcm(aes)",
+		.cra_driver_name	= "__generic-gcm-aesni",
+		.cra_priority		= 400,
 		.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= 1,
 		.cra_ctxsize		= sizeof(struct generic_gcmaes_ctx),
 		.cra_alignmask		= AESNI_ALIGN - 1,
 		.cra_module		= THIS_MODULE,
 	},
-}, {
-	.init			= generic_gcmaes_init,
-	.exit			= generic_gcmaes_exit,
-	.setkey			= gcmaes_wrapper_set_key,
-	.setauthsize		= gcmaes_wrapper_set_authsize,
-	.encrypt		= gcmaes_wrapper_encrypt,
-	.decrypt		= gcmaes_wrapper_decrypt,
-	.ivsize			= GCM_AES_IV_SIZE,
-	.maxauthsize		= 16,
-	.base = {
-		.cra_name		= "gcm(aes)",
-		.cra_driver_name	= "generic-gcm-aesni",
-		.cra_priority		= 400,
-		.cra_flags		= CRYPTO_ALG_ASYNC,
-		.cra_blocksize		= 1,
-		.cra_ctxsize		= sizeof(struct cryptd_aead *),
-		.cra_module		= THIS_MODULE,
-	},
 } };
 #else
-static struct aead_alg aesni_aead_algs[0];
+static struct aead_alg aesni_aeads[0];
 #endif
 
+static struct simd_aead_alg *aesni_simd_aeads[ARRAY_SIZE(aesni_aeads)];
 
 static const struct x86_cpu_id aesni_cpu_id[] = {
 	X86_FEATURE_MATCH(X86_FEATURE_AES),
@@ -1296,8 +1164,8 @@ static int __init aesni_init(void)
 	if (err)
 		goto unregister_algs;
 
-	err = crypto_register_aeads(aesni_aead_algs,
-				    ARRAY_SIZE(aesni_aead_algs));
+	err = simd_register_aeads_compat(aesni_aeads, ARRAY_SIZE(aesni_aeads),
+					 aesni_simd_aeads);
 	if (err)
 		goto unregister_skciphers;
 
@@ -1313,7 +1181,8 @@ unregister_algs:
 
 static void __exit aesni_exit(void)
 {
-	crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs));
+	simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
+			      aesni_simd_aeads);
 	simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
 				  aesni_simd_skciphers);
 	crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
-- 
cgit v1.2.3-59-g8ed1b


From de272ca72c6152e26b9799d21eb511aac03b6e2d Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 10 Mar 2019 12:00:53 -0700
Subject: crypto: x86/aegis128 - convert to use AEAD SIMD helpers

Convert the x86 implementation of AEGIS-128 to use the AEAD SIMD
helpers, rather than hand-rolling the same functionality.  This
simplifies the code and also fixes the bug where the user-provided
aead_request is modified.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aegis128-aesni-glue.c | 157 +++++++---------------------------
 crypto/Kconfig                        |   2 +-
 2 files changed, 31 insertions(+), 128 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 3ea71b871813..bdeee1b830be 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -11,8 +11,8 @@
  * any later version.
  */
 
-#include <crypto/cryptd.h>
 #include <crypto/internal/aead.h>
+#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
 #include <linux/module.h>
@@ -242,131 +242,35 @@ static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead)
 {
 }
 
-static int cryptd_aegis128_aesni_setkey(struct crypto_aead *aead,
-					const u8 *key, unsigned int keylen)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
-}
-
-static int cryptd_aegis128_aesni_setauthsize(struct crypto_aead *aead,
-					     unsigned int authsize)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
-}
-
-static int cryptd_aegis128_aesni_encrypt(struct aead_request *req)
-{
-	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	aead = &cryptd_tfm->base;
-	if (irq_fpu_usable() && (!in_atomic() ||
-				 !cryptd_aead_queued(cryptd_tfm)))
-		aead = cryptd_aead_child(cryptd_tfm);
-
-	aead_request_set_tfm(req, aead);
-
-	return crypto_aead_encrypt(req);
-}
-
-static int cryptd_aegis128_aesni_decrypt(struct aead_request *req)
-{
-	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	aead = &cryptd_tfm->base;
-	if (irq_fpu_usable() && (!in_atomic() ||
-				 !cryptd_aead_queued(cryptd_tfm)))
-		aead = cryptd_aead_child(cryptd_tfm);
-
-	aead_request_set_tfm(req, aead);
-
-	return crypto_aead_decrypt(req);
-}
-
-static int cryptd_aegis128_aesni_init_tfm(struct crypto_aead *aead)
-{
-	struct cryptd_aead *cryptd_tfm;
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
-	cryptd_tfm = cryptd_alloc_aead("__aegis128-aesni", CRYPTO_ALG_INTERNAL,
-				       CRYPTO_ALG_INTERNAL);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-
-	*ctx = cryptd_tfm;
-	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
-	return 0;
-}
-
-static void cryptd_aegis128_aesni_exit_tfm(struct crypto_aead *aead)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
-	cryptd_free_aead(*ctx);
-}
-
-static struct aead_alg crypto_aegis128_aesni_alg[] = {
-	{
-		.setkey = crypto_aegis128_aesni_setkey,
-		.setauthsize = crypto_aegis128_aesni_setauthsize,
-		.encrypt = crypto_aegis128_aesni_encrypt,
-		.decrypt = crypto_aegis128_aesni_decrypt,
-		.init = crypto_aegis128_aesni_init_tfm,
-		.exit = crypto_aegis128_aesni_exit_tfm,
-
-		.ivsize = AEGIS128_NONCE_SIZE,
-		.maxauthsize = AEGIS128_MAX_AUTH_SIZE,
-		.chunksize = AEGIS128_BLOCK_SIZE,
-
-		.base = {
-			.cra_flags = CRYPTO_ALG_INTERNAL,
-			.cra_blocksize = 1,
-			.cra_ctxsize = sizeof(struct aegis_ctx) +
-				__alignof__(struct aegis_ctx),
-			.cra_alignmask = 0,
-
-			.cra_name = "__aegis128",
-			.cra_driver_name = "__aegis128-aesni",
-
-			.cra_module = THIS_MODULE,
-		}
-	}, {
-		.setkey = cryptd_aegis128_aesni_setkey,
-		.setauthsize = cryptd_aegis128_aesni_setauthsize,
-		.encrypt = cryptd_aegis128_aesni_encrypt,
-		.decrypt = cryptd_aegis128_aesni_decrypt,
-		.init = cryptd_aegis128_aesni_init_tfm,
-		.exit = cryptd_aegis128_aesni_exit_tfm,
-
-		.ivsize = AEGIS128_NONCE_SIZE,
-		.maxauthsize = AEGIS128_MAX_AUTH_SIZE,
-		.chunksize = AEGIS128_BLOCK_SIZE,
-
-		.base = {
-			.cra_flags = CRYPTO_ALG_ASYNC,
-			.cra_blocksize = 1,
-			.cra_ctxsize = sizeof(struct cryptd_aead *),
-			.cra_alignmask = 0,
-
-			.cra_priority = 400,
-
-			.cra_name = "aegis128",
-			.cra_driver_name = "aegis128-aesni",
-
-			.cra_module = THIS_MODULE,
-		}
+static struct aead_alg crypto_aegis128_aesni_alg = {
+	.setkey = crypto_aegis128_aesni_setkey,
+	.setauthsize = crypto_aegis128_aesni_setauthsize,
+	.encrypt = crypto_aegis128_aesni_encrypt,
+	.decrypt = crypto_aegis128_aesni_decrypt,
+	.init = crypto_aegis128_aesni_init_tfm,
+	.exit = crypto_aegis128_aesni_exit_tfm,
+
+	.ivsize = AEGIS128_NONCE_SIZE,
+	.maxauthsize = AEGIS128_MAX_AUTH_SIZE,
+	.chunksize = AEGIS128_BLOCK_SIZE,
+
+	.base = {
+		.cra_flags = CRYPTO_ALG_INTERNAL,
+		.cra_blocksize = 1,
+		.cra_ctxsize = sizeof(struct aegis_ctx) +
+			       __alignof__(struct aegis_ctx),
+		.cra_alignmask = 0,
+		.cra_priority = 400,
+
+		.cra_name = "__aegis128",
+		.cra_driver_name = "__aegis128-aesni",
+
+		.cra_module = THIS_MODULE,
 	}
 };
 
+static struct simd_aead_alg *simd_alg;
+
 static int __init crypto_aegis128_aesni_module_init(void)
 {
 	if (!boot_cpu_has(X86_FEATURE_XMM2) ||
@@ -374,14 +278,13 @@ static int __init crypto_aegis128_aesni_module_init(void)
 	    !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
 		return -ENODEV;
 
-	return crypto_register_aeads(crypto_aegis128_aesni_alg,
-				     ARRAY_SIZE(crypto_aegis128_aesni_alg));
+	return simd_register_aeads_compat(&crypto_aegis128_aesni_alg, 1,
+					  &simd_alg);
 }
 
 static void __exit crypto_aegis128_aesni_module_exit(void)
 {
-	crypto_unregister_aeads(crypto_aegis128_aesni_alg,
-				ARRAY_SIZE(crypto_aegis128_aesni_alg));
+	simd_unregister_aeads(&crypto_aegis128_aesni_alg, 1, &simd_alg);
 }
 
 module_init(crypto_aegis128_aesni_module_init);
diff --git a/crypto/Kconfig b/crypto/Kconfig
index bbab6bf33519..5b2c4cd7923f 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -310,7 +310,7 @@ config CRYPTO_AEGIS128_AESNI_SSE2
 	tristate "AEGIS-128 AEAD algorithm (x86_64 AESNI+SSE2 implementation)"
 	depends on X86 && 64BIT
 	select CRYPTO_AEAD
-	select CRYPTO_CRYPTD
+	select CRYPTO_SIMD
 	help
 	 AESNI+SSE2 implementation of the AEGSI-128 dedicated AEAD algorithm.
 
-- 
cgit v1.2.3-59-g8ed1b


From d628132a5e3d0183ac59a3ebbd3a9dff8b20ac7e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 10 Mar 2019 12:00:54 -0700
Subject: crypto: x86/aegis128l - convert to use AEAD SIMD helpers

Convert the x86 implementation of AEGIS-128L to use the AEAD SIMD
helpers, rather than hand-rolling the same functionality.  This
simplifies the code and also fixes the bug where the user-provided
aead_request is modified.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aegis128l-aesni-glue.c | 157 +++++++--------------------------
 crypto/Kconfig                         |   2 +-
 2 files changed, 31 insertions(+), 128 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c
index 1b1b39c66c5e..80d917f7e467 100644
--- a/arch/x86/crypto/aegis128l-aesni-glue.c
+++ b/arch/x86/crypto/aegis128l-aesni-glue.c
@@ -11,8 +11,8 @@
  * any later version.
  */
 
-#include <crypto/cryptd.h>
 #include <crypto/internal/aead.h>
+#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
 #include <linux/module.h>
@@ -242,131 +242,35 @@ static void crypto_aegis128l_aesni_exit_tfm(struct crypto_aead *aead)
 {
 }
 
-static int cryptd_aegis128l_aesni_setkey(struct crypto_aead *aead,
-					 const u8 *key, unsigned int keylen)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
-}
-
-static int cryptd_aegis128l_aesni_setauthsize(struct crypto_aead *aead,
-					      unsigned int authsize)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
-}
-
-static int cryptd_aegis128l_aesni_encrypt(struct aead_request *req)
-{
-	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	aead = &cryptd_tfm->base;
-	if (irq_fpu_usable() && (!in_atomic() ||
-				 !cryptd_aead_queued(cryptd_tfm)))
-		aead = cryptd_aead_child(cryptd_tfm);
-
-	aead_request_set_tfm(req, aead);
-
-	return crypto_aead_encrypt(req);
-}
-
-static int cryptd_aegis128l_aesni_decrypt(struct aead_request *req)
-{
-	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	aead = &cryptd_tfm->base;
-	if (irq_fpu_usable() && (!in_atomic() ||
-				 !cryptd_aead_queued(cryptd_tfm)))
-		aead = cryptd_aead_child(cryptd_tfm);
-
-	aead_request_set_tfm(req, aead);
-
-	return crypto_aead_decrypt(req);
-}
-
-static int cryptd_aegis128l_aesni_init_tfm(struct crypto_aead *aead)
-{
-	struct cryptd_aead *cryptd_tfm;
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
-	cryptd_tfm = cryptd_alloc_aead("__aegis128l-aesni", CRYPTO_ALG_INTERNAL,
-				       CRYPTO_ALG_INTERNAL);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-
-	*ctx = cryptd_tfm;
-	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
-	return 0;
-}
-
-static void cryptd_aegis128l_aesni_exit_tfm(struct crypto_aead *aead)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
-	cryptd_free_aead(*ctx);
-}
-
-static struct aead_alg crypto_aegis128l_aesni_alg[] = {
-	{
-		.setkey = crypto_aegis128l_aesni_setkey,
-		.setauthsize = crypto_aegis128l_aesni_setauthsize,
-		.encrypt = crypto_aegis128l_aesni_encrypt,
-		.decrypt = crypto_aegis128l_aesni_decrypt,
-		.init = crypto_aegis128l_aesni_init_tfm,
-		.exit = crypto_aegis128l_aesni_exit_tfm,
-
-		.ivsize = AEGIS128L_NONCE_SIZE,
-		.maxauthsize = AEGIS128L_MAX_AUTH_SIZE,
-		.chunksize = AEGIS128L_BLOCK_SIZE,
-
-		.base = {
-			.cra_flags = CRYPTO_ALG_INTERNAL,
-			.cra_blocksize = 1,
-			.cra_ctxsize = sizeof(struct aegis_ctx) +
-				__alignof__(struct aegis_ctx),
-			.cra_alignmask = 0,
-
-			.cra_name = "__aegis128l",
-			.cra_driver_name = "__aegis128l-aesni",
-
-			.cra_module = THIS_MODULE,
-		}
-	}, {
-		.setkey = cryptd_aegis128l_aesni_setkey,
-		.setauthsize = cryptd_aegis128l_aesni_setauthsize,
-		.encrypt = cryptd_aegis128l_aesni_encrypt,
-		.decrypt = cryptd_aegis128l_aesni_decrypt,
-		.init = cryptd_aegis128l_aesni_init_tfm,
-		.exit = cryptd_aegis128l_aesni_exit_tfm,
-
-		.ivsize = AEGIS128L_NONCE_SIZE,
-		.maxauthsize = AEGIS128L_MAX_AUTH_SIZE,
-		.chunksize = AEGIS128L_BLOCK_SIZE,
-
-		.base = {
-			.cra_flags = CRYPTO_ALG_ASYNC,
-			.cra_blocksize = 1,
-			.cra_ctxsize = sizeof(struct cryptd_aead *),
-			.cra_alignmask = 0,
-
-			.cra_priority = 400,
-
-			.cra_name = "aegis128l",
-			.cra_driver_name = "aegis128l-aesni",
-
-			.cra_module = THIS_MODULE,
-		}
+static struct aead_alg crypto_aegis128l_aesni_alg = {
+	.setkey = crypto_aegis128l_aesni_setkey,
+	.setauthsize = crypto_aegis128l_aesni_setauthsize,
+	.encrypt = crypto_aegis128l_aesni_encrypt,
+	.decrypt = crypto_aegis128l_aesni_decrypt,
+	.init = crypto_aegis128l_aesni_init_tfm,
+	.exit = crypto_aegis128l_aesni_exit_tfm,
+
+	.ivsize = AEGIS128L_NONCE_SIZE,
+	.maxauthsize = AEGIS128L_MAX_AUTH_SIZE,
+	.chunksize = AEGIS128L_BLOCK_SIZE,
+
+	.base = {
+		.cra_flags = CRYPTO_ALG_INTERNAL,
+		.cra_blocksize = 1,
+		.cra_ctxsize = sizeof(struct aegis_ctx) +
+			       __alignof__(struct aegis_ctx),
+		.cra_alignmask = 0,
+		.cra_priority = 400,
+
+		.cra_name = "__aegis128l",
+		.cra_driver_name = "__aegis128l-aesni",
+
+		.cra_module = THIS_MODULE,
 	}
 };
 
+static struct simd_aead_alg *simd_alg;
+
 static int __init crypto_aegis128l_aesni_module_init(void)
 {
 	if (!boot_cpu_has(X86_FEATURE_XMM2) ||
@@ -374,14 +278,13 @@ static int __init crypto_aegis128l_aesni_module_init(void)
 	    !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
 		return -ENODEV;
 
-	return crypto_register_aeads(crypto_aegis128l_aesni_alg,
-				     ARRAY_SIZE(crypto_aegis128l_aesni_alg));
+	return simd_register_aeads_compat(&crypto_aegis128l_aesni_alg, 1,
+					  &simd_alg);
 }
 
 static void __exit crypto_aegis128l_aesni_module_exit(void)
 {
-	crypto_unregister_aeads(crypto_aegis128l_aesni_alg,
-				ARRAY_SIZE(crypto_aegis128l_aesni_alg));
+	simd_unregister_aeads(&crypto_aegis128l_aesni_alg, 1, &simd_alg);
 }
 
 module_init(crypto_aegis128l_aesni_module_init);
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 5b2c4cd7923f..ff05a87cf9e0 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -318,7 +318,7 @@ config CRYPTO_AEGIS128L_AESNI_SSE2
 	tristate "AEGIS-128L AEAD algorithm (x86_64 AESNI+SSE2 implementation)"
 	depends on X86 && 64BIT
 	select CRYPTO_AEAD
-	select CRYPTO_CRYPTD
+	select CRYPTO_SIMD
 	help
 	 AESNI+SSE2 implementation of the AEGSI-128L dedicated AEAD algorithm.
 
-- 
cgit v1.2.3-59-g8ed1b


From b6708c2d8fbdeecfaf8380329d4e136f68deef05 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 10 Mar 2019 12:00:55 -0700
Subject: crypto: x86/aegis256 - convert to use AEAD SIMD helpers

Convert the x86 implementation of AEGIS-256 to use the AEAD SIMD
helpers, rather than hand-rolling the same functionality.  This
simplifies the code and also fixes the bug where the user-provided
aead_request is modified.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aegis256-aesni-glue.c | 157 +++++++---------------------------
 crypto/Kconfig                        |   2 +-
 2 files changed, 31 insertions(+), 128 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c
index 6227ca3220a0..716eecb66bd5 100644
--- a/arch/x86/crypto/aegis256-aesni-glue.c
+++ b/arch/x86/crypto/aegis256-aesni-glue.c
@@ -11,8 +11,8 @@
  * any later version.
  */
 
-#include <crypto/cryptd.h>
 #include <crypto/internal/aead.h>
+#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
 #include <linux/module.h>
@@ -242,131 +242,35 @@ static void crypto_aegis256_aesni_exit_tfm(struct crypto_aead *aead)
 {
 }
 
-static int cryptd_aegis256_aesni_setkey(struct crypto_aead *aead,
-					const u8 *key, unsigned int keylen)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
-}
-
-static int cryptd_aegis256_aesni_setauthsize(struct crypto_aead *aead,
-					     unsigned int authsize)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
-}
-
-static int cryptd_aegis256_aesni_encrypt(struct aead_request *req)
-{
-	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	aead = &cryptd_tfm->base;
-	if (irq_fpu_usable() && (!in_atomic() ||
-				 !cryptd_aead_queued(cryptd_tfm)))
-		aead = cryptd_aead_child(cryptd_tfm);
-
-	aead_request_set_tfm(req, aead);
-
-	return crypto_aead_encrypt(req);
-}
-
-static int cryptd_aegis256_aesni_decrypt(struct aead_request *req)
-{
-	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	aead = &cryptd_tfm->base;
-	if (irq_fpu_usable() && (!in_atomic() ||
-				 !cryptd_aead_queued(cryptd_tfm)))
-		aead = cryptd_aead_child(cryptd_tfm);
-
-	aead_request_set_tfm(req, aead);
-
-	return crypto_aead_decrypt(req);
-}
-
-static int cryptd_aegis256_aesni_init_tfm(struct crypto_aead *aead)
-{
-	struct cryptd_aead *cryptd_tfm;
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
-	cryptd_tfm = cryptd_alloc_aead("__aegis256-aesni", CRYPTO_ALG_INTERNAL,
-				       CRYPTO_ALG_INTERNAL);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-
-	*ctx = cryptd_tfm;
-	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
-	return 0;
-}
-
-static void cryptd_aegis256_aesni_exit_tfm(struct crypto_aead *aead)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
-	cryptd_free_aead(*ctx);
-}
-
-static struct aead_alg crypto_aegis256_aesni_alg[] = {
-	{
-		.setkey = crypto_aegis256_aesni_setkey,
-		.setauthsize = crypto_aegis256_aesni_setauthsize,
-		.encrypt = crypto_aegis256_aesni_encrypt,
-		.decrypt = crypto_aegis256_aesni_decrypt,
-		.init = crypto_aegis256_aesni_init_tfm,
-		.exit = crypto_aegis256_aesni_exit_tfm,
-
-		.ivsize = AEGIS256_NONCE_SIZE,
-		.maxauthsize = AEGIS256_MAX_AUTH_SIZE,
-		.chunksize = AEGIS256_BLOCK_SIZE,
-
-		.base = {
-			.cra_flags = CRYPTO_ALG_INTERNAL,
-			.cra_blocksize = 1,
-			.cra_ctxsize = sizeof(struct aegis_ctx) +
-				__alignof__(struct aegis_ctx),
-			.cra_alignmask = 0,
-
-			.cra_name = "__aegis256",
-			.cra_driver_name = "__aegis256-aesni",
-
-			.cra_module = THIS_MODULE,
-		}
-	}, {
-		.setkey = cryptd_aegis256_aesni_setkey,
-		.setauthsize = cryptd_aegis256_aesni_setauthsize,
-		.encrypt = cryptd_aegis256_aesni_encrypt,
-		.decrypt = cryptd_aegis256_aesni_decrypt,
-		.init = cryptd_aegis256_aesni_init_tfm,
-		.exit = cryptd_aegis256_aesni_exit_tfm,
-
-		.ivsize = AEGIS256_NONCE_SIZE,
-		.maxauthsize = AEGIS256_MAX_AUTH_SIZE,
-		.chunksize = AEGIS256_BLOCK_SIZE,
-
-		.base = {
-			.cra_flags = CRYPTO_ALG_ASYNC,
-			.cra_blocksize = 1,
-			.cra_ctxsize = sizeof(struct cryptd_aead *),
-			.cra_alignmask = 0,
-
-			.cra_priority = 400,
-
-			.cra_name = "aegis256",
-			.cra_driver_name = "aegis256-aesni",
-
-			.cra_module = THIS_MODULE,
-		}
+static struct aead_alg crypto_aegis256_aesni_alg = {
+	.setkey = crypto_aegis256_aesni_setkey,
+	.setauthsize = crypto_aegis256_aesni_setauthsize,
+	.encrypt = crypto_aegis256_aesni_encrypt,
+	.decrypt = crypto_aegis256_aesni_decrypt,
+	.init = crypto_aegis256_aesni_init_tfm,
+	.exit = crypto_aegis256_aesni_exit_tfm,
+
+	.ivsize = AEGIS256_NONCE_SIZE,
+	.maxauthsize = AEGIS256_MAX_AUTH_SIZE,
+	.chunksize = AEGIS256_BLOCK_SIZE,
+
+	.base = {
+		.cra_flags = CRYPTO_ALG_INTERNAL,
+		.cra_blocksize = 1,
+		.cra_ctxsize = sizeof(struct aegis_ctx) +
+			       __alignof__(struct aegis_ctx),
+		.cra_alignmask = 0,
+		.cra_priority = 400,
+
+		.cra_name = "__aegis256",
+		.cra_driver_name = "__aegis256-aesni",
+
+		.cra_module = THIS_MODULE,
 	}
 };
 
+static struct simd_aead_alg *simd_alg;
+
 static int __init crypto_aegis256_aesni_module_init(void)
 {
 	if (!boot_cpu_has(X86_FEATURE_XMM2) ||
@@ -374,14 +278,13 @@ static int __init crypto_aegis256_aesni_module_init(void)
 	    !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
 		return -ENODEV;
 
-	return crypto_register_aeads(crypto_aegis256_aesni_alg,
-				    ARRAY_SIZE(crypto_aegis256_aesni_alg));
+	return simd_register_aeads_compat(&crypto_aegis256_aesni_alg, 1,
+					  &simd_alg);
 }
 
 static void __exit crypto_aegis256_aesni_module_exit(void)
 {
-	crypto_unregister_aeads(crypto_aegis256_aesni_alg,
-				ARRAY_SIZE(crypto_aegis256_aesni_alg));
+	simd_unregister_aeads(&crypto_aegis256_aesni_alg, 1, &simd_alg);
 }
 
 module_init(crypto_aegis256_aesni_module_init);
diff --git a/crypto/Kconfig b/crypto/Kconfig
index ff05a87cf9e0..1b7238e05cf1 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -326,7 +326,7 @@ config CRYPTO_AEGIS256_AESNI_SSE2
 	tristate "AEGIS-256 AEAD algorithm (x86_64 AESNI+SSE2 implementation)"
 	depends on X86 && 64BIT
 	select CRYPTO_AEAD
-	select CRYPTO_CRYPTD
+	select CRYPTO_SIMD
 	help
 	 AESNI+SSE2 implementation of the AEGSI-256 dedicated AEAD algorithm.
 
-- 
cgit v1.2.3-59-g8ed1b


From 477309580dcc5791a76302c53e50d0b8693b13bc Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 10 Mar 2019 12:00:56 -0700
Subject: crypto: x86/morus640 - convert to use AEAD SIMD helpers

Convert the x86 implementation of MORUS-640 to use the AEAD SIMD
helpers, rather than hand-rolling the same functionality.  This
simplifies the code and also fixes the bug where the user-provided
aead_request is modified.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/morus640-sse2-glue.c | 12 ++---
 arch/x86/crypto/morus640_glue.c      | 85 ------------------------------------
 crypto/Kconfig                       |  2 +-
 include/crypto/morus640_glue.h       | 79 ++++++++++-----------------------
 4 files changed, 30 insertions(+), 148 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c
index 9afaf8f8565a..32da56b3bdad 100644
--- a/arch/x86/crypto/morus640-sse2-glue.c
+++ b/arch/x86/crypto/morus640-sse2-glue.c
@@ -12,6 +12,7 @@
  */
 
 #include <crypto/internal/aead.h>
+#include <crypto/internal/simd.h>
 #include <crypto/morus640_glue.h>
 #include <linux/module.h>
 #include <asm/fpu/api.h>
@@ -35,7 +36,9 @@ asmlinkage void crypto_morus640_sse2_dec_tail(void *state, const void *src,
 asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor,
 					   u64 assoclen, u64 cryptlen);
 
-MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400);
+MORUS640_DECLARE_ALG(sse2, "morus640-sse2", 400);
+
+static struct simd_aead_alg *simd_alg;
 
 static int __init crypto_morus640_sse2_module_init(void)
 {
@@ -43,14 +46,13 @@ static int __init crypto_morus640_sse2_module_init(void)
 	    !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
 		return -ENODEV;
 
-	return crypto_register_aeads(crypto_morus640_sse2_algs,
-				     ARRAY_SIZE(crypto_morus640_sse2_algs));
+	return simd_register_aeads_compat(&crypto_morus640_sse2_alg, 1,
+					  &simd_alg);
 }
 
 static void __exit crypto_morus640_sse2_module_exit(void)
 {
-	crypto_unregister_aeads(crypto_morus640_sse2_algs,
-				ARRAY_SIZE(crypto_morus640_sse2_algs));
+	simd_unregister_aeads(&crypto_morus640_sse2_alg, 1, &simd_alg);
 }
 
 module_init(crypto_morus640_sse2_module_init);
diff --git a/arch/x86/crypto/morus640_glue.c b/arch/x86/crypto/morus640_glue.c
index cb3a81732016..1dea33d84426 100644
--- a/arch/x86/crypto/morus640_glue.c
+++ b/arch/x86/crypto/morus640_glue.c
@@ -11,7 +11,6 @@
  * any later version.
  */
 
-#include <crypto/cryptd.h>
 #include <crypto/internal/aead.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/morus640_glue.h>
@@ -200,90 +199,6 @@ void crypto_morus640_glue_init_ops(struct crypto_aead *aead,
 }
 EXPORT_SYMBOL_GPL(crypto_morus640_glue_init_ops);
 
-int cryptd_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key,
-				unsigned int keylen)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setkey);
-
-int cryptd_morus640_glue_setauthsize(struct crypto_aead *aead,
-				     unsigned int authsize)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setauthsize);
-
-int cryptd_morus640_glue_encrypt(struct aead_request *req)
-{
-	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	aead = &cryptd_tfm->base;
-	if (irq_fpu_usable() && (!in_atomic() ||
-				 !cryptd_aead_queued(cryptd_tfm)))
-		aead = cryptd_aead_child(cryptd_tfm);
-
-	aead_request_set_tfm(req, aead);
-
-	return crypto_aead_encrypt(req);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus640_glue_encrypt);
-
-int cryptd_morus640_glue_decrypt(struct aead_request *req)
-{
-	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	aead = &cryptd_tfm->base;
-	if (irq_fpu_usable() && (!in_atomic() ||
-				 !cryptd_aead_queued(cryptd_tfm)))
-		aead = cryptd_aead_child(cryptd_tfm);
-
-	aead_request_set_tfm(req, aead);
-
-	return crypto_aead_decrypt(req);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus640_glue_decrypt);
-
-int cryptd_morus640_glue_init_tfm(struct crypto_aead *aead)
-{
-	struct cryptd_aead *cryptd_tfm;
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	const char *name = crypto_aead_alg(aead)->base.cra_driver_name;
-	char internal_name[CRYPTO_MAX_ALG_NAME];
-
-	if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name)
-			>= CRYPTO_MAX_ALG_NAME)
-		return -ENAMETOOLONG;
-
-	cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL,
-				       CRYPTO_ALG_INTERNAL);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-
-	*ctx = cryptd_tfm;
-	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
-	return 0;
-}
-EXPORT_SYMBOL_GPL(cryptd_morus640_glue_init_tfm);
-
-void cryptd_morus640_glue_exit_tfm(struct crypto_aead *aead)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
-	cryptd_free_aead(*ctx);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus640_glue_exit_tfm);
-
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
 MODULE_DESCRIPTION("MORUS-640 AEAD mode -- glue for x86 optimizations");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 1b7238e05cf1..498ec4d98ce1 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -340,7 +340,7 @@ config CRYPTO_MORUS640_GLUE
 	tristate
 	depends on X86
 	select CRYPTO_AEAD
-	select CRYPTO_CRYPTD
+	select CRYPTO_SIMD
 	help
 	  Common glue for SIMD optimizations of the MORUS-640 dedicated AEAD
 	  algorithm.
diff --git a/include/crypto/morus640_glue.h b/include/crypto/morus640_glue.h
index df8e1103ff94..0ee6266cb26c 100644
--- a/include/crypto/morus640_glue.h
+++ b/include/crypto/morus640_glue.h
@@ -47,16 +47,7 @@ int crypto_morus640_glue_setauthsize(struct crypto_aead *tfm,
 int crypto_morus640_glue_encrypt(struct aead_request *req);
 int crypto_morus640_glue_decrypt(struct aead_request *req);
 
-int cryptd_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key,
-				unsigned int keylen);
-int cryptd_morus640_glue_setauthsize(struct crypto_aead *aead,
-				     unsigned int authsize);
-int cryptd_morus640_glue_encrypt(struct aead_request *req);
-int cryptd_morus640_glue_decrypt(struct aead_request *req);
-int cryptd_morus640_glue_init_tfm(struct crypto_aead *aead);
-void cryptd_morus640_glue_exit_tfm(struct crypto_aead *aead);
-
-#define MORUS640_DECLARE_ALGS(id, driver_name, priority) \
+#define MORUS640_DECLARE_ALG(id, driver_name, priority) \
 	static const struct morus640_glue_ops crypto_morus640_##id##_ops = {\
 		.init = crypto_morus640_##id##_init, \
 		.ad = crypto_morus640_##id##_ad, \
@@ -77,55 +68,29 @@ void cryptd_morus640_glue_exit_tfm(struct crypto_aead *aead);
 	{ \
 	} \
 	\
-	static struct aead_alg crypto_morus640_##id##_algs[] = {\
-		{ \
-			.setkey = crypto_morus640_glue_setkey, \
-			.setauthsize = crypto_morus640_glue_setauthsize, \
-			.encrypt = crypto_morus640_glue_encrypt, \
-			.decrypt = crypto_morus640_glue_decrypt, \
-			.init = crypto_morus640_##id##_init_tfm, \
-			.exit = crypto_morus640_##id##_exit_tfm, \
-			\
-			.ivsize = MORUS_NONCE_SIZE, \
-			.maxauthsize = MORUS_MAX_AUTH_SIZE, \
-			.chunksize = MORUS640_BLOCK_SIZE, \
-			\
-			.base = { \
-				.cra_flags = CRYPTO_ALG_INTERNAL, \
-				.cra_blocksize = 1, \
-				.cra_ctxsize = sizeof(struct morus640_ctx), \
-				.cra_alignmask = 0, \
-				\
-				.cra_name = "__morus640", \
-				.cra_driver_name = "__"driver_name, \
-				\
-				.cra_module = THIS_MODULE, \
-			} \
-		}, { \
-			.setkey = cryptd_morus640_glue_setkey, \
-			.setauthsize = cryptd_morus640_glue_setauthsize, \
-			.encrypt = cryptd_morus640_glue_encrypt, \
-			.decrypt = cryptd_morus640_glue_decrypt, \
-			.init = cryptd_morus640_glue_init_tfm, \
-			.exit = cryptd_morus640_glue_exit_tfm, \
+	static struct aead_alg crypto_morus640_##id##_alg = {\
+		.setkey = crypto_morus640_glue_setkey, \
+		.setauthsize = crypto_morus640_glue_setauthsize, \
+		.encrypt = crypto_morus640_glue_encrypt, \
+		.decrypt = crypto_morus640_glue_decrypt, \
+		.init = crypto_morus640_##id##_init_tfm, \
+		.exit = crypto_morus640_##id##_exit_tfm, \
+		\
+		.ivsize = MORUS_NONCE_SIZE, \
+		.maxauthsize = MORUS_MAX_AUTH_SIZE, \
+		.chunksize = MORUS640_BLOCK_SIZE, \
+		\
+		.base = { \
+			.cra_flags = CRYPTO_ALG_INTERNAL, \
+			.cra_blocksize = 1, \
+			.cra_ctxsize = sizeof(struct morus640_ctx), \
+			.cra_alignmask = 0, \
+			.cra_priority = priority, \
 			\
-			.ivsize = MORUS_NONCE_SIZE, \
-			.maxauthsize = MORUS_MAX_AUTH_SIZE, \
-			.chunksize = MORUS640_BLOCK_SIZE, \
+			.cra_name = "__morus640", \
+			.cra_driver_name = "__"driver_name, \
 			\
-			.base = { \
-				.cra_flags = CRYPTO_ALG_ASYNC, \
-				.cra_blocksize = 1, \
-				.cra_ctxsize = sizeof(struct crypto_aead *), \
-				.cra_alignmask = 0, \
-				\
-				.cra_priority = priority, \
-				\
-				.cra_name = "morus640", \
-				.cra_driver_name = driver_name, \
-				\
-				.cra_module = THIS_MODULE, \
-			} \
+			.cra_module = THIS_MODULE, \
 		} \
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From e151a8d28c2c99c8a9a9bfbe2bd612e692c33efd Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 10 Mar 2019 12:00:57 -0700
Subject: crypto: x86/morus1280 - convert to use AEAD SIMD helpers

Convert the x86 implementations of MORUS-1280 to use the AEAD SIMD
helpers, rather than hand-rolling the same functionality.  This
simplifies the code and also fixes the bug where the user-provided
aead_request is modified.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/morus1280-avx2-glue.c | 12 ++---
 arch/x86/crypto/morus1280-sse2-glue.c | 12 ++---
 arch/x86/crypto/morus1280_glue.c      | 85 -----------------------------------
 crypto/Kconfig                        |  2 +-
 include/crypto/morus1280_glue.h       | 79 +++++++++-----------------------
 5 files changed, 37 insertions(+), 153 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c
index 6634907d6ccd..679627a2a824 100644
--- a/arch/x86/crypto/morus1280-avx2-glue.c
+++ b/arch/x86/crypto/morus1280-avx2-glue.c
@@ -12,6 +12,7 @@
  */
 
 #include <crypto/internal/aead.h>
+#include <crypto/internal/simd.h>
 #include <crypto/morus1280_glue.h>
 #include <linux/module.h>
 #include <asm/fpu/api.h>
@@ -35,7 +36,9 @@ asmlinkage void crypto_morus1280_avx2_dec_tail(void *state, const void *src,
 asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor,
 					    u64 assoclen, u64 cryptlen);
 
-MORUS1280_DECLARE_ALGS(avx2, "morus1280-avx2", 400);
+MORUS1280_DECLARE_ALG(avx2, "morus1280-avx2", 400);
+
+static struct simd_aead_alg *simd_alg;
 
 static int __init crypto_morus1280_avx2_module_init(void)
 {
@@ -44,14 +47,13 @@ static int __init crypto_morus1280_avx2_module_init(void)
 	    !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
 		return -ENODEV;
 
-	return crypto_register_aeads(crypto_morus1280_avx2_algs,
-				     ARRAY_SIZE(crypto_morus1280_avx2_algs));
+	return simd_register_aeads_compat(&crypto_morus1280_avx2_alg, 1,
+					  &simd_alg);
 }
 
 static void __exit crypto_morus1280_avx2_module_exit(void)
 {
-	crypto_unregister_aeads(crypto_morus1280_avx2_algs,
-				ARRAY_SIZE(crypto_morus1280_avx2_algs));
+	simd_unregister_aeads(&crypto_morus1280_avx2_alg, 1, &simd_alg);
 }
 
 module_init(crypto_morus1280_avx2_module_init);
diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c
index f40244eaf14d..c35c0638d0bb 100644
--- a/arch/x86/crypto/morus1280-sse2-glue.c
+++ b/arch/x86/crypto/morus1280-sse2-glue.c
@@ -12,6 +12,7 @@
  */
 
 #include <crypto/internal/aead.h>
+#include <crypto/internal/simd.h>
 #include <crypto/morus1280_glue.h>
 #include <linux/module.h>
 #include <asm/fpu/api.h>
@@ -35,7 +36,9 @@ asmlinkage void crypto_morus1280_sse2_dec_tail(void *state, const void *src,
 asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor,
 					    u64 assoclen, u64 cryptlen);
 
-MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350);
+MORUS1280_DECLARE_ALG(sse2, "morus1280-sse2", 350);
+
+static struct simd_aead_alg *simd_alg;
 
 static int __init crypto_morus1280_sse2_module_init(void)
 {
@@ -43,14 +46,13 @@ static int __init crypto_morus1280_sse2_module_init(void)
 	    !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
 		return -ENODEV;
 
-	return crypto_register_aeads(crypto_morus1280_sse2_algs,
-				     ARRAY_SIZE(crypto_morus1280_sse2_algs));
+	return simd_register_aeads_compat(&crypto_morus1280_sse2_alg, 1,
+					  &simd_alg);
 }
 
 static void __exit crypto_morus1280_sse2_module_exit(void)
 {
-	crypto_unregister_aeads(crypto_morus1280_sse2_algs,
-				ARRAY_SIZE(crypto_morus1280_sse2_algs));
+	simd_unregister_aeads(&crypto_morus1280_sse2_alg, 1, &simd_alg);
 }
 
 module_init(crypto_morus1280_sse2_module_init);
diff --git a/arch/x86/crypto/morus1280_glue.c b/arch/x86/crypto/morus1280_glue.c
index 7e600f8bcdad..30fc1bd98ec3 100644
--- a/arch/x86/crypto/morus1280_glue.c
+++ b/arch/x86/crypto/morus1280_glue.c
@@ -11,7 +11,6 @@
  * any later version.
  */
 
-#include <crypto/cryptd.h>
 #include <crypto/internal/aead.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/morus1280_glue.h>
@@ -205,90 +204,6 @@ void crypto_morus1280_glue_init_ops(struct crypto_aead *aead,
 }
 EXPORT_SYMBOL_GPL(crypto_morus1280_glue_init_ops);
 
-int cryptd_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key,
-				 unsigned int keylen)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setkey);
-
-int cryptd_morus1280_glue_setauthsize(struct crypto_aead *aead,
-				      unsigned int authsize)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setauthsize);
-
-int cryptd_morus1280_glue_encrypt(struct aead_request *req)
-{
-	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	aead = &cryptd_tfm->base;
-	if (irq_fpu_usable() && (!in_atomic() ||
-				 !cryptd_aead_queued(cryptd_tfm)))
-		aead = cryptd_aead_child(cryptd_tfm);
-
-	aead_request_set_tfm(req, aead);
-
-	return crypto_aead_encrypt(req);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_encrypt);
-
-int cryptd_morus1280_glue_decrypt(struct aead_request *req)
-{
-	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	struct cryptd_aead *cryptd_tfm = *ctx;
-
-	aead = &cryptd_tfm->base;
-	if (irq_fpu_usable() && (!in_atomic() ||
-				 !cryptd_aead_queued(cryptd_tfm)))
-		aead = cryptd_aead_child(cryptd_tfm);
-
-	aead_request_set_tfm(req, aead);
-
-	return crypto_aead_decrypt(req);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_decrypt);
-
-int cryptd_morus1280_glue_init_tfm(struct crypto_aead *aead)
-{
-	struct cryptd_aead *cryptd_tfm;
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-	const char *name = crypto_aead_alg(aead)->base.cra_driver_name;
-	char internal_name[CRYPTO_MAX_ALG_NAME];
-
-	if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name)
-			>= CRYPTO_MAX_ALG_NAME)
-		return -ENAMETOOLONG;
-
-	cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL,
-				       CRYPTO_ALG_INTERNAL);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-
-	*ctx = cryptd_tfm;
-	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
-	return 0;
-}
-EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_init_tfm);
-
-void cryptd_morus1280_glue_exit_tfm(struct crypto_aead *aead)
-{
-	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
-	cryptd_free_aead(*ctx);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_exit_tfm);
-
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
 MODULE_DESCRIPTION("MORUS-1280 AEAD mode -- glue for x86 optimizations");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 498ec4d98ce1..6ad6d11c990b 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -363,7 +363,7 @@ config CRYPTO_MORUS1280_GLUE
 	tristate
 	depends on X86
 	select CRYPTO_AEAD
-	select CRYPTO_CRYPTD
+	select CRYPTO_SIMD
 	help
 	  Common glue for SIMD optimizations of the MORUS-1280 dedicated AEAD
 	  algorithm.
diff --git a/include/crypto/morus1280_glue.h b/include/crypto/morus1280_glue.h
index ad2aa743dd99..5cefddb1991f 100644
--- a/include/crypto/morus1280_glue.h
+++ b/include/crypto/morus1280_glue.h
@@ -47,16 +47,7 @@ int crypto_morus1280_glue_setauthsize(struct crypto_aead *tfm,
 int crypto_morus1280_glue_encrypt(struct aead_request *req);
 int crypto_morus1280_glue_decrypt(struct aead_request *req);
 
-int cryptd_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key,
-				 unsigned int keylen);
-int cryptd_morus1280_glue_setauthsize(struct crypto_aead *aead,
-				      unsigned int authsize);
-int cryptd_morus1280_glue_encrypt(struct aead_request *req);
-int cryptd_morus1280_glue_decrypt(struct aead_request *req);
-int cryptd_morus1280_glue_init_tfm(struct crypto_aead *aead);
-void cryptd_morus1280_glue_exit_tfm(struct crypto_aead *aead);
-
-#define MORUS1280_DECLARE_ALGS(id, driver_name, priority) \
+#define MORUS1280_DECLARE_ALG(id, driver_name, priority) \
 	static const struct morus1280_glue_ops crypto_morus1280_##id##_ops = {\
 		.init = crypto_morus1280_##id##_init, \
 		.ad = crypto_morus1280_##id##_ad, \
@@ -77,55 +68,29 @@ void cryptd_morus1280_glue_exit_tfm(struct crypto_aead *aead);
 	{ \
 	} \
 	\
-	static struct aead_alg crypto_morus1280_##id##_algs[] = {\
-		{ \
-			.setkey = crypto_morus1280_glue_setkey, \
-			.setauthsize = crypto_morus1280_glue_setauthsize, \
-			.encrypt = crypto_morus1280_glue_encrypt, \
-			.decrypt = crypto_morus1280_glue_decrypt, \
-			.init = crypto_morus1280_##id##_init_tfm, \
-			.exit = crypto_morus1280_##id##_exit_tfm, \
-			\
-			.ivsize = MORUS_NONCE_SIZE, \
-			.maxauthsize = MORUS_MAX_AUTH_SIZE, \
-			.chunksize = MORUS1280_BLOCK_SIZE, \
-			\
-			.base = { \
-				.cra_flags = CRYPTO_ALG_INTERNAL, \
-				.cra_blocksize = 1, \
-				.cra_ctxsize = sizeof(struct morus1280_ctx), \
-				.cra_alignmask = 0, \
-				\
-				.cra_name = "__morus1280", \
-				.cra_driver_name = "__"driver_name, \
-				\
-				.cra_module = THIS_MODULE, \
-			} \
-		}, { \
-			.setkey = cryptd_morus1280_glue_setkey, \
-			.setauthsize = cryptd_morus1280_glue_setauthsize, \
-			.encrypt = cryptd_morus1280_glue_encrypt, \
-			.decrypt = cryptd_morus1280_glue_decrypt, \
-			.init = cryptd_morus1280_glue_init_tfm, \
-			.exit = cryptd_morus1280_glue_exit_tfm, \
+	static struct aead_alg crypto_morus1280_##id##_alg = { \
+		.setkey = crypto_morus1280_glue_setkey, \
+		.setauthsize = crypto_morus1280_glue_setauthsize, \
+		.encrypt = crypto_morus1280_glue_encrypt, \
+		.decrypt = crypto_morus1280_glue_decrypt, \
+		.init = crypto_morus1280_##id##_init_tfm, \
+		.exit = crypto_morus1280_##id##_exit_tfm, \
+		\
+		.ivsize = MORUS_NONCE_SIZE, \
+		.maxauthsize = MORUS_MAX_AUTH_SIZE, \
+		.chunksize = MORUS1280_BLOCK_SIZE, \
+		\
+		.base = { \
+			.cra_flags = CRYPTO_ALG_INTERNAL, \
+			.cra_blocksize = 1, \
+			.cra_ctxsize = sizeof(struct morus1280_ctx), \
+			.cra_alignmask = 0, \
+			.cra_priority = priority, \
 			\
-			.ivsize = MORUS_NONCE_SIZE, \
-			.maxauthsize = MORUS_MAX_AUTH_SIZE, \
-			.chunksize = MORUS1280_BLOCK_SIZE, \
+			.cra_name = "__morus1280", \
+			.cra_driver_name = "__"driver_name, \
 			\
-			.base = { \
-				.cra_flags = CRYPTO_ALG_ASYNC, \
-				.cra_blocksize = 1, \
-				.cra_ctxsize = sizeof(struct crypto_aead *), \
-				.cra_alignmask = 0, \
-				\
-				.cra_priority = priority, \
-				\
-				.cra_name = "morus1280", \
-				.cra_driver_name = driver_name, \
-				\
-				.cra_module = THIS_MODULE, \
-			} \
+			.cra_module = THIS_MODULE, \
 		} \
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 580e295178402d14bbf598a5702f8e01fc59dbaa Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 12 Mar 2019 22:12:46 -0700
Subject: crypto: arm64/gcm-aes-ce - fix no-NEON fallback code

The arm64 gcm-aes-ce algorithm is failing the extra crypto self-tests
following my patches to test the !may_use_simd() code paths, which
previously were untested.  The problem is that in the !may_use_simd()
case, an odd number of AES blocks can be processed within each step of
the skcipher_walk.  However, the skcipher_walk is being done with a
"stride" of 2 blocks and is advanced by an even number of blocks after
each step.  This causes the encryption to produce the wrong ciphertext
and authentication tag, and causes the decryption to incorrectly fail.

Fix it by only processing an even number of blocks per step.

Fixes: c2b24c36e0a3 ("crypto: arm64/aes-gcm-ce - fix scatterwalk API violation")
Fixes: 71e52c278c54 ("crypto: arm64/aes-ce-gcm - operate on two input blocks at a time")
Cc: <stable@vger.kernel.org> # v4.19+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/ghash-ce-glue.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 791ad422c427..089b09286da7 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -473,9 +473,11 @@ static int gcm_encrypt(struct aead_request *req)
 		put_unaligned_be32(2, iv + GCM_IV_SIZE);
 
 		while (walk.nbytes >= (2 * AES_BLOCK_SIZE)) {
-			int blocks = walk.nbytes / AES_BLOCK_SIZE;
+			const int blocks =
+				walk.nbytes / (2 * AES_BLOCK_SIZE) * 2;
 			u8 *dst = walk.dst.virt.addr;
 			u8 *src = walk.src.virt.addr;
+			int remaining = blocks;
 
 			do {
 				__aes_arm64_encrypt(ctx->aes_key.key_enc,
@@ -485,9 +487,9 @@ static int gcm_encrypt(struct aead_request *req)
 
 				dst += AES_BLOCK_SIZE;
 				src += AES_BLOCK_SIZE;
-			} while (--blocks > 0);
+			} while (--remaining > 0);
 
-			ghash_do_update(walk.nbytes / AES_BLOCK_SIZE, dg,
+			ghash_do_update(blocks, dg,
 					walk.dst.virt.addr, &ctx->ghash_key,
 					NULL, pmull_ghash_update_p64);
 
@@ -609,7 +611,7 @@ static int gcm_decrypt(struct aead_request *req)
 		put_unaligned_be32(2, iv + GCM_IV_SIZE);
 
 		while (walk.nbytes >= (2 * AES_BLOCK_SIZE)) {
-			int blocks = walk.nbytes / AES_BLOCK_SIZE;
+			int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2;
 			u8 *dst = walk.dst.virt.addr;
 			u8 *src = walk.src.virt.addr;
 
-- 
cgit v1.2.3-59-g8ed1b


From f2abe0d72b21671ad19db075262411e3d4a828dd Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 12 Mar 2019 22:12:48 -0700
Subject: crypto: x86 - convert to use crypto_simd_usable()

Replace all calls to irq_fpu_usable() in the x86 crypto code with
crypto_simd_usable(), in order to allow testing the no-SIMD code paths.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_glue.c         |  8 ++++----
 arch/x86/crypto/chacha_glue.c              |  6 +++---
 arch/x86/crypto/crc32-pclmul_glue.c        |  5 +++--
 arch/x86/crypto/crc32c-intel_glue.c        |  7 ++++---
 arch/x86/crypto/crct10dif-pclmul_glue.c    |  7 ++++---
 arch/x86/crypto/ghash-clmulni-intel_glue.c |  9 +++++----
 arch/x86/crypto/nhpoly1305-avx2-glue.c     |  5 +++--
 arch/x86/crypto/nhpoly1305-sse2-glue.c     |  5 +++--
 arch/x86/crypto/poly1305_glue.c            |  4 ++--
 arch/x86/crypto/sha1_ssse3_glue.c          |  7 ++++---
 arch/x86/crypto/sha256_ssse3_glue.c        |  7 ++++---
 arch/x86/crypto/sha512_ssse3_glue.c        | 10 +++++-----
 12 files changed, 44 insertions(+), 36 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 1466d59384ac..21c246799aa5 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -30,8 +30,8 @@
 #include <crypto/gcm.h>
 #include <crypto/xts.h>
 #include <asm/cpu_device_id.h>
-#include <asm/fpu/api.h>
 #include <asm/crypto/aes.h>
+#include <asm/simd.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/internal/aead.h>
 #include <crypto/internal/simd.h>
@@ -332,7 +332,7 @@ static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx,
 		return -EINVAL;
 	}
 
-	if (!irq_fpu_usable())
+	if (!crypto_simd_usable())
 		err = crypto_aes_expand_key(ctx, in_key, key_len);
 	else {
 		kernel_fpu_begin();
@@ -353,7 +353,7 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
 	struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
 
-	if (!irq_fpu_usable())
+	if (!crypto_simd_usable())
 		crypto_aes_encrypt_x86(ctx, dst, src);
 	else {
 		kernel_fpu_begin();
@@ -366,7 +366,7 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
 	struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
 
-	if (!irq_fpu_usable())
+	if (!crypto_simd_usable())
 		crypto_aes_decrypt_x86(ctx, dst, src);
 	else {
 		kernel_fpu_begin();
diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
index 45c1c4143176..4967ad620775 100644
--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
@@ -12,10 +12,10 @@
 
 #include <crypto/algapi.h>
 #include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <asm/fpu/api.h>
 #include <asm/simd.h>
 
 #define CHACHA_STATE_ALIGN 16
@@ -170,7 +170,7 @@ static int chacha_simd(struct skcipher_request *req)
 	struct skcipher_walk walk;
 	int err;
 
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
 		return crypto_chacha_crypt(req);
 
 	err = skcipher_walk_virt(&walk, req, true);
@@ -193,7 +193,7 @@ static int xchacha_simd(struct skcipher_request *req)
 	u8 real_iv[16];
 	int err;
 
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
 		return crypto_xchacha_crypt(req);
 
 	err = skcipher_walk_virt(&walk, req, true);
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index c8d9cdacbf10..cb4ab6645106 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -32,10 +32,11 @@
 #include <linux/kernel.h>
 #include <linux/crc32.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 
 #include <asm/cpufeatures.h>
 #include <asm/cpu_device_id.h>
-#include <asm/fpu/api.h>
+#include <asm/simd.h>
 
 #define CHKSUM_BLOCK_SIZE	1
 #define CHKSUM_DIGEST_SIZE	4
@@ -54,7 +55,7 @@ static u32 __attribute__((pure))
 	unsigned int iremainder;
 	unsigned int prealign;
 
-	if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !irq_fpu_usable())
+	if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !crypto_simd_usable())
 		return crc32_le(crc, p, len);
 
 	if ((long)p & SCALE_F_MASK) {
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 5773e1161072..a58fe217c856 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -29,10 +29,11 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 
 #include <asm/cpufeatures.h>
 #include <asm/cpu_device_id.h>
-#include <asm/fpu/internal.h>
+#include <asm/simd.h>
 
 #define CHKSUM_BLOCK_SIZE	1
 #define CHKSUM_DIGEST_SIZE	4
@@ -177,7 +178,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
 	 * use faster PCL version if datasize is large enough to
 	 * overcome kernel fpu state save/restore overhead
 	 */
-	if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) {
+	if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
 		kernel_fpu_begin();
 		*crcp = crc_pcl(data, len, *crcp);
 		kernel_fpu_end();
@@ -189,7 +190,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
 static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
 				u8 *out)
 {
-	if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) {
+	if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
 		kernel_fpu_begin();
 		*(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
 		kernel_fpu_end();
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
index 0e785c0b2354..64f17d2d8b67 100644
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -26,12 +26,13 @@
 #include <linux/module.h>
 #include <linux/crc-t10dif.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
-#include <asm/fpu/api.h>
 #include <asm/cpufeatures.h>
 #include <asm/cpu_device_id.h>
+#include <asm/simd.h>
 
 asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len);
 
@@ -53,7 +54,7 @@ static int chksum_update(struct shash_desc *desc, const u8 *data,
 {
 	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
 
-	if (length >= 16 && irq_fpu_usable()) {
+	if (length >= 16 && crypto_simd_usable()) {
 		kernel_fpu_begin();
 		ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
 		kernel_fpu_end();
@@ -73,7 +74,7 @@ static int chksum_final(struct shash_desc *desc, u8 *out)
 static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
 			u8 *out)
 {
-	if (len >= 16 && irq_fpu_usable()) {
+	if (len >= 16 && crypto_simd_usable()) {
 		kernel_fpu_begin();
 		*(__u16 *)out = crc_t10dif_pcl(*crcp, data, len);
 		kernel_fpu_end();
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 3582ae885ee1..4099a0ae17dd 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -19,8 +19,9 @@
 #include <crypto/cryptd.h>
 #include <crypto/gf128mul.h>
 #include <crypto/internal/hash.h>
-#include <asm/fpu/api.h>
+#include <crypto/internal/simd.h>
 #include <asm/cpu_device_id.h>
+#include <asm/simd.h>
 
 #define GHASH_BLOCK_SIZE	16
 #define GHASH_DIGEST_SIZE	16
@@ -182,7 +183,7 @@ static int ghash_async_update(struct ahash_request *req)
 	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
 	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
 
-	if (!irq_fpu_usable() ||
+	if (!crypto_simd_usable() ||
 	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
 		memcpy(cryptd_req, req, sizeof(*req));
 		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
@@ -200,7 +201,7 @@ static int ghash_async_final(struct ahash_request *req)
 	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
 	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
 
-	if (!irq_fpu_usable() ||
+	if (!crypto_simd_usable() ||
 	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
 		memcpy(cryptd_req, req, sizeof(*req));
 		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
@@ -241,7 +242,7 @@ static int ghash_async_digest(struct ahash_request *req)
 	struct ahash_request *cryptd_req = ahash_request_ctx(req);
 	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
 
-	if (!irq_fpu_usable() ||
+	if (!crypto_simd_usable() ||
 	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
 		memcpy(cryptd_req, req, sizeof(*req));
 		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c
index 20d815ea4b6a..f7567cbd35b6 100644
--- a/arch/x86/crypto/nhpoly1305-avx2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c
@@ -7,9 +7,10 @@
  */
 
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <crypto/nhpoly1305.h>
 #include <linux/module.h>
-#include <asm/fpu/api.h>
+#include <asm/simd.h>
 
 asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
 			u8 hash[NH_HASH_BYTES]);
@@ -24,7 +25,7 @@ static void _nh_avx2(const u32 *key, const u8 *message, size_t message_len,
 static int nhpoly1305_avx2_update(struct shash_desc *desc,
 				  const u8 *src, unsigned int srclen)
 {
-	if (srclen < 64 || !irq_fpu_usable())
+	if (srclen < 64 || !crypto_simd_usable())
 		return crypto_nhpoly1305_update(desc, src, srclen);
 
 	do {
diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c
index ed68d164ce14..a661ede3b5cf 100644
--- a/arch/x86/crypto/nhpoly1305-sse2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c
@@ -7,9 +7,10 @@
  */
 
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <crypto/nhpoly1305.h>
 #include <linux/module.h>
-#include <asm/fpu/api.h>
+#include <asm/simd.h>
 
 asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
 			u8 hash[NH_HASH_BYTES]);
@@ -24,7 +25,7 @@ static void _nh_sse2(const u32 *key, const u8 *message, size_t message_len,
 static int nhpoly1305_sse2_update(struct shash_desc *desc,
 				  const u8 *src, unsigned int srclen)
 {
-	if (srclen < 64 || !irq_fpu_usable())
+	if (srclen < 64 || !crypto_simd_usable())
 		return crypto_nhpoly1305_update(desc, src, srclen);
 
 	do {
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index 88cc01506c84..6eb65b237b3c 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -11,11 +11,11 @@
 
 #include <crypto/algapi.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <crypto/poly1305.h>
 #include <linux/crypto.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <asm/fpu/api.h>
 #include <asm/simd.h>
 
 struct poly1305_simd_desc_ctx {
@@ -126,7 +126,7 @@ static int poly1305_simd_update(struct shash_desc *desc,
 	unsigned int bytes;
 
 	/* kernel_fpu_begin/end is costly, use fallback for small updates */
-	if (srclen <= 288 || !may_use_simd())
+	if (srclen <= 288 || !crypto_simd_usable())
 		return crypto_poly1305_update(desc, src, srclen);
 
 	kernel_fpu_begin();
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 7391c7de72c7..42f177afc33a 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -22,6 +22,7 @@
 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
 
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm.h>
@@ -29,7 +30,7 @@
 #include <linux/types.h>
 #include <crypto/sha.h>
 #include <crypto/sha1_base.h>
-#include <asm/fpu/api.h>
+#include <asm/simd.h>
 
 typedef void (sha1_transform_fn)(u32 *digest, const char *data,
 				unsigned int rounds);
@@ -39,7 +40,7 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
 {
 	struct sha1_state *sctx = shash_desc_ctx(desc);
 
-	if (!irq_fpu_usable() ||
+	if (!crypto_simd_usable() ||
 	    (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
 		return crypto_sha1_update(desc, data, len);
 
@@ -57,7 +58,7 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
 static int sha1_finup(struct shash_desc *desc, const u8 *data,
 		      unsigned int len, u8 *out, sha1_transform_fn *sha1_xform)
 {
-	if (!irq_fpu_usable())
+	if (!crypto_simd_usable())
 		return crypto_sha1_finup(desc, data, len, out);
 
 	kernel_fpu_begin();
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 773a873d2b28..73867da3cbee 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -30,6 +30,7 @@
 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
 
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm.h>
@@ -37,8 +38,8 @@
 #include <linux/types.h>
 #include <crypto/sha.h>
 #include <crypto/sha256_base.h>
-#include <asm/fpu/api.h>
 #include <linux/string.h>
+#include <asm/simd.h>
 
 asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
 				       u64 rounds);
@@ -49,7 +50,7 @@ static int sha256_update(struct shash_desc *desc, const u8 *data,
 {
 	struct sha256_state *sctx = shash_desc_ctx(desc);
 
-	if (!irq_fpu_usable() ||
+	if (!crypto_simd_usable() ||
 	    (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
 		return crypto_sha256_update(desc, data, len);
 
@@ -67,7 +68,7 @@ static int sha256_update(struct shash_desc *desc, const u8 *data,
 static int sha256_finup(struct shash_desc *desc, const u8 *data,
 	      unsigned int len, u8 *out, sha256_transform_fn *sha256_xform)
 {
-	if (!irq_fpu_usable())
+	if (!crypto_simd_usable())
 		return crypto_sha256_finup(desc, data, len, out);
 
 	kernel_fpu_begin();
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index f1b811b60ba6..458356a3f124 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -28,16 +28,16 @@
 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
 
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/cryptohash.h>
+#include <linux/string.h>
 #include <linux/types.h>
 #include <crypto/sha.h>
 #include <crypto/sha512_base.h>
-#include <asm/fpu/api.h>
-
-#include <linux/string.h>
+#include <asm/simd.h>
 
 asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data,
 				       u64 rounds);
@@ -49,7 +49,7 @@ static int sha512_update(struct shash_desc *desc, const u8 *data,
 {
 	struct sha512_state *sctx = shash_desc_ctx(desc);
 
-	if (!irq_fpu_usable() ||
+	if (!crypto_simd_usable() ||
 	    (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE)
 		return crypto_sha512_update(desc, data, len);
 
@@ -67,7 +67,7 @@ static int sha512_update(struct shash_desc *desc, const u8 *data,
 static int sha512_finup(struct shash_desc *desc, const u8 *data,
 	      unsigned int len, u8 *out, sha512_transform_fn *sha512_xform)
 {
-	if (!irq_fpu_usable())
+	if (!crypto_simd_usable())
 		return crypto_sha512_finup(desc, data, len, out);
 
 	kernel_fpu_begin();
-- 
cgit v1.2.3-59-g8ed1b


From 99680c5e91829453e001ab5f7e6d717e6d2dcb21 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 12 Mar 2019 22:12:49 -0700
Subject: crypto: arm - convert to use crypto_simd_usable()

Replace all calls to may_use_simd() in the arm crypto code with
crypto_simd_usable(), in order to allow testing the no-SIMD code paths.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm/crypto/chacha-neon-glue.c     | 5 +++--
 arch/arm/crypto/crc32-ce-glue.c        | 5 +++--
 arch/arm/crypto/crct10dif-ce-glue.c    | 3 ++-
 arch/arm/crypto/ghash-ce-glue.c        | 7 ++++---
 arch/arm/crypto/nhpoly1305-neon-glue.c | 3 ++-
 arch/arm/crypto/sha1-ce-glue.c         | 5 +++--
 arch/arm/crypto/sha1_neon_glue.c       | 5 +++--
 arch/arm/crypto/sha2-ce-glue.c         | 5 +++--
 arch/arm/crypto/sha256_neon_glue.c     | 5 +++--
 arch/arm/crypto/sha512-neon-glue.c     | 5 +++--
 10 files changed, 29 insertions(+), 19 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/crypto/chacha-neon-glue.c b/arch/arm/crypto/chacha-neon-glue.c
index 9d6fda81986d..48a89537b828 100644
--- a/arch/arm/crypto/chacha-neon-glue.c
+++ b/arch/arm/crypto/chacha-neon-glue.c
@@ -21,6 +21,7 @@
 
 #include <crypto/algapi.h>
 #include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -93,7 +94,7 @@ static int chacha_neon(struct skcipher_request *req)
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
 		return crypto_chacha_crypt(req);
 
 	return chacha_neon_stream_xor(req, ctx, req->iv);
@@ -107,7 +108,7 @@ static int xchacha_neon(struct skcipher_request *req)
 	u32 state[16];
 	u8 real_iv[16];
 
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
 		return crypto_xchacha_crypt(req);
 
 	crypto_chacha_init(state, ctx, req->iv);
diff --git a/arch/arm/crypto/crc32-ce-glue.c b/arch/arm/crypto/crc32-ce-glue.c
index cd9e93b46c2d..e712c2a7d387 100644
--- a/arch/arm/crypto/crc32-ce-glue.c
+++ b/arch/arm/crypto/crc32-ce-glue.c
@@ -16,6 +16,7 @@
 #include <linux/string.h>
 
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 
 #include <asm/hwcap.h>
 #include <asm/neon.h>
@@ -113,7 +114,7 @@ static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
 	u32 *crc = shash_desc_ctx(desc);
 	unsigned int l;
 
-	if (may_use_simd()) {
+	if (crypto_simd_usable()) {
 		if ((u32)data % SCALE_F) {
 			l = min_t(u32, length, SCALE_F - ((u32)data % SCALE_F));
 
@@ -147,7 +148,7 @@ static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data,
 	u32 *crc = shash_desc_ctx(desc);
 	unsigned int l;
 
-	if (may_use_simd()) {
+	if (crypto_simd_usable()) {
 		if ((u32)data % SCALE_F) {
 			l = min_t(u32, length, SCALE_F - ((u32)data % SCALE_F));
 
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
index 3d6b800b8396..3b24f2872592 100644
--- a/arch/arm/crypto/crct10dif-ce-glue.c
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -15,6 +15,7 @@
 #include <linux/string.h>
 
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 
 #include <asm/neon.h>
 #include <asm/simd.h>
@@ -36,7 +37,7 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data,
 {
 	u16 *crc = shash_desc_ctx(desc);
 
-	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
+	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
 		kernel_neon_begin();
 		*crc = crc_t10dif_pmull(*crc, data, length);
 		kernel_neon_end();
diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c
index b7d30b6cf49c..60123e9ea9d8 100644
--- a/arch/arm/crypto/ghash-ce-glue.c
+++ b/arch/arm/crypto/ghash-ce-glue.c
@@ -14,6 +14,7 @@
 #include <asm/unaligned.h>
 #include <crypto/cryptd.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <crypto/gf128mul.h>
 #include <linux/cpufeature.h>
 #include <linux/crypto.h>
@@ -196,7 +197,7 @@ static int ghash_async_update(struct ahash_request *req)
 	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
 	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
 
-	if (!may_use_simd() ||
+	if (!crypto_simd_usable() ||
 	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
 		memcpy(cryptd_req, req, sizeof(*req));
 		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
@@ -214,7 +215,7 @@ static int ghash_async_final(struct ahash_request *req)
 	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
 	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
 
-	if (!may_use_simd() ||
+	if (!crypto_simd_usable() ||
 	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
 		memcpy(cryptd_req, req, sizeof(*req));
 		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
@@ -232,7 +233,7 @@ static int ghash_async_digest(struct ahash_request *req)
 	struct ahash_request *cryptd_req = ahash_request_ctx(req);
 	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
 
-	if (!may_use_simd() ||
+	if (!crypto_simd_usable() ||
 	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
 		memcpy(cryptd_req, req, sizeof(*req));
 		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
diff --git a/arch/arm/crypto/nhpoly1305-neon-glue.c b/arch/arm/crypto/nhpoly1305-neon-glue.c
index 49aae87cb2bc..ae5aefc44a4d 100644
--- a/arch/arm/crypto/nhpoly1305-neon-glue.c
+++ b/arch/arm/crypto/nhpoly1305-neon-glue.c
@@ -9,6 +9,7 @@
 #include <asm/neon.h>
 #include <asm/simd.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <crypto/nhpoly1305.h>
 #include <linux/module.h>
 
@@ -25,7 +26,7 @@ static void _nh_neon(const u32 *key, const u8 *message, size_t message_len,
 static int nhpoly1305_neon_update(struct shash_desc *desc,
 				  const u8 *src, unsigned int srclen)
 {
-	if (srclen < 64 || !may_use_simd())
+	if (srclen < 64 || !crypto_simd_usable())
 		return crypto_nhpoly1305_update(desc, src, srclen);
 
 	do {
diff --git a/arch/arm/crypto/sha1-ce-glue.c b/arch/arm/crypto/sha1-ce-glue.c
index b732522e20f8..4c6c6900853c 100644
--- a/arch/arm/crypto/sha1-ce-glue.c
+++ b/arch/arm/crypto/sha1-ce-glue.c
@@ -9,6 +9,7 @@
  */
 
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <crypto/sha.h>
 #include <crypto/sha1_base.h>
 #include <linux/cpufeature.h>
@@ -33,7 +34,7 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
 {
 	struct sha1_state *sctx = shash_desc_ctx(desc);
 
-	if (!may_use_simd() ||
+	if (!crypto_simd_usable() ||
 	    (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
 		return sha1_update_arm(desc, data, len);
 
@@ -47,7 +48,7 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
 static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
 			 unsigned int len, u8 *out)
 {
-	if (!may_use_simd())
+	if (!crypto_simd_usable())
 		return sha1_finup_arm(desc, data, len, out);
 
 	kernel_neon_begin();
diff --git a/arch/arm/crypto/sha1_neon_glue.c b/arch/arm/crypto/sha1_neon_glue.c
index d15e0ea2c95e..d6c95c213d42 100644
--- a/arch/arm/crypto/sha1_neon_glue.c
+++ b/arch/arm/crypto/sha1_neon_glue.c
@@ -19,6 +19,7 @@
  */
 
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm.h>
@@ -39,7 +40,7 @@ static int sha1_neon_update(struct shash_desc *desc, const u8 *data,
 {
 	struct sha1_state *sctx = shash_desc_ctx(desc);
 
-	if (!may_use_simd() ||
+	if (!crypto_simd_usable() ||
 	    (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
 		return sha1_update_arm(desc, data, len);
 
@@ -54,7 +55,7 @@ static int sha1_neon_update(struct shash_desc *desc, const u8 *data,
 static int sha1_neon_finup(struct shash_desc *desc, const u8 *data,
 			   unsigned int len, u8 *out)
 {
-	if (!may_use_simd())
+	if (!crypto_simd_usable())
 		return sha1_finup_arm(desc, data, len, out);
 
 	kernel_neon_begin();
diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
index 1211a5c129fc..a47a9d4b663e 100644
--- a/arch/arm/crypto/sha2-ce-glue.c
+++ b/arch/arm/crypto/sha2-ce-glue.c
@@ -9,6 +9,7 @@
  */
 
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <crypto/sha.h>
 #include <crypto/sha256_base.h>
 #include <linux/cpufeature.h>
@@ -34,7 +35,7 @@ static int sha2_ce_update(struct shash_desc *desc, const u8 *data,
 {
 	struct sha256_state *sctx = shash_desc_ctx(desc);
 
-	if (!may_use_simd() ||
+	if (!crypto_simd_usable() ||
 	    (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
 		return crypto_sha256_arm_update(desc, data, len);
 
@@ -49,7 +50,7 @@ static int sha2_ce_update(struct shash_desc *desc, const u8 *data,
 static int sha2_ce_finup(struct shash_desc *desc, const u8 *data,
 			 unsigned int len, u8 *out)
 {
-	if (!may_use_simd())
+	if (!crypto_simd_usable())
 		return crypto_sha256_arm_finup(desc, data, len, out);
 
 	kernel_neon_begin();
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
index 1d82c6cd31a4..f3f6b1624fc3 100644
--- a/arch/arm/crypto/sha256_neon_glue.c
+++ b/arch/arm/crypto/sha256_neon_glue.c
@@ -15,6 +15,7 @@
  */
 
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <linux/cryptohash.h>
 #include <linux/types.h>
 #include <linux/string.h>
@@ -34,7 +35,7 @@ static int sha256_update(struct shash_desc *desc, const u8 *data,
 {
 	struct sha256_state *sctx = shash_desc_ctx(desc);
 
-	if (!may_use_simd() ||
+	if (!crypto_simd_usable() ||
 	    (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
 		return crypto_sha256_arm_update(desc, data, len);
 
@@ -49,7 +50,7 @@ static int sha256_update(struct shash_desc *desc, const u8 *data,
 static int sha256_finup(struct shash_desc *desc, const u8 *data,
 			unsigned int len, u8 *out)
 {
-	if (!may_use_simd())
+	if (!crypto_simd_usable())
 		return crypto_sha256_arm_finup(desc, data, len, out);
 
 	kernel_neon_begin();
diff --git a/arch/arm/crypto/sha512-neon-glue.c b/arch/arm/crypto/sha512-neon-glue.c
index 8a5642b41fd6..d33ab59c26c0 100644
--- a/arch/arm/crypto/sha512-neon-glue.c
+++ b/arch/arm/crypto/sha512-neon-glue.c
@@ -9,6 +9,7 @@
  */
 
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <crypto/sha.h>
 #include <crypto/sha512_base.h>
 #include <linux/crypto.h>
@@ -30,7 +31,7 @@ static int sha512_neon_update(struct shash_desc *desc, const u8 *data,
 {
 	struct sha512_state *sctx = shash_desc_ctx(desc);
 
-	if (!may_use_simd() ||
+	if (!crypto_simd_usable() ||
 	    (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE)
 		return sha512_arm_update(desc, data, len);
 
@@ -45,7 +46,7 @@ static int sha512_neon_update(struct shash_desc *desc, const u8 *data,
 static int sha512_neon_finup(struct shash_desc *desc, const u8 *data,
 			     unsigned int len, u8 *out)
 {
-	if (!may_use_simd())
+	if (!crypto_simd_usable())
 		return sha512_arm_finup(desc, data, len, out);
 
 	kernel_neon_begin();
-- 
cgit v1.2.3-59-g8ed1b


From e52b7023cdad005756cd91d7c54fa90ef6b43d32 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 12 Mar 2019 22:12:50 -0700
Subject: crypto: arm64 - convert to use crypto_simd_usable()

Replace all calls to may_use_simd() in the arm64 crypto code with
crypto_simd_usable(), in order to allow testing the no-SIMD code paths.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/aes-ce-ccm-glue.c      | 7 ++++---
 arch/arm64/crypto/aes-ce-glue.c          | 5 +++--
 arch/arm64/crypto/aes-glue.c             | 4 ++--
 arch/arm64/crypto/aes-neonbs-glue.c      | 2 +-
 arch/arm64/crypto/chacha-neon-glue.c     | 5 +++--
 arch/arm64/crypto/crct10dif-ce-glue.c    | 5 +++--
 arch/arm64/crypto/ghash-ce-glue.c        | 7 ++++---
 arch/arm64/crypto/nhpoly1305-neon-glue.c | 3 ++-
 arch/arm64/crypto/sha1-ce-glue.c         | 7 ++++---
 arch/arm64/crypto/sha2-ce-glue.c         | 7 ++++---
 arch/arm64/crypto/sha256-glue.c          | 5 +++--
 arch/arm64/crypto/sha3-ce-glue.c         | 5 +++--
 arch/arm64/crypto/sha512-ce-glue.c       | 7 ++++---
 arch/arm64/crypto/sm3-ce-glue.c          | 7 ++++---
 arch/arm64/crypto/sm4-ce-glue.c          | 5 +++--
 15 files changed, 47 insertions(+), 34 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
index 5fc6f51908fd..9dc4110a2e61 100644
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -14,6 +14,7 @@
 #include <crypto/aes.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/internal/aead.h>
+#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/module.h>
 
@@ -109,7 +110,7 @@ static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
 static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
 			   u32 abytes, u32 *macp)
 {
-	if (may_use_simd()) {
+	if (crypto_simd_usable()) {
 		kernel_neon_begin();
 		ce_aes_ccm_auth_data(mac, in, abytes, macp, key->key_enc,
 				     num_rounds(key));
@@ -255,7 +256,7 @@ static int ccm_encrypt(struct aead_request *req)
 
 	err = skcipher_walk_aead_encrypt(&walk, req, false);
 
-	if (may_use_simd()) {
+	if (crypto_simd_usable()) {
 		while (walk.nbytes) {
 			u32 tail = walk.nbytes % AES_BLOCK_SIZE;
 
@@ -313,7 +314,7 @@ static int ccm_decrypt(struct aead_request *req)
 
 	err = skcipher_walk_aead_decrypt(&walk, req, false);
 
-	if (may_use_simd()) {
+	if (crypto_simd_usable()) {
 		while (walk.nbytes) {
 			u32 tail = walk.nbytes % AES_BLOCK_SIZE;
 
diff --git a/arch/arm64/crypto/aes-ce-glue.c b/arch/arm64/crypto/aes-ce-glue.c
index e6b3227bbf57..3213843fcb46 100644
--- a/arch/arm64/crypto/aes-ce-glue.c
+++ b/arch/arm64/crypto/aes-ce-glue.c
@@ -12,6 +12,7 @@
 #include <asm/simd.h>
 #include <asm/unaligned.h>
 #include <crypto/aes.h>
+#include <crypto/internal/simd.h>
 #include <linux/cpufeature.h>
 #include <linux/crypto.h>
 #include <linux/module.h>
@@ -52,7 +53,7 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 {
 	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
 
-	if (!may_use_simd()) {
+	if (!crypto_simd_usable()) {
 		__aes_arm64_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
 		return;
 	}
@@ -66,7 +67,7 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 {
 	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
 
-	if (!may_use_simd()) {
+	if (!crypto_simd_usable()) {
 		__aes_arm64_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
 		return;
 	}
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index 1e676625ef33..692cb75f2ca2 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -405,7 +405,7 @@ static int ctr_encrypt_sync(struct skcipher_request *req)
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-	if (!may_use_simd())
+	if (!crypto_simd_usable())
 		return aes_ctr_encrypt_fallback(ctx, req);
 
 	return ctr_encrypt(req);
@@ -642,7 +642,7 @@ static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks,
 {
 	int rounds = 6 + ctx->key_length / 4;
 
-	if (may_use_simd()) {
+	if (crypto_simd_usable()) {
 		kernel_neon_begin();
 		aes_mac_update(in, ctx->key_enc, rounds, blocks, dg, enc_before,
 			       enc_after);
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
index e7a95a566462..4737b6c6c5cf 100644
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -288,7 +288,7 @@ static int ctr_encrypt_sync(struct skcipher_request *req)
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-	if (!may_use_simd())
+	if (!crypto_simd_usable())
 		return aes_ctr_encrypt_fallback(&ctx->fallback, req);
 
 	return ctr_encrypt(req);
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
index bece1d85bd81..3a26a98a7e17 100644
--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@@ -21,6 +21,7 @@
 
 #include <crypto/algapi.h>
 #include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -90,7 +91,7 @@ static int chacha_neon(struct skcipher_request *req)
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
 		return crypto_chacha_crypt(req);
 
 	return chacha_neon_stream_xor(req, ctx, req->iv);
@@ -104,7 +105,7 @@ static int xchacha_neon(struct skcipher_request *req)
 	u32 state[16];
 	u8 real_iv[16];
 
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
 		return crypto_xchacha_crypt(req);
 
 	crypto_chacha_init(state, ctx, req->iv);
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
index dd325829ee44..64e92ab70269 100644
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -16,6 +16,7 @@
 #include <linux/string.h>
 
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 
 #include <asm/neon.h>
 #include <asm/simd.h>
@@ -38,7 +39,7 @@ static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
 {
 	u16 *crc = shash_desc_ctx(desc);
 
-	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
+	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
 		kernel_neon_begin();
 		*crc = crc_t10dif_pmull_p8(*crc, data, length);
 		kernel_neon_end();
@@ -54,7 +55,7 @@ static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
 {
 	u16 *crc = shash_desc_ctx(desc);
 
-	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
+	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
 		kernel_neon_begin();
 		*crc = crc_t10dif_pmull_p64(*crc, data, length);
 		kernel_neon_end();
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 089b09286da7..fcd458c83bc1 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -17,6 +17,7 @@
 #include <crypto/gf128mul.h>
 #include <crypto/internal/aead.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
 #include <linux/cpufeature.h>
@@ -89,7 +90,7 @@ static void ghash_do_update(int blocks, u64 dg[], const char *src,
 						struct ghash_key const *k,
 						const char *head))
 {
-	if (likely(may_use_simd())) {
+	if (likely(crypto_simd_usable())) {
 		kernel_neon_begin();
 		simd_update(blocks, dg, src, key, head);
 		kernel_neon_end();
@@ -441,7 +442,7 @@ static int gcm_encrypt(struct aead_request *req)
 
 	err = skcipher_walk_aead_encrypt(&walk, req, false);
 
-	if (likely(may_use_simd() && walk.total >= 2 * AES_BLOCK_SIZE)) {
+	if (likely(crypto_simd_usable() && walk.total >= 2 * AES_BLOCK_SIZE)) {
 		u32 const *rk = NULL;
 
 		kernel_neon_begin();
@@ -565,7 +566,7 @@ static int gcm_decrypt(struct aead_request *req)
 
 	err = skcipher_walk_aead_decrypt(&walk, req, false);
 
-	if (likely(may_use_simd() && walk.total >= 2 * AES_BLOCK_SIZE)) {
+	if (likely(crypto_simd_usable() && walk.total >= 2 * AES_BLOCK_SIZE)) {
 		u32 const *rk = NULL;
 
 		kernel_neon_begin();
diff --git a/arch/arm64/crypto/nhpoly1305-neon-glue.c b/arch/arm64/crypto/nhpoly1305-neon-glue.c
index 22cc32ac9448..d15e872fa3f5 100644
--- a/arch/arm64/crypto/nhpoly1305-neon-glue.c
+++ b/arch/arm64/crypto/nhpoly1305-neon-glue.c
@@ -9,6 +9,7 @@
 #include <asm/neon.h>
 #include <asm/simd.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <crypto/nhpoly1305.h>
 #include <linux/module.h>
 
@@ -25,7 +26,7 @@ static void _nh_neon(const u32 *key, const u8 *message, size_t message_len,
 static int nhpoly1305_neon_update(struct shash_desc *desc,
 				  const u8 *src, unsigned int srclen)
 {
-	if (srclen < 64 || !may_use_simd())
+	if (srclen < 64 || !crypto_simd_usable())
 		return crypto_nhpoly1305_update(desc, src, srclen);
 
 	do {
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
index 17fac2889f56..eaa7a8258f1c 100644
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -12,6 +12,7 @@
 #include <asm/simd.h>
 #include <asm/unaligned.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <crypto/sha.h>
 #include <crypto/sha1_base.h>
 #include <linux/cpufeature.h>
@@ -38,7 +39,7 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
 {
 	struct sha1_ce_state *sctx = shash_desc_ctx(desc);
 
-	if (!may_use_simd())
+	if (!crypto_simd_usable())
 		return crypto_sha1_update(desc, data, len);
 
 	sctx->finalize = 0;
@@ -56,7 +57,7 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
 	struct sha1_ce_state *sctx = shash_desc_ctx(desc);
 	bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE);
 
-	if (!may_use_simd())
+	if (!crypto_simd_usable())
 		return crypto_sha1_finup(desc, data, len, out);
 
 	/*
@@ -78,7 +79,7 @@ static int sha1_ce_final(struct shash_desc *desc, u8 *out)
 {
 	struct sha1_ce_state *sctx = shash_desc_ctx(desc);
 
-	if (!may_use_simd())
+	if (!crypto_simd_usable())
 		return crypto_sha1_finup(desc, NULL, 0, out);
 
 	sctx->finalize = 0;
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index 261f5195cab7..a725997e55f2 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -12,6 +12,7 @@
 #include <asm/simd.h>
 #include <asm/unaligned.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <crypto/sha.h>
 #include <crypto/sha256_base.h>
 #include <linux/cpufeature.h>
@@ -42,7 +43,7 @@ static int sha256_ce_update(struct shash_desc *desc, const u8 *data,
 {
 	struct sha256_ce_state *sctx = shash_desc_ctx(desc);
 
-	if (!may_use_simd())
+	if (!crypto_simd_usable())
 		return sha256_base_do_update(desc, data, len,
 				(sha256_block_fn *)sha256_block_data_order);
 
@@ -61,7 +62,7 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data,
 	struct sha256_ce_state *sctx = shash_desc_ctx(desc);
 	bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE);
 
-	if (!may_use_simd()) {
+	if (!crypto_simd_usable()) {
 		if (len)
 			sha256_base_do_update(desc, data, len,
 				(sha256_block_fn *)sha256_block_data_order);
@@ -90,7 +91,7 @@ static int sha256_ce_final(struct shash_desc *desc, u8 *out)
 {
 	struct sha256_ce_state *sctx = shash_desc_ctx(desc);
 
-	if (!may_use_simd()) {
+	if (!crypto_simd_usable()) {
 		sha256_base_do_finalize(desc,
 				(sha256_block_fn *)sha256_block_data_order);
 		return sha256_base_finish(desc, out);
diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c
index 4aedeaefd61f..54586e0be9fd 100644
--- a/arch/arm64/crypto/sha256-glue.c
+++ b/arch/arm64/crypto/sha256-glue.c
@@ -14,6 +14,7 @@
 #include <asm/neon.h>
 #include <asm/simd.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <crypto/sha.h>
 #include <crypto/sha256_base.h>
 #include <linux/cryptohash.h>
@@ -89,7 +90,7 @@ static int sha256_update_neon(struct shash_desc *desc, const u8 *data,
 {
 	struct sha256_state *sctx = shash_desc_ctx(desc);
 
-	if (!may_use_simd())
+	if (!crypto_simd_usable())
 		return sha256_base_do_update(desc, data, len,
 				(sha256_block_fn *)sha256_block_data_order);
 
@@ -119,7 +120,7 @@ static int sha256_update_neon(struct shash_desc *desc, const u8 *data,
 static int sha256_finup_neon(struct shash_desc *desc, const u8 *data,
 			     unsigned int len, u8 *out)
 {
-	if (!may_use_simd()) {
+	if (!crypto_simd_usable()) {
 		if (len)
 			sha256_base_do_update(desc, data, len,
 				(sha256_block_fn *)sha256_block_data_order);
diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
index a336feac0f59..9a4bbfc45f40 100644
--- a/arch/arm64/crypto/sha3-ce-glue.c
+++ b/arch/arm64/crypto/sha3-ce-glue.c
@@ -14,6 +14,7 @@
 #include <asm/simd.h>
 #include <asm/unaligned.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <crypto/sha3.h>
 #include <linux/cpufeature.h>
 #include <linux/crypto.h>
@@ -32,7 +33,7 @@ static int sha3_update(struct shash_desc *desc, const u8 *data,
 	struct sha3_state *sctx = shash_desc_ctx(desc);
 	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
 
-	if (!may_use_simd())
+	if (!crypto_simd_usable())
 		return crypto_sha3_update(desc, data, len);
 
 	if ((sctx->partial + len) >= sctx->rsiz) {
@@ -76,7 +77,7 @@ static int sha3_final(struct shash_desc *desc, u8 *out)
 	__le64 *digest = (__le64 *)out;
 	int i;
 
-	if (!may_use_simd())
+	if (!crypto_simd_usable())
 		return crypto_sha3_final(desc, out);
 
 	sctx->buf[sctx->partial++] = 0x06;
diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c
index f2c5f28c622a..2369540040aa 100644
--- a/arch/arm64/crypto/sha512-ce-glue.c
+++ b/arch/arm64/crypto/sha512-ce-glue.c
@@ -13,6 +13,7 @@
 #include <asm/simd.h>
 #include <asm/unaligned.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <crypto/sha.h>
 #include <crypto/sha512_base.h>
 #include <linux/cpufeature.h>
@@ -31,7 +32,7 @@ asmlinkage void sha512_block_data_order(u64 *digest, u8 const *src, int blocks);
 static int sha512_ce_update(struct shash_desc *desc, const u8 *data,
 			    unsigned int len)
 {
-	if (!may_use_simd())
+	if (!crypto_simd_usable())
 		return sha512_base_do_update(desc, data, len,
 				(sha512_block_fn *)sha512_block_data_order);
 
@@ -46,7 +47,7 @@ static int sha512_ce_update(struct shash_desc *desc, const u8 *data,
 static int sha512_ce_finup(struct shash_desc *desc, const u8 *data,
 			   unsigned int len, u8 *out)
 {
-	if (!may_use_simd()) {
+	if (!crypto_simd_usable()) {
 		if (len)
 			sha512_base_do_update(desc, data, len,
 				(sha512_block_fn *)sha512_block_data_order);
@@ -65,7 +66,7 @@ static int sha512_ce_finup(struct shash_desc *desc, const u8 *data,
 
 static int sha512_ce_final(struct shash_desc *desc, u8 *out)
 {
-	if (!may_use_simd()) {
+	if (!crypto_simd_usable()) {
 		sha512_base_do_finalize(desc,
 				(sha512_block_fn *)sha512_block_data_order);
 		return sha512_base_finish(desc, out);
diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c
index 88938a20d9b2..5d15533799a2 100644
--- a/arch/arm64/crypto/sm3-ce-glue.c
+++ b/arch/arm64/crypto/sm3-ce-glue.c
@@ -12,6 +12,7 @@
 #include <asm/simd.h>
 #include <asm/unaligned.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <crypto/sm3.h>
 #include <crypto/sm3_base.h>
 #include <linux/cpufeature.h>
@@ -28,7 +29,7 @@ asmlinkage void sm3_ce_transform(struct sm3_state *sst, u8 const *src,
 static int sm3_ce_update(struct shash_desc *desc, const u8 *data,
 			 unsigned int len)
 {
-	if (!may_use_simd())
+	if (!crypto_simd_usable())
 		return crypto_sm3_update(desc, data, len);
 
 	kernel_neon_begin();
@@ -40,7 +41,7 @@ static int sm3_ce_update(struct shash_desc *desc, const u8 *data,
 
 static int sm3_ce_final(struct shash_desc *desc, u8 *out)
 {
-	if (!may_use_simd())
+	if (!crypto_simd_usable())
 		return crypto_sm3_finup(desc, NULL, 0, out);
 
 	kernel_neon_begin();
@@ -53,7 +54,7 @@ static int sm3_ce_final(struct shash_desc *desc, u8 *out)
 static int sm3_ce_finup(struct shash_desc *desc, const u8 *data,
 			unsigned int len, u8 *out)
 {
-	if (!may_use_simd())
+	if (!crypto_simd_usable())
 		return crypto_sm3_finup(desc, data, len, out);
 
 	kernel_neon_begin();
diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c
index 0c4fc223f225..2754c875d39c 100644
--- a/arch/arm64/crypto/sm4-ce-glue.c
+++ b/arch/arm64/crypto/sm4-ce-glue.c
@@ -3,6 +3,7 @@
 #include <asm/neon.h>
 #include <asm/simd.h>
 #include <crypto/sm4.h>
+#include <crypto/internal/simd.h>
 #include <linux/module.h>
 #include <linux/cpufeature.h>
 #include <linux/crypto.h>
@@ -20,7 +21,7 @@ static void sm4_ce_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
 	const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
 
-	if (!may_use_simd()) {
+	if (!crypto_simd_usable()) {
 		crypto_sm4_encrypt(tfm, out, in);
 	} else {
 		kernel_neon_begin();
@@ -33,7 +34,7 @@ static void sm4_ce_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
 	const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
 
-	if (!may_use_simd()) {
+	if (!crypto_simd_usable()) {
 		crypto_sm4_decrypt(tfm, out, in);
 	} else {
 		kernel_neon_begin();
-- 
cgit v1.2.3-59-g8ed1b


From e6331a321aafcc291a60ceedf4d6b0051a6117ca Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@mips.com>
Date: Fri, 22 Mar 2019 18:04:04 +0000
Subject: MIPS: KVM: Use prandom_u32_max() to generate tlbwr index

Emulation of the tlbwr instruction, which writes a TLB entry to a random
index in the TLB, currently uses get_random_bytes() to generate a 4 byte
random number which we then mask to form the index. This is overkill in
a couple of ways:

  - We don't need 4 bytes here since we mask the value to form a 6 bit
    number anyway, so we waste /dev/random entropy generating 3 random
    bytes that are unused.

  - We don't need crypto-grade randomness here - the architecture spec
    allows implementations to use any algorithm & merely encourages that
    some pseudo-randomness be used rather than a simple counter. The
    fast prandom_u32() function fits that criteria well.

So rather than using get_random_bytes() & consuming /dev/random entropy,
switch to using the faster prandom_u32_max() which provides what we need
here whilst also performing the masking/modulo for us.

Signed-off-by: Paul Burton <paul.burton@mips.com>
Reported-by: George Spelvin <lkml@sdf.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: linux-mips@vger.kernel.org
---
 arch/mips/kvm/emulate.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 0074427b04fb..e5de6bac8197 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1141,9 +1141,7 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu)
 	unsigned long pc = vcpu->arch.pc;
 	int index;
 
-	get_random_bytes(&index, sizeof(index));
-	index &= (KVM_MIPS_GUEST_TLB_SIZE - 1);
-
+	index = prandom_u32_max(KVM_MIPS_GUEST_TLB_SIZE);
 	tlb = &vcpu->arch.guest_tlb[index];
 
 	kvm_mips_invalidate_guest_tlb(vcpu, tlb);
-- 
cgit v1.2.3-59-g8ed1b


From 8db5da0b8618df79eceea99672e205d4a2a6309e Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.ibm.com>
Date: Sun, 27 Jan 2019 19:03:45 -0500
Subject: x86/ima: require signed kernel modules

Have the IMA architecture specific policy require signed kernel modules
on systems with secure boot mode enabled; and coordinate the different
signature verification methods, so only one signature is required.

Requiring appended kernel module signatures may be configured, enabled
on the boot command line, or with this patch enabled in secure boot
mode.  This patch defines set_module_sig_enforced().

To coordinate between appended kernel module signatures and IMA
signatures, only define an IMA MODULE_CHECK policy rule if
CONFIG_MODULE_SIG is not enabled.  A custom IMA policy may still define
and require an IMA signature.

Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Jessica Yu <jeyu@kernel.org>
---
 arch/x86/kernel/ima_arch.c | 9 ++++++++-
 include/linux/module.h     | 5 +++++
 kernel/module.c            | 5 +++++
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c
index e47cd9390ab4..3fb9847f1cad 100644
--- a/arch/x86/kernel/ima_arch.c
+++ b/arch/x86/kernel/ima_arch.c
@@ -64,12 +64,19 @@ static const char * const sb_arch_rules[] = {
 	"appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig",
 #endif /* CONFIG_KEXEC_VERIFY_SIG */
 	"measure func=KEXEC_KERNEL_CHECK",
+#if !IS_ENABLED(CONFIG_MODULE_SIG)
+	"appraise func=MODULE_CHECK appraise_type=imasig",
+#endif
+	"measure func=MODULE_CHECK",
 	NULL
 };
 
 const char * const *arch_get_ima_policy(void)
 {
-	if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot())
+	if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot()) {
+		if (IS_ENABLED(CONFIG_MODULE_SIG))
+			set_module_sig_enforced();
 		return sb_arch_rules;
+	}
 	return NULL;
 }
diff --git a/include/linux/module.h b/include/linux/module.h
index 5bf5dcd91009..73ee2b10e816 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -676,6 +676,7 @@ static inline bool is_livepatch_module(struct module *mod)
 #endif /* CONFIG_LIVEPATCH */
 
 bool is_module_sig_enforced(void);
+void set_module_sig_enforced(void);
 
 #else /* !CONFIG_MODULES... */
 
@@ -796,6 +797,10 @@ static inline bool is_module_sig_enforced(void)
 	return false;
 }
 
+static inline void set_module_sig_enforced(void)
+{
+}
+
 /* Dereference module function descriptor */
 static inline
 void *dereference_module_function_descriptor(struct module *mod, void *ptr)
diff --git a/kernel/module.c b/kernel/module.c
index 0b9aa8ab89f0..985caa467aef 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -286,6 +286,11 @@ bool is_module_sig_enforced(void)
 }
 EXPORT_SYMBOL(is_module_sig_enforced);
 
+void set_module_sig_enforced(void)
+{
+	sig_enforce = true;
+}
+
 /* Block module loading/unloading? */
 int modules_disabled = 0;
 core_param(nomodule, modules_disabled, bint, 0);
-- 
cgit v1.2.3-59-g8ed1b


From 86d35d4e7625f7c056d81316da107bd3a7564fb3 Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Tue, 26 Mar 2019 07:40:54 +0000
Subject: drm/i915: Split Pineview device info into desktop and mobile
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows the IS_PINEVIEW_<G|M> macros to be removed and avoid
duplication of device ids already defined in i915_pciids.h.

!IS_MOBILE check can be used in place of existing IS_PINEVIEW_G call
sites.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20190326074057.27833-2-tvrtko.ursulin@linux.intel.com
---
 arch/x86/kernel/early-quirks.c  |  3 ++-
 drivers/gpu/drm/i915/i915_drv.h |  2 --
 drivers/gpu/drm/i915/i915_pci.c | 12 ++++++++++--
 drivers/gpu/drm/i915/intel_pm.c |  4 ++--
 include/drm/i915_pciids.h       |  6 ++++--
 5 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 50d5848bf22e..f91d3ed2df62 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -525,7 +525,8 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
 	INTEL_I945G_IDS(&gen3_early_ops),
 	INTEL_I945GM_IDS(&gen3_early_ops),
 	INTEL_VLV_IDS(&gen6_early_ops),
-	INTEL_PINEVIEW_IDS(&gen3_early_ops),
+	INTEL_PINEVIEW_G_IDS(&gen3_early_ops),
+	INTEL_PINEVIEW_M_IDS(&gen3_early_ops),
 	INTEL_I965G_IDS(&gen3_early_ops),
 	INTEL_G33_IDS(&gen3_early_ops),
 	INTEL_I965GM_IDS(&gen3_early_ops),
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index f75600fa77c6..d1fd8f7d6f8a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2317,8 +2317,6 @@ static inline unsigned int i915_sg_segment_size(void)
 #define IS_G45(dev_priv)	IS_PLATFORM(dev_priv, INTEL_G45)
 #define IS_GM45(dev_priv)	IS_PLATFORM(dev_priv, INTEL_GM45)
 #define IS_G4X(dev_priv)	(IS_G45(dev_priv) || IS_GM45(dev_priv))
-#define IS_PINEVIEW_G(dev_priv)	(INTEL_DEVID(dev_priv) == 0xa001)
-#define IS_PINEVIEW_M(dev_priv)	(INTEL_DEVID(dev_priv) == 0xa011)
 #define IS_PINEVIEW(dev_priv)	IS_PLATFORM(dev_priv, INTEL_PINEVIEW)
 #define IS_G33(dev_priv)	IS_PLATFORM(dev_priv, INTEL_G33)
 #define IS_IRONLAKE_M(dev_priv)	(INTEL_DEVID(dev_priv) == 0x0046)
diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
index a7e1611af26d..716f2f95c57d 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -257,7 +257,14 @@ static const struct intel_device_info intel_g33_info = {
 	.display.has_overlay = 1,
 };
 
-static const struct intel_device_info intel_pineview_info = {
+static const struct intel_device_info intel_pineview_g_info = {
+	GEN3_FEATURES,
+	PLATFORM(INTEL_PINEVIEW),
+	.display.has_hotplug = 1,
+	.display.has_overlay = 1,
+};
+
+static const struct intel_device_info intel_pineview_m_info = {
 	GEN3_FEATURES,
 	PLATFORM(INTEL_PINEVIEW),
 	.is_mobile = 1,
@@ -761,7 +768,8 @@ static const struct pci_device_id pciidlist[] = {
 	INTEL_I965GM_IDS(&intel_i965gm_info),
 	INTEL_GM45_IDS(&intel_gm45_info),
 	INTEL_G45_IDS(&intel_g45_info),
-	INTEL_PINEVIEW_IDS(&intel_pineview_info),
+	INTEL_PINEVIEW_G_IDS(&intel_pineview_g_info),
+	INTEL_PINEVIEW_M_IDS(&intel_pineview_m_info),
 	INTEL_IRONLAKE_D_IDS(&intel_ironlake_d_info),
 	INTEL_IRONLAKE_M_IDS(&intel_ironlake_m_info),
 	INTEL_SNB_D_GT1_IDS(&intel_sandybridge_d_gt1_info),
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 9a6eb2ef5f48..0e05ee1f3ea0 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -850,7 +850,7 @@ static void pineview_update_wm(struct intel_crtc *unused_crtc)
 	u32 reg;
 	unsigned int wm;
 
-	latency = intel_get_cxsr_latency(IS_PINEVIEW_G(dev_priv),
+	latency = intel_get_cxsr_latency(!IS_MOBILE(dev_priv),
 					 dev_priv->is_ddr3,
 					 dev_priv->fsb_freq,
 					 dev_priv->mem_freq);
@@ -9589,7 +9589,7 @@ void intel_init_pm(struct drm_i915_private *dev_priv)
 		dev_priv->display.initial_watermarks = g4x_initial_watermarks;
 		dev_priv->display.optimize_watermarks = g4x_optimize_watermarks;
 	} else if (IS_PINEVIEW(dev_priv)) {
-		if (!intel_get_cxsr_latency(IS_PINEVIEW_G(dev_priv),
+		if (!intel_get_cxsr_latency(!IS_MOBILE(dev_priv),
 					    dev_priv->is_ddr3,
 					    dev_priv->fsb_freq,
 					    dev_priv->mem_freq)) {
diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h
index c7cdbfc4d033..cb9b5b35aa2c 100644
--- a/include/drm/i915_pciids.h
+++ b/include/drm/i915_pciids.h
@@ -108,8 +108,10 @@
 	INTEL_VGA_DEVICE(0x2e42, info), /* B43_G */ \
 	INTEL_VGA_DEVICE(0x2e92, info)	/* B43_G.1 */
 
-#define INTEL_PINEVIEW_IDS(info)			\
-	INTEL_VGA_DEVICE(0xa001, info),			\
+#define INTEL_PINEVIEW_G_IDS(info) \
+	INTEL_VGA_DEVICE(0xa001, info)
+
+#define INTEL_PINEVIEW_M_IDS(info) \
 	INTEL_VGA_DEVICE(0xa011, info)
 
 #define INTEL_IRONLAKE_D_IDS(info) \
-- 
cgit v1.2.3-59-g8ed1b


From d53fef0be4a5f2f12219c231cdb6f838fbbf680c Mon Sep 17 00:00:00 2001
From: Rodrigo Vivi <rodrigo.vivi@intel.com>
Date: Fri, 15 Mar 2019 12:19:38 -0700
Subject: x86/gpu: add ElkhartLake to gen11 early quirks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's reserve EHL stolen memory for graphics.

ElkhartLake is a gen11 platform which is compatible with
ICL changes.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: José Roberto de Souza <jose.souza@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: José Roberto de Souza <jose.souza@intel.com>
Acked-by: Borislav Petkov <bp@suse.de>
Link: https://patchwork.freedesktop.org/patch/msgid/20190315191938.22211-2-rodrigo.vivi@intel.com
---
 arch/x86/kernel/early-quirks.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index f91d3ed2df62..6c4f01540833 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -548,6 +548,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
 	INTEL_GLK_IDS(&gen9_early_ops),
 	INTEL_CNL_IDS(&gen9_early_ops),
 	INTEL_ICL_11_IDS(&gen11_early_ops),
+	INTEL_EHL_IDS(&gen11_early_ops),
 };
 
 struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0);
-- 
cgit v1.2.3-59-g8ed1b


From 5ab96af1ecfc5427d4f01b0b1e7b68f7bef96157 Mon Sep 17 00:00:00 2001
From: Thor Thayer <thor.thayer@linux.intel.com>
Date: Mon, 11 Mar 2019 17:18:05 -0500
Subject: ARM: socfpga_defconfig: Enable CONFIG_MTD_ALTERA_SYSMGR

Add System Manager driver by default for SOCFPGA ARM32 platforms.

Signed-off-by: Thor Thayer <thor.thayer@linux.intel.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 arch/arm/configs/socfpga_defconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/arm/configs/socfpga_defconfig b/arch/arm/configs/socfpga_defconfig
index 08d1b3e11d68..2d29992a60d2 100644
--- a/arch/arm/configs/socfpga_defconfig
+++ b/arch/arm/configs/socfpga_defconfig
@@ -106,6 +106,7 @@ CONFIG_SENSORS_LTC2978_REGULATOR=y
 CONFIG_WATCHDOG=y
 CONFIG_DW_WATCHDOG=y
 CONFIG_MFD_ALTERA_A10SR=y
+CONFIG_MFD_ALTERA_SYSMGR=y
 CONFIG_MFD_STMPE=y
 CONFIG_REGULATOR=y
 CONFIG_REGULATOR_FIXED_VOLTAGE=y
-- 
cgit v1.2.3-59-g8ed1b


From 5dd2814b0249fc0ff43f8bc1902fab199f7de9a5 Mon Sep 17 00:00:00 2001
From: Thor Thayer <thor.thayer@linux.intel.com>
Date: Mon, 11 Mar 2019 17:18:06 -0500
Subject: arm64: defconfig: Enable CONFIG_MTD_ALTERA_SYSMGR

Enable the Stratix10 System Manager by default.

Signed-off-by: Thor Thayer <thor.thayer@linux.intel.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 arch/arm64/configs/defconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 2d9c39033c1a..9e0b7ab639b3 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -432,6 +432,7 @@ CONFIG_MESON_WATCHDOG=m
 CONFIG_RENESAS_WDT=y
 CONFIG_UNIPHIER_WATCHDOG=y
 CONFIG_BCM2835_WDT=y
+CONFIG_MFD_ALTERA_SYSMGR=y
 CONFIG_MFD_BD9571MWV=y
 CONFIG_MFD_AXP20X_I2C=y
 CONFIG_MFD_AXP20X_RSB=y
-- 
cgit v1.2.3-59-g8ed1b


From 8f4ebe9b331ef3d39c8375ff38c623f26dbc1735 Mon Sep 17 00:00:00 2001
From: Thor Thayer <thor.thayer@linux.intel.com>
Date: Mon, 11 Mar 2019 17:18:08 -0500
Subject: arm64: dts: stratix10: New System Manager compatible

Use the new compatible string defined for the Stratix10
System Manager. Remove syscon since it is not correct
on this platform.

Signed-off-by: Thor Thayer <thor.thayer@linux.intel.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
index 7c649f6b14cb..a297776db274 100644
--- a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
+++ b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
@@ -390,7 +390,7 @@
 		};
 
 		sysmgr: sysmgr@ffd12000 {
-			compatible = "altr,sys-mgr", "syscon";
+			compatible = "altr,sys-mgr-s10","altr,sys-mgr";
 			reg = <0xffd12000 0x228>;
 		};
 
-- 
cgit v1.2.3-59-g8ed1b


From 8440bb9b944c02222c7a840d406141ed42e945cd Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 11 Feb 2019 13:58:43 +0100
Subject: sh: sh7786: Add explicit I/O cast to sh7786_mm_sel()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When compile-testing on arm:

    arch/sh/include/cpu-sh4/cpu/sh7786.h: In function ‘sh7786_mm_sel’:
    arch/sh/include/cpu-sh4/cpu/sh7786.h:135:21: warning: passing argument 1 of ‘__raw_readl’ makes pointer from integer without a cast [-Wint-conversion]
      return __raw_readl(0xFC400020) & 0x7;
			 ^~~~~~~~~~
    In file included from include/linux/io.h:25:0,
		     from arch/sh/include/cpu-sh4/cpu/sh7786.h:14,
		     from drivers/pinctrl/sh-pfc/pfc-sh7786.c:15:
    arch/arm/include/asm/io.h:113:21: note: expected ‘const volatile void *’ but argument is of type ‘unsigned int’
     #define __raw_readl __raw_readl
			 ^
    arch/arm/include/asm/io.h:114:19: note: in expansion of macro ‘__raw_readl’
     static inline u32 __raw_readl(const volatile void __iomem *addr)
		       ^~~~~~~~~~~

__raw_readl() on SuperH is a macro that casts the passed I/O address to
the correct type, while the implementations on most other architectures
expect to be passed the correct pointer type.

Add an explicit cast to fix this.

Note that this also gets rid of a sparse warning on SuperH:

    arch/sh/include/cpu-sh4/cpu/sh7786.h:135:16: warning: incorrect type in argument 1 (different base types)
    arch/sh/include/cpu-sh4/cpu/sh7786.h:135:16:    expected void const volatile [noderef] <asn:2>*<noident>
    arch/sh/include/cpu-sh4/cpu/sh7786.h:135:16:    got unsigned int

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
---
 arch/sh/include/cpu-sh4/cpu/sh7786.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/sh/include/cpu-sh4/cpu/sh7786.h b/arch/sh/include/cpu-sh4/cpu/sh7786.h
index 8f9bfbf3cdb1..d6cce65b4871 100644
--- a/arch/sh/include/cpu-sh4/cpu/sh7786.h
+++ b/arch/sh/include/cpu-sh4/cpu/sh7786.h
@@ -132,7 +132,7 @@ enum {
 
 static inline u32 sh7786_mm_sel(void)
 {
-	return __raw_readl(0xFC400020) & 0x7;
+	return __raw_readl((const volatile void __iomem *)0xFC400020) & 0x7;
 }
 
 #endif /* __CPU_SH7786_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From d71eb0ce109a124b0fa714832823b9452f2762cf Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 2 Apr 2019 09:59:33 -0500
Subject: x86/speculation/mds: Add mds=full,nosmt cmdline option

Add the mds=full,nosmt cmdline option.  This is like mds=full, but with
SMT disabled if the CPU is vulnerable.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tyler Hicks <tyhicks@canonical.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/admin-guide/hw-vuln/mds.rst       |  3 +++
 Documentation/admin-guide/kernel-parameters.txt |  6 ++++--
 arch/x86/kernel/cpu/bugs.c                      | 10 ++++++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/Documentation/admin-guide/hw-vuln/mds.rst b/Documentation/admin-guide/hw-vuln/mds.rst
index 1de29d28903d..244ab47d1fb3 100644
--- a/Documentation/admin-guide/hw-vuln/mds.rst
+++ b/Documentation/admin-guide/hw-vuln/mds.rst
@@ -260,6 +260,9 @@ time with the option "mds=". The valid arguments for this option are:
 
 		It does not automatically disable SMT.
 
+  full,nosmt	The same as mds=full, with SMT disabled on vulnerable
+		CPUs.  This is the complete mitigation.
+
   off		Disables MDS mitigations completely.
 
   ============  =============================================================
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 7325319c2c23..8f04985d3122 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2372,8 +2372,10 @@
 			This parameter controls the MDS mitigation. The
 			options are:
 
-			full    - Enable MDS mitigation on vulnerable CPUs
-			off     - Unconditionally disable MDS mitigation
+			full       - Enable MDS mitigation on vulnerable CPUs
+			full,nosmt - Enable MDS mitigation and disable
+				     SMT on vulnerable CPUs
+			off        - Unconditionally disable MDS mitigation
 
 			Not specifying this option is equivalent to
 			mds=full.
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 373ae1dcd301..9f252082a83b 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -221,6 +221,7 @@ static void x86_amd_ssb_disable(void)
 
 /* Default mitigation for L1TF-affected CPUs */
 static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL;
+static bool mds_nosmt __ro_after_init = false;
 
 static const char * const mds_strings[] = {
 	[MDS_MITIGATION_OFF]	= "Vulnerable",
@@ -238,8 +239,13 @@ static void __init mds_select_mitigation(void)
 	if (mds_mitigation == MDS_MITIGATION_FULL) {
 		if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
 			mds_mitigation = MDS_MITIGATION_VMWERV;
+
 		static_branch_enable(&mds_user_clear);
+
+		if (mds_nosmt && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
+			cpu_smt_disable(false);
 	}
+
 	pr_info("%s\n", mds_strings[mds_mitigation]);
 }
 
@@ -255,6 +261,10 @@ static int __init mds_cmdline(char *str)
 		mds_mitigation = MDS_MITIGATION_OFF;
 	else if (!strcmp(str, "full"))
 		mds_mitigation = MDS_MITIGATION_FULL;
+	else if (!strcmp(str, "full,nosmt")) {
+		mds_mitigation = MDS_MITIGATION_FULL;
+		mds_nosmt = true;
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 7c3658b20194a5b3209a143f63bc9c643c6a3ae2 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 2 Apr 2019 10:00:14 -0500
Subject: x86/speculation: Move arch_smt_update() call to after mitigation
 decisions

arch_smt_update() now has a dependency on both Spectre v2 and MDS
mitigations.  Move its initial call to after all the mitigation decisions
have been made.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tyler Hicks <tyhicks@canonical.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/kernel/cpu/bugs.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 9f252082a83b..3f934ffef8cf 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -111,6 +111,8 @@ void __init check_bugs(void)
 
 	mds_select_mitigation();
 
+	arch_smt_update();
+
 #ifdef CONFIG_X86_32
 	/*
 	 * Check whether we are able to run this kernel safely on SMP.
@@ -638,9 +640,6 @@ specv2_set_mode:
 
 	/* Set up IBPB and STIBP depending on the general spectre V2 command */
 	spectre_v2_user_select_mitigation(cmd);
-
-	/* Enable STIBP if appropriate */
-	arch_smt_update();
 }
 
 static void update_stibp_msr(void * __unused)
-- 
cgit v1.2.3-59-g8ed1b


From 39226ef02bfb43248b7db12a4fdccb39d95318e3 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 2 Apr 2019 10:00:51 -0500
Subject: x86/speculation/mds: Add SMT warning message

MDS is vulnerable with SMT.  Make that clear with a one-time printk
whenever SMT first gets enabled.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tyler Hicks <tyhicks@canonical.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/kernel/cpu/bugs.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 3f934ffef8cf..22a14d4b68a2 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -673,6 +673,9 @@ static void update_indir_branch_cond(void)
 		static_branch_disable(&switch_to_cond_stibp);
 }
 
+#undef pr_fmt
+#define pr_fmt(fmt) fmt
+
 /* Update the static key controlling the MDS CPU buffer clear in idle */
 static void update_mds_branch_idle(void)
 {
@@ -693,6 +696,8 @@ static void update_mds_branch_idle(void)
 		static_branch_disable(&mds_idle_clear);
 }
 
+#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
+
 void arch_smt_update(void)
 {
 	/* Enhanced IBRS implies STIBP. No update required. */
@@ -717,6 +722,8 @@ void arch_smt_update(void)
 	switch (mds_mitigation) {
 	case MDS_MITIGATION_FULL:
 	case MDS_MITIGATION_VMWERV:
+		if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
+			pr_warn_once(MDS_MSG_SMT);
 		update_mds_branch_idle();
 		break;
 	case MDS_MITIGATION_OFF:
@@ -1149,6 +1156,7 @@ static int __init l1tf_cmdline(char *str)
 early_param("l1tf", l1tf_cmdline);
 
 #undef pr_fmt
+#define pr_fmt(fmt) fmt
 
 #ifdef CONFIG_SYSFS
 
-- 
cgit v1.2.3-59-g8ed1b


From 8c5dc8d9f19c7992b5ed557b865127a80149041b Mon Sep 17 00:00:00 2001
From: Alexander Shiyan <shc_work@mail.ru>
Date: Thu, 17 Jan 2019 16:33:35 +0300
Subject: video: backlight: Remove useless BACKLIGHT_LCD_SUPPORT kernel symbol

We have two *_CLASS_DEVICE kernel config options (LCD_CLASS_DEVICE
and BACKLIGHT_LCD_DEVICE) that do the same job.
The patch removes useless BACKLIGHT_LCD_SUPPORT option
and converts LCD_CLASS_DEVICE into a menu.

Signed-off-by: Alexander Shiyan <shc_work@mail.ru>
Acked-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 arch/unicore32/Kconfig            |  1 -
 drivers/gpu/drm/Kconfig           |  2 --
 drivers/gpu/drm/bridge/Kconfig    |  1 -
 drivers/gpu/drm/fsl-dcu/Kconfig   |  1 -
 drivers/gpu/drm/i915/Kconfig      |  1 -
 drivers/gpu/drm/nouveau/Kconfig   |  2 --
 drivers/gpu/drm/shmobile/Kconfig  |  1 -
 drivers/gpu/drm/tilcdc/Kconfig    |  1 -
 drivers/staging/olpc_dcon/Kconfig |  1 -
 drivers/usb/misc/Kconfig          |  1 -
 drivers/video/backlight/Kconfig   | 10 ++--------
 drivers/video/fbdev/Kconfig       |  5 -----
 12 files changed, 2 insertions(+), 25 deletions(-)

(limited to 'arch')

diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index 817d82608712..8cbcbb12372c 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -195,7 +195,6 @@ config I2C_EEPROM_AT24
 
 config LCD_BACKLIGHT
 	tristate "LCD Backlight support"
-	select BACKLIGHT_LCD_SUPPORT
 	select BACKLIGHT_PWM
 
 endmenu
diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index bd943a71756c..1e68c1bbed2a 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -194,7 +194,6 @@ config DRM_RADEON
 	select POWER_SUPPLY
 	select HWMON
 	select BACKLIGHT_CLASS_DEVICE
-	select BACKLIGHT_LCD_SUPPORT
 	select INTERVAL_TREE
 	help
 	  Choose this option if you have an ATI Radeon graphics card.  There
@@ -215,7 +214,6 @@ config DRM_AMDGPU
 	select POWER_SUPPLY
 	select HWMON
 	select BACKLIGHT_CLASS_DEVICE
-	select BACKLIGHT_LCD_SUPPORT
 	select INTERVAL_TREE
 	select CHASH
 	help
diff --git a/drivers/gpu/drm/bridge/Kconfig b/drivers/gpu/drm/bridge/Kconfig
index 8840f396a7b6..3dff9997f5e3 100644
--- a/drivers/gpu/drm/bridge/Kconfig
+++ b/drivers/gpu/drm/bridge/Kconfig
@@ -76,7 +76,6 @@ config DRM_PARADE_PS8622
 	depends on OF
 	select DRM_PANEL
 	select DRM_KMS_HELPER
-	select BACKLIGHT_LCD_SUPPORT
 	select BACKLIGHT_CLASS_DEVICE
 	---help---
 	  Parade eDP-LVDS bridge chip driver.
diff --git a/drivers/gpu/drm/fsl-dcu/Kconfig b/drivers/gpu/drm/fsl-dcu/Kconfig
index 14a72c4c496d..dc825883400d 100644
--- a/drivers/gpu/drm/fsl-dcu/Kconfig
+++ b/drivers/gpu/drm/fsl-dcu/Kconfig
@@ -2,7 +2,6 @@ config DRM_FSL_DCU
 	tristate "DRM Support for Freescale DCU"
 	depends on DRM && OF && ARM && COMMON_CLK
 	select BACKLIGHT_CLASS_DEVICE
-	select BACKLIGHT_LCD_SUPPORT
 	select DRM_KMS_HELPER
 	select DRM_KMS_CMA_HELPER
 	select DRM_PANEL
diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index 148be8e1a090..3d5f1cb6a76c 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -15,7 +15,6 @@ config DRM_I915
 	select IRQ_WORK
 	# i915 depends on ACPI_VIDEO when ACPI is enabled
 	# but for select to work, need to select ACPI_VIDEO's dependencies, ick
-	select BACKLIGHT_LCD_SUPPORT if ACPI
 	select BACKLIGHT_CLASS_DEVICE if ACPI
 	select INPUT if ACPI
 	select ACPI_VIDEO if ACPI
diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig
index 00cd9ab8948d..5f3cad0eb92e 100644
--- a/drivers/gpu/drm/nouveau/Kconfig
+++ b/drivers/gpu/drm/nouveau/Kconfig
@@ -5,14 +5,12 @@ config DRM_NOUVEAU
 	select DRM_KMS_HELPER
 	select DRM_TTM
 	select BACKLIGHT_CLASS_DEVICE if DRM_NOUVEAU_BACKLIGHT
-	select BACKLIGHT_LCD_SUPPORT if DRM_NOUVEAU_BACKLIGHT
 	select ACPI_VIDEO if ACPI && X86 && BACKLIGHT_CLASS_DEVICE && INPUT
 	select X86_PLATFORM_DEVICES if ACPI && X86
 	select ACPI_WMI if ACPI && X86
 	select MXM_WMI if ACPI && X86
 	select POWER_SUPPLY
 	# Similar to i915, we need to select ACPI_VIDEO and it's dependencies
-	select BACKLIGHT_LCD_SUPPORT if ACPI && X86
 	select BACKLIGHT_CLASS_DEVICE if ACPI && X86
 	select INPUT if ACPI && X86
 	select THERMAL if ACPI && X86
diff --git a/drivers/gpu/drm/shmobile/Kconfig b/drivers/gpu/drm/shmobile/Kconfig
index 61bbe8e8bcc5..e2a6c82c8252 100644
--- a/drivers/gpu/drm/shmobile/Kconfig
+++ b/drivers/gpu/drm/shmobile/Kconfig
@@ -4,7 +4,6 @@ config DRM_SHMOBILE
 	depends on DRM && ARM
 	depends on ARCH_SHMOBILE || COMPILE_TEST
 	select BACKLIGHT_CLASS_DEVICE
-	select BACKLIGHT_LCD_SUPPORT
 	select DRM_KMS_HELPER
 	select DRM_KMS_CMA_HELPER
 	select DRM_GEM_CMA_HELPER
diff --git a/drivers/gpu/drm/tilcdc/Kconfig b/drivers/gpu/drm/tilcdc/Kconfig
index 52598049c096..cb7df2086aee 100644
--- a/drivers/gpu/drm/tilcdc/Kconfig
+++ b/drivers/gpu/drm/tilcdc/Kconfig
@@ -8,7 +8,6 @@ config DRM_TILCDC
 	select DRM_PANEL_BRIDGE
 	select VIDEOMODE_HELPERS
 	select BACKLIGHT_CLASS_DEVICE
-	select BACKLIGHT_LCD_SUPPORT
 	help
 	  Choose this option if you have an TI SoC with LCDC display
 	  controller, for example AM33xx in beagle-bone, DA8xx, or
diff --git a/drivers/staging/olpc_dcon/Kconfig b/drivers/staging/olpc_dcon/Kconfig
index 192cc8d0853f..c91a56f77bcb 100644
--- a/drivers/staging/olpc_dcon/Kconfig
+++ b/drivers/staging/olpc_dcon/Kconfig
@@ -2,7 +2,6 @@ config FB_OLPC_DCON
 	tristate "One Laptop Per Child Display CONtroller support"
 	depends on OLPC && FB
 	depends on I2C
-	depends on BACKLIGHT_LCD_SUPPORT
 	depends on (GPIO_CS5535 || GPIO_CS5535=n)
 	select BACKLIGHT_CLASS_DEVICE
 	help
diff --git a/drivers/usb/misc/Kconfig b/drivers/usb/misc/Kconfig
index be04c117fe80..c97f270338bf 100644
--- a/drivers/usb/misc/Kconfig
+++ b/drivers/usb/misc/Kconfig
@@ -142,7 +142,6 @@ config USB_FTDI_ELAN
 
 config USB_APPLEDISPLAY
 	tristate "Apple Cinema Display support"
-	select BACKLIGHT_LCD_SUPPORT
 	select BACKLIGHT_CLASS_DEVICE
 	help
 	  Say Y here if you want to control the backlight of Apple Cinema
diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig
index 71ee978c848f..3fdc18e85ff5 100644
--- a/drivers/video/backlight/Kconfig
+++ b/drivers/video/backlight/Kconfig
@@ -2,13 +2,7 @@
 # Backlight & LCD drivers configuration
 #
 
-menuconfig BACKLIGHT_LCD_SUPPORT
-	bool "Backlight & LCD device support"
-	help
-	  Enable this to be able to choose the drivers for controlling the
-	  backlight and the LCD panel on some platforms, for example on PDAs.
-
-if BACKLIGHT_LCD_SUPPORT
+menu "Backlight & LCD device support"
 
 #
 # LCD
@@ -466,4 +460,4 @@ config BACKLIGHT_RAVE_SP
 
 endif # BACKLIGHT_CLASS_DEVICE
 
-endif # BACKLIGHT_LCD_SUPPORT
+endmenu
diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig
index 58a9590c9db6..068294881eb9 100644
--- a/drivers/video/fbdev/Kconfig
+++ b/drivers/video/fbdev/Kconfig
@@ -186,7 +186,6 @@ config FB_MACMODES
 config FB_BACKLIGHT
 	tristate
 	depends on FB
-	select BACKLIGHT_LCD_SUPPORT
 	select BACKLIGHT_CLASS_DEVICE
 
 config FB_MODE_HELPERS
@@ -281,7 +280,6 @@ config FB_ARMCLCD
 	select FB_CFB_IMAGEBLIT
 	select FB_MODE_HELPERS if OF
 	select VIDEOMODE_HELPERS if OF
-	select BACKLIGHT_LCD_SUPPORT if OF
 	select BACKLIGHT_CLASS_DEVICE if OF
 	help
 	  This framebuffer device driver is for the ARM PrimeCell PL110
@@ -315,7 +313,6 @@ config FB_ACORN
 config FB_CLPS711X
 	tristate "CLPS711X LCD support"
 	depends on FB && (ARCH_CLPS711X || COMPILE_TEST)
-	select BACKLIGHT_LCD_SUPPORT
 	select FB_MODE_HELPERS
 	select FB_SYS_FILLRECT
 	select FB_SYS_COPYAREA
@@ -343,7 +340,6 @@ config FB_SA1100
 config FB_IMX
 	tristate "Freescale i.MX1/21/25/27 LCD support"
 	depends on FB && ARCH_MXC
-	select BACKLIGHT_LCD_SUPPORT
 	select LCD_CLASS_DEVICE
 	select FB_CFB_FILLRECT
 	select FB_CFB_COPYAREA
@@ -2192,7 +2188,6 @@ config FB_MX3
 	tristate "MX3 Framebuffer support"
 	depends on FB && MX3_IPU
 	select BACKLIGHT_CLASS_DEVICE
-	select BACKLIGHT_LCD_SUPPORT
 	select FB_CFB_FILLRECT
 	select FB_CFB_COPYAREA
 	select FB_CFB_IMAGEBLIT
-- 
cgit v1.2.3-59-g8ed1b


From 89833fab15d6017ba006a45b5af68caa067171a7 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Fri, 29 Mar 2019 22:46:51 +0100
Subject: x86/fpu: Fix __user annotations

In save_xstate_epilog(), use __user when type-casting userspace
pointers.

In setup_sigcontext() and x32_setup_rt_frame(), cast the userspace
pointers to 'unsigned long __user *' before writing into them. These
pointers are originally '__u32 __user *' or '__u64 __user *', causing
sparse to complain when a userspace pointer is written into them. The
casts are okay because the pointers always point to pointer-sized
values.

Thanks to Luc Van Oostenryck and Al Viro for explaining this to me.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Mukesh Ojha <mojha@codeaurora.org>
Cc: Qiaowei Ren <qiaowei.ren@intel.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190329214652.258477-3-jannh@google.com
---
 arch/x86/kernel/fpu/signal.c | 6 +++---
 arch/x86/kernel/signal.c     | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index f6a1d299627c..55b80de13ea5 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -92,13 +92,13 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
 		return err;
 
 	err |= __put_user(FP_XSTATE_MAGIC2,
-			  (__u32 *)(buf + fpu_user_xstate_size));
+			  (__u32 __user *)(buf + fpu_user_xstate_size));
 
 	/*
 	 * Read the xfeatures which we copied (directly from the cpu or
 	 * from the state in task struct) to the user buffers.
 	 */
-	err |= __get_user(xfeatures, (__u32 *)&x->header.xfeatures);
+	err |= __get_user(xfeatures, (__u32 __user *)&x->header.xfeatures);
 
 	/*
 	 * For legacy compatible, we always set FP/SSE bits in the bit
@@ -113,7 +113,7 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
 	 */
 	xfeatures |= XFEATURE_MASK_FPSSE;
 
-	err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures);
+	err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures);
 
 	return err;
 }
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 08dfd4c1a4f9..b419e1a1a0ce 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -206,7 +206,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 		put_user_ex(regs->ss, &sc->ss);
 #endif /* CONFIG_X86_32 */
 
-		put_user_ex(fpstate, &sc->fpstate);
+		put_user_ex(fpstate, (unsigned long __user *)&sc->fpstate);
 
 		/* non-iBCS2 extensions.. */
 		put_user_ex(mask, &sc->oldmask);
@@ -569,7 +569,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 			restorer = NULL;
 			err |= -EFAULT;
 		}
-		put_user_ex(restorer, &frame->pretcode);
+		put_user_ex(restorer, (unsigned long __user *)&frame->pretcode);
 	} put_user_catch(err);
 
 	err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
-- 
cgit v1.2.3-59-g8ed1b


From 9fda6693335cd51b0a74cffaac266c83439f7efe Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 3 Apr 2019 17:08:52 +0200
Subject: spi: sh-msiof: Convert to use GPIO descriptors

Convert GPIO chip selects in the Renesas MSIOF SPI driver from legacy
GPIO numbers to GPIO descriptors.

Notes:
  - The board file for the SH7724-based Ecovec24 development board now
    registers a GPIO descriptor lookup, instead of passing a GPIO number
    through controller_data,
  - sh_msiof_get_cs_gpios() must release all GPIOs, else
    spi_get_gpio_descs() cannot claim them during SPI controller
    registration.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 arch/sh/boards/mach-ecovec24/setup.c | 12 +++++++++---
 drivers/spi/spi-sh-msiof.c           | 20 ++++----------------
 2 files changed, 13 insertions(+), 19 deletions(-)

(limited to 'arch')

diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c
index 34e5414c5563..f402aa741bf3 100644
--- a/arch/sh/boards/mach-ecovec24/setup.c
+++ b/arch/sh/boards/mach-ecovec24/setup.c
@@ -806,7 +806,6 @@ static struct spi_board_info spi_bus[] = {
 		.platform_data	= &mmc_spi_info,
 		.max_speed_hz	= 5000000,
 		.mode		= SPI_MODE_0,
-		.controller_data = (void *) GPIO_PTM4,
 	},
 };
 
@@ -838,6 +837,14 @@ static struct platform_device msiof0_device = {
 	.resource	= msiof0_resources,
 };
 
+static struct gpiod_lookup_table msiof_gpio_table = {
+	.dev_id = "spi_sh_msiof.0",
+	.table = {
+		GPIO_LOOKUP("sh7724_pfc", GPIO_PTM4, "cs", GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 #endif
 
 /* FSI */
@@ -1296,12 +1303,11 @@ static int __init arch_setup(void)
 	gpio_request(GPIO_FN_MSIOF0_TXD, NULL);
 	gpio_request(GPIO_FN_MSIOF0_RXD, NULL);
 	gpio_request(GPIO_FN_MSIOF0_TSCK, NULL);
-	gpio_request(GPIO_PTM4, NULL); /* software CS control of TSYNC pin */
-	gpio_direction_output(GPIO_PTM4, 1); /* active low CS */
 	gpio_request(GPIO_PTB6, NULL); /* 3.3V power control */
 	gpio_direction_output(GPIO_PTB6, 0); /* disable power by default */
 
 	gpiod_add_lookup_table(&mmc_spi_gpio_table);
+	gpiod_add_lookup_table(&msiof_gpio_table);
 	spi_register_board_info(spi_bus, ARRAY_SIZE(spi_bus));
 #endif
 
diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c
index d3794499915c..6aab7b2136db 100644
--- a/drivers/spi/spi-sh-msiof.c
+++ b/drivers/spi/spi-sh-msiof.c
@@ -549,25 +549,11 @@ static void sh_msiof_spi_read_fifo_s32u(struct sh_msiof_spi_priv *p,
 
 static int sh_msiof_spi_setup(struct spi_device *spi)
 {
-	struct device_node *np = spi->controller->dev.of_node;
 	struct sh_msiof_spi_priv *p =
 		spi_controller_get_devdata(spi->controller);
 	u32 clr, set, tmp;
 
-	if (!np) {
-		/*
-		 * Use spi->controller_data for CS (same strategy as spi_gpio),
-		 * if any. otherwise let HW control CS
-		 */
-		spi->cs_gpio = (uintptr_t)spi->controller_data;
-	}
-
-	if (gpio_is_valid(spi->cs_gpio)) {
-		gpio_direction_output(spi->cs_gpio, !(spi->mode & SPI_CS_HIGH));
-		return 0;
-	}
-
-	if (spi_controller_is_slave(p->ctlr))
+	if (spi->cs_gpiod || spi_controller_is_slave(p->ctlr))
 		return 0;
 
 	if (p->native_cs_inited &&
@@ -600,7 +586,7 @@ static int sh_msiof_prepare_message(struct spi_controller *ctlr,
 	u32 ss, cs_high;
 
 	/* Configure pins before asserting CS */
-	if (gpio_is_valid(spi->cs_gpio)) {
+	if (spi->cs_gpiod) {
 		ss = p->unused_ss;
 		cs_high = p->native_cs_high;
 	} else {
@@ -1156,6 +1142,7 @@ static int sh_msiof_get_cs_gpios(struct sh_msiof_spi_priv *p)
 
 		gpiod = devm_gpiod_get_index(dev, "cs", i, GPIOD_ASIS);
 		if (!IS_ERR(gpiod)) {
+			devm_gpiod_put(dev, gpiod);
 			cs_gpios++;
 			continue;
 		}
@@ -1407,6 +1394,7 @@ static int sh_msiof_spi_probe(struct platform_device *pdev)
 	ctlr->bits_per_word_mask = chipdata->bits_per_word_mask;
 	ctlr->auto_runtime_pm = true;
 	ctlr->transfer_one = sh_msiof_transfer_one;
+	ctlr->use_gpio_descriptors = true;
 
 	ret = sh_msiof_request_dma(p);
 	if (ret < 0)
-- 
cgit v1.2.3-59-g8ed1b


From 60574d1e05b094d222162260dd9cac49f4d0996a Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 11 Mar 2019 14:55:57 -0600
Subject: acpi: Create subtable parsing infrastructure

Parsing entries in an ACPI table had assumed a generic header
structure. There is no standard ACPI header, though, so less common
layouts with different field sizes required custom parsers to go through
their subtable entry list.

Create the infrastructure for adding different table types so parsing
the entries array may be more reused for all ACPI system tables and
the common code doesn't need to be duplicated.

Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Tested-by: Brice Goglin <Brice.Goglin@inria.fr>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/kernel/acpi_numa.c                 |  2 +-
 arch/arm64/kernel/smp.c                       |  4 +-
 arch/ia64/kernel/acpi.c                       | 14 +++---
 arch/x86/kernel/acpi/boot.c                   | 36 +++++++-------
 drivers/acpi/numa.c                           | 16 +++----
 drivers/acpi/scan.c                           |  4 +-
 drivers/acpi/tables.c                         | 67 +++++++++++++++++++++++----
 drivers/irqchip/irq-gic-v2m.c                 |  2 +-
 drivers/irqchip/irq-gic-v3-its-pci-msi.c      |  2 +-
 drivers/irqchip/irq-gic-v3-its-platform-msi.c |  2 +-
 drivers/irqchip/irq-gic-v3-its.c              |  6 +--
 drivers/irqchip/irq-gic-v3.c                  | 10 ++--
 drivers/irqchip/irq-gic.c                     |  4 +-
 drivers/mailbox/pcc.c                         |  2 +-
 include/linux/acpi.h                          |  5 +-
 15 files changed, 113 insertions(+), 63 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c
index eac1d0cc595c..7ff800045434 100644
--- a/arch/arm64/kernel/acpi_numa.c
+++ b/arch/arm64/kernel/acpi_numa.c
@@ -45,7 +45,7 @@ static inline int get_cpu_for_acpi_id(u32 uid)
 	return -EINVAL;
 }
 
-static int __init acpi_parse_gicc_pxm(struct acpi_subtable_header *header,
+static int __init acpi_parse_gicc_pxm(union acpi_subtable_headers *header,
 				      const unsigned long end)
 {
 	struct acpi_srat_gicc_affinity *pa;
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 824de7038967..bb4b3f07761a 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -586,7 +586,7 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor)
 }
 
 static int __init
-acpi_parse_gic_cpu_interface(struct acpi_subtable_header *header,
+acpi_parse_gic_cpu_interface(union acpi_subtable_headers *header,
 			     const unsigned long end)
 {
 	struct acpi_madt_generic_interrupt *processor;
@@ -595,7 +595,7 @@ acpi_parse_gic_cpu_interface(struct acpi_subtable_header *header,
 	if (BAD_MADT_GICC_ENTRY(processor, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 	acpi_map_gic_cpu_interface(processor);
 
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 41eb281709da..1435e7a1a8cd 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -177,7 +177,7 @@ struct acpi_table_madt *acpi_madt __initdata;
 static u8 has_8259;
 
 static int __init
-acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
+acpi_parse_lapic_addr_ovr(union acpi_subtable_headers * header,
 			  const unsigned long end)
 {
 	struct acpi_madt_local_apic_override *lapic;
@@ -195,7 +195,7 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
 }
 
 static int __init
-acpi_parse_lsapic(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_lsapic(union acpi_subtable_headers *header, const unsigned long end)
 {
 	struct acpi_madt_local_sapic *lsapic;
 
@@ -216,7 +216,7 @@ acpi_parse_lsapic(struct acpi_subtable_header * header, const unsigned long end)
 }
 
 static int __init
-acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long end)
 {
 	struct acpi_madt_local_apic_nmi *lacpi_nmi;
 
@@ -230,7 +230,7 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
 }
 
 static int __init
-acpi_parse_iosapic(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_iosapic(union acpi_subtable_headers * header, const unsigned long end)
 {
 	struct acpi_madt_io_sapic *iosapic;
 
@@ -245,7 +245,7 @@ acpi_parse_iosapic(struct acpi_subtable_header * header, const unsigned long end
 static unsigned int __initdata acpi_madt_rev;
 
 static int __init
-acpi_parse_plat_int_src(struct acpi_subtable_header * header,
+acpi_parse_plat_int_src(union acpi_subtable_headers * header,
 			const unsigned long end)
 {
 	struct acpi_madt_interrupt_source *plintsrc;
@@ -329,7 +329,7 @@ unsigned int get_cpei_target_cpu(void)
 }
 
 static int __init
-acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
+acpi_parse_int_src_ovr(union acpi_subtable_headers * header,
 		       const unsigned long end)
 {
 	struct acpi_madt_interrupt_override *p;
@@ -350,7 +350,7 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
 }
 
 static int __init
-acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_nmi_src(union acpi_subtable_headers * header, const unsigned long end)
 {
 	struct acpi_madt_nmi_source *nmi_src;
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 8dcbf6890714..9fc92e4539d8 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -197,7 +197,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
 }
 
 static int __init
-acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
+acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
 {
 	struct acpi_madt_local_x2apic *processor = NULL;
 #ifdef CONFIG_X86_X2APIC
@@ -210,7 +210,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
 	if (BAD_MADT_ENTRY(processor, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 #ifdef CONFIG_X86_X2APIC
 	apic_id = processor->local_apic_id;
@@ -242,7 +242,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
 }
 
 static int __init
-acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
 {
 	struct acpi_madt_local_apic *processor = NULL;
 
@@ -251,7 +251,7 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
 	if (BAD_MADT_ENTRY(processor, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 	/* Ignore invalid ID */
 	if (processor->id == 0xff)
@@ -272,7 +272,7 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
 }
 
 static int __init
-acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end)
+acpi_parse_sapic(union acpi_subtable_headers *header, const unsigned long end)
 {
 	struct acpi_madt_local_sapic *processor = NULL;
 
@@ -281,7 +281,7 @@ acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end)
 	if (BAD_MADT_ENTRY(processor, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 	acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */
 			    processor->processor_id, /* ACPI ID */
@@ -291,7 +291,7 @@ acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end)
 }
 
 static int __init
-acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
+acpi_parse_lapic_addr_ovr(union acpi_subtable_headers * header,
 			  const unsigned long end)
 {
 	struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL;
@@ -301,7 +301,7 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
 	if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 	acpi_lapic_addr = lapic_addr_ovr->address;
 
@@ -309,7 +309,7 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
 }
 
 static int __init
-acpi_parse_x2apic_nmi(struct acpi_subtable_header *header,
+acpi_parse_x2apic_nmi(union acpi_subtable_headers *header,
 		      const unsigned long end)
 {
 	struct acpi_madt_local_x2apic_nmi *x2apic_nmi = NULL;
@@ -319,7 +319,7 @@ acpi_parse_x2apic_nmi(struct acpi_subtable_header *header,
 	if (BAD_MADT_ENTRY(x2apic_nmi, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 	if (x2apic_nmi->lint != 1)
 		printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
@@ -328,7 +328,7 @@ acpi_parse_x2apic_nmi(struct acpi_subtable_header *header,
 }
 
 static int __init
-acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long end)
 {
 	struct acpi_madt_local_apic_nmi *lapic_nmi = NULL;
 
@@ -337,7 +337,7 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
 	if (BAD_MADT_ENTRY(lapic_nmi, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 	if (lapic_nmi->lint != 1)
 		printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
@@ -449,7 +449,7 @@ static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity,
 }
 
 static int __init
-acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_ioapic(union acpi_subtable_headers * header, const unsigned long end)
 {
 	struct acpi_madt_io_apic *ioapic = NULL;
 	struct ioapic_domain_cfg cfg = {
@@ -462,7 +462,7 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 	if (BAD_MADT_ENTRY(ioapic, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 	/* Statically assign IRQ numbers for IOAPICs hosting legacy IRQs */
 	if (ioapic->global_irq_base < nr_legacy_irqs())
@@ -508,7 +508,7 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger,
 }
 
 static int __init
-acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
+acpi_parse_int_src_ovr(union acpi_subtable_headers * header,
 		       const unsigned long end)
 {
 	struct acpi_madt_interrupt_override *intsrc = NULL;
@@ -518,7 +518,7 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
 	if (BAD_MADT_ENTRY(intsrc, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 	if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
 		acpi_sci_ioapic_setup(intsrc->source_irq,
@@ -550,7 +550,7 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
 }
 
 static int __init
-acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_nmi_src(union acpi_subtable_headers * header, const unsigned long end)
 {
 	struct acpi_madt_nmi_source *nmi_src = NULL;
 
@@ -559,7 +559,7 @@ acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end
 	if (BAD_MADT_ENTRY(nmi_src, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 	/* TBD: Support nimsrc entries? */
 
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 867f6e3f2b4f..30995834ad70 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -339,7 +339,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
 }
 
 static int __init
-acpi_parse_x2apic_affinity(struct acpi_subtable_header *header,
+acpi_parse_x2apic_affinity(union acpi_subtable_headers *header,
 			   const unsigned long end)
 {
 	struct acpi_srat_x2apic_cpu_affinity *processor_affinity;
@@ -348,7 +348,7 @@ acpi_parse_x2apic_affinity(struct acpi_subtable_header *header,
 	if (!processor_affinity)
 		return -EINVAL;
 
-	acpi_table_print_srat_entry(header);
+	acpi_table_print_srat_entry(&header->common);
 
 	/* let architecture-dependent part to do it */
 	acpi_numa_x2apic_affinity_init(processor_affinity);
@@ -357,7 +357,7 @@ acpi_parse_x2apic_affinity(struct acpi_subtable_header *header,
 }
 
 static int __init
-acpi_parse_processor_affinity(struct acpi_subtable_header *header,
+acpi_parse_processor_affinity(union acpi_subtable_headers *header,
 			      const unsigned long end)
 {
 	struct acpi_srat_cpu_affinity *processor_affinity;
@@ -366,7 +366,7 @@ acpi_parse_processor_affinity(struct acpi_subtable_header *header,
 	if (!processor_affinity)
 		return -EINVAL;
 
-	acpi_table_print_srat_entry(header);
+	acpi_table_print_srat_entry(&header->common);
 
 	/* let architecture-dependent part to do it */
 	acpi_numa_processor_affinity_init(processor_affinity);
@@ -375,7 +375,7 @@ acpi_parse_processor_affinity(struct acpi_subtable_header *header,
 }
 
 static int __init
-acpi_parse_gicc_affinity(struct acpi_subtable_header *header,
+acpi_parse_gicc_affinity(union acpi_subtable_headers *header,
 			 const unsigned long end)
 {
 	struct acpi_srat_gicc_affinity *processor_affinity;
@@ -384,7 +384,7 @@ acpi_parse_gicc_affinity(struct acpi_subtable_header *header,
 	if (!processor_affinity)
 		return -EINVAL;
 
-	acpi_table_print_srat_entry(header);
+	acpi_table_print_srat_entry(&header->common);
 
 	/* let architecture-dependent part to do it */
 	acpi_numa_gicc_affinity_init(processor_affinity);
@@ -395,7 +395,7 @@ acpi_parse_gicc_affinity(struct acpi_subtable_header *header,
 static int __initdata parsed_numa_memblks;
 
 static int __init
-acpi_parse_memory_affinity(struct acpi_subtable_header * header,
+acpi_parse_memory_affinity(union acpi_subtable_headers * header,
 			   const unsigned long end)
 {
 	struct acpi_srat_mem_affinity *memory_affinity;
@@ -404,7 +404,7 @@ acpi_parse_memory_affinity(struct acpi_subtable_header * header,
 	if (!memory_affinity)
 		return -EINVAL;
 
-	acpi_table_print_srat_entry(header);
+	acpi_table_print_srat_entry(&header->common);
 
 	/* let architecture-dependent part to do it */
 	if (!acpi_numa_memory_affinity_init(memory_affinity))
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 446c959a8f08..f7771a3b4a3e 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -2241,10 +2241,10 @@ static struct acpi_probe_entry *ape;
 static int acpi_probe_count;
 static DEFINE_MUTEX(acpi_probe_mutex);
 
-static int __init acpi_match_madt(struct acpi_subtable_header *header,
+static int __init acpi_match_madt(union acpi_subtable_headers *header,
 				  const unsigned long end)
 {
-	if (!ape->subtable_valid || ape->subtable_valid(header, ape))
+	if (!ape->subtable_valid || ape->subtable_valid(&header->common, ape))
 		if (!ape->probe_subtbl(header, end))
 			acpi_probe_count++;
 
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index 8fccbe49612a..7553774a22b7 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -49,6 +49,15 @@ static struct acpi_table_desc initial_tables[ACPI_MAX_TABLES] __initdata;
 
 static int acpi_apic_instance __initdata;
 
+enum acpi_subtable_type {
+	ACPI_SUBTABLE_COMMON,
+};
+
+struct acpi_subtable_entry {
+	union acpi_subtable_headers *hdr;
+	enum acpi_subtable_type type;
+};
+
 /*
  * Disable table checksum verification for the early stage due to the size
  * limitation of the current x86 early mapping implementation.
@@ -217,6 +226,42 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header)
 	}
 }
 
+static unsigned long __init
+acpi_get_entry_type(struct acpi_subtable_entry *entry)
+{
+	switch (entry->type) {
+	case ACPI_SUBTABLE_COMMON:
+		return entry->hdr->common.type;
+	}
+	return 0;
+}
+
+static unsigned long __init
+acpi_get_entry_length(struct acpi_subtable_entry *entry)
+{
+	switch (entry->type) {
+	case ACPI_SUBTABLE_COMMON:
+		return entry->hdr->common.length;
+	}
+	return 0;
+}
+
+static unsigned long __init
+acpi_get_subtable_header_length(struct acpi_subtable_entry *entry)
+{
+	switch (entry->type) {
+	case ACPI_SUBTABLE_COMMON:
+		return sizeof(entry->hdr->common);
+	}
+	return 0;
+}
+
+static enum acpi_subtable_type __init
+acpi_get_subtable_type(char *id)
+{
+	return ACPI_SUBTABLE_COMMON;
+}
+
 /**
  * acpi_parse_entries_array - for each proc_num find a suitable subtable
  *
@@ -246,8 +291,8 @@ acpi_parse_entries_array(char *id, unsigned long table_size,
 		struct acpi_subtable_proc *proc, int proc_num,
 		unsigned int max_entries)
 {
-	struct acpi_subtable_header *entry;
-	unsigned long table_end;
+	struct acpi_subtable_entry entry;
+	unsigned long table_end, subtable_len, entry_len;
 	int count = 0;
 	int errs = 0;
 	int i;
@@ -270,19 +315,20 @@ acpi_parse_entries_array(char *id, unsigned long table_size,
 
 	/* Parse all entries looking for a match. */
 
-	entry = (struct acpi_subtable_header *)
+	entry.type = acpi_get_subtable_type(id);
+	entry.hdr = (union acpi_subtable_headers *)
 	    ((unsigned long)table_header + table_size);
+	subtable_len = acpi_get_subtable_header_length(&entry);
 
-	while (((unsigned long)entry) + sizeof(struct acpi_subtable_header) <
-	       table_end) {
+	while (((unsigned long)entry.hdr) + subtable_len  < table_end) {
 		if (max_entries && count >= max_entries)
 			break;
 
 		for (i = 0; i < proc_num; i++) {
-			if (entry->type != proc[i].id)
+			if (acpi_get_entry_type(&entry) != proc[i].id)
 				continue;
 			if (!proc[i].handler ||
-			     (!errs && proc[i].handler(entry, table_end))) {
+			     (!errs && proc[i].handler(entry.hdr, table_end))) {
 				errs++;
 				continue;
 			}
@@ -297,13 +343,14 @@ acpi_parse_entries_array(char *id, unsigned long table_size,
 		 * If entry->length is 0, break from this loop to avoid
 		 * infinite loop.
 		 */
-		if (entry->length == 0) {
+		entry_len = acpi_get_entry_length(&entry);
+		if (entry_len == 0) {
 			pr_err("[%4.4s:0x%02x] Invalid zero length\n", id, proc->id);
 			return -EINVAL;
 		}
 
-		entry = (struct acpi_subtable_header *)
-		    ((unsigned long)entry + entry->length);
+		entry.hdr = (union acpi_subtable_headers *)
+		    ((unsigned long)entry.hdr + entry_len);
 	}
 
 	if (max_entries && count > max_entries) {
diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c
index f5fe0100f9ff..de14e06fd9ec 100644
--- a/drivers/irqchip/irq-gic-v2m.c
+++ b/drivers/irqchip/irq-gic-v2m.c
@@ -446,7 +446,7 @@ static struct fwnode_handle *gicv2m_get_fwnode(struct device *dev)
 }
 
 static int __init
-acpi_parse_madt_msi(struct acpi_subtable_header *header,
+acpi_parse_madt_msi(union acpi_subtable_headers *header,
 		    const unsigned long end)
 {
 	int ret;
diff --git a/drivers/irqchip/irq-gic-v3-its-pci-msi.c b/drivers/irqchip/irq-gic-v3-its-pci-msi.c
index 8d6d009d1d58..c81d5b81da56 100644
--- a/drivers/irqchip/irq-gic-v3-its-pci-msi.c
+++ b/drivers/irqchip/irq-gic-v3-its-pci-msi.c
@@ -159,7 +159,7 @@ static int __init its_pci_of_msi_init(void)
 #ifdef CONFIG_ACPI
 
 static int __init
-its_pci_msi_parse_madt(struct acpi_subtable_header *header,
+its_pci_msi_parse_madt(union acpi_subtable_headers *header,
 		       const unsigned long end)
 {
 	struct acpi_madt_generic_translator *its_entry;
diff --git a/drivers/irqchip/irq-gic-v3-its-platform-msi.c b/drivers/irqchip/irq-gic-v3-its-platform-msi.c
index 7b8e87b493fe..9cdcda5bb3bd 100644
--- a/drivers/irqchip/irq-gic-v3-its-platform-msi.c
+++ b/drivers/irqchip/irq-gic-v3-its-platform-msi.c
@@ -117,7 +117,7 @@ static int __init its_pmsi_init_one(struct fwnode_handle *fwnode,
 
 #ifdef CONFIG_ACPI
 static int __init
-its_pmsi_parse_madt(struct acpi_subtable_header *header,
+its_pmsi_parse_madt(union acpi_subtable_headers *header,
 			const unsigned long end)
 {
 	struct acpi_madt_generic_translator *its_entry;
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 7577755bdcf4..128ac893d7e4 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3830,13 +3830,13 @@ static int __init acpi_get_its_numa_node(u32 its_id)
 	return NUMA_NO_NODE;
 }
 
-static int __init gic_acpi_match_srat_its(struct acpi_subtable_header *header,
+static int __init gic_acpi_match_srat_its(union acpi_subtable_headers *header,
 					  const unsigned long end)
 {
 	return 0;
 }
 
-static int __init gic_acpi_parse_srat_its(struct acpi_subtable_header *header,
+static int __init gic_acpi_parse_srat_its(union acpi_subtable_headers *header,
 			 const unsigned long end)
 {
 	int node;
@@ -3903,7 +3903,7 @@ static int __init acpi_get_its_numa_node(u32 its_id) { return NUMA_NO_NODE; }
 static void __init acpi_its_srat_maps_free(void) { }
 #endif
 
-static int __init gic_acpi_parse_madt_its(struct acpi_subtable_header *header,
+static int __init gic_acpi_parse_madt_its(union acpi_subtable_headers *header,
 					  const unsigned long end)
 {
 	struct acpi_madt_generic_translator *its_entry;
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 15e55d327505..f44cd89cfc40 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -1593,7 +1593,7 @@ gic_acpi_register_redist(phys_addr_t phys_base, void __iomem *redist_base)
 }
 
 static int __init
-gic_acpi_parse_madt_redist(struct acpi_subtable_header *header,
+gic_acpi_parse_madt_redist(union acpi_subtable_headers *header,
 			   const unsigned long end)
 {
 	struct acpi_madt_generic_redistributor *redist =
@@ -1611,7 +1611,7 @@ gic_acpi_parse_madt_redist(struct acpi_subtable_header *header,
 }
 
 static int __init
-gic_acpi_parse_madt_gicc(struct acpi_subtable_header *header,
+gic_acpi_parse_madt_gicc(union acpi_subtable_headers *header,
 			 const unsigned long end)
 {
 	struct acpi_madt_generic_interrupt *gicc =
@@ -1653,14 +1653,14 @@ static int __init gic_acpi_collect_gicr_base(void)
 	return -ENODEV;
 }
 
-static int __init gic_acpi_match_gicr(struct acpi_subtable_header *header,
+static int __init gic_acpi_match_gicr(union acpi_subtable_headers *header,
 				  const unsigned long end)
 {
 	/* Subtable presence means that redist exists, that's it */
 	return 0;
 }
 
-static int __init gic_acpi_match_gicc(struct acpi_subtable_header *header,
+static int __init gic_acpi_match_gicc(union acpi_subtable_headers *header,
 				      const unsigned long end)
 {
 	struct acpi_madt_generic_interrupt *gicc =
@@ -1726,7 +1726,7 @@ static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header,
 	return true;
 }
 
-static int __init gic_acpi_parse_virt_madt_gicc(struct acpi_subtable_header *header,
+static int __init gic_acpi_parse_virt_madt_gicc(union acpi_subtable_headers *header,
 						const unsigned long end)
 {
 	struct acpi_madt_generic_interrupt *gicc =
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index fd3110c171ba..c6dbe5018972 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -1495,7 +1495,7 @@ static struct
 } acpi_data __initdata;
 
 static int __init
-gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header,
+gic_acpi_parse_madt_cpu(union acpi_subtable_headers *header,
 			const unsigned long end)
 {
 	struct acpi_madt_generic_interrupt *processor;
@@ -1527,7 +1527,7 @@ gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header,
 }
 
 /* The things you have to do to just *count* something... */
-static int __init acpi_dummy_func(struct acpi_subtable_header *header,
+static int __init acpi_dummy_func(union acpi_subtable_headers *header,
 				  const unsigned long end)
 {
 	return 0;
diff --git a/drivers/mailbox/pcc.c b/drivers/mailbox/pcc.c
index 256f18b67e8a..08a0a3517138 100644
--- a/drivers/mailbox/pcc.c
+++ b/drivers/mailbox/pcc.c
@@ -382,7 +382,7 @@ static const struct mbox_chan_ops pcc_chan_ops = {
  *
  * This gets called for each entry in the PCC table.
  */
-static int parse_pcc_subspace(struct acpi_subtable_header *header,
+static int parse_pcc_subspace(union acpi_subtable_headers *header,
 		const unsigned long end)
 {
 	struct acpi_pcct_subspace *ss = (struct acpi_pcct_subspace *) header;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index d5dcebd7aad3..9494d42bf507 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -141,10 +141,13 @@ enum acpi_address_range_id {
 
 
 /* Table Handlers */
+union acpi_subtable_headers {
+	struct acpi_subtable_header common;
+};
 
 typedef int (*acpi_tbl_table_handler)(struct acpi_table_header *table);
 
-typedef int (*acpi_tbl_entry_handler)(struct acpi_subtable_header *header,
+typedef int (*acpi_tbl_entry_handler)(union acpi_subtable_headers *header,
 				      const unsigned long end);
 
 /* Debugger support */
-- 
cgit v1.2.3-59-g8ed1b


From e6046b5e69a070f8a1af868bba788cf796bcd9a8 Mon Sep 17 00:00:00 2001
From: Chuanhong Guo <gch981213@gmail.com>
Date: Thu, 21 Mar 2019 22:42:51 +0800
Subject: MIPS: ralink: fix cpu clock of mt7621 and add dt clk devices

For a long time the mt7621 uses a fixed cpu clock which causes a problem
if the cpu frequency is not 880MHz.

This patch fixes the cpu clock calculation and adds the cpu/bus clkdev
which will be used in dts.

Ported from OpenWrt:
c7ca224299 ramips: fix cpu clock of mt7621 and add dt clk devices

Signed-off-by: Weijie Gao <hackpascal@gmail.com>
Signed-off-by: Chuanhong Guo <gch981213@gmail.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: linux-mips@vger.kernel.org
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: John Crispin <john@phrozen.org>
Cc: linux-kernel@vger.kernel.org
---
 arch/mips/include/asm/mach-ralink/mt7621.h |  20 ++++++
 arch/mips/ralink/mt7621.c                  | 102 ++++++++++++++++++++---------
 arch/mips/ralink/timer-gic.c               |   4 +-
 3 files changed, 93 insertions(+), 33 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/include/asm/mach-ralink/mt7621.h b/arch/mips/include/asm/mach-ralink/mt7621.h
index a672e06fa5fd..b8a8834164c8 100644
--- a/arch/mips/include/asm/mach-ralink/mt7621.h
+++ b/arch/mips/include/asm/mach-ralink/mt7621.h
@@ -19,6 +19,10 @@
 #define SYSC_REG_CHIP_REV		0x0c
 #define SYSC_REG_SYSTEM_CONFIG0		0x10
 #define SYSC_REG_SYSTEM_CONFIG1		0x14
+#define SYSC_REG_CLKCFG0		0x2c
+#define SYSC_REG_CUR_CLK_STS		0x44
+
+#define MEMC_REG_CPU_PLL		0x648
 
 #define CHIP_REV_PKG_MASK		0x1
 #define CHIP_REV_PKG_SHIFT		16
@@ -26,6 +30,22 @@
 #define CHIP_REV_VER_SHIFT		8
 #define CHIP_REV_ECO_MASK		0xf
 
+#define XTAL_MODE_SEL_MASK		0x7
+#define XTAL_MODE_SEL_SHIFT		6
+
+#define CPU_CLK_SEL_MASK		0x3
+#define CPU_CLK_SEL_SHIFT		30
+
+#define CUR_CPU_FDIV_MASK		0x1f
+#define CUR_CPU_FDIV_SHIFT		8
+#define CUR_CPU_FFRAC_MASK		0x1f
+#define CUR_CPU_FFRAC_SHIFT		0
+
+#define CPU_PLL_PREDIV_MASK		0x3
+#define CPU_PLL_PREDIV_SHIFT		12
+#define CPU_PLL_FBDIV_MASK		0x7f
+#define CPU_PLL_FBDIV_SHIFT		4
+
 #define MT7621_DRAM_BASE                0x0
 #define MT7621_DDR2_SIZE_MIN		32
 #define MT7621_DDR2_SIZE_MAX		256
diff --git a/arch/mips/ralink/mt7621.c b/arch/mips/ralink/mt7621.c
index d2718de60b9b..6828eefb0a85 100644
--- a/arch/mips/ralink/mt7621.c
+++ b/arch/mips/ralink/mt7621.c
@@ -9,22 +9,22 @@
 
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/clk.h>
+#include <linux/clkdev.h>
+#include <linux/clk-provider.h>
+#include <dt-bindings/clock/mt7621-clk.h>
 
 #include <asm/mipsregs.h>
 #include <asm/smp-ops.h>
 #include <asm/mips-cps.h>
 #include <asm/mach-ralink/ralink_regs.h>
 #include <asm/mach-ralink/mt7621.h>
+#include <asm/time.h>
 
 #include <pinmux.h>
 
 #include "common.h"
 
-#define SYSC_REG_SYSCFG		0x10
-#define SYSC_REG_CPLL_CLKCFG0	0x2c
-#define SYSC_REG_CUR_CLK_STS	0x44
-#define CPU_CLK_SEL		(BIT(30) | BIT(31))
-
 #define MT7621_GPIO_MODE_UART1		1
 #define MT7621_GPIO_MODE_I2C		2
 #define MT7621_GPIO_MODE_UART3_MASK	0x3
@@ -110,49 +110,89 @@ static struct rt2880_pmx_group mt7621_pinmux_data[] = {
 	{ 0 }
 };
 
+static struct clk *clks[MT7621_CLK_MAX];
+static struct clk_onecell_data clk_data = {
+	.clks = clks,
+	.clk_num = ARRAY_SIZE(clks),
+};
+
 phys_addr_t mips_cpc_default_phys_base(void)
 {
 	panic("Cannot detect cpc address");
 }
 
+static struct clk *__init mt7621_add_sys_clkdev(
+	const char *id, unsigned long rate)
+{
+	struct clk *clk;
+	int err;
+
+	clk = clk_register_fixed_rate(NULL, id, NULL, 0, rate);
+	if (IS_ERR(clk))
+		panic("failed to allocate %s clock structure", id);
+
+	err = clk_register_clkdev(clk, id, NULL);
+	if (err)
+		panic("unable to register %s clock device", id);
+
+	return clk;
+}
+
 void __init ralink_clk_init(void)
 {
-	int cpu_fdiv = 0;
-	int cpu_ffrac = 0;
-	int fbdiv = 0;
-	u32 clk_sts, syscfg;
-	u8 clk_sel = 0, xtal_mode;
-	u32 cpu_clk;
+	const static u32 prediv_tbl[] = {0, 1, 2, 2};
+	u32 syscfg, xtal_sel, clkcfg, clk_sel, curclk, ffiv, ffrac;
+	u32 pll, prediv, fbdiv;
+	u32 xtal_clk, cpu_clk, bus_clk;
+
+	syscfg = rt_sysc_r32(SYSC_REG_SYSTEM_CONFIG0);
+	xtal_sel = (syscfg >> XTAL_MODE_SEL_SHIFT) & XTAL_MODE_SEL_MASK;
 
-	if ((rt_sysc_r32(SYSC_REG_CPLL_CLKCFG0) & CPU_CLK_SEL) != 0)
-		clk_sel = 1;
+	clkcfg = rt_sysc_r32(SYSC_REG_CLKCFG0);
+	clk_sel = (clkcfg >> CPU_CLK_SEL_SHIFT) & CPU_CLK_SEL_MASK;
+
+	curclk = rt_sysc_r32(SYSC_REG_CUR_CLK_STS);
+	ffiv = (curclk >> CUR_CPU_FDIV_SHIFT) & CUR_CPU_FDIV_MASK;
+	ffrac = (curclk >> CUR_CPU_FFRAC_SHIFT) & CUR_CPU_FFRAC_MASK;
+
+	if (xtal_sel <= 2)
+		xtal_clk = 20 * 1000 * 1000;
+	else if (xtal_sel <= 5)
+		xtal_clk = 40 * 1000 * 1000;
+	else
+		xtal_clk = 25 * 1000 * 1000;
 
 	switch (clk_sel) {
 	case 0:
-		clk_sts = rt_sysc_r32(SYSC_REG_CUR_CLK_STS);
-		cpu_fdiv = ((clk_sts >> 8) & 0x1F);
-		cpu_ffrac = (clk_sts & 0x1F);
-		cpu_clk = (500 * cpu_ffrac / cpu_fdiv) * 1000 * 1000;
+		cpu_clk = 500 * 1000 * 1000;
 		break;
-
 	case 1:
-		fbdiv = ((rt_sysc_r32(0x648) >> 4) & 0x7F) + 1;
-		syscfg = rt_sysc_r32(SYSC_REG_SYSCFG);
-		xtal_mode = (syscfg >> 6) & 0x7;
-		if (xtal_mode >= 6) {
-			/* 25Mhz Xtal */
-			cpu_clk = 25 * fbdiv * 1000 * 1000;
-		} else if (xtal_mode >= 3) {
-			/* 40Mhz Xtal */
-			cpu_clk = 40 * fbdiv * 1000 * 1000;
-		} else {
-			/* 20Mhz Xtal */
-			cpu_clk = 20 * fbdiv * 1000 * 1000;
-		}
+		pll = rt_memc_r32(MEMC_REG_CPU_PLL);
+		fbdiv = (pll >> CPU_PLL_FBDIV_SHIFT) & CPU_PLL_FBDIV_MASK;
+		prediv = (pll >> CPU_PLL_PREDIV_SHIFT) & CPU_PLL_PREDIV_MASK;
+		cpu_clk = ((fbdiv + 1) * xtal_clk) >> prediv_tbl[prediv];
 		break;
+	default:
+		cpu_clk = xtal_clk;
 	}
+
+	cpu_clk = cpu_clk / ffiv * ffrac;
+	bus_clk = cpu_clk / 4;
+
+	clks[MT7621_CLK_CPU] = mt7621_add_sys_clkdev("cpu", cpu_clk);
+	clks[MT7621_CLK_BUS] = mt7621_add_sys_clkdev("bus", bus_clk);
+
+	pr_info("CPU Clock: %dMHz\n", cpu_clk / 1000000);
+	mips_hpt_frequency = cpu_clk / 2;
 }
 
+static void __init mt7621_clocks_init_dt(struct device_node *np)
+{
+	of_clk_add_provider(np, of_clk_src_onecell_get, &clk_data);
+}
+
+CLK_OF_DECLARE(mt7621_clk, "mediatek,mt7621-pll", mt7621_clocks_init_dt);
+
 void __init ralink_of_remap(void)
 {
 	rt_sysc_membase = plat_of_remap_node("mtk,mt7621-sysc");
diff --git a/arch/mips/ralink/timer-gic.c b/arch/mips/ralink/timer-gic.c
index b5f07d21fcf2..4fed842ae8eb 100644
--- a/arch/mips/ralink/timer-gic.c
+++ b/arch/mips/ralink/timer-gic.c
@@ -11,14 +11,14 @@
 
 #include <linux/of.h>
 #include <linux/clk-provider.h>
-#include <linux/clocksource.h>
+#include <asm/time.h>
 
 #include "common.h"
 
 void __init plat_time_init(void)
 {
 	ralink_of_remap();
-
+	ralink_clk_init();
 	of_clk_init(NULL);
 	timer_probe();
 }
-- 
cgit v1.2.3-59-g8ed1b


From 72deb455b5ec619ff043c30bc90025aa3de3cdda Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 5 Apr 2019 18:08:59 +0200
Subject: block: remove CONFIG_LBDAF

Currently support for 64-bit sector_t and blkcnt_t is optional on 32-bit
architectures.  These types are required to support block device and/or
file sizes larger than 2 TiB, and have generally defaulted to on for
a long time.  Enabling the option only increases the i386 tinyconfig
size by 145 bytes, and many data structures already always use
64-bit values for their in-core and on-disk data structures anyway,
so there should not be a large change in dynamic memory usage either.

Dropping this option removes a somewhat weird non-default config that
has cause various bugs or compiler warnings when actually used.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/process/submit-checklist.rst         | 27 ++++++++----------
 Documentation/translations/ja_JP/SubmitChecklist   | 22 ++++++---------
 arch/arc/configs/haps_hs_defconfig                 |  1 -
 arch/arc/configs/haps_hs_smp_defconfig             |  1 -
 arch/arc/configs/nsim_700_defconfig                |  1 -
 arch/arc/configs/nsim_hs_defconfig                 |  1 -
 arch/arc/configs/nsim_hs_smp_defconfig             |  1 -
 arch/arc/configs/nsimosci_defconfig                |  1 -
 arch/arc/configs/nsimosci_hs_defconfig             |  1 -
 arch/arc/configs/nsimosci_hs_smp_defconfig         |  1 -
 arch/arm/configs/aspeed_g4_defconfig               |  1 -
 arch/arm/configs/aspeed_g5_defconfig               |  1 -
 arch/arm/configs/at91_dt_defconfig                 |  1 -
 arch/arm/configs/clps711x_defconfig                |  1 -
 arch/arm/configs/efm32_defconfig                   |  1 -
 arch/arm/configs/ezx_defconfig                     |  1 -
 arch/arm/configs/h3600_defconfig                   |  1 -
 arch/arm/configs/imote2_defconfig                  |  1 -
 arch/arm/configs/moxart_defconfig                  |  1 -
 arch/arm/configs/multi_v4t_defconfig               |  1 -
 arch/arm/configs/omap1_defconfig                   |  1 -
 arch/arm/configs/stm32_defconfig                   |  1 -
 arch/arm/configs/u300_defconfig                    |  1 -
 arch/arm/configs/vexpress_defconfig                |  1 -
 arch/m68k/configs/amcore_defconfig                 |  1 -
 arch/m68k/configs/m5475evb_defconfig               |  1 -
 arch/m68k/configs/stmark2_defconfig                |  1 -
 arch/mips/configs/ar7_defconfig                    |  1 -
 arch/mips/configs/decstation_defconfig             |  1 -
 arch/mips/configs/decstation_r4k_defconfig         |  1 -
 arch/mips/configs/loongson1b_defconfig             |  1 -
 arch/mips/configs/loongson1c_defconfig             |  1 -
 arch/mips/configs/rb532_defconfig                  |  1 -
 arch/mips/configs/rbtx49xx_defconfig               |  1 -
 arch/parisc/configs/generic-32bit_defconfig        |  1 -
 arch/sh/configs/apsh4ad0a_defconfig                |  1 -
 arch/sh/configs/ecovec24-romimage_defconfig        |  1 -
 arch/sh/configs/rsk7264_defconfig                  |  1 -
 arch/sh/configs/rsk7269_defconfig                  |  1 -
 arch/sh/configs/sh7785lcr_32bit_defconfig          |  1 -
 block/Kconfig                                      | 24 ----------------
 drivers/block/drbd/drbd_int.h                      |  5 ----
 drivers/block/ps3disk.c                            |  4 +--
 drivers/md/dm-exception-store.h                    | 28 ++-----------------
 drivers/md/dm-integrity.c                          |  8 ++----
 drivers/md/md.c                                    |  6 ++--
 drivers/nvdimm/pfn_devs.c                          |  4 +--
 drivers/scsi/sd.c                                  | 32 ----------------------
 fs/ext4/resize.c                                   |  2 --
 fs/ext4/super.c                                    | 32 +++++-----------------
 fs/gfs2/Kconfig                                    |  1 -
 fs/nfs/Kconfig                                     |  1 -
 fs/ocfs2/super.c                                   | 10 -------
 fs/stack.c                                         | 15 +++++-----
 fs/xfs/Kconfig                                     |  1 -
 fs/xfs/xfs_super.c                                 | 10 +------
 include/linux/genhd.h                              |  8 +++---
 include/linux/kernel.h                             | 14 ++--------
 include/linux/types.h                              |  5 ----
 lib/Kconfig.debug                                  |  1 -
 .../formal/srcu-cbmc/include/linux/types.h         |  4 ---
 61 files changed, 52 insertions(+), 250 deletions(-)

(limited to 'arch')

diff --git a/Documentation/process/submit-checklist.rst b/Documentation/process/submit-checklist.rst
index 367353c54949..c88867b173d9 100644
--- a/Documentation/process/submit-checklist.rst
+++ b/Documentation/process/submit-checklist.rst
@@ -72,47 +72,44 @@ and elsewhere regarding submitting Linux kernel patches.
 13) Has been build- and runtime tested with and without ``CONFIG_SMP`` and
     ``CONFIG_PREEMPT.``
 
-14) If the patch affects IO/Disk, etc: has been tested with and without
-    ``CONFIG_LBDAF.``
+16) All codepaths have been exercised with all lockdep features enabled.
 
-15) All codepaths have been exercised with all lockdep features enabled.
+17) All new ``/proc`` entries are documented under ``Documentation/``
 
-16) All new ``/proc`` entries are documented under ``Documentation/``
-
-17) All new kernel boot parameters are documented in
+18) All new kernel boot parameters are documented in
     ``Documentation/admin-guide/kernel-parameters.rst``.
 
-18) All new module parameters are documented with ``MODULE_PARM_DESC()``
+19) All new module parameters are documented with ``MODULE_PARM_DESC()``
 
-19) All new userspace interfaces are documented in ``Documentation/ABI/``.
+20) All new userspace interfaces are documented in ``Documentation/ABI/``.
     See ``Documentation/ABI/README`` for more information.
     Patches that change userspace interfaces should be CCed to
     linux-api@vger.kernel.org.
 
-20) Check that it all passes ``make headers_check``.
+21) Check that it all passes ``make headers_check``.
 
-21) Has been checked with injection of at least slab and page-allocation
+22) Has been checked with injection of at least slab and page-allocation
     failures.  See ``Documentation/fault-injection/``.
 
     If the new code is substantial, addition of subsystem-specific fault
     injection might be appropriate.
 
-22) Newly-added code has been compiled with ``gcc -W`` (use
+23) Newly-added code has been compiled with ``gcc -W`` (use
     ``make EXTRA_CFLAGS=-W``).  This will generate lots of noise, but is good
     for finding bugs like "warning: comparison between signed and unsigned".
 
-23) Tested after it has been merged into the -mm patchset to make sure
+24) Tested after it has been merged into the -mm patchset to make sure
     that it still works with all of the other queued patches and various
     changes in the VM, VFS, and other subsystems.
 
-24) All memory barriers {e.g., ``barrier()``, ``rmb()``, ``wmb()``} need a
+25) All memory barriers {e.g., ``barrier()``, ``rmb()``, ``wmb()``} need a
     comment in the source code that explains the logic of what they are doing
     and why.
 
-25) If any ioctl's are added by the patch, then also update
+26) If any ioctl's are added by the patch, then also update
     ``Documentation/ioctl/ioctl-number.txt``.
 
-26) If your modified source code depends on or uses any of the kernel
+27) If your modified source code depends on or uses any of the kernel
     APIs or features that are related to the following ``Kconfig`` symbols,
     then test multiple builds with the related ``Kconfig`` symbols disabled
     and/or ``=m`` (if that option is available) [not all of these at the
diff --git a/Documentation/translations/ja_JP/SubmitChecklist b/Documentation/translations/ja_JP/SubmitChecklist
index 60c7c35ac517..b42220d3d46c 100644
--- a/Documentation/translations/ja_JP/SubmitChecklist
+++ b/Documentation/translations/ja_JP/SubmitChecklist
@@ -74,38 +74,34 @@ Linux カーネルパッチ投稿者向けチェックリスト
 13: CONFIG_SMP, CONFIG_PREEMPT を有効にした場合と無効にした場合の両方で
     ビルドした上、動作確認を行ってください。
 
-14: もしパッチがディスクのI/O性能などに影響を与えるようであれば、
-    'CONFIG_LBDAF'オプションを有効にした場合と無効にした場合の両方で
-    テストを実施してみてください。
+14: lockdepの機能を全て有効にした上で、全てのコードパスを評価してください。
 
-15: lockdepの機能を全て有効にした上で、全てのコードパスを評価してください。
-
-16: /proc に新しいエントリを追加した場合には、Documentation/ 配下に
+15: /proc に新しいエントリを追加した場合には、Documentation/ 配下に
     必ずドキュメントを追加してください。
 
-17: 新しいブートパラメータを追加した場合には、
+16: 新しいブートパラメータを追加した場合には、
     必ずDocumentation/admin-guide/kernel-parameters.rst に説明を追加してください。
 
-18: 新しくmoduleにパラメータを追加した場合には、MODULE_PARM_DESC()を
+17: 新しくmoduleにパラメータを追加した場合には、MODULE_PARM_DESC()を
     利用して必ずその説明を記述してください。
 
-19: 新しいuserspaceインタフェースを作成した場合には、Documentation/ABI/ に
+18: 新しいuserspaceインタフェースを作成した場合には、Documentation/ABI/ に
     Documentation/ABI/README を参考にして必ずドキュメントを追加してください。
 
-20: 'make headers_check'を実行して全く問題がないことを確認してください。
+19: 'make headers_check'を実行して全く問題がないことを確認してください。
 
-21: 少なくともslabアロケーションとpageアロケーションに失敗した場合の
+20: 少なくともslabアロケーションとpageアロケーションに失敗した場合の
     挙動について、fault-injectionを利用して確認してください。
     Documentation/fault-injection/ を参照してください。
 
     追加したコードがかなりの量であったならば、サブシステム特有の
     fault-injectionを追加したほうが良いかもしれません。
 
-22: 新たに追加したコードは、`gcc -W'でコンパイルしてください。
+21: 新たに追加したコードは、`gcc -W'でコンパイルしてください。
     このオプションは大量の不要なメッセージを出力しますが、
     "warning: comparison between signed and unsigned" のようなメッセージは、
     バグを見つけるのに役に立ちます。
 
-23: 投稿したパッチが -mm パッチセットにマージされた後、全ての既存のパッチや
+22: 投稿したパッチが -mm パッチセットにマージされた後、全ての既存のパッチや
     VM, VFS およびその他のサブシステムに関する様々な変更と、現時点でも共存
     できることを確認するテストを行ってください。
diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig
index f56cc2070c11..b117e6c16d41 100644
--- a/arch/arc/configs/haps_hs_defconfig
+++ b/arch/arc/configs/haps_hs_defconfig
@@ -15,7 +15,6 @@ CONFIG_PERF_EVENTS=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_SLAB=y
 CONFIG_MODULES=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig
index b6f2482c7e74..33a787c375e2 100644
--- a/arch/arc/configs/haps_hs_smp_defconfig
+++ b/arch/arc/configs/haps_hs_smp_defconfig
@@ -17,7 +17,6 @@ CONFIG_PERF_EVENTS=y
 CONFIG_SLAB=y
 CONFIG_KPROBES=y
 CONFIG_MODULES=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig
index 318e4cd29629..de398c7b10b3 100644
--- a/arch/arc/configs/nsim_700_defconfig
+++ b/arch/arc/configs/nsim_700_defconfig
@@ -18,7 +18,6 @@ CONFIG_PERF_EVENTS=y
 CONFIG_ISA_ARCOMPACT=y
 CONFIG_KPROBES=y
 CONFIG_MODULES=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arc/configs/nsim_hs_defconfig b/arch/arc/configs/nsim_hs_defconfig
index c15807b0e0c1..2dbd34a9ff07 100644
--- a/arch/arc/configs/nsim_hs_defconfig
+++ b/arch/arc/configs/nsim_hs_defconfig
@@ -20,7 +20,6 @@ CONFIG_MODULES=y
 CONFIG_MODULE_FORCE_LOAD=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arc/configs/nsim_hs_smp_defconfig b/arch/arc/configs/nsim_hs_smp_defconfig
index 65e983fd942b..c7135f1e2583 100644
--- a/arch/arc/configs/nsim_hs_smp_defconfig
+++ b/arch/arc/configs/nsim_hs_smp_defconfig
@@ -18,7 +18,6 @@ CONFIG_MODULES=y
 CONFIG_MODULE_FORCE_LOAD=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig
index 08c5b99ac341..385a71d3c478 100644
--- a/arch/arc/configs/nsimosci_defconfig
+++ b/arch/arc/configs/nsimosci_defconfig
@@ -18,7 +18,6 @@ CONFIG_PERF_EVENTS=y
 CONFIG_ISA_ARCOMPACT=y
 CONFIG_KPROBES=y
 CONFIG_MODULES=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig
index 5b5e26d67955..248a2c3bdc12 100644
--- a/arch/arc/configs/nsimosci_hs_defconfig
+++ b/arch/arc/configs/nsimosci_hs_defconfig
@@ -17,7 +17,6 @@ CONFIG_PERF_EVENTS=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_KPROBES=y
 CONFIG_MODULES=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig
index 26af9b2f7fcb..1a4bc7b660fb 100644
--- a/arch/arc/configs/nsimosci_hs_smp_defconfig
+++ b/arch/arc/configs/nsimosci_hs_smp_defconfig
@@ -12,7 +12,6 @@ CONFIG_PERF_EVENTS=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_KPROBES=y
 CONFIG_MODULES=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arm/configs/aspeed_g4_defconfig b/arch/arm/configs/aspeed_g4_defconfig
index 1446262921b4..bdbade6af9c7 100644
--- a/arch/arm/configs/aspeed_g4_defconfig
+++ b/arch/arm/configs/aspeed_g4_defconfig
@@ -23,7 +23,6 @@ CONFIG_SLAB_FREELIST_RANDOM=y
 CONFIG_JUMP_LABEL=y
 CONFIG_STRICT_KERNEL_RWX=y
 CONFIG_GCC_PLUGINS=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEBUG_FS is not set
 # CONFIG_IOSCHED_DEADLINE is not set
diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig
index 02fa3a41add5..4bde84eae4eb 100644
--- a/arch/arm/configs/aspeed_g5_defconfig
+++ b/arch/arm/configs/aspeed_g5_defconfig
@@ -23,7 +23,6 @@ CONFIG_SLAB_FREELIST_RANDOM=y
 CONFIG_JUMP_LABEL=y
 CONFIG_STRICT_KERNEL_RWX=y
 CONFIG_GCC_PLUGINS=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEBUG_FS is not set
 # CONFIG_IOSCHED_DEADLINE is not set
diff --git a/arch/arm/configs/at91_dt_defconfig b/arch/arm/configs/at91_dt_defconfig
index e4b1be66b3f5..b7752929975c 100644
--- a/arch/arm/configs/at91_dt_defconfig
+++ b/arch/arm/configs/at91_dt_defconfig
@@ -9,7 +9,6 @@ CONFIG_EMBEDDED=y
 CONFIG_SLAB=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arm/configs/clps711x_defconfig b/arch/arm/configs/clps711x_defconfig
index fc105c9178cc..09ae750164e0 100644
--- a/arch/arm/configs/clps711x_defconfig
+++ b/arch/arm/configs/clps711x_defconfig
@@ -6,7 +6,6 @@ CONFIG_RD_LZMA=y
 CONFIG_EMBEDDED=y
 CONFIG_SLOB=y
 CONFIG_JUMP_LABEL=y
-# CONFIG_LBDAF is not set
 CONFIG_PARTITION_ADVANCED=y
 # CONFIG_IOSCHED_CFQ is not set
 CONFIG_ARCH_CLPS711X=y
diff --git a/arch/arm/configs/efm32_defconfig b/arch/arm/configs/efm32_defconfig
index ee42158f41ec..10ea92513a69 100644
--- a/arch/arm/configs/efm32_defconfig
+++ b/arch/arm/configs/efm32_defconfig
@@ -11,7 +11,6 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_EMBEDDED=y
 # CONFIG_VM_EVENT_COUNTERS is not set
 # CONFIG_SLUB_DEBUG is not set
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arm/configs/ezx_defconfig b/arch/arm/configs/ezx_defconfig
index 484e51fbd4a6..e3afca5bd9d6 100644
--- a/arch/arm/configs/ezx_defconfig
+++ b/arch/arm/configs/ezx_defconfig
@@ -13,7 +13,6 @@ CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
 CONFIG_MODVERSIONS=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_CFQ is not set
 CONFIG_ARCH_PXA=y
diff --git a/arch/arm/configs/h3600_defconfig b/arch/arm/configs/h3600_defconfig
index ebeca11faa48..175881b7da7c 100644
--- a/arch/arm/configs/h3600_defconfig
+++ b/arch/arm/configs/h3600_defconfig
@@ -4,7 +4,6 @@ CONFIG_HIGH_RES_TIMERS=y
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_MODULES=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arm/configs/imote2_defconfig b/arch/arm/configs/imote2_defconfig
index f204017c26b9..9b779e13e05d 100644
--- a/arch/arm/configs/imote2_defconfig
+++ b/arch/arm/configs/imote2_defconfig
@@ -12,7 +12,6 @@ CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
 CONFIG_MODVERSIONS=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_CFQ is not set
 CONFIG_ARCH_PXA=y
diff --git a/arch/arm/configs/moxart_defconfig b/arch/arm/configs/moxart_defconfig
index 078228a19339..6a11669fa536 100644
--- a/arch/arm/configs/moxart_defconfig
+++ b/arch/arm/configs/moxart_defconfig
@@ -15,7 +15,6 @@ CONFIG_EMBEDDED=y
 # CONFIG_VM_EVENT_COUNTERS is not set
 # CONFIG_SLUB_DEBUG is not set
 # CONFIG_COMPAT_BRK is not set
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 CONFIG_ARCH_MULTI_V4=y
diff --git a/arch/arm/configs/multi_v4t_defconfig b/arch/arm/configs/multi_v4t_defconfig
index 9a6390c172d6..eeea0c41138b 100644
--- a/arch/arm/configs/multi_v4t_defconfig
+++ b/arch/arm/configs/multi_v4t_defconfig
@@ -5,7 +5,6 @@ CONFIG_BLK_DEV_INITRD=y
 CONFIG_EMBEDDED=y
 CONFIG_SLOB=y
 CONFIG_JUMP_LABEL=y
-# CONFIG_LBDAF is not set
 CONFIG_PARTITION_ADVANCED=y
 # CONFIG_IOSCHED_CFQ is not set
 CONFIG_ARCH_MULTI_V4T=y
diff --git a/arch/arm/configs/omap1_defconfig b/arch/arm/configs/omap1_defconfig
index cfc00b0961ec..8448a7f407a4 100644
--- a/arch/arm/configs/omap1_defconfig
+++ b/arch/arm/configs/omap1_defconfig
@@ -17,7 +17,6 @@ CONFIG_OPROFILE=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arm/configs/stm32_defconfig b/arch/arm/configs/stm32_defconfig
index 0258ba891376..152321d2893e 100644
--- a/arch/arm/configs/stm32_defconfig
+++ b/arch/arm/configs/stm32_defconfig
@@ -13,7 +13,6 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_EMBEDDED=y
 # CONFIG_VM_EVENT_COUNTERS is not set
 # CONFIG_SLUB_DEBUG is not set
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arm/configs/u300_defconfig b/arch/arm/configs/u300_defconfig
index 36d77406e31b..831ba6a9ee8b 100644
--- a/arch/arm/configs/u300_defconfig
+++ b/arch/arm/configs/u300_defconfig
@@ -9,7 +9,6 @@ CONFIG_EXPERT=y
 # CONFIG_VM_EVENT_COUNTERS is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 CONFIG_PARTITION_ADVANCED=y
 # CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arm/configs/vexpress_defconfig b/arch/arm/configs/vexpress_defconfig
index 392ed3b3613c..484d77a7f589 100644
--- a/arch/arm/configs/vexpress_defconfig
+++ b/arch/arm/configs/vexpress_defconfig
@@ -14,7 +14,6 @@ CONFIG_PROFILING=y
 CONFIG_OPROFILE=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/m68k/configs/amcore_defconfig b/arch/m68k/configs/amcore_defconfig
index 0857cdbfde0c..d5e683dd885d 100644
--- a/arch/m68k/configs/amcore_defconfig
+++ b/arch/m68k/configs/amcore_defconfig
@@ -12,7 +12,6 @@ CONFIG_EMBEDDED=y
 # CONFIG_VM_EVENT_COUNTERS is not set
 # CONFIG_SLUB_DEBUG is not set
 # CONFIG_COMPAT_BRK is not set
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_CFQ is not set
 # CONFIG_MMU is not set
diff --git a/arch/m68k/configs/m5475evb_defconfig b/arch/m68k/configs/m5475evb_defconfig
index 4f4ccd13c11b..434bd3750966 100644
--- a/arch/m68k/configs/m5475evb_defconfig
+++ b/arch/m68k/configs/m5475evb_defconfig
@@ -11,7 +11,6 @@ CONFIG_SYSCTL_SYSCALL=y
 # CONFIG_AIO is not set
 CONFIG_EMBEDDED=y
 CONFIG_MODULES=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/m68k/configs/stmark2_defconfig b/arch/m68k/configs/stmark2_defconfig
index 69f23c7b0497..27fa9465d19d 100644
--- a/arch/m68k/configs/stmark2_defconfig
+++ b/arch/m68k/configs/stmark2_defconfig
@@ -17,7 +17,6 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_EMBEDDED=y
 # CONFIG_VM_EVENT_COUNTERS is not set
 # CONFIG_COMPAT_BRK is not set
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 CONFIG_BLK_CMDLINE_PARSER=y
 # CONFIG_MMU is not set
diff --git a/arch/mips/configs/ar7_defconfig b/arch/mips/configs/ar7_defconfig
index 9fbfb6e5c7d2..c83fdf649327 100644
--- a/arch/mips/configs/ar7_defconfig
+++ b/arch/mips/configs/ar7_defconfig
@@ -18,7 +18,6 @@ CONFIG_KEXEC=y
 # CONFIG_SECCOMP is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_BSD_DISKLABEL=y
diff --git a/arch/mips/configs/decstation_defconfig b/arch/mips/configs/decstation_defconfig
index 0c86ed86266a..30a6eafdb1d0 100644
--- a/arch/mips/configs/decstation_defconfig
+++ b/arch/mips/configs/decstation_defconfig
@@ -17,7 +17,6 @@ CONFIG_TC=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_SRCVERSION_ALL=y
-# CONFIG_LBDAF is not set
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_OSF_PARTITION=y
 # CONFIG_EFI_PARTITION is not set
diff --git a/arch/mips/configs/decstation_r4k_defconfig b/arch/mips/configs/decstation_r4k_defconfig
index 0e54ab2680ce..e2b58dbf4aa9 100644
--- a/arch/mips/configs/decstation_r4k_defconfig
+++ b/arch/mips/configs/decstation_r4k_defconfig
@@ -16,7 +16,6 @@ CONFIG_TC=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_SRCVERSION_ALL=y
-# CONFIG_LBDAF is not set
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_OSF_PARTITION=y
 # CONFIG_EFI_PARTITION is not set
diff --git a/arch/mips/configs/loongson1b_defconfig b/arch/mips/configs/loongson1b_defconfig
index b064d68a5424..aa7e98c5f5fc 100644
--- a/arch/mips/configs/loongson1b_defconfig
+++ b/arch/mips/configs/loongson1b_defconfig
@@ -19,7 +19,6 @@ CONFIG_MACH_LOONGSON32=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODVERSIONS=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 CONFIG_NET=y
diff --git a/arch/mips/configs/loongson1c_defconfig b/arch/mips/configs/loongson1c_defconfig
index 5d76559b56cd..520e7ef35383 100644
--- a/arch/mips/configs/loongson1c_defconfig
+++ b/arch/mips/configs/loongson1c_defconfig
@@ -20,7 +20,6 @@ CONFIG_LOONGSON1_LS1C=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODVERSIONS=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 CONFIG_NET=y
diff --git a/arch/mips/configs/rb532_defconfig b/arch/mips/configs/rb532_defconfig
index 7befe05fd813..ed1038f62a2c 100644
--- a/arch/mips/configs/rb532_defconfig
+++ b/arch/mips/configs/rb532_defconfig
@@ -19,7 +19,6 @@ CONFIG_PCI=y
 # CONFIG_PCI_QUIRKS is not set
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_MAC_PARTITION=y
diff --git a/arch/mips/configs/rbtx49xx_defconfig b/arch/mips/configs/rbtx49xx_defconfig
index 50a2c9ad583f..b0f0c5f9ad9d 100644
--- a/arch/mips/configs/rbtx49xx_defconfig
+++ b/arch/mips/configs/rbtx49xx_defconfig
@@ -17,7 +17,6 @@ CONFIG_TOSHIBA_RBTX4938_MPLEX_KEEP=y
 CONFIG_PCI=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 CONFIG_NET=y
 CONFIG_PACKET=y
diff --git a/arch/parisc/configs/generic-32bit_defconfig b/arch/parisc/configs/generic-32bit_defconfig
index 37ae4b57c001..a8f9bbef0975 100644
--- a/arch/parisc/configs/generic-32bit_defconfig
+++ b/arch/parisc/configs/generic-32bit_defconfig
@@ -14,7 +14,6 @@ CONFIG_SLAB=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 CONFIG_PA7100LC=y
 CONFIG_SMP=y
diff --git a/arch/sh/configs/apsh4ad0a_defconfig b/arch/sh/configs/apsh4ad0a_defconfig
index 825c641726c4..d0d9ebc7165b 100644
--- a/arch/sh/configs/apsh4ad0a_defconfig
+++ b/arch/sh/configs/apsh4ad0a_defconfig
@@ -19,7 +19,6 @@ CONFIG_SLAB=y
 CONFIG_PROFILING=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 CONFIG_CFQ_GROUP_IOSCHED=y
 CONFIG_CPU_SUBTYPE_SH7786=y
diff --git a/arch/sh/configs/ecovec24-romimage_defconfig b/arch/sh/configs/ecovec24-romimage_defconfig
index 0c5dfccbfe37..bdb61d1d0127 100644
--- a/arch/sh/configs/ecovec24-romimage_defconfig
+++ b/arch/sh/configs/ecovec24-romimage_defconfig
@@ -7,7 +7,6 @@ CONFIG_LOG_BUF_SHIFT=14
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_KALLSYMS is not set
 CONFIG_SLAB=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 CONFIG_CPU_SUBTYPE_SH7724=y
 CONFIG_MEMORY_SIZE=0x10000000
diff --git a/arch/sh/configs/rsk7264_defconfig b/arch/sh/configs/rsk7264_defconfig
index 2b9b731fc86b..ad003ee469ea 100644
--- a/arch/sh/configs/rsk7264_defconfig
+++ b/arch/sh/configs/rsk7264_defconfig
@@ -16,7 +16,6 @@ CONFIG_PERF_COUNTERS=y
 CONFIG_SLAB=y
 CONFIG_MMAP_ALLOW_UNINITIALIZED=y
 CONFIG_PROFILING=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 CONFIG_PARTITION_ADVANCED=y
 # CONFIG_IOSCHED_DEADLINE is not set
diff --git a/arch/sh/configs/rsk7269_defconfig b/arch/sh/configs/rsk7269_defconfig
index d041f7bcb84c..27fc01d58cf8 100644
--- a/arch/sh/configs/rsk7269_defconfig
+++ b/arch/sh/configs/rsk7269_defconfig
@@ -3,7 +3,6 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_EMBEDDED=y
 # CONFIG_VM_EVENT_COUNTERS is not set
 CONFIG_SLAB=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/sh/configs/sh7785lcr_32bit_defconfig b/arch/sh/configs/sh7785lcr_32bit_defconfig
index 2ddf5ca7094e..a89ccc15af23 100644
--- a/arch/sh/configs/sh7785lcr_32bit_defconfig
+++ b/arch/sh/configs/sh7785lcr_32bit_defconfig
@@ -11,7 +11,6 @@ CONFIG_PROFILING=y
 CONFIG_GCOV_KERNEL=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-# CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
 CONFIG_CPU_SUBTYPE_SH7785=y
 CONFIG_MEMORY_START=0x40000000
diff --git a/block/Kconfig b/block/Kconfig
index 028bc085dac8..1b220101a9cb 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -26,30 +26,6 @@ menuconfig BLOCK
 
 if BLOCK
 
-config LBDAF
-	bool "Support for large (2TB+) block devices and files"
-	depends on !64BIT
-	default y
-	help
-	  Enable block devices or files of size 2TB and larger.
-
-	  This option is required to support the full capacity of large
-	  (2TB+) block devices, including RAID, disk, Network Block Device,
-	  Logical Volume Manager (LVM) and loopback.
-	
-	  This option also enables support for single files larger than
-	  2TB.
-
-	  The ext4 filesystem requires that this feature be enabled in
-	  order to support filesystems that have the huge_file feature
-	  enabled.  Otherwise, it will refuse to mount in the read-write
-	  mode any filesystems that use the huge_file feature, which is
-	  enabled by default by mke2fs.ext4.
-
-	  The GFS2 filesystem also requires this feature.
-
-	  If unsure, say Y.
-
 config BLK_SCSI_REQUEST
 	bool
 
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 000a2f4c0e92..acd7af3630e9 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1317,10 +1317,6 @@ struct bm_extent {
 
 #define DRBD_MAX_SECTORS_FIXED_BM \
 	  ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9)))
-#if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
-#define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_32
-#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
-#else
 #define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_FIXED_BM
 /* 16 TB in units of sectors */
 #if BITS_PER_LONG == 32
@@ -1333,7 +1329,6 @@ struct bm_extent {
 #define DRBD_MAX_SECTORS_FLEX (1UL << 51)
 /* corresponds to (1UL << 38) bits right now. */
 #endif
-#endif
 
 /* Estimate max bio size as 256 * PAGE_SIZE,
  * so for typical PAGE_SIZE of 4k, that is (1<<20) Byte.
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 4e1d9b31f60c..cc61c5ce3ad5 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -102,7 +102,7 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev,
 
 	rq_for_each_segment(bvec, req, iter) {
 		unsigned long flags;
-		dev_dbg(&dev->sbd.core, "%s:%u: bio %u: %u sectors from %lu\n",
+		dev_dbg(&dev->sbd.core, "%s:%u: bio %u: %u sectors from %llu\n",
 			__func__, __LINE__, i, bio_sectors(iter.bio),
 			iter.bio->bi_iter.bi_sector);
 
@@ -496,7 +496,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
 		     dev->regions[dev->region_idx].size*priv->blocking_factor);
 
 	dev_info(&dev->sbd.core,
-		 "%s is a %s (%llu MiB total, %lu MiB for OtherOS)\n",
+		 "%s is a %s (%llu MiB total, %llu MiB for OtherOS)\n",
 		 gendisk->disk_name, priv->model, priv->raw_capacity >> 11,
 		 get_capacity(gendisk) >> 11);
 
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 12b5216c2cfe..721efc493942 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -135,9 +135,8 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *snap);
 /*
  * Funtions to manipulate consecutive chunks
  */
-#  if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
-#    define DM_CHUNK_CONSECUTIVE_BITS 8
-#    define DM_CHUNK_NUMBER_BITS 56
+#define DM_CHUNK_CONSECUTIVE_BITS 8
+#define DM_CHUNK_NUMBER_BITS 56
 
 static inline chunk_t dm_chunk_number(chunk_t chunk)
 {
@@ -163,29 +162,6 @@ static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
 	e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS);
 }
 
-#  else
-#    define DM_CHUNK_CONSECUTIVE_BITS 0
-
-static inline chunk_t dm_chunk_number(chunk_t chunk)
-{
-	return chunk;
-}
-
-static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
-{
-	return 0;
-}
-
-static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
-{
-}
-
-static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
-{
-}
-
-#  endif
-
 /*
  * Return the number of sectors in the device.
  */
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index d57d997a52c8..0eb56ba89a7f 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -88,14 +88,10 @@ struct journal_entry {
 
 #if BITS_PER_LONG == 64
 #define journal_entry_set_sector(je, x)		do { smp_wmb(); WRITE_ONCE((je)->u.sector, cpu_to_le64(x)); } while (0)
-#define journal_entry_get_sector(je)		le64_to_cpu((je)->u.sector)
-#elif defined(CONFIG_LBDAF)
-#define journal_entry_set_sector(je, x)		do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0)
-#define journal_entry_get_sector(je)		le64_to_cpu((je)->u.sector)
 #else
-#define journal_entry_set_sector(je, x)		do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32(0)); } while (0)
-#define journal_entry_get_sector(je)		le32_to_cpu((je)->u.s.sector_lo)
+#define journal_entry_set_sector(je, x)		do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0)
 #endif
+#define journal_entry_get_sector(je)		le64_to_cpu((je)->u.sector)
 #define journal_entry_is_unused(je)		((je)->u.s.sector_hi == cpu_to_le32(-1))
 #define journal_entry_set_unused(je)		do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0)
 #define journal_entry_is_inprogress(je)		((je)->u.s.sector_hi == cpu_to_le32(-2))
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d0f688399a56..1fa2682951f1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1106,8 +1106,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
 	 * (not needed for Linear and RAID0 as metadata doesn't
 	 * record this size)
 	 */
-	if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) &&
-	    sb->level >= 1)
+	if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
 		rdev->sectors = (sector_t)(2ULL << 32) - 2;
 
 	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
@@ -1405,8 +1404,7 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
 	/* Limit to 4TB as metadata cannot record more than that.
 	 * 4TB == 2^32 KB, or 2*2^32 sectors.
 	 */
-	if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
-	    rdev->mddev->level >= 1)
+	if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
 		num_sectors = (sector_t)(2ULL << 32) - 2;
 	do {
 		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index d271bd731af7..01f40672507f 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -391,7 +391,7 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn)
 		bb_present = badblocks_check(&nd_region->bb, meta_start,
 				meta_num, &first_bad, &num_bad);
 		if (bb_present) {
-			dev_dbg(&nd_pfn->dev, "meta: %x badblocks at %lx\n",
+			dev_dbg(&nd_pfn->dev, "meta: %x badblocks at %llx\n",
 					num_bad, first_bad);
 			nsoff = ALIGN_DOWN((nd_region->ndr_start
 					+ (first_bad << 9)) - nsio->res.start,
@@ -410,7 +410,7 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn)
 			}
 			if (rc) {
 				dev_err(&nd_pfn->dev,
-					"error clearing %x badblocks at %lx\n",
+					"error clearing %x badblocks at %llx\n",
 					num_bad, first_bad);
 				return rc;
 			}
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 2b2bc4b49d78..92c34d93e051 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -2256,22 +2256,6 @@ static void read_capacity_error(struct scsi_disk *sdkp, struct scsi_device *sdp,
 
 #define READ_CAPACITY_RETRIES_ON_RESET	10
 
-/*
- * Ensure that we don't overflow sector_t when CONFIG_LBDAF is not set
- * and the reported logical block size is bigger than 512 bytes. Note
- * that last_sector is a u64 and therefore logical_to_sectors() is not
- * applicable.
- */
-static bool sd_addressable_capacity(u64 lba, unsigned int sector_size)
-{
-	u64 last_sector = (lba + 1ULL) << (ilog2(sector_size) - 9);
-
-	if (sizeof(sector_t) == 4 && last_sector > U32_MAX)
-		return false;
-
-	return true;
-}
-
 static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp,
 						unsigned char *buffer)
 {
@@ -2337,14 +2321,6 @@ static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp,
 		return -ENODEV;
 	}
 
-	if (!sd_addressable_capacity(lba, sector_size)) {
-		sd_printk(KERN_ERR, sdkp, "Too big for this kernel. Use a "
-			"kernel compiled with support for large block "
-			"devices.\n");
-		sdkp->capacity = 0;
-		return -EOVERFLOW;
-	}
-
 	/* Logical blocks per physical block exponent */
 	sdkp->physical_block_size = (1 << (buffer[13] & 0xf)) * sector_size;
 
@@ -2426,14 +2402,6 @@ static int read_capacity_10(struct scsi_disk *sdkp, struct scsi_device *sdp,
 		return sector_size;
 	}
 
-	if (!sd_addressable_capacity(lba, sector_size)) {
-		sd_printk(KERN_ERR, sdkp, "Too big for this kernel. Use a "
-			"kernel compiled with support for large block "
-			"devices.\n");
-		sdkp->capacity = 0;
-		return -EOVERFLOW;
-	}
-
 	sdkp->capacity = lba + 1;
 	sdkp->physical_block_size = sector_size;
 	return sector_size;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index e7ae26e36c9c..38faf661e237 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1760,8 +1760,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 		ext4_msg(sb, KERN_ERR,
 			 "filesystem too large to resize to %llu blocks safely",
 			 n_blocks_count);
-		if (sizeof(sector_t) < 8)
-			ext4_warning(sb, "CONFIG_LBDAF not enabled");
 		return -EINVAL;
 	}
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6ed4eb81e674..d10e9e724bdd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2706,13 +2706,9 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
 	loff_t res;
 	loff_t upper_limit = MAX_LFS_FILESIZE;
 
-	/* small i_blocks in vfs inode? */
-	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
-		/*
-		 * CONFIG_LBDAF is not enabled implies the inode
-		 * i_block represent total blocks in 512 bytes
-		 * 32 == size of vfs inode i_blocks * 8
-		 */
+	BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64));
+
+	if (!has_huge_files) {
 		upper_limit = (1LL << 32) - 1;
 
 		/* total blocks in file system block size */
@@ -2753,11 +2749,11 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 	 * number of 512-byte sectors of the file.
 	 */
 
-	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
+	if (!has_huge_files) {
 		/*
-		 * !has_huge_files or CONFIG_LBDAF not enabled implies that
-		 * the inode i_block field represents total file blocks in
-		 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
+		 * !has_huge_files or implies that the inode i_block field
+		 * represents total file blocks in 2^32 512-byte sectors ==
+		 * size of vfs inode i_blocks * 8
 		 */
 		upper_limit = (1LL << 32) - 1;
 
@@ -2897,18 +2893,6 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
 				~EXT4_FEATURE_RO_COMPAT_SUPP));
 		return 0;
 	}
-	/*
-	 * Large file size enabled file system can only be mounted
-	 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
-	 */
-	if (ext4_has_feature_huge_file(sb)) {
-		if (sizeof(blkcnt_t) < sizeof(u64)) {
-			ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
-				 "cannot be mounted RDWR without "
-				 "CONFIG_LBDAF");
-			return 0;
-		}
-	}
 	if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
 		ext4_msg(sb, KERN_ERR,
 			 "Can't support bigalloc feature without "
@@ -4057,8 +4041,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "filesystem"
 			 " too large to mount safely on this system");
-		if (sizeof(sector_t) < 8)
-			ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
 		goto failed_mount;
 	}
 
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 3ed2b088dcfd..6a1e499543f5 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,5 @@
 config GFS2_FS
 	tristate "GFS2 file system support"
-	depends on (64BIT || LBDAF)
 	select FS_POSIX_ACL
 	select CRC32
 	select LIBCRC32C
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 5f93cfacb3d1..69d02cf8cf37 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -121,7 +121,6 @@ config PNFS_FILE_LAYOUT
 config PNFS_BLOCK
 	tristate
 	depends on NFS_V4_1 && BLK_DEV_DM
-	depends on 64BIT || LBDAF
 	default NFS_V4
 
 config PNFS_FLEXFILE_LAYOUT
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 96ae7cedd487..fc3d29eceb2f 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -600,7 +600,6 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
 	 */
 
 #if BITS_PER_LONG == 32
-# if defined(CONFIG_LBDAF)
 	BUILD_BUG_ON(sizeof(sector_t) != 8);
 	/*
 	 * We might be limited by page cache size.
@@ -614,15 +613,6 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
 		 */
 		bitshift = 31;
 	}
-# else
-	/*
-	 * We are limited by the size of sector_t. Use block size, as
-	 * that's what we expose to the VFS.
-	 */
-	bytes = 1 << bbits;
-	trim = 1;
-	bitshift = 31;
-# endif
 #endif
 
 	/*
diff --git a/fs/stack.c b/fs/stack.c
index a54e33ed10f1..664ed35558bd 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -21,11 +21,10 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
 	i_size = i_size_read(src);
 
 	/*
-	 * But if CONFIG_LBDAF (on 32-bit), we ought to make an effort to
-	 * keep the two halves of i_blocks in sync despite SMP or PREEMPT -
-	 * though stat's generic_fillattr() doesn't bother, and we won't be
-	 * applying quotas (where i_blocks does become important) at the
-	 * upper level.
+	 * But on 32-bit, we ought to make an effort to keep the two halves of
+	 * i_blocks in sync despite SMP or PREEMPT - though stat's
+	 * generic_fillattr() doesn't bother, and we won't be applying quotas
+	 * (where i_blocks does become important) at the upper level.
 	 *
 	 * We don't actually know what locking is used at the lower level;
 	 * but if it's a filesystem that supports quotas, it will be using
@@ -44,9 +43,9 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
 	 * include/linux/fs.h).  We don't necessarily hold i_mutex when this
 	 * is called, so take i_lock for that case.
 	 *
-	 * And if CONFIG_LBDAF (on 32-bit), continue our effort to keep the
-	 * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock
-	 * for that case too, and do both at once by combining the tests.
+	 * And if on 32-bit, continue our effort to keep the two halves of
+	 * i_blocks in sync despite SMP or PREEMPT: use i_lock  for that case
+	 * too, and do both at once by combining the tests.
 	 *
 	 * There is none of this locking overhead in the 64-bit case.
 	 */
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 457ac9f97377..99af5e5bda9f 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,7 +1,6 @@
 config XFS_FS
 	tristate "XFS filesystem support"
 	depends on BLOCK
-	depends on (64BIT || LBDAF)
 	select EXPORTFS
 	select LIBCRC32C
 	select FS_IOMAP
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f093ea244849..703b6be063ef 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -539,26 +539,18 @@ xfs_max_file_offset(
 
 	/* Figure out maximum filesize, on Linux this can depend on
 	 * the filesystem blocksize (on 32 bit platforms).
-	 * __block_write_begin does this in an [unsigned] long...
+	 * __block_write_begin does this in an [unsigned] long long...
 	 *      page->index << (PAGE_SHIFT - bbits)
 	 * So, for page sized blocks (4K on 32 bit platforms),
 	 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
 	 *      (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1)
 	 * but for smaller blocksizes it is less (bbits = log2 bsize).
-	 * Note1: get_block_t takes a long (implicit cast from above)
-	 * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
-	 * can optionally convert the [unsigned] long from above into
-	 * an [unsigned] long long.
 	 */
 
 #if BITS_PER_LONG == 32
-# if defined(CONFIG_LBDAF)
 	ASSERT(sizeof(sector_t) == 8);
 	pagefactor = PAGE_SIZE;
 	bitshift = BITS_PER_LONG;
-# else
-	pagefactor = PAGE_SIZE >> (PAGE_SHIFT - blockshift);
-# endif
 #endif
 
 	return (((uint64_t)pagefactor) << bitshift) - 1;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 06c0fd594097..98076b1b5e48 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -714,7 +714,7 @@ static inline void hd_free_part(struct hd_struct *part)
  */
 static inline sector_t part_nr_sects_read(struct hd_struct *part)
 {
-#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP)
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	sector_t nr_sects;
 	unsigned seq;
 	do {
@@ -722,7 +722,7 @@ static inline sector_t part_nr_sects_read(struct hd_struct *part)
 		nr_sects = part->nr_sects;
 	} while (read_seqcount_retry(&part->nr_sects_seq, seq));
 	return nr_sects;
-#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT)
+#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
 	sector_t nr_sects;
 
 	preempt_disable();
@@ -741,11 +741,11 @@ static inline sector_t part_nr_sects_read(struct hd_struct *part)
  */
 static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
 {
-#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP)
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	write_seqcount_begin(&part->nr_sects_seq);
 	part->nr_sects = size;
 	write_seqcount_end(&part->nr_sects_seq);
-#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT)
+#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
 	preempt_disable();
 	part->nr_sects = size;
 	preempt_enable();
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 34a5036debd3..24ef5a018a5e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -17,6 +17,7 @@
 #include <asm/byteorder.h>
 #include <asm/div64.h>
 #include <uapi/linux/kernel.h>
+#include <asm/div64.h>
 
 #define STACK_MAGIC	0xdeadbeef
 
@@ -175,18 +176,7 @@
 #define _RET_IP_		(unsigned long)__builtin_return_address(0)
 #define _THIS_IP_  ({ __label__ __here; __here: (unsigned long)&&__here; })
 
-#ifdef CONFIG_LBDAF
-# define sector_div(a, b) do_div(a, b)
-#else
-# define sector_div(n, b)( \
-{ \
-	int _res; \
-	_res = (n) % (b); \
-	(n) /= (b); \
-	_res; \
-} \
-)
-#endif
+#define sector_div(a, b) do_div(a, b)
 
 /**
  * upper_32_bits - return bits 32-63 of a number
diff --git a/include/linux/types.h b/include/linux/types.h
index cc0dbbe551d5..231114ae38f4 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -127,13 +127,8 @@ typedef s64			int64_t;
  *
  * blkcnt_t is the type of the inode's block count.
  */
-#ifdef CONFIG_LBDAF
 typedef u64 sector_t;
 typedef u64 blkcnt_t;
-#else
-typedef unsigned long sector_t;
-typedef unsigned long blkcnt_t;
-#endif
 
 /*
  * The type of an index into the pagecache.
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 0d9e81779e37..d8781786cf63 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1927,7 +1927,6 @@ config TEST_STATIC_KEYS
 config TEST_KMOD
 	tristate "kmod stress tester"
 	depends on m
-	depends on BLOCK && (64BIT || LBDAF)	  # for XFS, BTRFS
 	depends on NETDEVICES && NET_CORE && INET # for TUN
 	select TEST_LKM
 	select XFS_FS
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h
index d27285f8ee82..8bc960e5e713 100644
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h
@@ -59,11 +59,7 @@ typedef		__u32		uint32_t;
  *
  * blkcnt_t is the type of the inode's block count.
  */
-#ifdef CONFIG_LBDAF
 typedef u64 sector_t;
-#else
-typedef unsigned long sector_t;
-#endif
 
 /*
  * The type of an index into the pagecache.
-- 
cgit v1.2.3-59-g8ed1b


From 863a0618226eb0d802e7fd830135a947bae5d2fa Mon Sep 17 00:00:00 2001
From: Dan Murphy <dmurphy@ti.com>
Date: Thu, 21 Mar 2019 09:28:36 -0500
Subject: ARM: dts: omap4-droid4: Update backlight dt properties

Update the properties for the lm3532 device node for droid4.
With this change the backlight LED string and the keypad
LED strings will be controlled separately.

Signed-off-by: Dan Murphy <dmurphy@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
[tony@atomide.com: remove the line "backlight = <&lcd_backlight>"]
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 arch/arm/boot/dts/omap4-droid4-xt894.dts | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/boot/dts/omap4-droid4-xt894.dts b/arch/arm/boot/dts/omap4-droid4-xt894.dts
index e21ec929f096..714863f8f261 100644
--- a/arch/arm/boot/dts/omap4-droid4-xt894.dts
+++ b/arch/arm/boot/dts/omap4-droid4-xt894.dts
@@ -214,7 +214,6 @@
 
 		width-mm = <50>;
 		height-mm = <89>;
-		backlight = <&lcd_backlight>;
 
 		panel-timing {
 			clock-frequency = <0>;		/* Calculated by dsi */
@@ -383,20 +382,30 @@
 };
 
 &i2c1 {
-	lm3532@38 {
+	led-controller@38 {
 		compatible = "ti,lm3532";
+		#address-cells = <1>;
+		#size-cells = <0>;
 		reg = <0x38>;
 
 		enable-gpios = <&gpio6 12 GPIO_ACTIVE_HIGH>;
 
-		lcd_backlight: backlight {
-			compatible = "ti,lm3532-backlight";
+		ramp-up-us = <1024>;
+		ramp-down-us = <8193>;
 
-			lcd {
-				led-sources = <0 1 2>;
-				ramp-up-msec = <1>;
-				ramp-down-msec = <0>;
-			};
+		led@0 {
+			reg = <0>;
+			led-sources = <2>;
+			ti,led-mode = <0>;
+			label = ":backlight";
+			linux,default-trigger = "backlight";
+		};
+
+		led@1 {
+			reg = <1>;
+			led-sources = <1>;
+			ti,led-mode = <0>;
+			label = ":kbd_backlight";
 		};
 	};
 };
-- 
cgit v1.2.3-59-g8ed1b


From dec3d0b1071a0f3194e66a83d26ecf4aa8c5910e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 31 Mar 2019 13:04:13 -0700
Subject: crypto: x86/crct10dif-pcl - fix use via crypto_shash_digest()

The ->digest() method of crct10dif-pclmul reads the current CRC value
from the shash_desc context.  But this value is uninitialized, causing
crypto_shash_digest() to compute the wrong result.  Fix it.

Probably this wasn't noticed before because lib/crc-t10dif.c only uses
crypto_shash_update(), not crypto_shash_digest().  Likewise,
crypto_shash_digest() is not yet tested by the crypto self-tests because
those only test the ahash API which only uses shash init/update/final.

Fixes: 0b95a7f85718 ("crypto: crct10dif - Glue code to cast accelerated CRCT10DIF assembly as a crypto transform")
Cc: <stable@vger.kernel.org> # v3.11+
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/crct10dif-pclmul_glue.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
index 64f17d2d8b67..3c81e15b0873 100644
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -71,15 +71,14 @@ static int chksum_final(struct shash_desc *desc, u8 *out)
 	return 0;
 }
 
-static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
-			u8 *out)
+static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out)
 {
 	if (len >= 16 && crypto_simd_usable()) {
 		kernel_fpu_begin();
-		*(__u16 *)out = crc_t10dif_pcl(*crcp, data, len);
+		*(__u16 *)out = crc_t10dif_pcl(crc, data, len);
 		kernel_fpu_end();
 	} else
-		*(__u16 *)out = crc_t10dif_generic(*crcp, data, len);
+		*(__u16 *)out = crc_t10dif_generic(crc, data, len);
 	return 0;
 }
 
@@ -88,15 +87,13 @@ static int chksum_finup(struct shash_desc *desc, const u8 *data,
 {
 	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
 
-	return __chksum_finup(&ctx->crc, data, len, out);
+	return __chksum_finup(ctx->crc, data, len, out);
 }
 
 static int chksum_digest(struct shash_desc *desc, const u8 *data,
 			 unsigned int length, u8 *out)
 {
-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	return __chksum_finup(&ctx->crc, data, length, out);
+	return __chksum_finup(0, data, length, out);
 }
 
 static struct shash_alg alg = {
-- 
cgit v1.2.3-59-g8ed1b


From f6e9af87661530e60d9faf1d96675e8f22127aa8 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 31 Mar 2019 13:04:21 -0700
Subject: crypto: arm64/cbcmac - handle empty messages in same way as template

My patches to make testmgr fuzz algorithms against their generic
implementation detected that the arm64 implementations of "cbcmac(aes)"
handle empty messages differently from the cbcmac template.  Namely, the
arm64 implementations return the encrypted initial value, but the cbcmac
template returns the initial value directly.

This isn't actually a meaningful case because any user of cbcmac needs
to prepend the message length, as CCM does; otherwise it's insecure.
However, we should keep the behavior consistent; at the very least this
makes testing easier.

Do it the easy way, which is to change the arm64 implementations to have
the same behavior as the cbcmac template.

For what it's worth, ghash does things essentially the same way: it
returns its initial value when given an empty message, even though in
practice ghash is never passed an empty message.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/aes-glue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index 692cb75f2ca2..f0ceb545bd1e 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -707,7 +707,7 @@ static int cbcmac_final(struct shash_desc *desc, u8 *out)
 	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
 
-	mac_do_update(&tctx->key, NULL, 0, ctx->dg, 1, 0);
+	mac_do_update(&tctx->key, NULL, 0, ctx->dg, (ctx->len != 0), 0);
 
 	memcpy(out, ctx->dg, AES_BLOCK_SIZE);
 
-- 
cgit v1.2.3-59-g8ed1b


From 12f2639038ef420fe796171ffb810b30d1ac0619 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 5 Apr 2019 21:46:12 +0200
Subject: tracing: stop making gpio tracing configurable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gpio tracing was made configurable in 4.4-rc1 (commit ddd70280bf0e
("tracing: gpio: Add Kconfig option for enabling/disabling trace
events")). Since then it is the only event type that can be compiled
conditionally. Given that there is only little overhead I don't
understand the reasoning and I was annoyed more than once that gpio
events were not available without recompiling.

So drop the Kconfig symbol and make gpio events available
unconditionally.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 arch/arm/configs/aspeed_g4_defconfig | 1 -
 arch/arm/configs/aspeed_g5_defconfig | 1 -
 include/trace/events/gpio.h          | 4 ----
 kernel/trace/Kconfig                 | 7 -------
 4 files changed, 13 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/configs/aspeed_g4_defconfig b/arch/arm/configs/aspeed_g4_defconfig
index 1446262921b4..0f3638dd576b 100644
--- a/arch/arm/configs/aspeed_g4_defconfig
+++ b/arch/arm/configs/aspeed_g4_defconfig
@@ -248,7 +248,6 @@ CONFIG_PANIC_TIMEOUT=-1
 # CONFIG_SCHED_DEBUG is not set
 CONFIG_SCHED_STACK_END_CHECK=y
 CONFIG_FUNCTION_TRACER=y
-# CONFIG_TRACING_EVENTS_GPIO is not set
 # CONFIG_RUNTIME_TESTING_MENU is not set
 CONFIG_DEBUG_WX=y
 CONFIG_DEBUG_USER=y
diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig
index 02fa3a41add5..eaab776c4ac2 100644
--- a/arch/arm/configs/aspeed_g5_defconfig
+++ b/arch/arm/configs/aspeed_g5_defconfig
@@ -248,7 +248,6 @@ CONFIG_PANIC_TIMEOUT=-1
 # CONFIG_SCHED_DEBUG is not set
 CONFIG_SCHED_STACK_END_CHECK=y
 CONFIG_FUNCTION_TRACER=y
-# CONFIG_TRACING_EVENTS_GPIO is not set
 # CONFIG_RUNTIME_TESTING_MENU is not set
 CONFIG_DEBUG_WX=y
 CONFIG_DEBUG_USER=y
diff --git a/include/trace/events/gpio.h b/include/trace/events/gpio.h
index 5c189a22c489..3aa9fd86d748 100644
--- a/include/trace/events/gpio.h
+++ b/include/trace/events/gpio.h
@@ -2,10 +2,6 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM gpio
 
-#ifndef CONFIG_TRACING_EVENTS_GPIO
-#define NOTRACE
-#endif
-
 #if !defined(_TRACE_GPIO_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_GPIO_H
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8bd1d6d001d7..5d965cef6c77 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -774,13 +774,6 @@ config TRACE_EVAL_MAP_FILE
 
 	If unsure, say N
 
-config TRACING_EVENTS_GPIO
-	bool "Trace gpio events"
-	depends on GPIOLIB
-	default y
-	help
-	  Enable tracing events for gpio subsystem
-
 config GCOV_PROFILE_FTRACE
 	bool "Enable GCOV profiling on ftrace subsystem"
 	depends on GCOV_KERNEL
-- 
cgit v1.2.3-59-g8ed1b


From 5ab6a91a1cacd827382897a80425df8a2d27744a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Dec 2018 14:56:38 +0100
Subject: arm: use a dummy struct device for ISA DMA use of the DMA API

This gets rid of the last NULL dev argument passed to the DMA API.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/arm/kernel/dma-isa.c | 8 +++++++-
 arch/arm/mach-rpc/dma.c   | 8 +++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/kernel/dma-isa.c b/arch/arm/kernel/dma-isa.c
index 84363fe7bad2..10c45cc6b957 100644
--- a/arch/arm/kernel/dma-isa.c
+++ b/arch/arm/kernel/dma-isa.c
@@ -55,6 +55,12 @@ static int isa_get_dma_residue(unsigned int chan, dma_t *dma)
 	return chan < 4 ? count : (count << 1);
 }
 
+static struct device isa_dma_dev = {
+	.init_name		= "fallback device",
+	.coherent_dma_mask	= ~(dma_addr_t)0,
+	.dma_mask		= &isa_dma_dev.coherent_dma_mask,
+};
+
 static void isa_enable_dma(unsigned int chan, dma_t *dma)
 {
 	if (dma->invalid) {
@@ -89,7 +95,7 @@ static void isa_enable_dma(unsigned int chan, dma_t *dma)
 			dma->sg = &dma->buf;
 			dma->sgcount = 1;
 			dma->buf.length = dma->count;
-			dma->buf.dma_address = dma_map_single(NULL,
+			dma->buf.dma_address = dma_map_single(&isa_dma_dev,
 				dma->addr, dma->count,
 				direction);
 		}
diff --git a/arch/arm/mach-rpc/dma.c b/arch/arm/mach-rpc/dma.c
index fb48f3141fb4..f2703ca17954 100644
--- a/arch/arm/mach-rpc/dma.c
+++ b/arch/arm/mach-rpc/dma.c
@@ -151,6 +151,12 @@ static void iomd_free_dma(unsigned int chan, dma_t *dma)
 	free_irq(idma->irq, idma);
 }
 
+static struct device isa_dma_dev = {
+	.init_name		= "fallback device",
+	.coherent_dma_mask	= ~(dma_addr_t)0,
+	.dma_mask		= &isa_dma_dev.coherent_dma_mask,
+};
+
 static void iomd_enable_dma(unsigned int chan, dma_t *dma)
 {
 	struct iomd_dma *idma = container_of(dma, struct iomd_dma, dma);
@@ -168,7 +174,7 @@ static void iomd_enable_dma(unsigned int chan, dma_t *dma)
 			idma->dma.sg = &idma->dma.buf;
 			idma->dma.sgcount = 1;
 			idma->dma.buf.length = idma->dma.count;
-			idma->dma.buf.dma_address = dma_map_single(NULL,
+			idma->dma.buf.dma_address = dma_map_single(&isa_dma_dev,
 				idma->dma.addr, idma->dma.count,
 				idma->dma.dma_mode == DMA_MODE_READ ?
 				DMA_FROM_DEVICE : DMA_TO_DEVICE);
-- 
cgit v1.2.3-59-g8ed1b


From e43e2657fe77a37b13643e2469670ecdb0ba5e10 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Dec 2018 14:32:02 +0100
Subject: x86/dma: Remove the x86_dma_fallback_dev hack

Now that we removed support for the NULL device argument in the DMA API,
there is no need to cater for that in the x86 code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/x86/include/asm/dma-mapping.h | 10 ----------
 arch/x86/kernel/amd_gart_64.c      |  6 ------
 arch/x86/kernel/pci-dma.c          | 20 --------------------
 kernel/dma/mapping.c               |  7 -------
 4 files changed, 43 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index ce4d176b3d13..6b15a24930e0 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -13,14 +13,7 @@
 #include <asm/swiotlb.h>
 #include <linux/dma-contiguous.h>
 
-#ifdef CONFIG_ISA
-# define ISA_DMA_BIT_MASK DMA_BIT_MASK(24)
-#else
-# define ISA_DMA_BIT_MASK DMA_BIT_MASK(32)
-#endif
-
 extern int iommu_merge;
-extern struct device x86_dma_fallback_dev;
 extern int panic_on_overflow;
 
 extern const struct dma_map_ops *dma_ops;
@@ -30,7 +23,4 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return dma_ops;
 }
 
-bool arch_dma_alloc_attrs(struct device **dev);
-#define arch_dma_alloc_attrs arch_dma_alloc_attrs
-
 #endif
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 2c0aa34af69c..bf7f13ea3c64 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -233,9 +233,6 @@ static dma_addr_t gart_map_page(struct device *dev, struct page *page,
 	unsigned long bus;
 	phys_addr_t paddr = page_to_phys(page) + offset;
 
-	if (!dev)
-		dev = &x86_dma_fallback_dev;
-
 	if (!need_iommu(dev, paddr, size))
 		return paddr;
 
@@ -392,9 +389,6 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 	if (nents == 0)
 		return 0;
 
-	if (!dev)
-		dev = &x86_dma_fallback_dev;
-
 	out		= 0;
 	start		= 0;
 	start_sg	= sg;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d460998ae828..dcd272dbd0a9 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -51,14 +51,6 @@ int iommu_pass_through __read_mostly;
 
 extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
 
-/* Dummy device used for NULL arguments (normally ISA). */
-struct device x86_dma_fallback_dev = {
-	.init_name = "fallback device",
-	.coherent_dma_mask = ISA_DMA_BIT_MASK,
-	.dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
-};
-EXPORT_SYMBOL(x86_dma_fallback_dev);
-
 void __init pci_iommu_alloc(void)
 {
 	struct iommu_table_entry *p;
@@ -77,18 +69,6 @@ void __init pci_iommu_alloc(void)
 	}
 }
 
-bool arch_dma_alloc_attrs(struct device **dev)
-{
-	if (!*dev)
-		*dev = &x86_dma_fallback_dev;
-
-	if (!is_device_dma_capable(*dev))
-		return false;
-	return true;
-
-}
-EXPORT_SYMBOL(arch_dma_alloc_attrs);
-
 /*
  * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
  * parameter documentation.
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index c000906348c9..685a53f2a793 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -238,10 +238,6 @@ u64 dma_get_required_mask(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dma_get_required_mask);
 
-#ifndef arch_dma_alloc_attrs
-#define arch_dma_alloc_attrs(dev)	(true)
-#endif
-
 void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t flag, unsigned long attrs)
 {
@@ -256,9 +252,6 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	/* let the implementation decide on the zone to allocate from: */
 	flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
-	if (!arch_dma_alloc_attrs(&dev))
-		return NULL;
-
 	if (dma_is_direct(ops))
 		cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
 	else if (ops->alloc)
-- 
cgit v1.2.3-59-g8ed1b


From d75f773c86a2b8b7278e2c33343b46a4024bc002 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 25 Mar 2019 21:32:28 +0200
Subject: treewide: Switch printk users from %pf and %pF to %ps and %pS,
 respectively

%pF and %pf are functionally equivalent to %pS and %ps conversion
specifiers. The former are deprecated, therefore switch the current users
to use the preferred variant.

The changes have been produced by the following command:

	git grep -l '%p[fF]' | grep -v '^\(tools\|Documentation\)/' | \
	while read i; do perl -i -pe 's/%pf/%ps/g; s/%pF/%pS/g;' $i; done

And verifying the result.

Link: http://lkml.kernel.org/r/20190325193229.23390-1-sakari.ailus@linux.intel.com
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: sparclinux@vger.kernel.org
Cc: linux-um@lists.infradead.org
Cc: xen-devel@lists.xenproject.org
Cc: linux-acpi@vger.kernel.org
Cc: linux-pm@vger.kernel.org
Cc: drbd-dev@lists.linbit.com
Cc: linux-block@vger.kernel.org
Cc: linux-mmc@vger.kernel.org
Cc: linux-nvdimm@lists.01.org
Cc: linux-pci@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: linux-btrfs@vger.kernel.org
Cc: linux-f2fs-devel@lists.sourceforge.net
Cc: linux-mm@kvack.org
Cc: ceph-devel@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Acked-by: David Sterba <dsterba@suse.com> (for btrfs)
Acked-by: Mike Rapoport <rppt@linux.ibm.com> (for mm/memblock.c)
Acked-by: Bjorn Helgaas <bhelgaas@google.com> (for drivers/pci)
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 arch/alpha/kernel/pci_iommu.c           | 20 ++++++++++----------
 arch/arm/mach-imx/pm-imx6.c             |  2 +-
 arch/arm/mm/alignment.c                 |  2 +-
 arch/arm/nwfpe/fpmodule.c               |  2 +-
 arch/microblaze/mm/pgtable.c            |  2 +-
 arch/sparc/kernel/ds.c                  |  2 +-
 arch/um/kernel/sysrq.c                  |  2 +-
 arch/x86/include/asm/trace/exceptions.h |  2 +-
 arch/x86/kernel/irq_64.c                |  2 +-
 arch/x86/mm/extable.c                   |  4 ++--
 arch/x86/xen/multicalls.c               |  2 +-
 drivers/acpi/device_pm.c                |  2 +-
 drivers/base/power/main.c               |  6 +++---
 drivers/base/syscore.c                  | 12 ++++++------
 drivers/block/drbd/drbd_receiver.c      |  2 +-
 drivers/block/floppy.c                  | 10 +++++-----
 drivers/cpufreq/cpufreq.c               |  2 +-
 drivers/mmc/core/quirks.h               |  2 +-
 drivers/nvdimm/bus.c                    |  2 +-
 drivers/nvdimm/dimm_devs.c              |  2 +-
 drivers/pci/pci-driver.c                | 14 +++++++-------
 drivers/pci/quirks.c                    |  4 ++--
 drivers/pnp/quirks.c                    |  2 +-
 drivers/scsi/esp_scsi.c                 |  2 +-
 fs/btrfs/tests/free-space-tree-tests.c  |  4 ++--
 fs/f2fs/f2fs.h                          |  2 +-
 fs/pstore/inode.c                       |  2 +-
 include/trace/events/btrfs.h            |  2 +-
 include/trace/events/cpuhp.h            |  4 ++--
 include/trace/events/preemptirq.h       |  2 +-
 include/trace/events/rcu.h              |  4 ++--
 include/trace/events/sunrpc.h           |  2 +-
 include/trace/events/vmscan.h           |  4 ++--
 include/trace/events/workqueue.h        |  4 ++--
 include/trace/events/xen.h              |  2 +-
 init/main.c                             |  6 +++---
 kernel/async.c                          |  4 ++--
 kernel/events/uprobes.c                 |  2 +-
 kernel/fail_function.c                  |  2 +-
 kernel/irq/debugfs.c                    |  2 +-
 kernel/irq/handle.c                     |  2 +-
 kernel/irq/manage.c                     |  2 +-
 kernel/irq/spurious.c                   |  4 ++--
 kernel/rcu/tree.c                       |  2 +-
 kernel/stop_machine.c                   |  2 +-
 kernel/time/sched_clock.c               |  2 +-
 kernel/time/timer.c                     |  2 +-
 kernel/workqueue.c                      | 12 ++++++------
 lib/error-inject.c                      |  2 +-
 lib/percpu-refcount.c                   |  4 ++--
 mm/memblock.c                           | 14 +++++++-------
 mm/memory.c                             |  2 +-
 mm/vmscan.c                             |  2 +-
 net/ceph/osd_client.c                   |  2 +-
 net/core/net-procfs.c                   |  2 +-
 net/core/netpoll.c                      |  4 ++--
 56 files changed, 106 insertions(+), 106 deletions(-)

(limited to 'arch')

diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index aa0f50d0f823..4a2ae862b19e 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -237,7 +237,7 @@ static int pci_dac_dma_supported(struct pci_dev *dev, u64 mask)
 		ok = 0;
 
 	/* If both conditions above are met, we are fine. */
-	DBGA("pci_dac_dma_supported %s from %pf\n",
+	DBGA("pci_dac_dma_supported %s from %ps\n",
 	     ok ? "yes" : "no", __builtin_return_address(0));
 
 	return ok;
@@ -269,7 +269,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size,
 	    && paddr + size <= __direct_map_size) {
 		ret = paddr + __direct_map_base;
 
-		DBGA2("pci_map_single: [%p,%zx] -> direct %llx from %pf\n",
+		DBGA2("pci_map_single: [%p,%zx] -> direct %llx from %ps\n",
 		      cpu_addr, size, ret, __builtin_return_address(0));
 
 		return ret;
@@ -280,7 +280,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size,
 	if (dac_allowed) {
 		ret = paddr + alpha_mv.pci_dac_offset;
 
-		DBGA2("pci_map_single: [%p,%zx] -> DAC %llx from %pf\n",
+		DBGA2("pci_map_single: [%p,%zx] -> DAC %llx from %ps\n",
 		      cpu_addr, size, ret, __builtin_return_address(0));
 
 		return ret;
@@ -317,7 +317,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size,
 	ret = arena->dma_base + dma_ofs * PAGE_SIZE;
 	ret += (unsigned long)cpu_addr & ~PAGE_MASK;
 
-	DBGA2("pci_map_single: [%p,%zx] np %ld -> sg %llx from %pf\n",
+	DBGA2("pci_map_single: [%p,%zx] np %ld -> sg %llx from %ps\n",
 	      cpu_addr, size, npages, ret, __builtin_return_address(0));
 
 	return ret;
@@ -384,14 +384,14 @@ static void alpha_pci_unmap_page(struct device *dev, dma_addr_t dma_addr,
 	    && dma_addr < __direct_map_base + __direct_map_size) {
 		/* Nothing to do.  */
 
-		DBGA2("pci_unmap_single: direct [%llx,%zx] from %pf\n",
+		DBGA2("pci_unmap_single: direct [%llx,%zx] from %ps\n",
 		      dma_addr, size, __builtin_return_address(0));
 
 		return;
 	}
 
 	if (dma_addr > 0xffffffff) {
-		DBGA2("pci64_unmap_single: DAC [%llx,%zx] from %pf\n",
+		DBGA2("pci64_unmap_single: DAC [%llx,%zx] from %ps\n",
 		      dma_addr, size, __builtin_return_address(0));
 		return;
 	}
@@ -423,7 +423,7 @@ static void alpha_pci_unmap_page(struct device *dev, dma_addr_t dma_addr,
 
 	spin_unlock_irqrestore(&arena->lock, flags);
 
-	DBGA2("pci_unmap_single: sg [%llx,%zx] np %ld from %pf\n",
+	DBGA2("pci_unmap_single: sg [%llx,%zx] np %ld from %ps\n",
 	      dma_addr, size, npages, __builtin_return_address(0));
 }
 
@@ -446,7 +446,7 @@ try_again:
 	cpu_addr = (void *)__get_free_pages(gfp | __GFP_ZERO, order);
 	if (! cpu_addr) {
 		printk(KERN_INFO "pci_alloc_consistent: "
-		       "get_free_pages failed from %pf\n",
+		       "get_free_pages failed from %ps\n",
 			__builtin_return_address(0));
 		/* ??? Really atomic allocation?  Otherwise we could play
 		   with vmalloc and sg if we can't find contiguous memory.  */
@@ -465,7 +465,7 @@ try_again:
 		goto try_again;
 	}
 
-	DBGA2("pci_alloc_consistent: %zx -> [%p,%llx] from %pf\n",
+	DBGA2("pci_alloc_consistent: %zx -> [%p,%llx] from %ps\n",
 	      size, cpu_addr, *dma_addrp, __builtin_return_address(0));
 
 	return cpu_addr;
@@ -485,7 +485,7 @@ static void alpha_pci_free_coherent(struct device *dev, size_t size,
 	pci_unmap_single(pdev, dma_addr, size, PCI_DMA_BIDIRECTIONAL);
 	free_pages((unsigned long)cpu_addr, get_order(size));
 
-	DBGA2("pci_free_consistent: [%llx,%zx] from %pf\n",
+	DBGA2("pci_free_consistent: [%llx,%zx] from %ps\n",
 	      dma_addr, size, __builtin_return_address(0));
 }
 
diff --git a/arch/arm/mach-imx/pm-imx6.c b/arch/arm/mach-imx/pm-imx6.c
index 87f45b926c78..e67e0b2d4ce0 100644
--- a/arch/arm/mach-imx/pm-imx6.c
+++ b/arch/arm/mach-imx/pm-imx6.c
@@ -631,7 +631,7 @@ static void imx6_pm_stby_poweroff(void)
 static int imx6_pm_stby_poweroff_probe(void)
 {
 	if (pm_power_off) {
-		pr_warn("%s: pm_power_off already claimed  %p %pf!\n",
+		pr_warn("%s: pm_power_off already claimed  %p %ps!\n",
 			__func__, pm_power_off, pm_power_off);
 		return -EBUSY;
 	}
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index b54f8f8def36..e376883ab35b 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -133,7 +133,7 @@ static const char *usermode_action[] = {
 static int alignment_proc_show(struct seq_file *m, void *v)
 {
 	seq_printf(m, "User:\t\t%lu\n", ai_user);
-	seq_printf(m, "System:\t\t%lu (%pF)\n", ai_sys, ai_sys_last_pc);
+	seq_printf(m, "System:\t\t%lu (%pS)\n", ai_sys, ai_sys_last_pc);
 	seq_printf(m, "Skipped:\t%lu\n", ai_skipped);
 	seq_printf(m, "Half:\t\t%lu\n", ai_half);
 	seq_printf(m, "Word:\t\t%lu\n", ai_word);
diff --git a/arch/arm/nwfpe/fpmodule.c b/arch/arm/nwfpe/fpmodule.c
index 1365e8650843..ee34c76e6624 100644
--- a/arch/arm/nwfpe/fpmodule.c
+++ b/arch/arm/nwfpe/fpmodule.c
@@ -147,7 +147,7 @@ void float_raise(signed char flags)
 #ifdef CONFIG_DEBUG_USER
 	if (flags & debug)
  		printk(KERN_DEBUG
-		       "NWFPE: %s[%d] takes exception %08x at %pf from %08lx\n",
+		       "NWFPE: %s[%d] takes exception %08x at %ps from %08lx\n",
 		       current->comm, current->pid, flags,
 		       __builtin_return_address(0), GET_USERREG()->ARM_pc);
 #endif
diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c
index c2ce1e42b888..8fe54fda31dc 100644
--- a/arch/microblaze/mm/pgtable.c
+++ b/arch/microblaze/mm/pgtable.c
@@ -75,7 +75,7 @@ static void __iomem *__ioremap(phys_addr_t addr, unsigned long size,
 		p >= memory_start && p < virt_to_phys(high_memory) &&
 		!(p >= __virt_to_phys((phys_addr_t)__bss_stop) &&
 		p < __virt_to_phys((phys_addr_t)__bss_stop))) {
-		pr_warn("__ioremap(): phys addr "PTE_FMT" is RAM lr %pf\n",
+		pr_warn("__ioremap(): phys addr "PTE_FMT" is RAM lr %ps\n",
 			(unsigned long)p, __builtin_return_address(0));
 		return NULL;
 	}
diff --git a/arch/sparc/kernel/ds.c b/arch/sparc/kernel/ds.c
index f87265afb175..cad08ccce625 100644
--- a/arch/sparc/kernel/ds.c
+++ b/arch/sparc/kernel/ds.c
@@ -876,7 +876,7 @@ void ldom_power_off(void)
 
 static void ds_conn_reset(struct ds_info *dp)
 {
-	printk(KERN_ERR "ds-%llu: ds_conn_reset() from %pf\n",
+	printk(KERN_ERR "ds-%llu: ds_conn_reset() from %ps\n",
 	       dp->id, __builtin_return_address(0));
 }
 
diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c
index 6b995e870d55..05585eef11d9 100644
--- a/arch/um/kernel/sysrq.c
+++ b/arch/um/kernel/sysrq.c
@@ -20,7 +20,7 @@
 
 static void _print_addr(void *data, unsigned long address, int reliable)
 {
-	pr_info(" [<%08lx>] %s%pF\n", address, reliable ? "" : "? ",
+	pr_info(" [<%08lx>] %s%pS\n", address, reliable ? "" : "? ",
 		(void *)address);
 }
 
diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h
index e0e6d7f21399..6b1e87194809 100644
--- a/arch/x86/include/asm/trace/exceptions.h
+++ b/arch/x86/include/asm/trace/exceptions.h
@@ -30,7 +30,7 @@ DECLARE_EVENT_CLASS(x86_exceptions,
 		__entry->error_code = error_code;
 	),
 
-	TP_printk("address=%pf ip=%pf error_code=0x%lx",
+	TP_printk("address=%ps ip=%ps error_code=0x%lx",
 		  (void *)__entry->address, (void *)__entry->ip,
 		  __entry->error_code) );
 
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 0469cd078db1..4dff56658427 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -58,7 +58,7 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 	if (regs->sp >= estack_top && regs->sp <= estack_bottom)
 		return;
 
-	WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
+	WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pS)\n",
 		current->comm, curbase, regs->sp,
 		irq_stack_top, irq_stack_bottom,
 		estack_top, estack_bottom, (void *)regs->ip);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 3c4568f8fb28..b0a2de8d2f9e 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -145,7 +145,7 @@ __visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup
 				       unsigned long error_code,
 				       unsigned long fault_addr)
 {
-	if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pF)\n",
+	if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
 			 (unsigned int)regs->cx, regs->ip, (void *)regs->ip))
 		show_stack_regs(regs);
 
@@ -162,7 +162,7 @@ __visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup
 				       unsigned long error_code,
 				       unsigned long fault_addr)
 {
-	if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pF)\n",
+	if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
 			 (unsigned int)regs->cx, (unsigned int)regs->dx,
 			 (unsigned int)regs->ax,  regs->ip, (void *)regs->ip))
 		show_stack_regs(regs);
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 0766a08bdf45..07054572297f 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -105,7 +105,7 @@ void xen_mc_flush(void)
 		for (i = 0; i < b->mcidx; i++) {
 			if (b->entries[i].result < 0) {
 #if MC_DEBUG
-				pr_err("  call %2d: op=%lu arg=[%lx] result=%ld\t%pF\n",
+				pr_err("  call %2d: op=%lu arg=[%lx] result=%ld\t%pS\n",
 				       i + 1,
 				       b->debug[i].op,
 				       b->debug[i].args[0],
diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index 824ae985ad93..1aa0d014dc34 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -414,7 +414,7 @@ static void acpi_pm_notify_handler(acpi_handle handle, u32 val, void *not_used)
 	if (adev->wakeup.flags.notifier_present) {
 		pm_wakeup_ws_event(adev->wakeup.ws, 0, acpi_s2idle_wakeup());
 		if (adev->wakeup.context.func) {
-			acpi_handle_debug(handle, "Running %pF for %s\n",
+			acpi_handle_debug(handle, "Running %pS for %s\n",
 					  adev->wakeup.context.func,
 					  dev_name(adev->wakeup.context.dev));
 			adev->wakeup.context.func(&adev->wakeup.context);
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 5a8149829ab3..c88f56b9ae5b 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -205,7 +205,7 @@ static ktime_t initcall_debug_start(struct device *dev, void *cb)
 	if (!pm_print_times_enabled)
 		return 0;
 
-	dev_info(dev, "calling %pF @ %i, parent: %s\n", cb,
+	dev_info(dev, "calling %pS @ %i, parent: %s\n", cb,
 		 task_pid_nr(current),
 		 dev->parent ? dev_name(dev->parent) : "none");
 	return ktime_get();
@@ -223,7 +223,7 @@ static void initcall_debug_report(struct device *dev, ktime_t calltime,
 	rettime = ktime_get();
 	nsecs = (s64) ktime_to_ns(ktime_sub(rettime, calltime));
 
-	dev_info(dev, "%pF returned %d after %Ld usecs\n", cb, error,
+	dev_info(dev, "%pS returned %d after %Ld usecs\n", cb, error,
 		 (unsigned long long)nsecs >> 10);
 }
 
@@ -2062,7 +2062,7 @@ EXPORT_SYMBOL_GPL(dpm_suspend_start);
 void __suspend_report_result(const char *function, void *fn, int ret)
 {
 	if (ret)
-		printk(KERN_ERR "%s(): %pF returns %d\n", function, fn, ret);
+		printk(KERN_ERR "%s(): %pS returns %d\n", function, fn, ret);
 }
 EXPORT_SYMBOL_GPL(__suspend_report_result);
 
diff --git a/drivers/base/syscore.c b/drivers/base/syscore.c
index 6e076f359dcc..0d346a307140 100644
--- a/drivers/base/syscore.c
+++ b/drivers/base/syscore.c
@@ -62,19 +62,19 @@ int syscore_suspend(void)
 	list_for_each_entry_reverse(ops, &syscore_ops_list, node)
 		if (ops->suspend) {
 			if (initcall_debug)
-				pr_info("PM: Calling %pF\n", ops->suspend);
+				pr_info("PM: Calling %pS\n", ops->suspend);
 			ret = ops->suspend();
 			if (ret)
 				goto err_out;
 			WARN_ONCE(!irqs_disabled(),
-				"Interrupts enabled after %pF\n", ops->suspend);
+				"Interrupts enabled after %pS\n", ops->suspend);
 		}
 
 	trace_suspend_resume(TPS("syscore_suspend"), 0, false);
 	return 0;
 
  err_out:
-	pr_err("PM: System core suspend callback %pF failed.\n", ops->suspend);
+	pr_err("PM: System core suspend callback %pS failed.\n", ops->suspend);
 
 	list_for_each_entry_continue(ops, &syscore_ops_list, node)
 		if (ops->resume)
@@ -100,10 +100,10 @@ void syscore_resume(void)
 	list_for_each_entry(ops, &syscore_ops_list, node)
 		if (ops->resume) {
 			if (initcall_debug)
-				pr_info("PM: Calling %pF\n", ops->resume);
+				pr_info("PM: Calling %pS\n", ops->resume);
 			ops->resume();
 			WARN_ONCE(!irqs_disabled(),
-				"Interrupts enabled after %pF\n", ops->resume);
+				"Interrupts enabled after %pS\n", ops->resume);
 		}
 	trace_suspend_resume(TPS("syscore_resume"), 0, false);
 }
@@ -122,7 +122,7 @@ void syscore_shutdown(void)
 	list_for_each_entry_reverse(ops, &syscore_ops_list, node)
 		if (ops->shutdown) {
 			if (initcall_debug)
-				pr_info("PM: Calling %pF\n", ops->shutdown);
+				pr_info("PM: Calling %pS\n", ops->shutdown);
 			ops->shutdown();
 		}
 
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index c7ad88d91a09..3e5fd97a3b4d 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -6116,7 +6116,7 @@ int drbd_ack_receiver(struct drbd_thread *thi)
 
 			err = cmd->fn(connection, &pi);
 			if (err) {
-				drbd_err(connection, "%pf failed\n", cmd->fn);
+				drbd_err(connection, "%ps failed\n", cmd->fn);
 				goto reconnect;
 			}
 
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 95f608d1a098..49f89db0766f 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -1693,7 +1693,7 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id)
 		/* we don't even know which FDC is the culprit */
 		pr_info("DOR0=%x\n", fdc_state[0].dor);
 		pr_info("floppy interrupt on bizarre fdc %d\n", fdc);
-		pr_info("handler=%pf\n", handler);
+		pr_info("handler=%ps\n", handler);
 		is_alive(__func__, "bizarre fdc");
 		return IRQ_NONE;
 	}
@@ -1752,7 +1752,7 @@ static void reset_interrupt(void)
 	debugt(__func__, "");
 	result();		/* get the status ready for set_fdc */
 	if (FDCS->reset) {
-		pr_info("reset set in interrupt, calling %pf\n", cont->error);
+		pr_info("reset set in interrupt, calling %ps\n", cont->error);
 		cont->error();	/* a reset just after a reset. BAD! */
 	}
 	cont->redo();
@@ -1793,7 +1793,7 @@ static void show_floppy(void)
 	pr_info("\n");
 	pr_info("floppy driver state\n");
 	pr_info("-------------------\n");
-	pr_info("now=%lu last interrupt=%lu diff=%lu last called handler=%pf\n",
+	pr_info("now=%lu last interrupt=%lu diff=%lu last called handler=%ps\n",
 		jiffies, interruptjiffies, jiffies - interruptjiffies,
 		lasthandler);
 
@@ -1812,9 +1812,9 @@ static void show_floppy(void)
 	pr_info("status=%x\n", fd_inb(FD_STATUS));
 	pr_info("fdc_busy=%lu\n", fdc_busy);
 	if (do_floppy)
-		pr_info("do_floppy=%pf\n", do_floppy);
+		pr_info("do_floppy=%ps\n", do_floppy);
 	if (work_pending(&floppy_work))
-		pr_info("floppy_work.func=%pf\n", floppy_work.func);
+		pr_info("floppy_work.func=%ps\n", floppy_work.func);
 	if (delayed_work_pending(&fd_timer))
 		pr_info("delayed work.function=%p expires=%ld\n",
 		       fd_timer.work.func,
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 0e626b00053b..d87ebc2d2e68 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -432,7 +432,7 @@ static void cpufreq_list_transition_notifiers(void)
 	mutex_lock(&cpufreq_transition_notifier_list.mutex);
 
 	for (nb = cpufreq_transition_notifier_list.head; nb; nb = nb->next)
-		pr_info("%pF\n", nb->notifier_call);
+		pr_info("%pS\n", nb->notifier_call);
 
 	mutex_unlock(&cpufreq_transition_notifier_list.mutex);
 }
diff --git a/drivers/mmc/core/quirks.h b/drivers/mmc/core/quirks.h
index dd2f73af8f2c..2d2d9ea8be4f 100644
--- a/drivers/mmc/core/quirks.h
+++ b/drivers/mmc/core/quirks.h
@@ -159,7 +159,7 @@ static inline void mmc_fixup_device(struct mmc_card *card,
 		    (f->ext_csd_rev == EXT_CSD_REV_ANY ||
 		     f->ext_csd_rev == card->ext_csd.rev) &&
 		    rev >= f->rev_start && rev <= f->rev_end) {
-			dev_dbg(&card->dev, "calling %pf\n", f->vendor_fixup);
+			dev_dbg(&card->dev, "calling %ps\n", f->vendor_fixup);
 			f->vendor_fixup(card, f->data);
 		}
 	}
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 7bbff0af29b2..7ff684159f29 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -581,7 +581,7 @@ int __nd_driver_register(struct nd_device_driver *nd_drv, struct module *owner,
 	struct device_driver *drv = &nd_drv->drv;
 
 	if (!nd_drv->type) {
-		pr_debug("driver type bitmask not set (%pf)\n",
+		pr_debug("driver type bitmask not set (%ps)\n",
 				__builtin_return_address(0));
 		return -EINVAL;
 	}
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index efe412a6b5b9..06f5087547ea 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -53,7 +53,7 @@ static int validate_dimm(struct nvdimm_drvdata *ndd)
 
 	rc = nvdimm_check_config_data(ndd->dev);
 	if (rc)
-		dev_dbg(ndd->dev, "%pf: %s error: %d\n",
+		dev_dbg(ndd->dev, "%ps: %s error: %d\n",
 				__builtin_return_address(0), __func__, rc);
 	return rc;
 }
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 79b1610a8beb..11a877461584 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -578,7 +578,7 @@ static int pci_legacy_suspend(struct device *dev, pm_message_t state)
 		if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0
 		    && pci_dev->current_state != PCI_UNKNOWN) {
 			WARN_ONCE(pci_dev->current_state != prev,
-				"PCI PM: Device state not saved by %pF\n",
+				"PCI PM: Device state not saved by %pS\n",
 				drv->suspend);
 		}
 	}
@@ -605,7 +605,7 @@ static int pci_legacy_suspend_late(struct device *dev, pm_message_t state)
 		if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0
 		    && pci_dev->current_state != PCI_UNKNOWN) {
 			WARN_ONCE(pci_dev->current_state != prev,
-				"PCI PM: Device state not saved by %pF\n",
+				"PCI PM: Device state not saved by %pS\n",
 				drv->suspend_late);
 			goto Fixup;
 		}
@@ -773,7 +773,7 @@ static int pci_pm_suspend(struct device *dev)
 		if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0
 		    && pci_dev->current_state != PCI_UNKNOWN) {
 			WARN_ONCE(pci_dev->current_state != prev,
-				"PCI PM: State of device not saved by %pF\n",
+				"PCI PM: State of device not saved by %pS\n",
 				pm->suspend);
 		}
 	}
@@ -821,7 +821,7 @@ static int pci_pm_suspend_noirq(struct device *dev)
 		if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0
 		    && pci_dev->current_state != PCI_UNKNOWN) {
 			WARN_ONCE(pci_dev->current_state != prev,
-				"PCI PM: State of device not saved by %pF\n",
+				"PCI PM: State of device not saved by %pS\n",
 				pm->suspend_noirq);
 			goto Fixup;
 		}
@@ -1260,11 +1260,11 @@ static int pci_pm_runtime_suspend(struct device *dev)
 		 * log level.
 		 */
 		if (error == -EBUSY || error == -EAGAIN) {
-			dev_dbg(dev, "can't suspend now (%pf returned %d)\n",
+			dev_dbg(dev, "can't suspend now (%ps returned %d)\n",
 				pm->runtime_suspend, error);
 			return error;
 		} else if (error) {
-			dev_err(dev, "can't suspend (%pf returned %d)\n",
+			dev_err(dev, "can't suspend (%ps returned %d)\n",
 				pm->runtime_suspend, error);
 			return error;
 		}
@@ -1276,7 +1276,7 @@ static int pci_pm_runtime_suspend(struct device *dev)
 	    && !pci_dev->state_saved && pci_dev->current_state != PCI_D0
 	    && pci_dev->current_state != PCI_UNKNOWN) {
 		WARN_ONCE(pci_dev->current_state != prev,
-			"PCI PM: State of device not saved by %pF\n",
+			"PCI PM: State of device not saved by %pS\n",
 			pm->runtime_suspend);
 		return 0;
 	}
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index e2a879e93d86..56cd4c4a170c 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -36,7 +36,7 @@ static ktime_t fixup_debug_start(struct pci_dev *dev,
 				 void (*fn)(struct pci_dev *dev))
 {
 	if (initcall_debug)
-		pci_info(dev, "calling  %pF @ %i\n", fn, task_pid_nr(current));
+		pci_info(dev, "calling  %pS @ %i\n", fn, task_pid_nr(current));
 
 	return ktime_get();
 }
@@ -51,7 +51,7 @@ static void fixup_debug_report(struct pci_dev *dev, ktime_t calltime,
 	delta = ktime_sub(rettime, calltime);
 	duration = (unsigned long long) ktime_to_ns(delta) >> 10;
 	if (initcall_debug || duration > 10000)
-		pci_info(dev, "%pF took %lld usecs\n", fn, duration);
+		pci_info(dev, "%pS took %lld usecs\n", fn, duration);
 }
 
 static void pci_do_fixups(struct pci_dev *dev, struct pci_fixup *f,
diff --git a/drivers/pnp/quirks.c b/drivers/pnp/quirks.c
index 803666ae3635..de99f371d362 100644
--- a/drivers/pnp/quirks.c
+++ b/drivers/pnp/quirks.c
@@ -458,7 +458,7 @@ void pnp_fixup_device(struct pnp_dev *dev)
 	for (f = pnp_fixups; *f->id; f++) {
 		if (!compare_pnp_id(dev->id, f->id))
 			continue;
-		pnp_dbg(&dev->dev, "%s: calling %pF\n", f->id,
+		pnp_dbg(&dev->dev, "%s: calling %pS\n", f->id,
 			f->quirk_function);
 		f->quirk_function(dev);
 	}
diff --git a/drivers/scsi/esp_scsi.c b/drivers/scsi/esp_scsi.c
index 465df475f753..76fd02ccbf49 100644
--- a/drivers/scsi/esp_scsi.c
+++ b/drivers/scsi/esp_scsi.c
@@ -1031,7 +1031,7 @@ static int esp_check_spur_intr(struct esp *esp)
 
 static void esp_schedule_reset(struct esp *esp)
 {
-	esp_log_reset("esp_schedule_reset() from %pf\n",
+	esp_log_reset("esp_schedule_reset() from %ps\n",
 		      __builtin_return_address(0));
 	esp->flags |= ESP_FLAG_RESETTING;
 	esp_event(esp, ESP_EVENT_RESET);
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 89346da890cf..f7a969b986eb 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -539,7 +539,7 @@ static int run_test_both_formats(test_func_t test_func, u32 sectorsize,
 	ret = run_test(test_func, 0, sectorsize, nodesize, alignment);
 	if (ret) {
 		test_err(
-	"%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u",
+	"%ps failed with extents, sectorsize=%u, nodesize=%u, alignment=%u",
 			 test_func, sectorsize, nodesize, alignment);
 		test_ret = ret;
 	}
@@ -547,7 +547,7 @@ static int run_test_both_formats(test_func_t test_func, u32 sectorsize,
 	ret = run_test(test_func, 1, sectorsize, nodesize, alignment);
 	if (ret) {
 		test_err(
-	"%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u",
+	"%ps failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u",
 			 test_func, sectorsize, nodesize, alignment);
 		test_ret = ret;
 	}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 8f23ee6e8eb9..02d0cd199828 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1337,7 +1337,7 @@ struct f2fs_private_dio {
 
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 #define f2fs_show_injection_info(type)					\
-	printk_ratelimited("%sF2FS-fs : inject %s in %s of %pF\n",	\
+	printk_ratelimited("%sF2FS-fs : inject %s in %s of %pS\n",	\
 		KERN_INFO, f2fs_fault_name[type],			\
 		__func__, __builtin_return_address(0))
 static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index c60ee46f3e39..29e94e0b6d73 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -115,7 +115,7 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v)
 
 	rec = (struct pstore_ftrace_record *)(ps->record->buf + data->off);
 
-	seq_printf(s, "CPU:%d ts:%llu %08lx  %08lx  %pf <- %pF\n",
+	seq_printf(s, "CPU:%d ts:%llu %08lx  %08lx  %ps <- %pS\n",
 		   pstore_ftrace_decode_cpu(rec),
 		   pstore_ftrace_read_timestamp(rec),
 		   rec->ip, rec->parent_ip, (void *)rec->ip,
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index ab1cc33adbac..b9b7465be5eb 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1345,7 +1345,7 @@ DECLARE_EVENT_CLASS(btrfs__work,
 		__entry->normal_work	= &work->normal_work;
 	),
 
-	TP_printk_btrfs("work=%p (normal_work=%p) wq=%p func=%pf ordered_func=%p "
+	TP_printk_btrfs("work=%p (normal_work=%p) wq=%p func=%ps ordered_func=%p "
 		  "ordered_free=%p",
 		  __entry->work, __entry->normal_work, __entry->wq,
 		   __entry->func, __entry->ordered_func, __entry->ordered_free)
diff --git a/include/trace/events/cpuhp.h b/include/trace/events/cpuhp.h
index fe1d6e8cd99d..ad16f77310c6 100644
--- a/include/trace/events/cpuhp.h
+++ b/include/trace/events/cpuhp.h
@@ -30,7 +30,7 @@ TRACE_EVENT(cpuhp_enter,
 		__entry->fun	= fun;
 	),
 
-	TP_printk("cpu: %04u target: %3d step: %3d (%pf)",
+	TP_printk("cpu: %04u target: %3d step: %3d (%ps)",
 		  __entry->cpu, __entry->target, __entry->idx, __entry->fun)
 );
 
@@ -58,7 +58,7 @@ TRACE_EVENT(cpuhp_multi_enter,
 		__entry->fun	= fun;
 	),
 
-	TP_printk("cpu: %04u target: %3d step: %3d (%pf)",
+	TP_printk("cpu: %04u target: %3d step: %3d (%ps)",
 		  __entry->cpu, __entry->target, __entry->idx, __entry->fun)
 );
 
diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h
index 9a0d4ceeb166..95fba0471e5b 100644
--- a/include/trace/events/preemptirq.h
+++ b/include/trace/events/preemptirq.h
@@ -27,7 +27,7 @@ DECLARE_EVENT_CLASS(preemptirq_template,
 		__entry->parent_offs = (u32)(parent_ip - (unsigned long)_stext);
 	),
 
-	TP_printk("caller=%pF parent=%pF",
+	TP_printk("caller=%pS parent=%pS",
 		  (void *)((unsigned long)(_stext) + __entry->caller_offs),
 		  (void *)((unsigned long)(_stext) + __entry->parent_offs))
 );
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index f0c4d10e614b..80339fd14c1c 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -491,7 +491,7 @@ TRACE_EVENT(rcu_callback,
 		__entry->qlen = qlen;
 	),
 
-	TP_printk("%s rhp=%p func=%pf %ld/%ld",
+	TP_printk("%s rhp=%p func=%ps %ld/%ld",
 		  __entry->rcuname, __entry->rhp, __entry->func,
 		  __entry->qlen_lazy, __entry->qlen)
 );
@@ -587,7 +587,7 @@ TRACE_EVENT(rcu_invoke_callback,
 		__entry->func = rhp->func;
 	),
 
-	TP_printk("%s rhp=%p func=%pf",
+	TP_printk("%s rhp=%p func=%ps",
 		  __entry->rcuname, __entry->rhp, __entry->func)
 );
 
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 0d5d0d91f861..7b8b987d332e 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -102,7 +102,7 @@ DECLARE_EVENT_CLASS(rpc_task_running,
 		__entry->flags = task->tk_flags;
 		),
 
-	TP_printk("task:%u@%d flags=%4.4x state=%4.4lx status=%d action=%pf",
+	TP_printk("task:%u@%d flags=%4.4x state=%4.4lx status=%d action=%ps",
 		__entry->task_id, __entry->client_id,
 		__entry->flags,
 		__entry->runstate,
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index a1cb91342231..252327dbfa51 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -226,7 +226,7 @@ TRACE_EVENT(mm_shrink_slab_start,
 		__entry->priority = priority;
 	),
 
-	TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d",
+	TP_printk("%pS %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d",
 		__entry->shrink,
 		__entry->shr,
 		__entry->nid,
@@ -265,7 +265,7 @@ TRACE_EVENT(mm_shrink_slab_end,
 		__entry->total_scan = total_scan;
 	),
 
-	TP_printk("%pF %p: nid: %d unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d",
+	TP_printk("%pS %p: nid: %d unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d",
 		__entry->shrink,
 		__entry->shr,
 		__entry->nid,
diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h
index 9a761bc6a251..e172549283be 100644
--- a/include/trace/events/workqueue.h
+++ b/include/trace/events/workqueue.h
@@ -60,7 +60,7 @@ TRACE_EVENT(workqueue_queue_work,
 		__entry->cpu		= pwq->pool->cpu;
 	),
 
-	TP_printk("work struct=%p function=%pf workqueue=%p req_cpu=%u cpu=%u",
+	TP_printk("work struct=%p function=%ps workqueue=%p req_cpu=%u cpu=%u",
 		  __entry->work, __entry->function, __entry->workqueue,
 		  __entry->req_cpu, __entry->cpu)
 );
@@ -102,7 +102,7 @@ TRACE_EVENT(workqueue_execute_start,
 		__entry->function	= work->func;
 	),
 
-	TP_printk("work struct %p: function %pf", __entry->work, __entry->function)
+	TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
 );
 
 /**
diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h
index fdcf88bcf0ea..9a0e8af21310 100644
--- a/include/trace/events/xen.h
+++ b/include/trace/events/xen.h
@@ -73,7 +73,7 @@ TRACE_EVENT(xen_mc_callback,
 		    __entry->fn = fn;
 		    __entry->data = data;
 		    ),
-	    TP_printk("callback %pf, data %p",
+	    TP_printk("callback %ps, data %p",
 		      __entry->fn, __entry->data)
 	);
 
diff --git a/init/main.c b/init/main.c
index c86a1c8f19f4..3192c6cc5382 100644
--- a/init/main.c
+++ b/init/main.c
@@ -826,7 +826,7 @@ trace_initcall_start_cb(void *data, initcall_t fn)
 {
 	ktime_t *calltime = (ktime_t *)data;
 
-	printk(KERN_DEBUG "calling  %pF @ %i\n", fn, task_pid_nr(current));
+	printk(KERN_DEBUG "calling  %pS @ %i\n", fn, task_pid_nr(current));
 	*calltime = ktime_get();
 }
 
@@ -840,7 +840,7 @@ trace_initcall_finish_cb(void *data, initcall_t fn, int ret)
 	rettime = ktime_get();
 	delta = ktime_sub(rettime, *calltime);
 	duration = (unsigned long long) ktime_to_ns(delta) >> 10;
-	printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n",
+	printk(KERN_DEBUG "initcall %pS returned %d after %lld usecs\n",
 		 fn, ret, duration);
 }
 
@@ -897,7 +897,7 @@ int __init_or_module do_one_initcall(initcall_t fn)
 		strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf));
 		local_irq_enable();
 	}
-	WARN(msgbuf[0], "initcall %pF returned with %s\n", fn, msgbuf);
+	WARN(msgbuf[0], "initcall %pS returned with %s\n", fn, msgbuf);
 
 	add_latent_entropy();
 	return ret;
diff --git a/kernel/async.c b/kernel/async.c
index f6bd0d9885e1..12c332e4e13e 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -119,7 +119,7 @@ static void async_run_entry_fn(struct work_struct *work)
 
 	/* 1) run (and print duration) */
 	if (initcall_debug && system_state < SYSTEM_RUNNING) {
-		pr_debug("calling  %lli_%pF @ %i\n",
+		pr_debug("calling  %lli_%pS @ %i\n",
 			(long long)entry->cookie,
 			entry->func, task_pid_nr(current));
 		calltime = ktime_get();
@@ -128,7 +128,7 @@ static void async_run_entry_fn(struct work_struct *work)
 	if (initcall_debug && system_state < SYSTEM_RUNNING) {
 		rettime = ktime_get();
 		delta = ktime_sub(rettime, calltime);
-		pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
+		pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n",
 			(long long)entry->cookie,
 			entry->func,
 			(long long)ktime_to_ns(delta) >> 10);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index affa830a198c..48abc0f18eae 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2028,7 +2028,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 		if (uc->handler) {
 			rc = uc->handler(uc, regs);
 			WARN(rc & ~UPROBE_HANDLER_MASK,
-				"bad rc=0x%x from %pf()\n", rc, uc->handler);
+				"bad rc=0x%x from %ps()\n", rc, uc->handler);
 		}
 
 		if (uc->ret_handler)
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index 17f75b545f66..feb80712b913 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -210,7 +210,7 @@ static int fei_seq_show(struct seq_file *m, void *v)
 {
 	struct fei_attr *attr = list_entry(v, struct fei_attr, list);
 
-	seq_printf(m, "%pf\n", attr->kp.addr);
+	seq_printf(m, "%ps\n", attr->kp.addr);
 	return 0;
 }
 
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 516c00a5e867..c1eccd4f6520 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -152,7 +152,7 @@ static int irq_debug_show(struct seq_file *m, void *p)
 
 	raw_spin_lock_irq(&desc->lock);
 	data = irq_desc_get_irq_data(desc);
-	seq_printf(m, "handler:  %pf\n", desc->handle_irq);
+	seq_printf(m, "handler:  %ps\n", desc->handle_irq);
 	seq_printf(m, "device:   %s\n", desc->dev_name);
 	seq_printf(m, "status:   0x%08x\n", desc->status_use_accessors);
 	irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states,
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 6df5ddfdb0f8..a4ace611f47f 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -149,7 +149,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
 		res = action->handler(irq, action->dev_id);
 		trace_irq_handler_exit(irq, action, res);
 
-		if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
+		if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n",
 			      irq, action->handler))
 			local_irq_disable();
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9ec34a2a6638..ec43ab2fdfda 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -778,7 +778,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
 		ret = 0;
 		break;
 	default:
-		pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
+		pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n",
 		       flags, irq_desc_get_irq(desc), chip->irq_set_type);
 	}
 	if (unmask)
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 6d2fa6914b30..2ed97a7c9b2a 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -212,9 +212,9 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
 	 */
 	raw_spin_lock_irqsave(&desc->lock, flags);
 	for_each_action_of_desc(desc, action) {
-		printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
+		printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler);
 		if (action->thread_fn)
-			printk(KERN_CONT " threaded [<%p>] %pf",
+			printk(KERN_CONT " threaded [<%p>] %ps",
 					action->thread_fn, action->thread_fn);
 		printk(KERN_CONT "\n");
 	}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index acd6ccf56faf..8eee921b384d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2870,7 +2870,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
 		 * Use rcu:rcu_callback trace event to find the previous
 		 * time callback was passed to __call_rcu().
 		 */
-		WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n",
+		WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n",
 			  head, head->func);
 		WRITE_ONCE(head->func, rcu_leak_callback);
 		return;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 067cb83f37ea..7231fb5953fc 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -513,7 +513,7 @@ repeat:
 		}
 		preempt_count_dec();
 		WARN_ONCE(preempt_count(),
-			  "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg);
+			  "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
 		goto repeat;
 	}
 }
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 094b82ca95e5..1002cf61700a 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -231,7 +231,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
 	if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
 		enable_sched_clock_irqtime();
 
-	pr_debug("Registered %pF as sched_clock source\n", read);
+	pr_debug("Registered %pS as sched_clock source\n", read);
 }
 
 void __init generic_sched_clock_init(void)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2fce056f8a49..6502c3ed317e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1328,7 +1328,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list
 	lock_map_release(&lockdep_map);
 
 	if (count != preempt_count()) {
-		WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
+		WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
 			  fn, count, preempt_count());
 		/*
 		 * Restore the preempt count. That gives us a decent
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7abbeed13421..8ea9e4fb8cc6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2277,7 +2277,7 @@ __acquires(&pool->lock)
 
 	if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
 		pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
-		       "     last function: %pf\n",
+		       "     last function: %ps\n",
 		       current->comm, preempt_count(), task_pid_nr(current),
 		       worker->current_func);
 		debug_show_held_locks(current);
@@ -2596,11 +2596,11 @@ static void check_flush_dependency(struct workqueue_struct *target_wq,
 	worker = current_wq_worker();
 
 	WARN_ONCE(current->flags & PF_MEMALLOC,
-		  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf",
+		  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
 		  current->pid, current->comm, target_wq->name, target_func);
 	WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
 			      (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
-		  "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf",
+		  "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",
 		  worker->current_pwq->wq->name, worker->current_func,
 		  target_wq->name, target_func);
 }
@@ -4582,7 +4582,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
 	probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
 
 	if (fn || name[0] || desc[0]) {
-		printk("%sWorkqueue: %s %pf", log_lvl, name, fn);
+		printk("%sWorkqueue: %s %ps", log_lvl, name, fn);
 		if (strcmp(name, desc))
 			pr_cont(" (%s)", desc);
 		pr_cont("\n");
@@ -4607,7 +4607,7 @@ static void pr_cont_work(bool comma, struct work_struct *work)
 		pr_cont("%s BAR(%d)", comma ? "," : "",
 			task_pid_nr(barr->task));
 	} else {
-		pr_cont("%s %pf", comma ? "," : "", work->func);
+		pr_cont("%s %ps", comma ? "," : "", work->func);
 	}
 }
 
@@ -4639,7 +4639,7 @@ static void show_pwq(struct pool_workqueue *pwq)
 			if (worker->current_pwq != pwq)
 				continue;
 
-			pr_cont("%s %d%s:%pf", comma ? "," : "",
+			pr_cont("%s %d%s:%ps", comma ? "," : "",
 				task_pid_nr(worker->task),
 				worker == pwq->wq->rescuer ? "(RESCUER)" : "",
 				worker->current_func);
diff --git a/lib/error-inject.c b/lib/error-inject.c
index c0d4600f4896..aa63751c916f 100644
--- a/lib/error-inject.c
+++ b/lib/error-inject.c
@@ -189,7 +189,7 @@ static int ei_seq_show(struct seq_file *m, void *v)
 {
 	struct ei_entry *ent = list_entry(v, struct ei_entry, list);
 
-	seq_printf(m, "%pf\t%s\n", (void *)ent->start_addr,
+	seq_printf(m, "%ps\t%s\n", (void *)ent->start_addr,
 		   error_type_string(ent->etype));
 	return 0;
 }
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 9877682e49c7..da54318d3b55 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -151,7 +151,7 @@ static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu)
 	atomic_long_add((long)count - PERCPU_COUNT_BIAS, &ref->count);
 
 	WARN_ONCE(atomic_long_read(&ref->count) <= 0,
-		  "percpu ref (%pf) <= 0 (%ld) after switching to atomic",
+		  "percpu ref (%ps) <= 0 (%ld) after switching to atomic",
 		  ref->release, atomic_long_read(&ref->count));
 
 	/* @ref is viewed as dead on all CPUs, send out switch confirmation */
@@ -333,7 +333,7 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
 
 	WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD,
-		  "%s called more than once on %pf!", __func__, ref->release);
+		  "%s called more than once on %ps!", __func__, ref->release);
 
 	ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
 	__percpu_ref_switch_mode(ref, confirm_kill);
diff --git a/mm/memblock.c b/mm/memblock.c
index 470601115892..9db5e51babd2 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -701,7 +701,7 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
 {
 	phys_addr_t end = base + size - 1;
 
-	memblock_dbg("memblock_add: [%pa-%pa] %pF\n",
+	memblock_dbg("memblock_add: [%pa-%pa] %pS\n",
 		     &base, &end, (void *)_RET_IP_);
 
 	return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
@@ -820,7 +820,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
 {
 	phys_addr_t end = base + size - 1;
 
-	memblock_dbg("   memblock_free: [%pa-%pa] %pF\n",
+	memblock_dbg("   memblock_free: [%pa-%pa] %pS\n",
 		     &base, &end, (void *)_RET_IP_);
 
 	kmemleak_free_part_phys(base, size);
@@ -831,7 +831,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
 {
 	phys_addr_t end = base + size - 1;
 
-	memblock_dbg("memblock_reserve: [%pa-%pa] %pF\n",
+	memblock_dbg("memblock_reserve: [%pa-%pa] %pS\n",
 		     &base, &end, (void *)_RET_IP_);
 
 	return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
@@ -1466,7 +1466,7 @@ void * __init memblock_alloc_try_nid_raw(
 {
 	void *ptr;
 
-	memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
+	memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
 		     __func__, (u64)size, (u64)align, nid, &min_addr,
 		     &max_addr, (void *)_RET_IP_);
 
@@ -1502,7 +1502,7 @@ void * __init memblock_alloc_try_nid_nopanic(
 {
 	void *ptr;
 
-	memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
+	memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
 		     __func__, (u64)size, (u64)align, nid, &min_addr,
 		     &max_addr, (void *)_RET_IP_);
 
@@ -1538,7 +1538,7 @@ void * __init memblock_alloc_try_nid(
 {
 	void *ptr;
 
-	memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
+	memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
 		     __func__, (u64)size, (u64)align, nid, &min_addr,
 		     &max_addr, (void *)_RET_IP_);
 	ptr = memblock_alloc_internal(size, align,
@@ -1567,7 +1567,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
 	phys_addr_t cursor, end;
 
 	end = base + size - 1;
-	memblock_dbg("%s: [%pa-%pa] %pF\n",
+	memblock_dbg("%s: [%pa-%pa] %pS\n",
 		     __func__, &base, &end, (void *)_RET_IP_);
 	kmemleak_free_part_phys(base, size);
 	cursor = PFN_UP(base);
diff --git a/mm/memory.c b/mm/memory.c
index 47fe250307c7..3541a15067f2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -519,7 +519,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
 		dump_page(page, "bad pte");
 	pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
 		 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
-	pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
+	pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
 		 vma->vm_file,
 		 vma->vm_ops ? vma->vm_ops->fault : NULL,
 		 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a5ad0b35ab8e..90648187f622 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -493,7 +493,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 
 	total_scan += delta;
 	if (total_scan < 0) {
-		pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
+		pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
 		       shrinker->scan_objects, total_scan);
 		total_scan = freeable;
 		next_deferred = nr;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index fa9530dd876e..6f739de28918 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2398,7 +2398,7 @@ static void finish_request(struct ceph_osd_request *req)
 
 static void __complete_request(struct ceph_osd_request *req)
 {
-	dout("%s req %p tid %llu cb %pf result %d\n", __func__, req,
+	dout("%s req %p tid %llu cb %ps result %d\n", __func__, req,
 	     req->r_tid, req->r_callback, req->r_result);
 
 	if (req->r_callback)
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 63881f72ef71..36347933ec3a 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -258,7 +258,7 @@ static int ptype_seq_show(struct seq_file *seq, void *v)
 		else
 			seq_printf(seq, "%04x", ntohs(pt->type));
 
-		seq_printf(seq, " %-8s %pf\n",
+		seq_printf(seq, " %-8s %ps\n",
 			   pt->dev ? pt->dev->name : "", pt->func);
 	}
 
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 361aabffb8c0..bf5446192d6a 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -149,7 +149,7 @@ static void poll_one_napi(struct napi_struct *napi)
 	 * indicate that we are clearing the Tx path only.
 	 */
 	work = napi->poll(napi, 0);
-	WARN_ONCE(work, "%pF exceeded budget in poll\n", napi->poll);
+	WARN_ONCE(work, "%pS exceeded budget in poll\n", napi->poll);
 	trace_napi_poll(napi, work, 0);
 
 	clear_bit(NAPI_STATE_NPSVC, &napi->state);
@@ -346,7 +346,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
 		}
 
 		WARN_ONCE(!irqs_disabled(),
-			"netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n",
+			"netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pS)\n",
 			dev->name, dev->netdev_ops->ndo_start_xmit);
 
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 39ea9baffda91df8bfee9b45610242a3191ea1ec Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:30 +0200
Subject: x86/fpu: Remove fpu->initialized usage in __fpu__restore_sig()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a preparation for the removal of the ->initialized member in the
fpu struct.

__fpu__restore_sig() is deactivating the FPU via fpu__drop() and then
setting manually ->initialized followed by fpu__restore(). The result is
that it is possible to manipulate fpu->state and the state of registers
won't be saved/restored on a context switch which would overwrite
fpu->state:

fpu__drop(fpu):
  ...
  fpu->initialized = 0;
  preempt_enable();

  <--- context switch

Don't access the fpu->state while the content is read from user space
and examined/sanitized. Use a temporary kmalloc() buffer for the
preparation of the FPU registers and once the state is considered okay,
load it. Should something go wrong, return with an error and without
altering the original FPU registers.

The removal of fpu__initialize() is a nop because fpu->initialized is
already set for the user task.

 [ bp: Massage a bit. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-2-bigeasy@linutronix.de
---
 arch/x86/include/asm/fpu/signal.h |  2 +-
 arch/x86/kernel/fpu/regset.c      |  5 ++---
 arch/x86/kernel/fpu/signal.c      | 40 +++++++++++++++------------------------
 3 files changed, 18 insertions(+), 29 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index 44bbc39a57b3..7fb516b6893a 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -22,7 +22,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
 
 extern void convert_from_fxsr(struct user_i387_ia32_struct *env,
 			      struct task_struct *tsk);
-extern void convert_to_fxsr(struct task_struct *tsk,
+extern void convert_to_fxsr(struct fxregs_state *fxsave,
 			    const struct user_i387_ia32_struct *env);
 
 unsigned long
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index bc02f5144b95..5dbc099178a8 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -269,11 +269,10 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
 		memcpy(&to[i], &from[i], sizeof(to[0]));
 }
 
-void convert_to_fxsr(struct task_struct *tsk,
+void convert_to_fxsr(struct fxregs_state *fxsave,
 		     const struct user_i387_ia32_struct *env)
 
 {
-	struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
 	struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
 	struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
 	int i;
@@ -350,7 +349,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
 	if (!ret)
-		convert_to_fxsr(target, &env);
+		convert_to_fxsr(&target->thread.fpu.state.fxsave, &env);
 
 	/*
 	 * update the header bit in the xsave header, indicating the
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 55b80de13ea5..7296a9bb78e7 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -207,11 +207,11 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 }
 
 static inline void
-sanitize_restored_xstate(struct task_struct *tsk,
+sanitize_restored_xstate(union fpregs_state *state,
 			 struct user_i387_ia32_struct *ia32_env,
 			 u64 xfeatures, int fx_only)
 {
-	struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+	struct xregs_state *xsave = &state->xsave;
 	struct xstate_header *header = &xsave->header;
 
 	if (use_xsave()) {
@@ -238,7 +238,7 @@ sanitize_restored_xstate(struct task_struct *tsk,
 		 */
 		xsave->i387.mxcsr &= mxcsr_feature_mask;
 
-		convert_to_fxsr(tsk, ia32_env);
+		convert_to_fxsr(&state->fxsave, ia32_env);
 	}
 }
 
@@ -284,8 +284,6 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 	if (!access_ok(buf, size))
 		return -EACCES;
 
-	fpu__initialize(fpu);
-
 	if (!static_cpu_has(X86_FEATURE_FPU))
 		return fpregs_soft_set(current, NULL,
 				       0, sizeof(struct user_i387_ia32_struct),
@@ -315,40 +313,32 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		 * header. Validate and sanitize the copied state.
 		 */
 		struct user_i387_ia32_struct env;
+		union fpregs_state *state;
 		int err = 0;
+		void *tmp;
 
-		/*
-		 * Drop the current fpu which clears fpu->initialized. This ensures
-		 * that any context-switch during the copy of the new state,
-		 * avoids the intermediate state from getting restored/saved.
-		 * Thus avoiding the new restored state from getting corrupted.
-		 * We will be ready to restore/save the state only after
-		 * fpu->initialized is again set.
-		 */
-		fpu__drop(fpu);
+		tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL);
+		if (!tmp)
+			return -ENOMEM;
+		state = PTR_ALIGN(tmp, 64);
 
 		if (using_compacted_format()) {
-			err = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
+			err = copy_user_to_xstate(&state->xsave, buf_fx);
 		} else {
-			err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
+			err = __copy_from_user(&state->xsave, buf_fx, state_size);
 
 			if (!err && state_size > offsetof(struct xregs_state, header))
-				err = validate_xstate_header(&fpu->state.xsave.header);
+				err = validate_xstate_header(&state->xsave.header);
 		}
 
 		if (err || __copy_from_user(&env, buf, sizeof(env))) {
-			fpstate_init(&fpu->state);
-			trace_x86_fpu_init_state(fpu);
 			err = -1;
 		} else {
-			sanitize_restored_xstate(tsk, &env, xfeatures, fx_only);
+			sanitize_restored_xstate(state, &env, xfeatures, fx_only);
+			copy_kernel_to_fpregs(state);
 		}
 
-		local_bh_disable();
-		fpu->initialized = 1;
-		fpu__restore(fpu);
-		local_bh_enable();
-
+		kfree(tmp);
 		return err;
 	} else {
 		/*
-- 
cgit v1.2.3-59-g8ed1b


From 6dd677a044e606fd343e31c2108b13d74aec1ca5 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:31 +0200
Subject: x86/fpu: Remove fpu__restore()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are no users of fpu__restore() so it is time to remove it. The
comment regarding fpu__restore() and TS bit is stale since commit

  b3b0870ef3ffe ("i387: do not preload FPU state at task switch time")

and has no meaning since.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aubrey Li <aubrey.li@intel.com>
Cc: Babu Moger <Babu.Moger@amd.com>
Cc: "Chang S. Bae" <chang.seok.bae@intel.com>
Cc: Dmitry Safonov <dima@arista.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: linux-doc@vger.kernel.org
Cc: Nicolai Stange <nstange@suse.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-3-bigeasy@linutronix.de
---
 Documentation/preempt-locking.txt   |  1 -
 arch/x86/include/asm/fpu/internal.h |  1 -
 arch/x86/kernel/fpu/core.c          | 24 ------------------------
 arch/x86/kernel/process_32.c        |  4 +---
 arch/x86/kernel/process_64.c        |  4 +---
 5 files changed, 2 insertions(+), 32 deletions(-)

(limited to 'arch')

diff --git a/Documentation/preempt-locking.txt b/Documentation/preempt-locking.txt
index 509f5a422d57..dce336134e54 100644
--- a/Documentation/preempt-locking.txt
+++ b/Documentation/preempt-locking.txt
@@ -52,7 +52,6 @@ preemption must be disabled around such regions.
 
 Note, some FPU functions are already explicitly preempt safe.  For example,
 kernel_fpu_begin and kernel_fpu_end will disable and enable preemption.
-However, fpu__restore() must be called with preemption disabled.
 
 
 RULE #3: Lock acquire and release must be performed by same task
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index fb04a3ded7dd..75a1d5f712ee 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -28,7 +28,6 @@ extern void fpu__initialize(struct fpu *fpu);
 extern void fpu__prepare_read(struct fpu *fpu);
 extern void fpu__prepare_write(struct fpu *fpu);
 extern void fpu__save(struct fpu *fpu);
-extern void fpu__restore(struct fpu *fpu);
 extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
 extern void fpu__drop(struct fpu *fpu);
 extern int  fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 2e5003fef51a..1d3ae7988f7f 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -303,30 +303,6 @@ void fpu__prepare_write(struct fpu *fpu)
 	}
 }
 
-/*
- * 'fpu__restore()' is called to copy FPU registers from
- * the FPU fpstate to the live hw registers and to activate
- * access to the hardware registers, so that FPU instructions
- * can be used afterwards.
- *
- * Must be called with kernel preemption disabled (for example
- * with local interrupts disabled, as it is in the case of
- * do_device_not_available()).
- */
-void fpu__restore(struct fpu *fpu)
-{
-	fpu__initialize(fpu);
-
-	/* Avoid __kernel_fpu_begin() right after fpregs_activate() */
-	kernel_fpu_disable();
-	trace_x86_fpu_before_restore(fpu);
-	fpregs_activate(fpu);
-	copy_kernel_to_fpregs(&fpu->state);
-	trace_x86_fpu_after_restore(fpu);
-	kernel_fpu_enable();
-}
-EXPORT_SYMBOL_GPL(fpu__restore);
-
 /*
  * Drops current FPU state: deactivates the fpregs and
  * the fpstate. NOTE: it still leaves previous contents
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index e471d8e6f0b2..7888a41a03cd 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -267,9 +267,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/*
 	 * Leave lazy mode, flushing any hypercalls made here.
 	 * This must be done before restoring TLS segments so
-	 * the GDT and LDT are properly updated, and must be
-	 * done before fpu__restore(), so the TS bit is up
-	 * to date.
+	 * the GDT and LDT are properly updated.
 	 */
 	arch_end_context_switch(next_p);
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6a62f4af9fcf..e1983b3a16c4 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -538,9 +538,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/*
 	 * Leave lazy mode, flushing any hypercalls made here.  This
 	 * must be done after loading TLS entries in the GDT but before
-	 * loading segments that might reference them, and and it must
-	 * be done before fpu__restore(), so the TS bit is up to
-	 * date.
+	 * loading segments that might reference them.
 	 */
 	arch_end_context_switch(next_p);
 
-- 
cgit v1.2.3-59-g8ed1b


From 60e528d6ce3f60a058bbb64f8acb2a07f84b172a Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:32 +0200
Subject: x86/fpu: Remove preempt_disable() in fpu__clear()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The preempt_disable() section was introduced in commit

  a10b6a16cdad8 ("x86/fpu: Make the fpu state change in fpu__clear() scheduler-atomic")

and it was said to be temporary.

fpu__initialize() initializes the FPU struct to its initial value and
then sets ->initialized to 1. The last part is the important one.
The content of the state does not matter because it gets set via
copy_init_fpstate_to_fpregs().

A preemption here has little meaning because the registers will always be
set to the same content after copy_init_fpstate_to_fpregs(). A softirq
with a kernel_fpu_begin() could also force to save FPU's registers after
fpu__initialize() without changing the outcome here.

Remove the preempt_disable() section in fpu__clear(), preemption here
does not hurt.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Nicolai Stange <nstange@suse.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-4-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/core.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1d3ae7988f7f..1940319268ae 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -366,11 +366,9 @@ void fpu__clear(struct fpu *fpu)
 	 * Make sure fpstate is cleared and initialized.
 	 */
 	if (static_cpu_has(X86_FEATURE_FPU)) {
-		preempt_disable();
 		fpu__initialize(fpu);
 		user_fpu_begin();
 		copy_init_fpstate_to_fpregs();
-		preempt_enable();
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 88f5260a3bf9bfb276b5b4aac2e81587e425a1d7 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:33 +0200
Subject: x86/fpu: Always init the state in fpu__clear()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fpu__clear() only initializes the state if the CPU has FPU support.
This initialisation is also required for FPU-less systems and takes
place in math_emulate(). Since fpu__initialize() only performs the
initialization if ->initialized is zero it does not matter that it
is invoked each time an opcode is emulated. It makes the removal of
->initialized easier if the struct is also initialized in the FPU-less
case at the same time.

Move fpu__initialize() before the FPU feature check so it is also
performed in the FPU-less case too.

 [ bp: Massage a bit. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aubrey Li <aubrey.li@intel.com>
Cc: Bill Metzenthen <billm@melbpc.org.au>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Nicolai Stange <nstange@suse.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-5-bigeasy@linutronix.de
---
 arch/x86/include/asm/fpu/internal.h | 1 -
 arch/x86/kernel/fpu/core.c          | 5 ++---
 arch/x86/math-emu/fpu_entry.c       | 3 ---
 3 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 75a1d5f712ee..70ecb7c032cb 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -24,7 +24,6 @@
 /*
  * High level FPU state handling functions:
  */
-extern void fpu__initialize(struct fpu *fpu);
 extern void fpu__prepare_read(struct fpu *fpu);
 extern void fpu__prepare_write(struct fpu *fpu);
 extern void fpu__save(struct fpu *fpu);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1940319268ae..e43296854e37 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -223,7 +223,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
  * Activate the current task's in-memory FPU context,
  * if it has not been used before:
  */
-void fpu__initialize(struct fpu *fpu)
+static void fpu__initialize(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu);
 
@@ -236,7 +236,6 @@ void fpu__initialize(struct fpu *fpu)
 		fpu->initialized = 1;
 	}
 }
-EXPORT_SYMBOL_GPL(fpu__initialize);
 
 /*
  * This function must be called before we read a task's fpstate.
@@ -365,8 +364,8 @@ void fpu__clear(struct fpu *fpu)
 	/*
 	 * Make sure fpstate is cleared and initialized.
 	 */
+	fpu__initialize(fpu);
 	if (static_cpu_has(X86_FEATURE_FPU)) {
-		fpu__initialize(fpu);
 		user_fpu_begin();
 		copy_init_fpstate_to_fpregs();
 	}
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 9e2ba7e667f6..a873da6b46d6 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -113,9 +113,6 @@ void math_emulate(struct math_emu_info *info)
 	unsigned long code_base = 0;
 	unsigned long code_limit = 0;	/* Initialized to stop compiler warnings */
 	struct desc_struct code_descriptor;
-	struct fpu *fpu = &current->thread.fpu;
-
-	fpu__initialize(fpu);
 
 #ifdef RE_ENTRANT_CHECKING
 	if (emulating) {
-- 
cgit v1.2.3-59-g8ed1b


From fbcc9e0c37ba3186c41b5eb1aee2f7f3b711bc1e Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:34 +0200
Subject: x86/fpu: Remove fpu->initialized usage in copy_fpstate_to_sigframe()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With lazy-FPU support the (now named variable) ->initialized was set to
true if the CPU's FPU registers were holding a valid state of the
FPU registers for the active process. If it was set to false then the
FPU state was saved in fpu->state and the FPU was deactivated.

With lazy-FPU gone, ->initialized is always true for user threads and
kernel threads never call this function so ->initialized is always true
in copy_fpstate_to_sigframe().

The using_compacted_format() check is also a leftover from the lazy-FPU
time. In the

  ->initialized == false

case copy_to_user() would copy the compacted buffer while userland would
expect the non-compacted format instead. So in order to save the FPU
state in the non-compacted form it issues XSAVE to save the *current*
FPU state.

If the FPU is not enabled, the attempt raises the FPU trap, the trap
restores the FPU contents and re-enables the FPU and XSAVE is invoked
again and succeeds.

*This* does not longer work since commit

  bef8b6da9522 ("x86/fpu: Handle #NM without FPU emulation as an error")

Remove the check for ->initialized because it is always true and remove
the false condition. Update the comment to reflect that the state is
always live.

 [ bp: Massage. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-6-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 35 ++++++++---------------------------
 1 file changed, 8 insertions(+), 27 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 7296a9bb78e7..c1a5999affa0 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -144,9 +144,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
  *	buf == buf_fx for 64-bit frames and 32-bit fsave frame.
  *	buf != buf_fx for 32-bit frames with fxstate.
  *
- * If the fpu, extended register state is live, save the state directly
- * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise,
- * copy the thread's fpu state to the user frame starting at 'buf_fx'.
+ * Save the state directly to the user frame pointed by the aligned pointer
+ * 'buf_fx'.
  *
  * If this is a 32-bit frame with fxstate, put a fsave header before
  * the aligned state at 'buf_fx'.
@@ -157,7 +156,6 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
 int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 {
 	struct fpu *fpu = &current->thread.fpu;
-	struct xregs_state *xsave = &fpu->state.xsave;
 	struct task_struct *tsk = current;
 	int ia32_fxstate = (buf != buf_fx);
 
@@ -172,29 +170,12 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 			sizeof(struct user_i387_ia32_struct), NULL,
 			(struct _fpstate_32 __user *) buf) ? -1 : 1;
 
-	if (fpu->initialized || using_compacted_format()) {
-		/* Save the live register state to the user directly. */
-		if (copy_fpregs_to_sigframe(buf_fx))
-			return -1;
-		/* Update the thread's fxstate to save the fsave header. */
-		if (ia32_fxstate)
-			copy_fxregs_to_kernel(fpu);
-	} else {
-		/*
-		 * It is a *bug* if kernel uses compacted-format for xsave
-		 * area and we copy it out directly to a signal frame. It
-		 * should have been handled above by saving the registers
-		 * directly.
-		 */
-		if (boot_cpu_has(X86_FEATURE_XSAVES)) {
-			WARN_ONCE(1, "x86/fpu: saving compacted-format xsave area to a signal frame!\n");
-			return -1;
-		}
-
-		fpstate_sanitize_xstate(fpu);
-		if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size))
-			return -1;
-	}
+	/* Save the live registers state to the user frame directly. */
+	if (copy_fpregs_to_sigframe(buf_fx))
+		return -1;
+	/* Update the thread's fxstate to save the fsave header. */
+	if (ia32_fxstate)
+		copy_fxregs_to_kernel(fpu);
 
 	/* Save the fsave header for the 32-bit frames. */
 	if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
-- 
cgit v1.2.3-59-g8ed1b


From c838b580ca9dcf16b07aa72bb61a914e9db0fd2c Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@mips.com>
Date: Fri, 5 Apr 2019 22:50:35 +0000
Subject: MIPS: jump_label: Remove redundant nops

Both arch_static_branch() & arch_static_branch_jump() emit a control
transfer instruction (ie. branch or jump) without disabling assembler
re-ordering. As such the assembler will automatically fill their delay
slots.

Both functions follow their branch or jump with an explicit nop that at
first appears to be there to fill the delay slot, but given that the
assembler will do that the explicit nops serve no purpose & we end up
with our branch or jump followed by 2 nops. Remove the redundant nops.

Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: linux-mips@vger.kernel.org
---
 arch/mips/include/asm/jump_label.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h
index e4456e450f94..df04449b261b 100644
--- a/arch/mips/include/asm/jump_label.h
+++ b/arch/mips/include/asm/jump_label.h
@@ -29,7 +29,7 @@
 static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
 	asm_volatile_goto("1:\t" B_INSN " 2f\n\t"
-		"2:\tnop\n\t"
+		"2:\t.insn\n\t"
 		".pushsection __jump_table,  \"aw\"\n\t"
 		WORD_INSN " 1b, %l[l_yes], %0\n\t"
 		".popsection\n\t"
@@ -43,7 +43,6 @@ l_yes:
 static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
 {
 	asm_volatile_goto("1:\tj %l[l_yes]\n\t"
-		"nop\n\t"
 		".pushsection __jump_table,  \"aw\"\n\t"
 		WORD_INSN " 1b, %l[l_yes], %0\n\t"
 		".popsection\n\t"
-- 
cgit v1.2.3-59-g8ed1b


From 9b6584e35f407a391de39938fd0b4fb29671924f Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@mips.com>
Date: Fri, 5 Apr 2019 22:50:36 +0000
Subject: MIPS: jump_label: Use compact branches for >= r6

MIPSr6 introduced compact branches which have no delay slots. Make use
of them for jump labels in order to avoid the need for a nop to fill the
branch or jump delay slot, saving 4 bytes of code for each static branch.

Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: linux-mips@vger.kernel.org
---
 arch/mips/include/asm/jump_label.h | 12 +++++++++---
 arch/mips/kernel/jump_label.c      | 30 +++++++++++++++++++++++++-----
 2 files changed, 34 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h
index df04449b261b..3185fd3220ec 100644
--- a/arch/mips/include/asm/jump_label.h
+++ b/arch/mips/include/asm/jump_label.h
@@ -11,6 +11,7 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
+#include <asm/isa-rev.h>
 
 #define JUMP_LABEL_NOP_SIZE 4
 
@@ -21,9 +22,14 @@
 #endif
 
 #ifdef CONFIG_CPU_MICROMIPS
-#define B_INSN "b32"
+# define B_INSN "b32"
+# define J_INSN "j32"
+#elif MIPS_ISA_REV >= 6
+# define B_INSN "bc"
+# define J_INSN "bc"
 #else
-#define B_INSN "b"
+# define B_INSN "b"
+# define J_INSN "j"
 #endif
 
 static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
@@ -42,7 +48,7 @@ l_yes:
 
 static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("1:\tj %l[l_yes]\n\t"
+	asm_volatile_goto("1:\t" J_INSN " %l[l_yes]\n\t"
 		".pushsection __jump_table,  \"aw\"\n\t"
 		WORD_INSN " 1b, %l[l_yes], %0\n\t"
 		".popsection\n\t"
diff --git a/arch/mips/kernel/jump_label.c b/arch/mips/kernel/jump_label.c
index ab943927f97a..662c8db9f45b 100644
--- a/arch/mips/kernel/jump_label.c
+++ b/arch/mips/kernel/jump_label.c
@@ -40,18 +40,38 @@ void arch_jump_label_transform(struct jump_entry *e,
 {
 	union mips_instruction *insn_p;
 	union mips_instruction insn;
+	long offset;
 
 	insn_p = (union mips_instruction *)msk_isa16_mode(e->code);
 
-	/* Jump only works within an aligned region its delay slot is in. */
-	BUG_ON((e->target & ~J_RANGE_MASK) != ((e->code + 4) & ~J_RANGE_MASK));
-
 	/* Target must have the right alignment and ISA must be preserved. */
 	BUG_ON((e->target & J_ALIGN_MASK) != J_ISA_BIT);
 
 	if (type == JUMP_LABEL_JMP) {
-		insn.j_format.opcode = J_ISA_BIT ? mm_j32_op : j_op;
-		insn.j_format.target = e->target >> J_RANGE_SHIFT;
+		if (!IS_ENABLED(CONFIG_CPU_MICROMIPS) && MIPS_ISA_REV >= 6) {
+			offset = e->target - ((unsigned long)insn_p + 4);
+			offset >>= 2;
+
+			/*
+			 * The branch offset must fit in the instruction's 26
+			 * bit field.
+			 */
+			WARN_ON((offset >= BIT(25)) ||
+				(offset < -(long)BIT(25)));
+
+			insn.j_format.opcode = bc6_op;
+			insn.j_format.target = offset;
+		} else {
+			/*
+			 * Jump only works within an aligned region its delay
+			 * slot is in.
+			 */
+			WARN_ON((e->target & ~J_RANGE_MASK) !=
+				((e->code + 4) & ~J_RANGE_MASK));
+
+			insn.j_format.opcode = J_ISA_BIT ? mm_j32_op : j_op;
+			insn.j_format.target = e->target >> J_RANGE_SHIFT;
+		}
 	} else {
 		insn.word = 0; /* nop */
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 3e3d1dfda4d6a17666fe0beae5c56b97b42e2c7a Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@mips.com>
Date: Fri, 5 Apr 2019 22:50:37 +0000
Subject: MIPS: generic: Enable CONFIG_JUMP_LABEL

Enable CONFIG_JUMP_LABEL for generic configs in order to better optimize
at runtime and get better test coverage for our jump label support.

Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: linux-mips@vger.kernel.org
---
 arch/mips/configs/generic_defconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/mips/configs/generic_defconfig b/arch/mips/configs/generic_defconfig
index 5d80521e5d5a..714169e411cf 100644
--- a/arch/mips/configs/generic_defconfig
+++ b/arch/mips/configs/generic_defconfig
@@ -26,6 +26,7 @@ CONFIG_MIPS_CPS=y
 CONFIG_HIGHMEM=y
 CONFIG_NR_CPUS=16
 CONFIG_MIPS_O32_FP64_SUPPORT=y
+CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_TRIM_UNUSED_KSYMS=y
-- 
cgit v1.2.3-59-g8ed1b


From 39388e80f9b0c3788bfb6efe3054bdce0c3ead45 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:35 +0200
Subject: x86/fpu: Don't save fxregs for ia32 frames in
 copy_fpstate_to_sigframe()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In commit

  72a671ced66db ("x86, fpu: Unify signal handling code paths for x86 and x86_64 kernels")

the 32bit and 64bit path of the signal delivery code were merged.

The 32bit version:

  int save_i387_xstate_ia32(void __user *buf)
  …
         if (cpu_has_xsave)
                 return save_i387_xsave(fp);
         if (cpu_has_fxsr)
                 return save_i387_fxsave(fp);

The 64bit version:

  int save_i387_xstate(void __user *buf)
  …
         if (user_has_fpu()) {
                 if (use_xsave())
                         err = xsave_user(buf);
                 else
                         err = fxsave_user(buf);

                 if (unlikely(err)) {
                         __clear_user(buf, xstate_size);
                         return err;

The merge:

  int save_xstate_sig(void __user *buf, void __user *buf_fx, int size)
  …
         if (user_has_fpu()) {
                 /* Save the live register state to the user directly. */
                 if (save_user_xstate(buf_fx))
                         return -1;
                 /* Update the thread's fxstate to save the fsave header. */
                 if (ia32_fxstate)
                         fpu_fxsave(&tsk->thread.fpu);

I don't think that we needed to save the FPU registers to ->thread.fpu
because the registers were stored in buf_fx. Today the state will be
restored from buf_fx after the signal was handled (I assume that this
was also the case with lazy-FPU).

Since commit

  66463db4fc560 ("x86, fpu: shift drop_init_fpu() from save_xstate_sig() to handle_signal()")

it is ensured that the signal handler starts with clear/fresh set of FPU
registers which means that the previous store is futile.

Remove the copy_fxregs_to_kernel() call because task's FPU state is
cleared later in handle_signal() via fpu__clear().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-7-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index c1a5999affa0..34989d2a8893 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -155,7 +155,6 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
  */
 int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 {
-	struct fpu *fpu = &current->thread.fpu;
 	struct task_struct *tsk = current;
 	int ia32_fxstate = (buf != buf_fx);
 
@@ -173,9 +172,6 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	/* Save the live registers state to the user frame directly. */
 	if (copy_fpregs_to_sigframe(buf_fx))
 		return -1;
-	/* Update the thread's fxstate to save the fsave header. */
-	if (ia32_fxstate)
-		copy_fxregs_to_kernel(fpu);
 
 	/* Save the fsave header for the 32-bit frames. */
 	if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
-- 
cgit v1.2.3-59-g8ed1b


From 2722146eb78451b30e4717a267a3a2b44e4ad317 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:36 +0200
Subject: x86/fpu: Remove fpu->initialized
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The struct fpu.initialized member is always set to one for user tasks
and zero for kernel tasks. This avoids saving/restoring the FPU
registers for kernel threads.

The ->initialized = 0 case for user tasks has been removed in previous
changes, for instance, by doing an explicit unconditional init at fork()
time for FPU-less systems which was otherwise delayed until the emulated
opcode.

The context switch code (switch_fpu_prepare() + switch_fpu_finish())
can't unconditionally save/restore registers for kernel threads. Not
only would it slow down the switch but also load a zeroed xcomp_bv for
XSAVES.

For kernel_fpu_begin() (+end) the situation is similar: EFI with runtime
services uses this before alternatives_patched is true. Which means that
this function is used too early and it wasn't the case before.

For those two cases, use current->mm to distinguish between user and
kernel thread. For kernel_fpu_begin() skip save/restore of the FPU
registers.

During the context switch into a kernel thread don't do anything. There
is no reason to save the FPU state of a kernel thread.

The reordering in __switch_to() is important because the current()
pointer needs to be valid before switch_fpu_finish() is invoked so ->mm
is seen of the new task instead the old one.

N.B.: fpu__save() doesn't need to check ->mm because it is called by
user tasks only.

 [ bp: Massage. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aubrey Li <aubrey.li@intel.com>
Cc: Babu Moger <Babu.Moger@amd.com>
Cc: "Chang S. Bae" <chang.seok.bae@intel.com>
Cc: Dmitry Safonov <dima@arista.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Nicolai Stange <nstange@suse.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-8-bigeasy@linutronix.de
---
 arch/x86/ia32/ia32_signal.c         | 17 ++++-----
 arch/x86/include/asm/fpu/internal.h | 18 +++++-----
 arch/x86/include/asm/fpu/types.h    |  9 -----
 arch/x86/include/asm/trace/fpu.h    |  5 +--
 arch/x86/kernel/fpu/core.c          | 70 +++++++++++--------------------------
 arch/x86/kernel/fpu/init.c          |  2 --
 arch/x86/kernel/fpu/regset.c        | 19 +++-------
 arch/x86/kernel/fpu/xstate.c        |  2 --
 arch/x86/kernel/process_32.c        |  4 +--
 arch/x86/kernel/process_64.c        |  4 +--
 arch/x86/kernel/signal.c            | 17 ++++-----
 arch/x86/mm/pkeys.c                 |  7 +---
 12 files changed, 53 insertions(+), 121 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 321fe5f5d0e9..6eeb3249f22f 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -216,8 +216,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
 				 size_t frame_size,
 				 void __user **fpstate)
 {
-	struct fpu *fpu = &current->thread.fpu;
-	unsigned long sp;
+	unsigned long sp, fx_aligned, math_size;
 
 	/* Default to using normal stack */
 	sp = regs->sp;
@@ -231,15 +230,11 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
 		 ksig->ka.sa.sa_restorer)
 		sp = (unsigned long) ksig->ka.sa.sa_restorer;
 
-	if (fpu->initialized) {
-		unsigned long fx_aligned, math_size;
-
-		sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
-		*fpstate = (struct _fpstate_32 __user *) sp;
-		if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
-				    math_size) < 0)
-			return (void __user *) -1L;
-	}
+	sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
+	*fpstate = (struct _fpstate_32 __user *) sp;
+	if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
+				     math_size) < 0)
+		return (void __user *) -1L;
 
 	sp -= frame_size;
 	/* Align the stack pointer according to the i386 ABI,
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 70ecb7c032cb..04042eacc852 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -494,11 +494,14 @@ static inline void fpregs_activate(struct fpu *fpu)
  *
  *  - switch_fpu_finish() restores the new state as
  *    necessary.
+ *
+ * The FPU context is only stored/restored for a user task and
+ * ->mm is used to distinguish between kernel and user threads.
  */
 static inline void
 switch_fpu_prepare(struct fpu *old_fpu, int cpu)
 {
-	if (static_cpu_has(X86_FEATURE_FPU) && old_fpu->initialized) {
+	if (static_cpu_has(X86_FEATURE_FPU) && current->mm) {
 		if (!copy_fpregs_to_fpstate(old_fpu))
 			old_fpu->last_cpu = -1;
 		else
@@ -506,8 +509,7 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu)
 
 		/* But leave fpu_fpregs_owner_ctx! */
 		trace_x86_fpu_regs_deactivated(old_fpu);
-	} else
-		old_fpu->last_cpu = -1;
+	}
 }
 
 /*
@@ -520,12 +522,12 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu)
  */
 static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
 {
-	bool preload = static_cpu_has(X86_FEATURE_FPU) &&
-		       new_fpu->initialized;
+	if (static_cpu_has(X86_FEATURE_FPU)) {
+		if (!fpregs_state_valid(new_fpu, cpu)) {
+			if (current->mm)
+				copy_kernel_to_fpregs(&new_fpu->state);
+		}
 
-	if (preload) {
-		if (!fpregs_state_valid(new_fpu, cpu))
-			copy_kernel_to_fpregs(&new_fpu->state);
 		fpregs_activate(new_fpu);
 	}
 }
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 2e32e178e064..f098f6cab94b 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -293,15 +293,6 @@ struct fpu {
 	 */
 	unsigned int			last_cpu;
 
-	/*
-	 * @initialized:
-	 *
-	 * This flag indicates whether this context is initialized: if the task
-	 * is not running then we can restore from this context, if the task
-	 * is running then we should save into this context.
-	 */
-	unsigned char			initialized;
-
 	/*
 	 * @avx512_timestamp:
 	 *
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index 069c04be1507..bd65f6ba950f 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -13,22 +13,19 @@ DECLARE_EVENT_CLASS(x86_fpu,
 
 	TP_STRUCT__entry(
 		__field(struct fpu *, fpu)
-		__field(bool, initialized)
 		__field(u64, xfeatures)
 		__field(u64, xcomp_bv)
 		),
 
 	TP_fast_assign(
 		__entry->fpu		= fpu;
-		__entry->initialized	= fpu->initialized;
 		if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
 			__entry->xfeatures = fpu->state.xsave.header.xfeatures;
 			__entry->xcomp_bv  = fpu->state.xsave.header.xcomp_bv;
 		}
 	),
-	TP_printk("x86/fpu: %p initialized: %d xfeatures: %llx xcomp_bv: %llx",
+	TP_printk("x86/fpu: %p xfeatures: %llx xcomp_bv: %llx",
 			__entry->fpu,
-			__entry->initialized,
 			__entry->xfeatures,
 			__entry->xcomp_bv
 	)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index e43296854e37..97e27de2b7c0 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -101,7 +101,7 @@ static void __kernel_fpu_begin(void)
 
 	kernel_fpu_disable();
 
-	if (fpu->initialized) {
+	if (current->mm) {
 		/*
 		 * Ignore return value -- we don't care if reg state
 		 * is clobbered.
@@ -116,7 +116,7 @@ static void __kernel_fpu_end(void)
 {
 	struct fpu *fpu = &current->thread.fpu;
 
-	if (fpu->initialized)
+	if (current->mm)
 		copy_kernel_to_fpregs(&fpu->state);
 
 	kernel_fpu_enable();
@@ -147,11 +147,10 @@ void fpu__save(struct fpu *fpu)
 
 	preempt_disable();
 	trace_x86_fpu_before_save(fpu);
-	if (fpu->initialized) {
-		if (!copy_fpregs_to_fpstate(fpu)) {
-			copy_kernel_to_fpregs(&fpu->state);
-		}
-	}
+
+	if (!copy_fpregs_to_fpstate(fpu))
+		copy_kernel_to_fpregs(&fpu->state);
+
 	trace_x86_fpu_after_save(fpu);
 	preempt_enable();
 }
@@ -190,7 +189,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 {
 	dst_fpu->last_cpu = -1;
 
-	if (!src_fpu->initialized || !static_cpu_has(X86_FEATURE_FPU))
+	if (!static_cpu_has(X86_FEATURE_FPU))
 		return 0;
 
 	WARN_ON_FPU(src_fpu != &current->thread.fpu);
@@ -227,14 +226,10 @@ static void fpu__initialize(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu);
 
-	if (!fpu->initialized) {
-		fpstate_init(&fpu->state);
-		trace_x86_fpu_init_state(fpu);
+	fpstate_init(&fpu->state);
+	trace_x86_fpu_init_state(fpu);
 
-		trace_x86_fpu_activate_state(fpu);
-		/* Safe to do for the current task: */
-		fpu->initialized = 1;
-	}
+	trace_x86_fpu_activate_state(fpu);
 }
 
 /*
@@ -247,32 +242,20 @@ static void fpu__initialize(struct fpu *fpu)
  *
  * - or it's called for stopped tasks (ptrace), in which case the
  *   registers were already saved by the context-switch code when
- *   the task scheduled out - we only have to initialize the registers
- *   if they've never been initialized.
+ *   the task scheduled out.
  *
  * If the task has used the FPU before then save it.
  */
 void fpu__prepare_read(struct fpu *fpu)
 {
-	if (fpu == &current->thread.fpu) {
+	if (fpu == &current->thread.fpu)
 		fpu__save(fpu);
-	} else {
-		if (!fpu->initialized) {
-			fpstate_init(&fpu->state);
-			trace_x86_fpu_init_state(fpu);
-
-			trace_x86_fpu_activate_state(fpu);
-			/* Safe to do for current and for stopped child tasks: */
-			fpu->initialized = 1;
-		}
-	}
 }
 
 /*
  * This function must be called before we write a task's fpstate.
  *
- * If the task has used the FPU before then invalidate any cached FPU registers.
- * If the task has not used the FPU before then initialize its fpstate.
+ * Invalidate any cached FPU registers.
  *
  * After this function call, after registers in the fpstate are
  * modified and the child task has woken up, the child task will
@@ -289,17 +272,8 @@ void fpu__prepare_write(struct fpu *fpu)
 	 */
 	WARN_ON_FPU(fpu == &current->thread.fpu);
 
-	if (fpu->initialized) {
-		/* Invalidate any cached state: */
-		__fpu_invalidate_fpregs_state(fpu);
-	} else {
-		fpstate_init(&fpu->state);
-		trace_x86_fpu_init_state(fpu);
-
-		trace_x86_fpu_activate_state(fpu);
-		/* Safe to do for stopped child tasks: */
-		fpu->initialized = 1;
-	}
+	/* Invalidate any cached state: */
+	__fpu_invalidate_fpregs_state(fpu);
 }
 
 /*
@@ -316,17 +290,13 @@ void fpu__drop(struct fpu *fpu)
 	preempt_disable();
 
 	if (fpu == &current->thread.fpu) {
-		if (fpu->initialized) {
-			/* Ignore delayed exceptions from user space */
-			asm volatile("1: fwait\n"
-				     "2:\n"
-				     _ASM_EXTABLE(1b, 2b));
-			fpregs_deactivate(fpu);
-		}
+		/* Ignore delayed exceptions from user space */
+		asm volatile("1: fwait\n"
+			     "2:\n"
+			     _ASM_EXTABLE(1b, 2b));
+		fpregs_deactivate(fpu);
 	}
 
-	fpu->initialized = 0;
-
 	trace_x86_fpu_dropped(fpu);
 
 	preempt_enable();
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 6abd83572b01..20d8fa7124c7 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -239,8 +239,6 @@ static void __init fpu__init_system_ctx_switch(void)
 
 	WARN_ON_FPU(!on_boot_cpu);
 	on_boot_cpu = 0;
-
-	WARN_ON_FPU(current->thread.fpu.initialized);
 }
 
 /*
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 5dbc099178a8..d652b939ccfb 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -15,16 +15,12 @@
  */
 int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
 {
-	struct fpu *target_fpu = &target->thread.fpu;
-
-	return target_fpu->initialized ? regset->n : 0;
+	return regset->n;
 }
 
 int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
 {
-	struct fpu *target_fpu = &target->thread.fpu;
-
-	if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->initialized)
+	if (boot_cpu_has(X86_FEATURE_FXSR))
 		return regset->n;
 	else
 		return 0;
@@ -370,16 +366,9 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu)
 {
 	struct task_struct *tsk = current;
-	struct fpu *fpu = &tsk->thread.fpu;
-	int fpvalid;
-
-	fpvalid = fpu->initialized;
-	if (fpvalid)
-		fpvalid = !fpregs_get(tsk, NULL,
-				      0, sizeof(struct user_i387_ia32_struct),
-				      ufpu, NULL);
 
-	return fpvalid;
+	return !fpregs_get(tsk, NULL, 0, sizeof(struct user_i387_ia32_struct),
+			   ufpu, NULL);
 }
 EXPORT_SYMBOL(dump_fpu);
 
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d7432c2b1051..8bfcc5b9e04b 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -892,8 +892,6 @@ const void *get_xsave_field_ptr(int xsave_state)
 {
 	struct fpu *fpu = &current->thread.fpu;
 
-	if (!fpu->initialized)
-		return NULL;
 	/*
 	 * fpu__save() takes the CPU's xstate registers
 	 * and saves them off to the 'fpu memory buffer.
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 7888a41a03cd..77d9eb43ccac 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -288,10 +288,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	if (prev->gs | next->gs)
 		lazy_load_gs(next->gs);
 
-	switch_fpu_finish(next_fpu, cpu);
-
 	this_cpu_write(current_task, next_p);
 
+	switch_fpu_finish(next_fpu, cpu);
+
 	/* Load the Intel cache allocation PQR MSR. */
 	resctrl_sched_in();
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e1983b3a16c4..ffea7c557963 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -566,14 +566,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	x86_fsgsbase_load(prev, next);
 
-	switch_fpu_finish(next_fpu, cpu);
-
 	/*
 	 * Switch the PDA and FPU contexts.
 	 */
 	this_cpu_write(current_task, next_p);
 	this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
 
+	switch_fpu_finish(next_fpu, cpu);
+
 	/* Reload sp0. */
 	update_task_stack(next_p);
 
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b419e1a1a0ce..87b327c6cb10 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -246,7 +246,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 	unsigned long sp = regs->sp;
 	unsigned long buf_fx = 0;
 	int onsigstack = on_sig_stack(sp);
-	struct fpu *fpu = &current->thread.fpu;
+	int ret;
 
 	/* redzone */
 	if (IS_ENABLED(CONFIG_X86_64))
@@ -265,11 +265,9 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 		sp = (unsigned long) ka->sa.sa_restorer;
 	}
 
-	if (fpu->initialized) {
-		sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
-					  &buf_fx, &math_size);
-		*fpstate = (void __user *)sp;
-	}
+	sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
+				  &buf_fx, &math_size);
+	*fpstate = (void __user *)sp;
 
 	sp = align_sigframe(sp - frame_size);
 
@@ -281,8 +279,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 		return (void __user *)-1L;
 
 	/* save i387 and extended state */
-	if (fpu->initialized &&
-	    copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0)
+	ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size);
+	if (ret < 0)
 		return (void __user *)-1L;
 
 	return (void __user *)sp;
@@ -763,8 +761,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 		/*
 		 * Ensure the signal handler starts with the new fpu state.
 		 */
-		if (fpu->initialized)
-			fpu__clear(fpu);
+		fpu__clear(fpu);
 	}
 	signal_setup_done(failed, ksig, stepping);
 }
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 047a77f6a10c..05bb9a44eb1c 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -39,17 +39,12 @@ int __execute_only_pkey(struct mm_struct *mm)
 	 * dance to set PKRU if we do not need to.  Check it
 	 * first and assume that if the execute-only pkey is
 	 * write-disabled that we do not have to set it
-	 * ourselves.  We need preempt off so that nobody
-	 * can make fpregs inactive.
+	 * ourselves.
 	 */
-	preempt_disable();
 	if (!need_to_set_mm_pkey &&
-	    current->thread.fpu.initialized &&
 	    !__pkru_allows_read(read_pkru(), execute_only_pkey)) {
-		preempt_enable();
 		return execute_only_pkey;
 	}
-	preempt_enable();
 
 	/*
 	 * Set up PKRU so that it denies access for everything
-- 
cgit v1.2.3-59-g8ed1b


From 0169f53e0d97bb675075506810494bd86b8c934e Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:37 +0200
Subject: x86/fpu: Remove user_fpu_begin()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

user_fpu_begin() sets fpu_fpregs_owner_ctx to task's fpu struct. This is
always the case since there is no lazy FPU anymore.

fpu_fpregs_owner_ctx is used during context switch to decide if it needs
to load the saved registers or if the currently loaded registers are
valid. It could be skipped during a

  taskA -> kernel thread -> taskA

switch because the switch to the kernel thread would not alter the CPU's
sFPU tate.

Since this field is always updated during context switch and
never invalidated, setting it manually (in user context) makes no
difference. A kernel thread with kernel_fpu_begin() block could
set fpu_fpregs_owner_ctx to NULL but a kernel thread does not use
user_fpu_begin().

This is a leftover from the lazy-FPU time.

Remove user_fpu_begin(), it does not change fpu_fpregs_owner_ctx's
content.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aubrey Li <aubrey.li@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Nicolai Stange <nstange@suse.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-9-bigeasy@linutronix.de
---
 arch/x86/include/asm/fpu/internal.h | 17 -----------------
 arch/x86/kernel/fpu/core.c          |  4 +---
 arch/x86/kernel/fpu/signal.c        |  1 -
 3 files changed, 1 insertion(+), 21 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 04042eacc852..54f70cae2f15 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -532,23 +532,6 @@ static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
 	}
 }
 
-/*
- * Needs to be preemption-safe.
- *
- * NOTE! user_fpu_begin() must be used only immediately before restoring
- * the save state. It does not do any saving/restoring on its own. In
- * lazy FPU mode, it is just an optimization to avoid a #NM exception,
- * the task can lose the FPU right after preempt_enable().
- */
-static inline void user_fpu_begin(void)
-{
-	struct fpu *fpu = &current->thread.fpu;
-
-	preempt_disable();
-	fpregs_activate(fpu);
-	preempt_enable();
-}
-
 /*
  * MXCSR and XCR definitions:
  */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 97e27de2b7c0..739ca3ae2bdc 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -335,10 +335,8 @@ void fpu__clear(struct fpu *fpu)
 	 * Make sure fpstate is cleared and initialized.
 	 */
 	fpu__initialize(fpu);
-	if (static_cpu_has(X86_FEATURE_FPU)) {
-		user_fpu_begin();
+	if (static_cpu_has(X86_FEATURE_FPU))
 		copy_init_fpstate_to_fpregs();
-	}
 }
 
 /*
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 34989d2a8893..155f4552413e 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -322,7 +322,6 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		 * For 64-bit frames and 32-bit fsave frames, restore the user
 		 * state to the registers directly (with exceptions handled).
 		 */
-		user_fpu_begin();
 		if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) {
 			fpu__clear(fpu);
 			return -1;
-- 
cgit v1.2.3-59-g8ed1b


From 4ee91519e1dccc175665fe24bb20a47c6053575c Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 3 Apr 2019 18:41:38 +0200
Subject: x86/fpu: Add an __fpregs_load_activate() internal helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a helper function that ensures the floating point registers for the
current task are active. Use with preemption disabled.

While at it, add fpregs_lock/unlock() helpers too, to be used in later
patches.

 [ bp: Add a comment about its intended usage. ]

Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aubrey Li <aubrey.li@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-10-bigeasy@linutronix.de
---
 arch/x86/include/asm/fpu/api.h      | 11 +++++++++++
 arch/x86/include/asm/fpu/internal.h | 22 ++++++++++++++--------
 2 files changed, 25 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index b56d504af654..73e684160f35 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -10,6 +10,7 @@
 
 #ifndef _ASM_X86_FPU_API_H
 #define _ASM_X86_FPU_API_H
+#include <linux/preempt.h>
 
 /*
  * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
@@ -22,6 +23,16 @@ extern void kernel_fpu_begin(void);
 extern void kernel_fpu_end(void);
 extern bool irq_fpu_usable(void);
 
+static inline void fpregs_lock(void)
+{
+	preempt_disable();
+}
+
+static inline void fpregs_unlock(void)
+{
+	preempt_enable();
+}
+
 /*
  * Query the presence of one or more xfeatures. Works on any legacy CPU as well.
  *
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 54f70cae2f15..3e0c2c496f2d 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -484,6 +484,18 @@ static inline void fpregs_activate(struct fpu *fpu)
 	trace_x86_fpu_regs_activated(fpu);
 }
 
+/*
+ * Internal helper, do not use directly. Use switch_fpu_return() instead.
+ */
+static inline void __fpregs_load_activate(struct fpu *fpu, int cpu)
+{
+	if (!fpregs_state_valid(fpu, cpu)) {
+		if (current->mm)
+			copy_kernel_to_fpregs(&fpu->state);
+		fpregs_activate(fpu);
+	}
+}
+
 /*
  * FPU state switching for scheduling.
  *
@@ -522,14 +534,8 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu)
  */
 static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
 {
-	if (static_cpu_has(X86_FEATURE_FPU)) {
-		if (!fpregs_state_valid(new_fpu, cpu)) {
-			if (current->mm)
-				copy_kernel_to_fpregs(&new_fpu->state);
-		}
-
-		fpregs_activate(new_fpu);
-	}
+	if (static_cpu_has(X86_FEATURE_FPU))
+		__fpregs_load_activate(new_fpu, cpu);
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 07baeb04f37c952981d63359ff840118ce8f5434 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:39 +0200
Subject: x86/fpu: Make __raw_xsave_addr() use a feature number instead of mask
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Most users of __raw_xsave_addr() use a feature number, shift it to a
mask and then __raw_xsave_addr() shifts it back to the feature number.

Make __raw_xsave_addr() use the feature number as an argument.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-11-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/xstate.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 8bfcc5b9e04b..4f7f3c5d0d0c 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -805,20 +805,18 @@ void fpu__resume_cpu(void)
 }
 
 /*
- * Given an xstate feature mask, calculate where in the xsave
+ * Given an xstate feature nr, calculate where in the xsave
  * buffer the state is.  Callers should ensure that the buffer
  * is valid.
  */
-static void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask)
+static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
 {
-	int feature_nr = fls64(xstate_feature_mask) - 1;
-
-	if (!xfeature_enabled(feature_nr)) {
+	if (!xfeature_enabled(xfeature_nr)) {
 		WARN_ON_FPU(1);
 		return NULL;
 	}
 
-	return (void *)xsave + xstate_comp_offsets[feature_nr];
+	return (void *)xsave + xstate_comp_offsets[xfeature_nr];
 }
 /*
  * Given the xsave area and a state inside, this function returns the
@@ -840,6 +838,7 @@ static void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask
  */
 void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
 {
+	int xfeature_nr;
 	/*
 	 * Do we even *have* xsave state?
 	 */
@@ -867,7 +866,8 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
 	if (!(xsave->header.xfeatures & xstate_feature))
 		return NULL;
 
-	return __raw_xsave_addr(xsave, xstate_feature);
+	xfeature_nr = fls64(xstate_feature) - 1;
+	return __raw_xsave_addr(xsave, xfeature_nr);
 }
 EXPORT_SYMBOL_GPL(get_xsave_addr);
 
@@ -1014,7 +1014,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
 		 * Copy only in-use xstates:
 		 */
 		if ((header.xfeatures >> i) & 1) {
-			void *src = __raw_xsave_addr(xsave, 1 << i);
+			void *src = __raw_xsave_addr(xsave, i);
 
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
@@ -1100,7 +1100,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
 		 * Copy only in-use xstates:
 		 */
 		if ((header.xfeatures >> i) & 1) {
-			void *src = __raw_xsave_addr(xsave, 1 << i);
+			void *src = __raw_xsave_addr(xsave, i);
 
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
@@ -1157,7 +1157,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 		u64 mask = ((u64)1 << i);
 
 		if (hdr.xfeatures & mask) {
-			void *dst = __raw_xsave_addr(xsave, 1 << i);
+			void *dst = __raw_xsave_addr(xsave, i);
 
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
@@ -1211,7 +1211,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 		u64 mask = ((u64)1 << i);
 
 		if (hdr.xfeatures & mask) {
-			void *dst = __raw_xsave_addr(xsave, 1 << i);
+			void *dst = __raw_xsave_addr(xsave, i);
 
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
-- 
cgit v1.2.3-59-g8ed1b


From abd16d68d65229e5acafdadc32704239131bf2ea Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:40 +0200
Subject: x86/fpu: Use a feature number instead of mask in two more helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After changing the argument of __raw_xsave_addr() from a mask to
number Dave suggested to check if it makes sense to do the same for
get_xsave_addr(). As it turns out it does.

Only get_xsave_addr() needs the mask to check if the requested feature
is part of what is supported/saved and then uses the number again. The
shift operation is cheaper compared to fls64() (find last bit set).
Also, the feature number uses less opcode space compared to the mask. :)

Make the get_xsave_addr() argument a xfeature number instead of a mask
and fix up its callers.

Furthermore, use xfeature_nr and xfeature_mask consistently.

This results in the following changes to the kvm code:

  feature -> xfeature_mask
  index -> xfeature_nr

Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Siarhei Liakh <Siarhei.Liakh@concurrent-rt.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-12-bigeasy@linutronix.de
---
 arch/x86/include/asm/fpu/xstate.h |  4 ++--
 arch/x86/kernel/fpu/xstate.c      | 22 ++++++++++------------
 arch/x86/kernel/traps.c           |  2 +-
 arch/x86/kvm/x86.c                | 28 ++++++++++++++--------------
 arch/x86/mm/mpx.c                 |  6 +++---
 5 files changed, 30 insertions(+), 32 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 48581988d78c..fbe41f808e5d 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -46,8 +46,8 @@ extern void __init update_regset_xstate_info(unsigned int size,
 					     u64 xstate_mask);
 
 void fpu__xstate_clear_all_cpu_caps(void);
-void *get_xsave_addr(struct xregs_state *xsave, int xstate);
-const void *get_xsave_field_ptr(int xstate_field);
+void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
+const void *get_xsave_field_ptr(int xfeature_nr);
 int using_compacted_format(void);
 int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
 int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 4f7f3c5d0d0c..9c459fd1d38e 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -830,15 +830,14 @@ static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
  *
  * Inputs:
  *	xstate: the thread's storage area for all FPU data
- *	xstate_feature: state which is defined in xsave.h (e.g.
- *	XFEATURE_MASK_FP, XFEATURE_MASK_SSE, etc...)
+ *	xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
+ *	XFEATURE_SSE, etc...)
  * Output:
  *	address of the state in the xsave area, or NULL if the
  *	field is not present in the xsave buffer.
  */
-void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
+void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
 {
-	int xfeature_nr;
 	/*
 	 * Do we even *have* xsave state?
 	 */
@@ -850,11 +849,11 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
 	 * have not enabled.  Remember that pcntxt_mask is
 	 * what we write to the XCR0 register.
 	 */
-	WARN_ONCE(!(xfeatures_mask & xstate_feature),
+	WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)),
 		  "get of unsupported state");
 	/*
 	 * This assumes the last 'xsave*' instruction to
-	 * have requested that 'xstate_feature' be saved.
+	 * have requested that 'xfeature_nr' be saved.
 	 * If it did not, we might be seeing and old value
 	 * of the field in the buffer.
 	 *
@@ -863,10 +862,9 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
 	 * or because the "init optimization" caused it
 	 * to not be saved.
 	 */
-	if (!(xsave->header.xfeatures & xstate_feature))
+	if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
 		return NULL;
 
-	xfeature_nr = fls64(xstate_feature) - 1;
 	return __raw_xsave_addr(xsave, xfeature_nr);
 }
 EXPORT_SYMBOL_GPL(get_xsave_addr);
@@ -882,13 +880,13 @@ EXPORT_SYMBOL_GPL(get_xsave_addr);
  * Note that this only works on the current task.
  *
  * Inputs:
- *	@xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
- *	XFEATURE_MASK_SSE, etc...)
+ *	@xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
+ *	XFEATURE_SSE, etc...)
  * Output:
  *	address of the state in the xsave area or NULL if the state
  *	is not present or is in its 'init state'.
  */
-const void *get_xsave_field_ptr(int xsave_state)
+const void *get_xsave_field_ptr(int xfeature_nr)
 {
 	struct fpu *fpu = &current->thread.fpu;
 
@@ -898,7 +896,7 @@ const void *get_xsave_field_ptr(int xsave_state)
 	 */
 	fpu__save(fpu);
 
-	return get_xsave_addr(&fpu->state.xsave, xsave_state);
+	return get_xsave_addr(&fpu->state.xsave, xfeature_nr);
 }
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d26f9e9c3d83..8b6d03e55d2f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -456,7 +456,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 	 * which is all zeros which indicates MPX was not
 	 * responsible for the exception.
 	 */
-	bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR);
 	if (!bndcsr)
 		goto exit_trap;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 099b851dabaf..8022e7769b3a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3674,15 +3674,15 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 	 */
 	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
 	while (valid) {
-		u64 feature = valid & -valid;
-		int index = fls64(feature) - 1;
-		void *src = get_xsave_addr(xsave, feature);
+		u64 xfeature_mask = valid & -valid;
+		int xfeature_nr = fls64(xfeature_mask) - 1;
+		void *src = get_xsave_addr(xsave, xfeature_nr);
 
 		if (src) {
 			u32 size, offset, ecx, edx;
-			cpuid_count(XSTATE_CPUID, index,
+			cpuid_count(XSTATE_CPUID, xfeature_nr,
 				    &size, &offset, &ecx, &edx);
-			if (feature == XFEATURE_MASK_PKRU)
+			if (xfeature_nr == XFEATURE_PKRU)
 				memcpy(dest + offset, &vcpu->arch.pkru,
 				       sizeof(vcpu->arch.pkru));
 			else
@@ -3690,7 +3690,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 
 		}
 
-		valid -= feature;
+		valid -= xfeature_mask;
 	}
 }
 
@@ -3717,22 +3717,22 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
 	 */
 	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
 	while (valid) {
-		u64 feature = valid & -valid;
-		int index = fls64(feature) - 1;
-		void *dest = get_xsave_addr(xsave, feature);
+		u64 xfeature_mask = valid & -valid;
+		int xfeature_nr = fls64(xfeature_mask) - 1;
+		void *dest = get_xsave_addr(xsave, xfeature_nr);
 
 		if (dest) {
 			u32 size, offset, ecx, edx;
-			cpuid_count(XSTATE_CPUID, index,
+			cpuid_count(XSTATE_CPUID, xfeature_nr,
 				    &size, &offset, &ecx, &edx);
-			if (feature == XFEATURE_MASK_PKRU)
+			if (xfeature_nr == XFEATURE_PKRU)
 				memcpy(&vcpu->arch.pkru, src + offset,
 				       sizeof(vcpu->arch.pkru));
 			else
 				memcpy(dest, src + offset, size);
 		}
 
-		valid -= feature;
+		valid -= xfeature_mask;
 	}
 }
 
@@ -8850,11 +8850,11 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 		if (init_event)
 			kvm_put_guest_fpu(vcpu);
 		mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
-					XFEATURE_MASK_BNDREGS);
+					XFEATURE_BNDREGS);
 		if (mpx_state_buffer)
 			memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
 		mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
-					XFEATURE_MASK_BNDCSR);
+					XFEATURE_BNDCSR);
 		if (mpx_state_buffer)
 			memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
 		if (init_event)
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index c805db6236b4..59726aaf4671 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -142,7 +142,7 @@ int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs)
 		goto err_out;
 	}
 	/* get bndregs field from current task's xsave area */
-	bndregs = get_xsave_field_ptr(XFEATURE_MASK_BNDREGS);
+	bndregs = get_xsave_field_ptr(XFEATURE_BNDREGS);
 	if (!bndregs) {
 		err = -EINVAL;
 		goto err_out;
@@ -190,7 +190,7 @@ static __user void *mpx_get_bounds_dir(void)
 	 * The bounds directory pointer is stored in a register
 	 * only accessible if we first do an xsave.
 	 */
-	bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR);
 	if (!bndcsr)
 		return MPX_INVALID_BOUNDS_DIR;
 
@@ -376,7 +376,7 @@ static int do_mpx_bt_fault(void)
 	const struct mpx_bndcsr *bndcsr;
 	struct mm_struct *mm = current->mm;
 
-	bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR);
 	if (!bndcsr)
 		return -EINVAL;
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From 3d45ad9260c35c597706847e196aae8d966a574f Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.ibm.com>
Date: Wed, 3 Apr 2019 22:12:17 -0400
Subject: x86/ima: add missing include

As reported by 0-DAY kernel test infrastructure:
   arch/x86//kernel/ima_arch.c: In function 'arch_get_ima_policy':
>> arch/x86//kernel/ima_arch.c:78:4: error: implicit declaration of
function 'set_module_sig_enforced' [-Werror=implicit-function-declaration]

Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 arch/x86/kernel/ima_arch.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c
index 3fb9847f1cad..85de790583f9 100644
--- a/arch/x86/kernel/ima_arch.c
+++ b/arch/x86/kernel/ima_arch.c
@@ -3,6 +3,7 @@
  * Copyright (C) 2018 IBM Corporation
  */
 #include <linux/efi.h>
+#include <linux/module.h>
 #include <linux/ima.h>
 
 extern struct boot_params boot_params;
-- 
cgit v1.2.3-59-g8ed1b


From 88ec6b93c8e7d6d4ffaf6ad6395ceb3bf552de15 Mon Sep 17 00:00:00 2001
From: Cédric Le Goater <clg@kaod.org>
Date: Wed, 10 Apr 2019 19:04:33 +0200
Subject: powerpc/xive: add OPAL extensions for the XIVE native exploitation
 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The support for XIVE native exploitation mode in Linux/KVM needs a
couple more OPAL calls to get and set the state of the XIVE internal
structures being used by a sPAPR guest.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/opal-api.h        |  7 ++-
 arch/powerpc/include/asm/opal.h            |  7 +++
 arch/powerpc/include/asm/xive.h            | 14 +++++
 arch/powerpc/platforms/powernv/opal-call.c |  3 +
 arch/powerpc/sysdev/xive/native.c          | 99 ++++++++++++++++++++++++++++++
 5 files changed, 127 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 870fb7b239ea..e1d118ac61dc 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -186,8 +186,8 @@
 #define OPAL_XIVE_FREE_IRQ			140
 #define OPAL_XIVE_SYNC				141
 #define OPAL_XIVE_DUMP				142
-#define OPAL_XIVE_RESERVED3			143
-#define OPAL_XIVE_RESERVED4			144
+#define OPAL_XIVE_GET_QUEUE_STATE		143
+#define OPAL_XIVE_SET_QUEUE_STATE		144
 #define OPAL_SIGNAL_SYSTEM_RESET		145
 #define OPAL_NPU_INIT_CONTEXT			146
 #define OPAL_NPU_DESTROY_CONTEXT		147
@@ -210,7 +210,8 @@
 #define OPAL_PCI_GET_PBCQ_TUNNEL_BAR		164
 #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR		165
 #define	OPAL_NX_COPROC_INIT			167
-#define OPAL_LAST				167
+#define OPAL_XIVE_GET_VP_STATE			170
+#define OPAL_LAST				170
 
 #define QUIESCE_HOLD			1 /* Spin all calls at entry */
 #define QUIESCE_REJECT			2 /* Fail all calls with OPAL_BUSY */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index a55b01c90bb1..4e978d4dea5c 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -279,6 +279,13 @@ int64_t opal_xive_allocate_irq(uint32_t chip_id);
 int64_t opal_xive_free_irq(uint32_t girq);
 int64_t opal_xive_sync(uint32_t type, uint32_t id);
 int64_t opal_xive_dump(uint32_t type, uint32_t id);
+int64_t opal_xive_get_queue_state(uint64_t vp, uint32_t prio,
+				  __be32 *out_qtoggle,
+				  __be32 *out_qindex);
+int64_t opal_xive_set_queue_state(uint64_t vp, uint32_t prio,
+				  uint32_t qtoggle,
+				  uint32_t qindex);
+int64_t opal_xive_get_vp_state(uint64_t vp, __be64 *out_w01);
 int64_t opal_pci_set_p2p(uint64_t phb_init, uint64_t phb_target,
 			uint64_t desc, uint16_t pe_number);
 
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index 3c704f5dd3ae..b579a943407b 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -109,12 +109,26 @@ extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
 extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
 
 extern void xive_native_sync_source(u32 hw_irq);
+extern void xive_native_sync_queue(u32 hw_irq);
 extern bool is_xive_irq(struct irq_chip *chip);
 extern int xive_native_enable_vp(u32 vp_id, bool single_escalation);
 extern int xive_native_disable_vp(u32 vp_id);
 extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id);
 extern bool xive_native_has_single_escalation(void);
 
+extern int xive_native_get_queue_info(u32 vp_id, uint32_t prio,
+				      u64 *out_qpage,
+				      u64 *out_qsize,
+				      u64 *out_qeoi_page,
+				      u32 *out_escalate_irq,
+				      u64 *out_qflags);
+
+extern int xive_native_get_queue_state(u32 vp_id, uint32_t prio, u32 *qtoggle,
+				       u32 *qindex);
+extern int xive_native_set_queue_state(u32 vp_id, uint32_t prio, u32 qtoggle,
+				       u32 qindex);
+extern int xive_native_get_vp_state(u32 vp_id, u64 *out_state);
+
 #else
 
 static inline bool xive_enabled(void) { return false; }
diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c
index daad8c45c8e7..7472244e7f30 100644
--- a/arch/powerpc/platforms/powernv/opal-call.c
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -260,6 +260,9 @@ OPAL_CALL(opal_xive_get_vp_info,		OPAL_XIVE_GET_VP_INFO);
 OPAL_CALL(opal_xive_set_vp_info,		OPAL_XIVE_SET_VP_INFO);
 OPAL_CALL(opal_xive_sync,			OPAL_XIVE_SYNC);
 OPAL_CALL(opal_xive_dump,			OPAL_XIVE_DUMP);
+OPAL_CALL(opal_xive_get_queue_state,		OPAL_XIVE_GET_QUEUE_STATE);
+OPAL_CALL(opal_xive_set_queue_state,		OPAL_XIVE_SET_QUEUE_STATE);
+OPAL_CALL(opal_xive_get_vp_state,		OPAL_XIVE_GET_VP_STATE);
 OPAL_CALL(opal_signal_system_reset,		OPAL_SIGNAL_SYSTEM_RESET);
 OPAL_CALL(opal_npu_init_context,		OPAL_NPU_INIT_CONTEXT);
 OPAL_CALL(opal_npu_destroy_context,		OPAL_NPU_DESTROY_CONTEXT);
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
index 1ca127d052a6..0c037e933e55 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -437,6 +437,12 @@ void xive_native_sync_source(u32 hw_irq)
 }
 EXPORT_SYMBOL_GPL(xive_native_sync_source);
 
+void xive_native_sync_queue(u32 hw_irq)
+{
+	opal_xive_sync(XIVE_SYNC_QUEUE, hw_irq);
+}
+EXPORT_SYMBOL_GPL(xive_native_sync_queue);
+
 static const struct xive_ops xive_native_ops = {
 	.populate_irq_data	= xive_native_populate_irq_data,
 	.configure_irq		= xive_native_configure_irq,
@@ -711,3 +717,96 @@ bool xive_native_has_single_escalation(void)
 	return xive_has_single_esc;
 }
 EXPORT_SYMBOL_GPL(xive_native_has_single_escalation);
+
+int xive_native_get_queue_info(u32 vp_id, u32 prio,
+			       u64 *out_qpage,
+			       u64 *out_qsize,
+			       u64 *out_qeoi_page,
+			       u32 *out_escalate_irq,
+			       u64 *out_qflags)
+{
+	__be64 qpage;
+	__be64 qsize;
+	__be64 qeoi_page;
+	__be32 escalate_irq;
+	__be64 qflags;
+	s64 rc;
+
+	rc = opal_xive_get_queue_info(vp_id, prio, &qpage, &qsize,
+				      &qeoi_page, &escalate_irq, &qflags);
+	if (rc) {
+		pr_err("OPAL failed to get queue info for VCPU %d/%d : %lld\n",
+		       vp_id, prio, rc);
+		return -EIO;
+	}
+
+	if (out_qpage)
+		*out_qpage = be64_to_cpu(qpage);
+	if (out_qsize)
+		*out_qsize = be32_to_cpu(qsize);
+	if (out_qeoi_page)
+		*out_qeoi_page = be64_to_cpu(qeoi_page);
+	if (out_escalate_irq)
+		*out_escalate_irq = be32_to_cpu(escalate_irq);
+	if (out_qflags)
+		*out_qflags = be64_to_cpu(qflags);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xive_native_get_queue_info);
+
+int xive_native_get_queue_state(u32 vp_id, u32 prio, u32 *qtoggle, u32 *qindex)
+{
+	__be32 opal_qtoggle;
+	__be32 opal_qindex;
+	s64 rc;
+
+	rc = opal_xive_get_queue_state(vp_id, prio, &opal_qtoggle,
+				       &opal_qindex);
+	if (rc) {
+		pr_err("OPAL failed to get queue state for VCPU %d/%d : %lld\n",
+		       vp_id, prio, rc);
+		return -EIO;
+	}
+
+	if (qtoggle)
+		*qtoggle = be32_to_cpu(opal_qtoggle);
+	if (qindex)
+		*qindex = be32_to_cpu(opal_qindex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xive_native_get_queue_state);
+
+int xive_native_set_queue_state(u32 vp_id, u32 prio, u32 qtoggle, u32 qindex)
+{
+	s64 rc;
+
+	rc = opal_xive_set_queue_state(vp_id, prio, qtoggle, qindex);
+	if (rc) {
+		pr_err("OPAL failed to set queue state for VCPU %d/%d : %lld\n",
+		       vp_id, prio, rc);
+		return -EIO;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xive_native_set_queue_state);
+
+int xive_native_get_vp_state(u32 vp_id, u64 *out_state)
+{
+	__be64 state;
+	s64 rc;
+
+	rc = opal_xive_get_vp_state(vp_id, &state);
+	if (rc) {
+		pr_err("OPAL failed to get vp state for VCPU %d : %lld\n",
+		       vp_id, rc);
+		return -EIO;
+	}
+
+	if (out_state)
+		*out_state = be64_to_cpu(state);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xive_native_get_vp_state);
-- 
cgit v1.2.3-59-g8ed1b


From c806e88734b9e9aea260bf2261c129aa23968fca Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:41 +0200
Subject: x86/pkeys: Provide *pkru() helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dave Hansen asked for __read_pkru() and __write_pkru() to be
symmetrical.

As part of the series __write_pkru() will read back the value and only
write it if it is different.

In order to make both functions symmetrical, move the function
containing only the opcode asm into a function called like the
instruction itself.

__write_pkru() will just invoke wrpkru() but in a follow-up patch will
also read back the value.

 [ bp: Convert asm opcode wrapper names to rd/wrpkru(). ]

Suggested-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Juergen Gross <jgross@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-13-bigeasy@linutronix.de
---
 arch/x86/include/asm/pgtable.h       |  2 +-
 arch/x86/include/asm/special_insns.h | 12 +++++++++---
 arch/x86/kvm/vmx/vmx.c               |  2 +-
 3 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 2779ace16d23..e8875ca75623 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -127,7 +127,7 @@ static inline int pte_dirty(pte_t pte)
 static inline u32 read_pkru(void)
 {
 	if (boot_cpu_has(X86_FEATURE_OSPKE))
-		return __read_pkru();
+		return rdpkru();
 	return 0;
 }
 
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 43c029cdc3fe..34897e2b52c9 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -92,7 +92,7 @@ static inline void native_write_cr8(unsigned long val)
 #endif
 
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
-static inline u32 __read_pkru(void)
+static inline u32 rdpkru(void)
 {
 	u32 ecx = 0;
 	u32 edx, pkru;
@@ -107,7 +107,7 @@ static inline u32 __read_pkru(void)
 	return pkru;
 }
 
-static inline void __write_pkru(u32 pkru)
+static inline void wrpkru(u32 pkru)
 {
 	u32 ecx = 0, edx = 0;
 
@@ -118,8 +118,14 @@ static inline void __write_pkru(u32 pkru)
 	asm volatile(".byte 0x0f,0x01,0xef\n\t"
 		     : : "a" (pkru), "c"(ecx), "d"(edx));
 }
+
+static inline void __write_pkru(u32 pkru)
+{
+	wrpkru(pkru);
+}
+
 #else
-static inline u32 __read_pkru(void)
+static inline u32 rdpkru(void)
 {
 	return 0;
 }
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index ab432a930ae8..dd1e1eea4f0b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6501,7 +6501,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	 */
 	if (static_cpu_has(X86_FEATURE_PKU) &&
 	    kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
-		vcpu->arch.pkru = __read_pkru();
+		vcpu->arch.pkru = rdpkru();
 		if (vcpu->arch.pkru != vmx->host_pkru)
 			__write_pkru(vmx->host_pkru);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 577ff465f5a6a5a0866d75a033844810baca20a0 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:42 +0200
Subject: x86/fpu: Only write PKRU if it is different from current
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

According to Dave Hansen, WRPKRU is more expensive than RDPKRU. It has
a higher cycle cost and it's also practically a (light) speculation
barrier.

As an optimisation read the current PKRU value and only write the new
one if it is different.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-14-bigeasy@linutronix.de
---
 arch/x86/include/asm/special_insns.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 34897e2b52c9..0a3c4cab39db 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -121,6 +121,13 @@ static inline void wrpkru(u32 pkru)
 
 static inline void __write_pkru(u32 pkru)
 {
+	/*
+	 * WRPKRU is relatively expensive compared to RDPKRU.
+	 * Avoid WRPKRU when it would not change the value.
+	 */
+	if (pkru == rdpkru())
+		return;
+
 	wrpkru(pkru);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 0556cbdc2fbcb3068e5b924a8b3d5386ae0dd27d Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:43 +0200
Subject: x86/pkeys: Don't check if PKRU is zero before writing it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

write_pkru() checks if the current value is the same as the expected
value. So instead of just checking if the current and new value is zero
(and skip the write in such a case) we can benefit from that.

Remove the zero check of PKRU, __write_pkru() provides such a check now.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-15-bigeasy@linutronix.de
---
 arch/x86/mm/pkeys.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 05bb9a44eb1c..50f65fc1b9a3 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -142,13 +142,6 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
 void copy_init_pkru_to_fpregs(void)
 {
 	u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value);
-	/*
-	 * Any write to PKRU takes it out of the XSAVE 'init
-	 * state' which increases context switch cost.  Avoid
-	 * writing 0 when PKRU was already 0.
-	 */
-	if (!init_pkru_value_snapshot && !read_pkru())
-		return;
 	/*
 	 * Override the PKRU state that came from 'init_fpstate'
 	 * with the baseline from the process.
-- 
cgit v1.2.3-59-g8ed1b


From 0cecca9d03c964abbd2b7927d0670eb70db4ebf2 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 3 Apr 2019 18:41:44 +0200
Subject: x86/fpu: Eager switch PKRU state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While most of a task's FPU state is only needed in user space, the
protection keys need to be in place immediately after a context switch.

The reason is that any access to userspace memory while running in
kernel mode also needs to abide by the memory permissions specified in
the protection keys.

The "eager switch" is a preparation for loading the FPU state on return
to userland. Instead of decoupling PKRU state from xstate, update PKRU
within xstate on write operations by the kernel.

For user tasks the PKRU should be always read from the xsave area and it
should not change anything because the PKRU value was loaded as part of
FPU restore.

For kernel threads the default "init_pkru_value" will be written. Before
this commit, the kernel thread would end up with a random value which it
inherited from the previous user task.

 [ bigeasy: save pkru to xstate, no cache, don't use __raw_xsave_addr() ]

 [ bp: update commit message, sort headers properly in asm/fpu/xstate.h ]

Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aubrey Li <aubrey.li@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Juergen Gross <jgross@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-16-bigeasy@linutronix.de
---
 arch/x86/include/asm/fpu/internal.h | 24 ++++++++++++++++++++++--
 arch/x86/include/asm/fpu/xstate.h   |  4 +++-
 arch/x86/include/asm/pgtable.h      |  6 ++++++
 arch/x86/mm/pkeys.c                 |  1 -
 4 files changed, 31 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 3e0c2c496f2d..6eb4a0b1ad0e 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -14,6 +14,7 @@
 #include <linux/compat.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 
 #include <asm/user.h>
 #include <asm/fpu/api.h>
@@ -534,8 +535,27 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu)
  */
 static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
 {
-	if (static_cpu_has(X86_FEATURE_FPU))
-		__fpregs_load_activate(new_fpu, cpu);
+	u32 pkru_val = init_pkru_value;
+	struct pkru_state *pk;
+
+	if (!static_cpu_has(X86_FEATURE_FPU))
+		return;
+
+	__fpregs_load_activate(new_fpu, cpu);
+
+	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+		return;
+
+	/*
+	 * PKRU state is switched eagerly because it needs to be valid before we
+	 * return to userland e.g. for a copy_to_user() operation.
+	 */
+	if (current->mm) {
+		pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU);
+		if (pk)
+			pkru_val = pk->pkru;
+	}
+	__write_pkru(pkru_val);
 }
 
 /*
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index fbe41f808e5d..7e42b285c856 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -2,9 +2,11 @@
 #ifndef __ASM_X86_XSAVE_H
 #define __ASM_X86_XSAVE_H
 
+#include <linux/uaccess.h>
 #include <linux/types.h>
+
 #include <asm/processor.h>
-#include <linux/uaccess.h>
+#include <asm/user.h>
 
 /* Bit 63 of XCR0 is reserved for future expansion */
 #define XFEATURE_MASK_EXTEND	(~(XFEATURE_MASK_FPSSE | (1ULL << 63)))
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e8875ca75623..9beb371b1adf 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1355,6 +1355,12 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
 #define PKRU_WD_BIT 0x2
 #define PKRU_BITS_PER_PKEY 2
 
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+extern u32 init_pkru_value;
+#else
+#define init_pkru_value	0
+#endif
+
 static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
 {
 	int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 50f65fc1b9a3..2ecbf4155f98 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -126,7 +126,6 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
  * in the process's lifetime will not accidentally get access
  * to data which is pkey-protected later on.
  */
-static
 u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
 		      PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) |
 		      PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) |
-- 
cgit v1.2.3-59-g8ed1b


From 383c252545edcc708128e2028a2318b05c45ede4 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:45 +0200
Subject: x86/entry: Add TIF_NEED_FPU_LOAD
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add TIF_NEED_FPU_LOAD. This flag is used for loading the FPU registers
before returning to userland. It must not be set on systems without a
FPU.

If this flag is cleared, the CPU's FPU registers hold the latest,
up-to-date content of the current task's (current()) FPU registers.
The in-memory copy (union fpregs_state) is not valid.

If this flag is set, then all of CPU's FPU registers may hold a random
value (except for PKRU) and it is required to load the content of the
FPU registers on return to userland.

Introduce it now as a preparatory change before adding the main feature.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aubrey Li <aubrey.li@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-17-bigeasy@linutronix.de
---
 arch/x86/include/asm/fpu/internal.h | 8 ++++++++
 arch/x86/include/asm/thread_info.h  | 2 ++
 2 files changed, 10 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 6eb4a0b1ad0e..da75d7b3e37d 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -508,6 +508,14 @@ static inline void __fpregs_load_activate(struct fpu *fpu, int cpu)
  *  - switch_fpu_finish() restores the new state as
  *    necessary.
  *
+ * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers
+ * are saved in the current thread's FPU register state.
+ *
+ * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not
+ * hold current()'s FPU registers. It is required to load the
+ * registers before returning to userland or using the content
+ * otherwise.
+ *
  * The FPU context is only stored/restored for a user task and
  * ->mm is used to distinguish between kernel and user threads.
  */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e0eccbcb8447..f9453536f9bb 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -88,6 +88,7 @@ struct thread_info {
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_UPROBE		12	/* breakpointed or singlestepping */
 #define TIF_PATCH_PENDING	13	/* pending live patching update */
+#define TIF_NEED_FPU_LOAD	14	/* load FPU on return to userspace */
 #define TIF_NOCPUID		15	/* CPUID is not accessible in userland */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* IA32 compatibility process */
@@ -117,6 +118,7 @@ struct thread_info {
 #define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
 #define _TIF_PATCH_PENDING	(1 << TIF_PATCH_PENDING)
+#define _TIF_NEED_FPU_LOAD	(1 << TIF_NEED_FPU_LOAD)
 #define _TIF_NOCPUID		(1 << TIF_NOCPUID)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
-- 
cgit v1.2.3-59-g8ed1b


From 69277c98f5eef0d9839699b7825923c3985f665f Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 3 Apr 2019 18:41:46 +0200
Subject: x86/fpu: Always store the registers in copy_fpstate_to_sigframe()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

copy_fpstate_to_sigframe() stores the registers directly to user space.
This is okay because the FPU registers are valid and saving them
directly avoids saving them into kernel memory and making a copy.

However, this cannot be done anymore if the FPU registers are going
to be restored on the return to userland. It is possible that the FPU
registers will be invalidated in the middle of the save operation and
this should be done with disabled preemption / BH.

Save the FPU registers to the task's FPU struct and copy them to the
user memory later on.

Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-18-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 155f4552413e..8f23f5237218 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -144,8 +144,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
  *	buf == buf_fx for 64-bit frames and 32-bit fsave frame.
  *	buf != buf_fx for 32-bit frames with fxstate.
  *
- * Save the state directly to the user frame pointed by the aligned pointer
- * 'buf_fx'.
+ * Save the state to task's fpu->state and then copy it to the user frame
+ * pointed to by the aligned pointer 'buf_fx'.
  *
  * If this is a 32-bit frame with fxstate, put a fsave header before
  * the aligned state at 'buf_fx'.
@@ -155,6 +155,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
  */
 int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 {
+	struct fpu *fpu = &current->thread.fpu;
+	struct xregs_state *xsave = &fpu->state.xsave;
 	struct task_struct *tsk = current;
 	int ia32_fxstate = (buf != buf_fx);
 
@@ -169,9 +171,16 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 			sizeof(struct user_i387_ia32_struct), NULL,
 			(struct _fpstate_32 __user *) buf) ? -1 : 1;
 
-	/* Save the live registers state to the user frame directly. */
-	if (copy_fpregs_to_sigframe(buf_fx))
-		return -1;
+	copy_fpregs_to_fpstate(fpu);
+
+	if (using_compacted_format()) {
+		if (copy_xstate_to_user(buf_fx, xsave, 0, size))
+			return -1;
+	} else {
+		fpstate_sanitize_xstate(fpu);
+		if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size))
+			return -1;
+	}
 
 	/* Save the fsave header for the 32-bit frames. */
 	if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
-- 
cgit v1.2.3-59-g8ed1b


From a352a3b7b7920212ee4c45a41500c66826318e92 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 3 Apr 2019 18:41:47 +0200
Subject: x86/fpu: Prepare copy_fpstate_to_sigframe() for TIF_NEED_FPU_LOAD
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The FPU registers need only to be saved if TIF_NEED_FPU_LOAD is not set.
Otherwise this has been already done and can be skipped.

 [ bp: Massage a bit. ]

Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-19-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 8f23f5237218..9b9dfdc96285 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -171,7 +171,17 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 			sizeof(struct user_i387_ia32_struct), NULL,
 			(struct _fpstate_32 __user *) buf) ? -1 : 1;
 
-	copy_fpregs_to_fpstate(fpu);
+	/*
+	 * If we do not need to load the FPU registers at return to userspace
+	 * then the CPU has the current state and we need to save it. Otherwise,
+	 * it has already been done and we can skip it.
+	 */
+	fpregs_lock();
+	if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
+		copy_fpregs_to_fpstate(fpu);
+		set_thread_flag(TIF_NEED_FPU_LOAD);
+	}
+	fpregs_unlock();
 
 	if (using_compacted_format()) {
 		if (copy_xstate_to_user(buf_fx, xsave, 0, size))
-- 
cgit v1.2.3-59-g8ed1b


From 0d714dba162620fd8b9f5b3104a487e041353c4d Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:48 +0200
Subject: x86/fpu: Update xstate's PKRU value on write_pkru()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

During the context switch the xstate is loaded which also includes the
PKRU value.

If xstate is restored on return to userland it is required
that the PKRU value in xstate is the same as the one in the CPU.

Save the PKRU in xstate during modification.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Juergen Gross <jgross@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-20-bigeasy@linutronix.de
---
 arch/x86/include/asm/pgtable.h | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 9beb371b1adf..5cfbbb6d458d 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -23,6 +23,8 @@
 
 #ifndef __ASSEMBLY__
 #include <asm/x86_init.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fpu/api.h>
 
 extern pgd_t early_top_pgt[PTRS_PER_PGD];
 int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
@@ -133,8 +135,23 @@ static inline u32 read_pkru(void)
 
 static inline void write_pkru(u32 pkru)
 {
-	if (boot_cpu_has(X86_FEATURE_OSPKE))
-		__write_pkru(pkru);
+	struct pkru_state *pk;
+
+	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+		return;
+
+	pk = get_xsave_addr(&current->thread.fpu.state.xsave, XFEATURE_PKRU);
+
+	/*
+	 * The PKRU value in xstate needs to be in sync with the value that is
+	 * written to the CPU. The FPU restore on return to userland would
+	 * otherwise load the previous value again.
+	 */
+	fpregs_lock();
+	if (pk)
+		pk->pkru = pkru;
+	__write_pkru(pkru);
+	fpregs_unlock();
 }
 
 static inline int pte_young(pte_t pte)
-- 
cgit v1.2.3-59-g8ed1b


From e0d3602f933367881bddfff310a744e6e61c284c Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:49 +0200
Subject: x86/fpu: Inline copy_user_to_fpregs_zeroing()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Start refactoring __fpu__restore_sig() by inlining
copy_user_to_fpregs_zeroing(). The original function remains and will be
used to restore from userland memory if possible.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-21-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 9b9dfdc96285..c2ff43fbbd07 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -337,11 +337,29 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		kfree(tmp);
 		return err;
 	} else {
+		int ret;
+
 		/*
 		 * For 64-bit frames and 32-bit fsave frames, restore the user
 		 * state to the registers directly (with exceptions handled).
 		 */
-		if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) {
+		if (use_xsave()) {
+			if ((unsigned long)buf_fx % 64 || fx_only) {
+				u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
+				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+				ret = copy_user_to_fxregs(buf_fx);
+			} else {
+				u64 init_bv = xfeatures_mask & ~xfeatures;
+				if (unlikely(init_bv))
+					copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+				ret = copy_user_to_xregs(buf_fx, xfeatures);
+			}
+		} else if (use_fxsr()) {
+			ret = copy_user_to_fxregs(buf_fx);
+		} else
+			ret = copy_user_to_fregs(buf_fx);
+
+		if (ret) {
 			fpu__clear(fpu);
 			return -1;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 258f250fc5f7f11dec00fc267a0424dd2a0fa8f4 Mon Sep 17 00:00:00 2001
From: Honghui Zhang <honghui.zhang@mediatek.com>
Date: Mon, 18 Mar 2019 16:27:55 +0800
Subject: arm64: dts: mt2712: Remove un-used property for PCIe

The "num-lanes" property for PCIe is not used, remove it.

Signed-off-by: Honghui Zhang <honghui.zhang@mediatek.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
---
 arch/arm64/boot/dts/mediatek/mt2712e.dtsi | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/boot/dts/mediatek/mt2712e.dtsi b/arch/arm64/boot/dts/mediatek/mt2712e.dtsi
index 976d92a94738..43307bad3f0d 100644
--- a/arch/arm64/boot/dts/mediatek/mt2712e.dtsi
+++ b/arch/arm64/boot/dts/mediatek/mt2712e.dtsi
@@ -819,7 +819,6 @@
 			#size-cells = <2>;
 			#interrupt-cells = <1>;
 			ranges;
-			num-lanes = <1>;
 			interrupt-map-mask = <0 0 0 7>;
 			interrupt-map = <0 0 0 1 &pcie_intc0 0>,
 					<0 0 0 2 &pcie_intc0 1>,
@@ -840,7 +839,6 @@
 			#size-cells = <2>;
 			#interrupt-cells = <1>;
 			ranges;
-			num-lanes = <1>;
 			interrupt-map-mask = <0 0 0 7>;
 			interrupt-map = <0 0 0 1 &pcie_intc1 0>,
 					<0 0 0 2 &pcie_intc1 1>,
-- 
cgit v1.2.3-59-g8ed1b


From 926b21f37b072ae4c117052de45a975c6d468fec Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:50 +0200
Subject: x86/fpu: Restore from kernel memory on the 64-bit path too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 64-bit case (both 64-bit and 32-bit frames) loads the new state from
user memory.

However, doing this is not desired if the FPU state is going to be
restored on return to userland: it would be required to disable
preemption in order to avoid a context switch which would set
TIF_NEED_FPU_LOAD. If this happens before the restore operation then the
loaded registers would become volatile.

Furthermore, disabling preemption while accessing user memory requires
to disable the pagefault handler. An error during FXRSTOR would then
mean that either a page fault occurred (and it would have to be retried
with enabled page fault handler) or a #GP occurred because the xstate is
bogus (after all, the signal handler can modify it).

In order to avoid that mess, copy the FPU state from userland, validate
it and then load it. The copy_kernel_…() helpers are basically just
like the old helpers except that they operate on kernel memory and the
fault handler just sets the error value and the caller handles it.

copy_user_to_fpregs_zeroing() and its helpers remain and will be used
later for a fastpath optimisation.

 [ bp: Clarify commit message. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aubrey Li <aubrey.li@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-22-bigeasy@linutronix.de
---
 arch/x86/include/asm/fpu/internal.h | 43 +++++++++++++++++++++++++
 arch/x86/kernel/fpu/signal.c        | 62 +++++++++++++++++++++++++++++--------
 2 files changed, 92 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index da75d7b3e37d..2cf04fbcba5d 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -121,6 +121,21 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu);
 	err;								\
 })
 
+#define kernel_insn_err(insn, output, input...)				\
+({									\
+	int err;							\
+	asm volatile("1:" #insn "\n\t"					\
+		     "2:\n"						\
+		     ".section .fixup,\"ax\"\n"				\
+		     "3:  movl $-1,%[err]\n"				\
+		     "    jmp  2b\n"					\
+		     ".previous\n"					\
+		     _ASM_EXTABLE(1b, 3b)				\
+		     : [err] "=r" (err), output				\
+		     : "0"(0), input);					\
+	err;								\
+})
+
 #define kernel_insn(insn, output, input...)				\
 	asm volatile("1:" #insn "\n\t"					\
 		     "2:\n"						\
@@ -149,6 +164,14 @@ static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
 		kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
 }
 
+static inline int copy_kernel_to_fxregs_err(struct fxregs_state *fx)
+{
+	if (IS_ENABLED(CONFIG_X86_32))
+		return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+	else
+		return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
 static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
 {
 	if (IS_ENABLED(CONFIG_X86_32))
@@ -162,6 +185,11 @@ static inline void copy_kernel_to_fregs(struct fregs_state *fx)
 	kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
 }
 
+static inline int copy_kernel_to_fregs_err(struct fregs_state *fx)
+{
+	return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
 static inline int copy_user_to_fregs(struct fregs_state __user *fx)
 {
 	return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
@@ -361,6 +389,21 @@ static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
 	return err;
 }
 
+/*
+ * Restore xstate from kernel space xsave area, return an error code instead of
+ * an exception.
+ */
+static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask)
+{
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err;
+
+	XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+
+	return err;
+}
+
 /*
  * These must be called with preempt disabled. Returns
  * 'true' if the FPU state is still intact and we can
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index c2ff43fbbd07..9ea1eaa4c9b1 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -234,7 +234,8 @@ sanitize_restored_xstate(union fpregs_state *state,
 		 */
 		xsave->i387.mxcsr &= mxcsr_feature_mask;
 
-		convert_to_fxsr(&state->fxsave, ia32_env);
+		if (ia32_env)
+			convert_to_fxsr(&state->fxsave, ia32_env);
 	}
 }
 
@@ -337,28 +338,63 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		kfree(tmp);
 		return err;
 	} else {
+		union fpregs_state *state;
+		void *tmp;
 		int ret;
 
+		tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL);
+		if (!tmp)
+			return -ENOMEM;
+		state = PTR_ALIGN(tmp, 64);
+
 		/*
 		 * For 64-bit frames and 32-bit fsave frames, restore the user
 		 * state to the registers directly (with exceptions handled).
 		 */
-		if (use_xsave()) {
-			if ((unsigned long)buf_fx % 64 || fx_only) {
-				u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
-				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-				ret = copy_user_to_fxregs(buf_fx);
+		if ((unsigned long)buf_fx % 64)
+			fx_only = 1;
+
+		if (use_xsave() && !fx_only) {
+			u64 init_bv = xfeatures_mask & ~xfeatures;
+
+			if (using_compacted_format()) {
+				ret = copy_user_to_xstate(&state->xsave, buf_fx);
 			} else {
-				u64 init_bv = xfeatures_mask & ~xfeatures;
-				if (unlikely(init_bv))
-					copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-				ret = copy_user_to_xregs(buf_fx, xfeatures);
+				ret = __copy_from_user(&state->xsave, buf_fx, state_size);
+
+				if (!ret && state_size > offsetof(struct xregs_state, header))
+					ret = validate_xstate_header(&state->xsave.header);
 			}
+			if (ret)
+				goto err_out;
+
+			sanitize_restored_xstate(state, NULL, xfeatures, fx_only);
+
+			if (unlikely(init_bv))
+				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+			ret = copy_kernel_to_xregs_err(&state->xsave, xfeatures);
+
 		} else if (use_fxsr()) {
-			ret = copy_user_to_fxregs(buf_fx);
-		} else
-			ret = copy_user_to_fregs(buf_fx);
+			ret = __copy_from_user(&state->fxsave, buf_fx, state_size);
+			if (ret)
+				goto err_out;
 
+			if (use_xsave()) {
+				u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
+				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+			}
+			state->fxsave.mxcsr &= mxcsr_feature_mask;
+
+			ret = copy_kernel_to_fxregs_err(&state->fxsave);
+		} else {
+			ret = __copy_from_user(&state->fsave, buf_fx, state_size);
+			if (ret)
+				goto err_out;
+			ret = copy_kernel_to_fregs_err(&state->fsave);
+		}
+
+err_out:
+		kfree(tmp);
 		if (ret) {
 			fpu__clear(fpu);
 			return -1;
-- 
cgit v1.2.3-59-g8ed1b


From c2ff9e9a3d9d6c019394a22989a228d02970a8b1 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:51 +0200
Subject: x86/fpu: Merge the two code paths in __fpu__restore_sig()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ia32_fxstate case (32bit with fxsr) and the other (64bit frames or
32bit frames without fxsr) restore both from kernel memory and sanitize
the content.

The !ia32_fxstate version restores missing xstates from "init state"
while the ia32_fxstate doesn't and skips it.

Merge the two code paths and keep the !ia32_fxstate one. Copy only the
user_i387_ia32_struct data structure in the ia32_fxstate.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-23-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 139 +++++++++++++++++--------------------------
 1 file changed, 54 insertions(+), 85 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 9ea1eaa4c9b1..b13e86b29426 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -263,12 +263,17 @@ static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_
 
 static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 {
+	struct user_i387_ia32_struct *envp = NULL;
+	int state_size = fpu_kernel_xstate_size;
 	int ia32_fxstate = (buf != buf_fx);
 	struct task_struct *tsk = current;
 	struct fpu *fpu = &tsk->thread.fpu;
-	int state_size = fpu_kernel_xstate_size;
+	struct user_i387_ia32_struct env;
+	union fpregs_state *state;
 	u64 xfeatures = 0;
 	int fx_only = 0;
+	int ret = 0;
+	void *tmp;
 
 	ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
 			 IS_ENABLED(CONFIG_IA32_EMULATION));
@@ -303,105 +308,69 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		}
 	}
 
+	tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+	state = PTR_ALIGN(tmp, 64);
+
+	if ((unsigned long)buf_fx % 64)
+		fx_only = 1;
+
+	/*
+	 * For 32-bit frames with fxstate, copy the fxstate so it can be
+	 * reconstructed later.
+	 */
 	if (ia32_fxstate) {
-		/*
-		 * For 32-bit frames with fxstate, copy the user state to the
-		 * thread's fpu state, reconstruct fxstate from the fsave
-		 * header. Validate and sanitize the copied state.
-		 */
-		struct user_i387_ia32_struct env;
-		union fpregs_state *state;
-		int err = 0;
-		void *tmp;
+		ret = __copy_from_user(&env, buf, sizeof(env));
+		if (ret)
+			goto err_out;
+		envp = &env;
+	}
 
-		tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL);
-		if (!tmp)
-			return -ENOMEM;
-		state = PTR_ALIGN(tmp, 64);
+	if (use_xsave() && !fx_only) {
+		u64 init_bv = xfeatures_mask & ~xfeatures;
 
 		if (using_compacted_format()) {
-			err = copy_user_to_xstate(&state->xsave, buf_fx);
+			ret = copy_user_to_xstate(&state->xsave, buf_fx);
 		} else {
-			err = __copy_from_user(&state->xsave, buf_fx, state_size);
+			ret = __copy_from_user(&state->xsave, buf_fx, state_size);
 
-			if (!err && state_size > offsetof(struct xregs_state, header))
-				err = validate_xstate_header(&state->xsave.header);
+			if (!ret && state_size > offsetof(struct xregs_state, header))
+				ret = validate_xstate_header(&state->xsave.header);
 		}
+		if (ret)
+			goto err_out;
 
-		if (err || __copy_from_user(&env, buf, sizeof(env))) {
-			err = -1;
-		} else {
-			sanitize_restored_xstate(state, &env, xfeatures, fx_only);
-			copy_kernel_to_fpregs(state);
-		}
-
-		kfree(tmp);
-		return err;
-	} else {
-		union fpregs_state *state;
-		void *tmp;
-		int ret;
-
-		tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL);
-		if (!tmp)
-			return -ENOMEM;
-		state = PTR_ALIGN(tmp, 64);
-
-		/*
-		 * For 64-bit frames and 32-bit fsave frames, restore the user
-		 * state to the registers directly (with exceptions handled).
-		 */
-		if ((unsigned long)buf_fx % 64)
-			fx_only = 1;
-
-		if (use_xsave() && !fx_only) {
-			u64 init_bv = xfeatures_mask & ~xfeatures;
-
-			if (using_compacted_format()) {
-				ret = copy_user_to_xstate(&state->xsave, buf_fx);
-			} else {
-				ret = __copy_from_user(&state->xsave, buf_fx, state_size);
-
-				if (!ret && state_size > offsetof(struct xregs_state, header))
-					ret = validate_xstate_header(&state->xsave.header);
-			}
-			if (ret)
-				goto err_out;
-
-			sanitize_restored_xstate(state, NULL, xfeatures, fx_only);
-
-			if (unlikely(init_bv))
-				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-			ret = copy_kernel_to_xregs_err(&state->xsave, xfeatures);
+		sanitize_restored_xstate(state, envp, xfeatures, fx_only);
 
-		} else if (use_fxsr()) {
-			ret = __copy_from_user(&state->fxsave, buf_fx, state_size);
-			if (ret)
-				goto err_out;
+		if (unlikely(init_bv))
+			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+		ret = copy_kernel_to_xregs_err(&state->xsave, xfeatures);
 
-			if (use_xsave()) {
-				u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
-				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-			}
-			state->fxsave.mxcsr &= mxcsr_feature_mask;
+	} else if (use_fxsr()) {
+		ret = __copy_from_user(&state->fxsave, buf_fx, state_size);
+		if (ret)
+			goto err_out;
 
-			ret = copy_kernel_to_fxregs_err(&state->fxsave);
-		} else {
-			ret = __copy_from_user(&state->fsave, buf_fx, state_size);
-			if (ret)
-				goto err_out;
-			ret = copy_kernel_to_fregs_err(&state->fsave);
+		sanitize_restored_xstate(state, envp, xfeatures, fx_only);
+		if (use_xsave()) {
+			u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
+			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
 		}
 
-err_out:
-		kfree(tmp);
-		if (ret) {
-			fpu__clear(fpu);
-			return -1;
-		}
+		ret = copy_kernel_to_fxregs_err(&state->fxsave);
+	} else {
+		ret = __copy_from_user(&state->fsave, buf_fx, state_size);
+		if (ret)
+			goto err_out;
+		ret = copy_kernel_to_fregs_err(&state->fsave);
 	}
 
-	return 0;
+err_out:
+	kfree(tmp);
+	if (ret)
+		fpu__clear(fpu);
+	return ret;
 }
 
 static inline int xstate_sigframe_size(void)
-- 
cgit v1.2.3-59-g8ed1b


From 5f409e20b794565e2d60ad333e79334630a6c798 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 3 Apr 2019 18:41:52 +0200
Subject: x86/fpu: Defer FPU state load until return to userspace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Defer loading of FPU state until return to userspace. This gives
the kernel the potential to skip loading FPU state for tasks that
stay in kernel mode, or for tasks that end up with repeated
invocations of kernel_fpu_begin() & kernel_fpu_end().

The fpregs_lock/unlock() section ensures that the registers remain
unchanged. Otherwise a context switch or a bottom half could save the
registers to its FPU context and the processor's FPU registers would
became random if modified at the same time.

KVM swaps the host/guest registers on entry/exit path. This flow has
been kept as is. First it ensures that the registers are loaded and then
saves the current (host) state before it loads the guest's registers. The
swap is done at the very end with disabled interrupts so it should not
change anymore before theg guest is entered. The read/save version seems
to be cheaper compared to memcpy() in a micro benchmark.

Each thread gets TIF_NEED_FPU_LOAD set as part of fork() / fpu__copy().
For kernel threads, this flag gets never cleared which avoids saving /
restoring the FPU state for kernel threads and during in-kernel usage of
the FPU registers.

 [
   bp: Correct and update commit message and fix checkpatch warnings.
   s/register/registers/ where it is used in plural.
   minor comment corrections.
   remove unused trace_x86_fpu_activate_state() TP.
 ]

Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aubrey Li <aubrey.li@intel.com>
Cc: Babu Moger <Babu.Moger@amd.com>
Cc: "Chang S. Bae" <chang.seok.bae@intel.com>
Cc: Dmitry Safonov <dima@arista.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Nicolai Stange <nstange@suse.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Waiman Long <longman@redhat.com>
Cc: x86-ml <x86@kernel.org>
Cc: Yi Wang <wang.yi59@zte.com.cn>
Link: https://lkml.kernel.org/r/20190403164156.19645-24-bigeasy@linutronix.de
---
 arch/x86/entry/common.c             |  10 +++-
 arch/x86/include/asm/fpu/api.h      |  22 +++++++-
 arch/x86/include/asm/fpu/internal.h |  27 +++++----
 arch/x86/include/asm/trace/fpu.h    |  10 ++--
 arch/x86/kernel/fpu/core.c          | 106 +++++++++++++++++++++++++++---------
 arch/x86/kernel/fpu/signal.c        |  49 ++++++++++-------
 arch/x86/kernel/process.c           |   2 +-
 arch/x86/kernel/process_32.c        |   5 +-
 arch/x86/kernel/process_64.c        |   5 +-
 arch/x86/kvm/x86.c                  |  20 +++++--
 10 files changed, 184 insertions(+), 72 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 7bc105f47d21..51beb8d29123 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -25,12 +25,13 @@
 #include <linux/uprobes.h>
 #include <linux/livepatch.h>
 #include <linux/syscalls.h>
+#include <linux/uaccess.h>
 
 #include <asm/desc.h>
 #include <asm/traps.h>
 #include <asm/vdso.h>
-#include <linux/uaccess.h>
 #include <asm/cpufeature.h>
+#include <asm/fpu/api.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
@@ -196,6 +197,13 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 	if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
 		exit_to_usermode_loop(regs, cached_flags);
 
+	/* Reload ti->flags; we may have rescheduled above. */
+	cached_flags = READ_ONCE(ti->flags);
+
+	fpregs_assert_state_consistent();
+	if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD))
+		switch_fpu_return();
+
 #ifdef CONFIG_COMPAT
 	/*
 	 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 73e684160f35..b774c52e5411 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -10,7 +10,7 @@
 
 #ifndef _ASM_X86_FPU_API_H
 #define _ASM_X86_FPU_API_H
-#include <linux/preempt.h>
+#include <linux/bottom_half.h>
 
 /*
  * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
@@ -22,17 +22,37 @@
 extern void kernel_fpu_begin(void);
 extern void kernel_fpu_end(void);
 extern bool irq_fpu_usable(void);
+extern void fpregs_mark_activate(void);
 
+/*
+ * Use fpregs_lock() while editing CPU's FPU registers or fpu->state.
+ * A context switch will (and softirq might) save CPU's FPU registers to
+ * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in
+ * a random state.
+ */
 static inline void fpregs_lock(void)
 {
 	preempt_disable();
+	local_bh_disable();
 }
 
 static inline void fpregs_unlock(void)
 {
+	local_bh_enable();
 	preempt_enable();
 }
 
+#ifdef CONFIG_X86_DEBUG_FPU
+extern void fpregs_assert_state_consistent(void);
+#else
+static inline void fpregs_assert_state_consistent(void) { }
+#endif
+
+/*
+ * Load the task FPU state before returning to userspace.
+ */
+extern void switch_fpu_return(void);
+
 /*
  * Query the presence of one or more xfeatures. Works on any legacy CPU as well.
  *
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 2cf04fbcba5d..0c8a5093647a 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -30,7 +30,7 @@ extern void fpu__prepare_write(struct fpu *fpu);
 extern void fpu__save(struct fpu *fpu);
 extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
 extern void fpu__drop(struct fpu *fpu);
-extern int  fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu);
+extern int  fpu__copy(struct task_struct *dst, struct task_struct *src);
 extern void fpu__clear(struct fpu *fpu);
 extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
 extern int  dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate);
@@ -531,13 +531,20 @@ static inline void fpregs_activate(struct fpu *fpu)
 /*
  * Internal helper, do not use directly. Use switch_fpu_return() instead.
  */
-static inline void __fpregs_load_activate(struct fpu *fpu, int cpu)
+static inline void __fpregs_load_activate(void)
 {
+	struct fpu *fpu = &current->thread.fpu;
+	int cpu = smp_processor_id();
+
+	if (WARN_ON_ONCE(current->mm == NULL))
+		return;
+
 	if (!fpregs_state_valid(fpu, cpu)) {
-		if (current->mm)
-			copy_kernel_to_fpregs(&fpu->state);
+		copy_kernel_to_fpregs(&fpu->state);
 		fpregs_activate(fpu);
+		fpu->last_cpu = cpu;
 	}
+	clear_thread_flag(TIF_NEED_FPU_LOAD);
 }
 
 /*
@@ -548,8 +555,8 @@ static inline void __fpregs_load_activate(struct fpu *fpu, int cpu)
  *  - switch_fpu_prepare() saves the old state.
  *    This is done within the context of the old process.
  *
- *  - switch_fpu_finish() restores the new state as
- *    necessary.
+ *  - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state
+ *    will get loaded on return to userspace, or when the kernel needs it.
  *
  * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers
  * are saved in the current thread's FPU register state.
@@ -581,10 +588,10 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu)
  */
 
 /*
- * Set up the userspace FPU context for the new task, if the task
- * has used the FPU.
+ * Load PKRU from the FPU context if available. Delay loading of the
+ * complete FPU state until the return to userland.
  */
-static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
+static inline void switch_fpu_finish(struct fpu *new_fpu)
 {
 	u32 pkru_val = init_pkru_value;
 	struct pkru_state *pk;
@@ -592,7 +599,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
 	if (!static_cpu_has(X86_FEATURE_FPU))
 		return;
 
-	__fpregs_load_activate(new_fpu, cpu);
+	set_thread_flag(TIF_NEED_FPU_LOAD);
 
 	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
 		return;
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index bd65f6ba950f..879b77792f94 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -13,19 +13,22 @@ DECLARE_EVENT_CLASS(x86_fpu,
 
 	TP_STRUCT__entry(
 		__field(struct fpu *, fpu)
+		__field(bool, load_fpu)
 		__field(u64, xfeatures)
 		__field(u64, xcomp_bv)
 		),
 
 	TP_fast_assign(
 		__entry->fpu		= fpu;
+		__entry->load_fpu	= test_thread_flag(TIF_NEED_FPU_LOAD);
 		if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
 			__entry->xfeatures = fpu->state.xsave.header.xfeatures;
 			__entry->xcomp_bv  = fpu->state.xsave.header.xcomp_bv;
 		}
 	),
-	TP_printk("x86/fpu: %p xfeatures: %llx xcomp_bv: %llx",
+	TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx",
 			__entry->fpu,
+			__entry->load_fpu,
 			__entry->xfeatures,
 			__entry->xcomp_bv
 	)
@@ -61,11 +64,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated,
 	TP_ARGS(fpu)
 );
 
-DEFINE_EVENT(x86_fpu, x86_fpu_activate_state,
-	TP_PROTO(struct fpu *fpu),
-	TP_ARGS(fpu)
-);
-
 DEFINE_EVENT(x86_fpu, x86_fpu_init_state,
 	TP_PROTO(struct fpu *fpu),
 	TP_ARGS(fpu)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 739ca3ae2bdc..ce243f76bdb7 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -102,23 +102,20 @@ static void __kernel_fpu_begin(void)
 	kernel_fpu_disable();
 
 	if (current->mm) {
-		/*
-		 * Ignore return value -- we don't care if reg state
-		 * is clobbered.
-		 */
-		copy_fpregs_to_fpstate(fpu);
-	} else {
-		__cpu_invalidate_fpregs_state();
+		if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
+			set_thread_flag(TIF_NEED_FPU_LOAD);
+			/*
+			 * Ignore return value -- we don't care if reg state
+			 * is clobbered.
+			 */
+			copy_fpregs_to_fpstate(fpu);
+		}
 	}
+	__cpu_invalidate_fpregs_state();
 }
 
 static void __kernel_fpu_end(void)
 {
-	struct fpu *fpu = &current->thread.fpu;
-
-	if (current->mm)
-		copy_kernel_to_fpregs(&fpu->state);
-
 	kernel_fpu_enable();
 }
 
@@ -145,14 +142,17 @@ void fpu__save(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu);
 
-	preempt_disable();
+	fpregs_lock();
 	trace_x86_fpu_before_save(fpu);
 
-	if (!copy_fpregs_to_fpstate(fpu))
-		copy_kernel_to_fpregs(&fpu->state);
+	if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
+		if (!copy_fpregs_to_fpstate(fpu)) {
+			copy_kernel_to_fpregs(&fpu->state);
+		}
+	}
 
 	trace_x86_fpu_after_save(fpu);
-	preempt_enable();
+	fpregs_unlock();
 }
 EXPORT_SYMBOL_GPL(fpu__save);
 
@@ -185,8 +185,11 @@ void fpstate_init(union fpregs_state *state)
 }
 EXPORT_SYMBOL_GPL(fpstate_init);
 
-int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
+int fpu__copy(struct task_struct *dst, struct task_struct *src)
 {
+	struct fpu *dst_fpu = &dst->thread.fpu;
+	struct fpu *src_fpu = &src->thread.fpu;
+
 	dst_fpu->last_cpu = -1;
 
 	if (!static_cpu_has(X86_FEATURE_FPU))
@@ -201,16 +204,23 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 	memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
 
 	/*
-	 * Save current FPU registers directly into the child
-	 * FPU context, without any memory-to-memory copying.
+	 * If the FPU registers are not current just memcpy() the state.
+	 * Otherwise save current FPU registers directly into the child's FPU
+	 * context, without any memory-to-memory copying.
 	 *
 	 * ( The function 'fails' in the FNSAVE case, which destroys
-	 *   register contents so we have to copy them back. )
+	 *   register contents so we have to load them back. )
 	 */
-	if (!copy_fpregs_to_fpstate(dst_fpu)) {
-		memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size);
-		copy_kernel_to_fpregs(&src_fpu->state);
-	}
+	fpregs_lock();
+	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+		memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size);
+
+	else if (!copy_fpregs_to_fpstate(dst_fpu))
+		copy_kernel_to_fpregs(&dst_fpu->state);
+
+	fpregs_unlock();
+
+	set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);
 
 	trace_x86_fpu_copy_src(src_fpu);
 	trace_x86_fpu_copy_dst(dst_fpu);
@@ -226,10 +236,9 @@ static void fpu__initialize(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu);
 
+	set_thread_flag(TIF_NEED_FPU_LOAD);
 	fpstate_init(&fpu->state);
 	trace_x86_fpu_init_state(fpu);
-
-	trace_x86_fpu_activate_state(fpu);
 }
 
 /*
@@ -308,6 +317,8 @@ void fpu__drop(struct fpu *fpu)
  */
 static inline void copy_init_fpstate_to_fpregs(void)
 {
+	fpregs_lock();
+
 	if (use_xsave())
 		copy_kernel_to_xregs(&init_fpstate.xsave, -1);
 	else if (static_cpu_has(X86_FEATURE_FXSR))
@@ -317,6 +328,9 @@ static inline void copy_init_fpstate_to_fpregs(void)
 
 	if (boot_cpu_has(X86_FEATURE_OSPKE))
 		copy_init_pkru_to_fpregs();
+
+	fpregs_mark_activate();
+	fpregs_unlock();
 }
 
 /*
@@ -339,6 +353,46 @@ void fpu__clear(struct fpu *fpu)
 		copy_init_fpstate_to_fpregs();
 }
 
+/*
+ * Load FPU context before returning to userspace.
+ */
+void switch_fpu_return(void)
+{
+	if (!static_cpu_has(X86_FEATURE_FPU))
+		return;
+
+	__fpregs_load_activate();
+}
+EXPORT_SYMBOL_GPL(switch_fpu_return);
+
+#ifdef CONFIG_X86_DEBUG_FPU
+/*
+ * If current FPU state according to its tracking (loaded FPU context on this
+ * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is
+ * loaded on return to userland.
+ */
+void fpregs_assert_state_consistent(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+		return;
+
+	WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id()));
+}
+EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent);
+#endif
+
+void fpregs_mark_activate(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	fpregs_activate(fpu);
+	fpu->last_cpu = smp_processor_id();
+	clear_thread_flag(TIF_NEED_FPU_LOAD);
+}
+EXPORT_SYMBOL_GPL(fpregs_mark_activate);
+
 /*
  * x87 math exception handling:
  */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index b13e86b29426..6df1f15e0cd5 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -269,11 +269,9 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 	struct task_struct *tsk = current;
 	struct fpu *fpu = &tsk->thread.fpu;
 	struct user_i387_ia32_struct env;
-	union fpregs_state *state;
 	u64 xfeatures = 0;
 	int fx_only = 0;
 	int ret = 0;
-	void *tmp;
 
 	ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
 			 IS_ENABLED(CONFIG_IA32_EMULATION));
@@ -308,14 +306,18 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		}
 	}
 
-	tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL);
-	if (!tmp)
-		return -ENOMEM;
-	state = PTR_ALIGN(tmp, 64);
+	/*
+	 * The current state of the FPU registers does not matter. By setting
+	 * TIF_NEED_FPU_LOAD unconditionally it is ensured that the our xstate
+	 * is not modified on context switch and that the xstate is considered
+	 * to be loaded again on return to userland (overriding last_cpu avoids
+	 * the optimisation).
+	 */
+	set_thread_flag(TIF_NEED_FPU_LOAD);
+	__fpu_invalidate_fpregs_state(fpu);
 
 	if ((unsigned long)buf_fx % 64)
 		fx_only = 1;
-
 	/*
 	 * For 32-bit frames with fxstate, copy the fxstate so it can be
 	 * reconstructed later.
@@ -331,43 +333,52 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		u64 init_bv = xfeatures_mask & ~xfeatures;
 
 		if (using_compacted_format()) {
-			ret = copy_user_to_xstate(&state->xsave, buf_fx);
+			ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
 		} else {
-			ret = __copy_from_user(&state->xsave, buf_fx, state_size);
+			ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
 
 			if (!ret && state_size > offsetof(struct xregs_state, header))
-				ret = validate_xstate_header(&state->xsave.header);
+				ret = validate_xstate_header(&fpu->state.xsave.header);
 		}
 		if (ret)
 			goto err_out;
 
-		sanitize_restored_xstate(state, envp, xfeatures, fx_only);
+		sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only);
 
+		fpregs_lock();
 		if (unlikely(init_bv))
 			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-		ret = copy_kernel_to_xregs_err(&state->xsave, xfeatures);
+		ret = copy_kernel_to_xregs_err(&fpu->state.xsave, xfeatures);
 
 	} else if (use_fxsr()) {
-		ret = __copy_from_user(&state->fxsave, buf_fx, state_size);
-		if (ret)
+		ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size);
+		if (ret) {
+			ret = -EFAULT;
 			goto err_out;
+		}
+
+		sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only);
 
-		sanitize_restored_xstate(state, envp, xfeatures, fx_only);
+		fpregs_lock();
 		if (use_xsave()) {
 			u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
 			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
 		}
 
-		ret = copy_kernel_to_fxregs_err(&state->fxsave);
+		ret = copy_kernel_to_fxregs_err(&fpu->state.fxsave);
 	} else {
-		ret = __copy_from_user(&state->fsave, buf_fx, state_size);
+		ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size);
 		if (ret)
 			goto err_out;
-		ret = copy_kernel_to_fregs_err(&state->fsave);
+
+		fpregs_lock();
+		ret = copy_kernel_to_fregs_err(&fpu->state.fsave);
 	}
+	if (!ret)
+		fpregs_mark_activate();
+	fpregs_unlock();
 
 err_out:
-	kfree(tmp);
 	if (ret)
 		fpu__clear(fpu);
 	return ret;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 58ac7be52c7a..b9b8e6eb74f2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -101,7 +101,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 	dst->thread.vm86 = NULL;
 #endif
 
-	return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
+	return fpu__copy(dst, src);
 }
 
 /*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 77d9eb43ccac..1bc47f3a4885 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -234,7 +234,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
-	switch_fpu_prepare(prev_fpu, cpu);
+	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
+		switch_fpu_prepare(prev_fpu, cpu);
 
 	/*
 	 * Save away %gs. No need to save %fs, as it was saved on the
@@ -290,7 +291,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	this_cpu_write(current_task, next_p);
 
-	switch_fpu_finish(next_fpu, cpu);
+	switch_fpu_finish(next_fpu);
 
 	/* Load the Intel cache allocation PQR MSR. */
 	resctrl_sched_in();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ffea7c557963..37b2ecef041e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -520,7 +520,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
 		     this_cpu_read(irq_count) != -1);
 
-	switch_fpu_prepare(prev_fpu, cpu);
+	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
+		switch_fpu_prepare(prev_fpu, cpu);
 
 	/* We must save %fs and %gs before load_TLS() because
 	 * %fs and %gs may be cleared by load_TLS().
@@ -572,7 +573,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	this_cpu_write(current_task, next_p);
 	this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
 
-	switch_fpu_finish(next_fpu, cpu);
+	switch_fpu_finish(next_fpu);
 
 	/* Reload sp0. */
 	update_task_stack(next_p);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8022e7769b3a..e340c3c0cba3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7877,6 +7877,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		wait_lapic_expire(vcpu);
 	guest_enter_irqoff();
 
+	fpregs_assert_state_consistent();
+	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+		switch_fpu_return();
+
 	if (unlikely(vcpu->arch.switch_db_regs)) {
 		set_debugreg(0, 7);
 		set_debugreg(vcpu->arch.eff_db[0], 0);
@@ -8137,22 +8141,30 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 /* Swap (qemu) user FPU context for the guest FPU context. */
 static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	preempt_disable();
+	fpregs_lock();
+
 	copy_fpregs_to_fpstate(&current->thread.fpu);
 	/* PKRU is separately restored in kvm_x86_ops->run.  */
 	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
 				~XFEATURE_MASK_PKRU);
-	preempt_enable();
+
+	fpregs_mark_activate();
+	fpregs_unlock();
+
 	trace_kvm_fpu(1);
 }
 
 /* When vcpu_run ends, restore user space FPU context. */
 static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	preempt_disable();
+	fpregs_lock();
+
 	copy_fpregs_to_fpstate(vcpu->arch.guest_fpu);
 	copy_kernel_to_fpregs(&current->thread.fpu.state);
-	preempt_enable();
+
+	fpregs_mark_activate();
+	fpregs_unlock();
+
 	++vcpu->stat.fpu_reload;
 	trace_kvm_fpu(0);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 1d731e731c4cd7cbd3b1aa295f0932e7610da82f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:53 +0200
Subject: x86/fpu: Add a fastpath to __fpu__restore_sig()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous commits refactor the restoration of the FPU registers so
that they can be loaded from in-kernel memory. This overhead can be
avoided if the load can be performed without a pagefault.

Attempt to restore FPU registers by invoking
copy_user_to_fpregs_zeroing(). If it fails try the slowpath which can
handle pagefaults.

 [ bp: Add a comment over the fastpath to be able to find one's way
   around the function. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-25-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 6df1f15e0cd5..a1bd7be70206 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -242,10 +242,10 @@ sanitize_restored_xstate(union fpregs_state *state,
 /*
  * Restore the extended state if present. Otherwise, restore the FP/SSE state.
  */
-static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
+static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
 {
 	if (use_xsave()) {
-		if ((unsigned long)buf % 64 || fx_only) {
+		if (fx_only) {
 			u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
 			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
 			return copy_user_to_fxregs(buf);
@@ -327,8 +327,27 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		if (ret)
 			goto err_out;
 		envp = &env;
+	} else {
+		/*
+		 * Attempt to restore the FPU registers directly from user
+		 * memory. For that to succeed, the user access cannot cause
+		 * page faults. If it does, fall back to the slow path below,
+		 * going through the kernel buffer with the enabled pagefault
+		 * handler.
+		 */
+		fpregs_lock();
+		pagefault_disable();
+		ret = copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only);
+		pagefault_enable();
+		if (!ret) {
+			fpregs_mark_activate();
+			fpregs_unlock();
+			return 0;
+		}
+		fpregs_unlock();
 	}
 
+
 	if (use_xsave() && !fx_only) {
 		u64 init_bv = xfeatures_mask & ~xfeatures;
 
-- 
cgit v1.2.3-59-g8ed1b


From da2f32fb8dc7cbd9433cb2e990693734b30a2465 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:54 +0200
Subject: x86/fpu: Add a fastpath to copy_fpstate_to_sigframe()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Try to save the FPU registers directly to the userland stack frame if
the CPU holds the FPU registers for the current task. This has to be
done with the pagefault disabled because we can't fault (while the FPU
registers are locked) and therefore the operation might fail. If it
fails try the slowpath which can handle faults.

 [ bp: Massage a bit. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-26-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a1bd7be70206..3c3167576216 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -144,8 +144,10 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
  *	buf == buf_fx for 64-bit frames and 32-bit fsave frame.
  *	buf != buf_fx for 32-bit frames with fxstate.
  *
- * Save the state to task's fpu->state and then copy it to the user frame
- * pointed to by the aligned pointer 'buf_fx'.
+ * Try to save it directly to the user frame with disabled page fault handler.
+ * If this fails then do the slow path where the FPU state is first saved to
+ * task's fpu->state and then copy it to the user frame pointed to by the
+ * aligned pointer 'buf_fx'.
  *
  * If this is a 32-bit frame with fxstate, put a fsave header before
  * the aligned state at 'buf_fx'.
@@ -159,6 +161,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	struct xregs_state *xsave = &fpu->state.xsave;
 	struct task_struct *tsk = current;
 	int ia32_fxstate = (buf != buf_fx);
+	int ret = -EFAULT;
 
 	ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
 			 IS_ENABLED(CONFIG_IA32_EMULATION));
@@ -173,23 +176,30 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 
 	/*
 	 * If we do not need to load the FPU registers at return to userspace
-	 * then the CPU has the current state and we need to save it. Otherwise,
-	 * it has already been done and we can skip it.
+	 * then the CPU has the current state. Try to save it directly to
+	 * userland's stack frame if it does not cause a pagefault. If it does,
+	 * try the slowpath.
 	 */
 	fpregs_lock();
 	if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
-		copy_fpregs_to_fpstate(fpu);
+		pagefault_disable();
+		ret = copy_fpregs_to_sigframe(buf_fx);
+		pagefault_enable();
+		if (ret)
+			copy_fpregs_to_fpstate(fpu);
 		set_thread_flag(TIF_NEED_FPU_LOAD);
 	}
 	fpregs_unlock();
 
-	if (using_compacted_format()) {
-		if (copy_xstate_to_user(buf_fx, xsave, 0, size))
-			return -1;
-	} else {
-		fpstate_sanitize_xstate(fpu);
-		if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size))
-			return -1;
+	if (ret) {
+		if (using_compacted_format()) {
+			if (copy_xstate_to_user(buf_fx, xsave, 0, size))
+				return -1;
+		} else {
+			fpstate_sanitize_xstate(fpu);
+			if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size))
+				return -1;
+		}
 	}
 
 	/* Save the fsave header for the 32-bit frames. */
-- 
cgit v1.2.3-59-g8ed1b


From 16b22f85bca246ace984209e9f22af2161450cbd Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@mips.com>
Date: Fri, 12 Apr 2019 11:04:02 -0700
Subject: Revert "MIPS: ralink: fix cpu clock of mt7621 and add dt clk devices"

Commit e6046b5e69a0 ("MIPS: ralink: fix cpu clock of mt7621 and add dt
clk devices") includes a file that doesn't exist, causing build
failures... Revert it.

References: https://lore.kernel.org/linux-mips/CAJsYDVJvviz8a2oVmb0XL3OB+=Eecu-3kC9T9vsmxpuC_BqDSA@mail.gmail.com/
Signed-off-by: Paul Burton <paul.burton@mips.com>
---
 arch/mips/include/asm/mach-ralink/mt7621.h |  20 ------
 arch/mips/ralink/mt7621.c                  | 102 +++++++++--------------------
 arch/mips/ralink/timer-gic.c               |   4 +-
 3 files changed, 33 insertions(+), 93 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/include/asm/mach-ralink/mt7621.h b/arch/mips/include/asm/mach-ralink/mt7621.h
index b8a8834164c8..a672e06fa5fd 100644
--- a/arch/mips/include/asm/mach-ralink/mt7621.h
+++ b/arch/mips/include/asm/mach-ralink/mt7621.h
@@ -19,10 +19,6 @@
 #define SYSC_REG_CHIP_REV		0x0c
 #define SYSC_REG_SYSTEM_CONFIG0		0x10
 #define SYSC_REG_SYSTEM_CONFIG1		0x14
-#define SYSC_REG_CLKCFG0		0x2c
-#define SYSC_REG_CUR_CLK_STS		0x44
-
-#define MEMC_REG_CPU_PLL		0x648
 
 #define CHIP_REV_PKG_MASK		0x1
 #define CHIP_REV_PKG_SHIFT		16
@@ -30,22 +26,6 @@
 #define CHIP_REV_VER_SHIFT		8
 #define CHIP_REV_ECO_MASK		0xf
 
-#define XTAL_MODE_SEL_MASK		0x7
-#define XTAL_MODE_SEL_SHIFT		6
-
-#define CPU_CLK_SEL_MASK		0x3
-#define CPU_CLK_SEL_SHIFT		30
-
-#define CUR_CPU_FDIV_MASK		0x1f
-#define CUR_CPU_FDIV_SHIFT		8
-#define CUR_CPU_FFRAC_MASK		0x1f
-#define CUR_CPU_FFRAC_SHIFT		0
-
-#define CPU_PLL_PREDIV_MASK		0x3
-#define CPU_PLL_PREDIV_SHIFT		12
-#define CPU_PLL_FBDIV_MASK		0x7f
-#define CPU_PLL_FBDIV_SHIFT		4
-
 #define MT7621_DRAM_BASE                0x0
 #define MT7621_DDR2_SIZE_MIN		32
 #define MT7621_DDR2_SIZE_MAX		256
diff --git a/arch/mips/ralink/mt7621.c b/arch/mips/ralink/mt7621.c
index 6828eefb0a85..d2718de60b9b 100644
--- a/arch/mips/ralink/mt7621.c
+++ b/arch/mips/ralink/mt7621.c
@@ -9,22 +9,22 @@
 
 #include <linux/kernel.h>
 #include <linux/init.h>
-#include <linux/clk.h>
-#include <linux/clkdev.h>
-#include <linux/clk-provider.h>
-#include <dt-bindings/clock/mt7621-clk.h>
 
 #include <asm/mipsregs.h>
 #include <asm/smp-ops.h>
 #include <asm/mips-cps.h>
 #include <asm/mach-ralink/ralink_regs.h>
 #include <asm/mach-ralink/mt7621.h>
-#include <asm/time.h>
 
 #include <pinmux.h>
 
 #include "common.h"
 
+#define SYSC_REG_SYSCFG		0x10
+#define SYSC_REG_CPLL_CLKCFG0	0x2c
+#define SYSC_REG_CUR_CLK_STS	0x44
+#define CPU_CLK_SEL		(BIT(30) | BIT(31))
+
 #define MT7621_GPIO_MODE_UART1		1
 #define MT7621_GPIO_MODE_I2C		2
 #define MT7621_GPIO_MODE_UART3_MASK	0x3
@@ -110,89 +110,49 @@ static struct rt2880_pmx_group mt7621_pinmux_data[] = {
 	{ 0 }
 };
 
-static struct clk *clks[MT7621_CLK_MAX];
-static struct clk_onecell_data clk_data = {
-	.clks = clks,
-	.clk_num = ARRAY_SIZE(clks),
-};
-
 phys_addr_t mips_cpc_default_phys_base(void)
 {
 	panic("Cannot detect cpc address");
 }
 
-static struct clk *__init mt7621_add_sys_clkdev(
-	const char *id, unsigned long rate)
-{
-	struct clk *clk;
-	int err;
-
-	clk = clk_register_fixed_rate(NULL, id, NULL, 0, rate);
-	if (IS_ERR(clk))
-		panic("failed to allocate %s clock structure", id);
-
-	err = clk_register_clkdev(clk, id, NULL);
-	if (err)
-		panic("unable to register %s clock device", id);
-
-	return clk;
-}
-
 void __init ralink_clk_init(void)
 {
-	const static u32 prediv_tbl[] = {0, 1, 2, 2};
-	u32 syscfg, xtal_sel, clkcfg, clk_sel, curclk, ffiv, ffrac;
-	u32 pll, prediv, fbdiv;
-	u32 xtal_clk, cpu_clk, bus_clk;
-
-	syscfg = rt_sysc_r32(SYSC_REG_SYSTEM_CONFIG0);
-	xtal_sel = (syscfg >> XTAL_MODE_SEL_SHIFT) & XTAL_MODE_SEL_MASK;
+	int cpu_fdiv = 0;
+	int cpu_ffrac = 0;
+	int fbdiv = 0;
+	u32 clk_sts, syscfg;
+	u8 clk_sel = 0, xtal_mode;
+	u32 cpu_clk;
 
-	clkcfg = rt_sysc_r32(SYSC_REG_CLKCFG0);
-	clk_sel = (clkcfg >> CPU_CLK_SEL_SHIFT) & CPU_CLK_SEL_MASK;
-
-	curclk = rt_sysc_r32(SYSC_REG_CUR_CLK_STS);
-	ffiv = (curclk >> CUR_CPU_FDIV_SHIFT) & CUR_CPU_FDIV_MASK;
-	ffrac = (curclk >> CUR_CPU_FFRAC_SHIFT) & CUR_CPU_FFRAC_MASK;
-
-	if (xtal_sel <= 2)
-		xtal_clk = 20 * 1000 * 1000;
-	else if (xtal_sel <= 5)
-		xtal_clk = 40 * 1000 * 1000;
-	else
-		xtal_clk = 25 * 1000 * 1000;
+	if ((rt_sysc_r32(SYSC_REG_CPLL_CLKCFG0) & CPU_CLK_SEL) != 0)
+		clk_sel = 1;
 
 	switch (clk_sel) {
 	case 0:
-		cpu_clk = 500 * 1000 * 1000;
+		clk_sts = rt_sysc_r32(SYSC_REG_CUR_CLK_STS);
+		cpu_fdiv = ((clk_sts >> 8) & 0x1F);
+		cpu_ffrac = (clk_sts & 0x1F);
+		cpu_clk = (500 * cpu_ffrac / cpu_fdiv) * 1000 * 1000;
 		break;
+
 	case 1:
-		pll = rt_memc_r32(MEMC_REG_CPU_PLL);
-		fbdiv = (pll >> CPU_PLL_FBDIV_SHIFT) & CPU_PLL_FBDIV_MASK;
-		prediv = (pll >> CPU_PLL_PREDIV_SHIFT) & CPU_PLL_PREDIV_MASK;
-		cpu_clk = ((fbdiv + 1) * xtal_clk) >> prediv_tbl[prediv];
+		fbdiv = ((rt_sysc_r32(0x648) >> 4) & 0x7F) + 1;
+		syscfg = rt_sysc_r32(SYSC_REG_SYSCFG);
+		xtal_mode = (syscfg >> 6) & 0x7;
+		if (xtal_mode >= 6) {
+			/* 25Mhz Xtal */
+			cpu_clk = 25 * fbdiv * 1000 * 1000;
+		} else if (xtal_mode >= 3) {
+			/* 40Mhz Xtal */
+			cpu_clk = 40 * fbdiv * 1000 * 1000;
+		} else {
+			/* 20Mhz Xtal */
+			cpu_clk = 20 * fbdiv * 1000 * 1000;
+		}
 		break;
-	default:
-		cpu_clk = xtal_clk;
 	}
-
-	cpu_clk = cpu_clk / ffiv * ffrac;
-	bus_clk = cpu_clk / 4;
-
-	clks[MT7621_CLK_CPU] = mt7621_add_sys_clkdev("cpu", cpu_clk);
-	clks[MT7621_CLK_BUS] = mt7621_add_sys_clkdev("bus", bus_clk);
-
-	pr_info("CPU Clock: %dMHz\n", cpu_clk / 1000000);
-	mips_hpt_frequency = cpu_clk / 2;
 }
 
-static void __init mt7621_clocks_init_dt(struct device_node *np)
-{
-	of_clk_add_provider(np, of_clk_src_onecell_get, &clk_data);
-}
-
-CLK_OF_DECLARE(mt7621_clk, "mediatek,mt7621-pll", mt7621_clocks_init_dt);
-
 void __init ralink_of_remap(void)
 {
 	rt_sysc_membase = plat_of_remap_node("mtk,mt7621-sysc");
diff --git a/arch/mips/ralink/timer-gic.c b/arch/mips/ralink/timer-gic.c
index 4fed842ae8eb..b5f07d21fcf2 100644
--- a/arch/mips/ralink/timer-gic.c
+++ b/arch/mips/ralink/timer-gic.c
@@ -11,14 +11,14 @@
 
 #include <linux/of.h>
 #include <linux/clk-provider.h>
-#include <asm/time.h>
+#include <linux/clocksource.h>
 
 #include "common.h"
 
 void __init plat_time_init(void)
 {
 	ralink_of_remap();
-	ralink_clk_init();
+
 	of_clk_init(NULL);
 	timer_probe();
 }
-- 
cgit v1.2.3-59-g8ed1b


From 06b251dff78704c7d122bd109384d970a7dbe94d Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 12 Apr 2019 20:16:15 +0200
Subject: x86/fpu: Restore regs in copy_fpstate_to_sigframe() in order to use
 the fastpath
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a task is scheduled out and receives a signal then it won't be
able to take the fastpath because the registers aren't available. The
slowpath is more expensive compared to XRSTOR + XSAVE which usually
succeeds.

Here are some clock_gettime() numbers from a bigger box with AVX512
during bootup:

- __fpregs_load_activate() takes 140ns - 350ns. If it was the most recent
  FPU context on the CPU then the optimisation in __fpregs_load_activate()
  will skip the load (which was disabled during the test).

- copy_fpregs_to_sigframe() takes 200ns - 450ns if it succeeds. On a
  pagefault it is 1.8us - 3us usually in the 2.6us area.

- The slowpath takes 1.5us - 6us. Usually in the 2.6us area.

My testcases (including lat_sig) take the fastpath without
__fpregs_load_activate(). I expect this to be the majority.

Since the slowpath is in the >1us area it makes sense to load the
registers and attempt to save them directly. The direct save may fail
but should only happen on the first invocation or after fork() while the
page is read-only.

 [ bp: Massage a bit. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-27-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 3c3167576216..7026f1c4e5e3 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -175,20 +175,21 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 			(struct _fpstate_32 __user *) buf) ? -1 : 1;
 
 	/*
-	 * If we do not need to load the FPU registers at return to userspace
-	 * then the CPU has the current state. Try to save it directly to
-	 * userland's stack frame if it does not cause a pagefault. If it does,
-	 * try the slowpath.
+	 * Load the FPU registers if they are not valid for the current task.
+	 * With a valid FPU state we can attempt to save the state directly to
+	 * userland's stack frame which will likely succeed. If it does not, do
+	 * the slowpath.
 	 */
 	fpregs_lock();
-	if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
-		pagefault_disable();
-		ret = copy_fpregs_to_sigframe(buf_fx);
-		pagefault_enable();
-		if (ret)
-			copy_fpregs_to_fpstate(fpu);
-		set_thread_flag(TIF_NEED_FPU_LOAD);
-	}
+	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+		__fpregs_load_activate();
+
+	pagefault_disable();
+	ret = copy_fpregs_to_sigframe(buf_fx);
+	pagefault_enable();
+	if (ret && !test_thread_flag(TIF_NEED_FPU_LOAD))
+		copy_fpregs_to_fpstate(fpu);
+	set_thread_flag(TIF_NEED_FPU_LOAD);
 	fpregs_unlock();
 
 	if (ret) {
-- 
cgit v1.2.3-59-g8ed1b


From a5eff7259790d5314eff10563d6e59d358cce482 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:56 +0200
Subject: x86/pkeys: Add PKRU value to init_fpstate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The task's initial PKRU value is set partly for fpu__clear()/
copy_init_pkru_to_fpregs(). It is not part of init_fpstate.xsave and
instead it is set explicitly.

If the user removes the PKRU state from XSAVE in the signal handler then
__fpu__restore_sig() will restore the missing bits from `init_fpstate'
and initialize the PKRU value to 0.

Add the `init_pkru_value' to `init_fpstate' so it is set to the init
value in such a case.

In theory copy_init_pkru_to_fpregs() could be removed because restoring
the PKRU at return-to-userland should be enough.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Chang S. Bae" <chang.seok.bae@intel.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-28-bigeasy@linutronix.de
---
 arch/x86/kernel/cpu/common.c | 5 +++++
 arch/x86/mm/pkeys.c          | 6 ++++++
 2 files changed, 11 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cb28e98a0659..352fa19e6311 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -372,6 +372,8 @@ static bool pku_disabled;
 
 static __always_inline void setup_pku(struct cpuinfo_x86 *c)
 {
+	struct pkru_state *pk;
+
 	/* check the boot processor, plus compile options for PKU: */
 	if (!cpu_feature_enabled(X86_FEATURE_PKU))
 		return;
@@ -382,6 +384,9 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c)
 		return;
 
 	cr4_set_bits(X86_CR4_PKE);
+	pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU);
+	if (pk)
+		pk->pkru = init_pkru_value;
 	/*
 	 * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
 	 * cpuid bit to be set.  We need to ensure that we
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 2ecbf4155f98..1dcfc91c8f0c 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -18,6 +18,7 @@
 
 #include <asm/cpufeature.h>             /* boot_cpu_has, ...            */
 #include <asm/mmu_context.h>            /* vma_pkey()                   */
+#include <asm/fpu/internal.h>		/* init_fpstate			*/
 
 int __execute_only_pkey(struct mm_struct *mm)
 {
@@ -161,6 +162,7 @@ static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
 static ssize_t init_pkru_write_file(struct file *file,
 		 const char __user *user_buf, size_t count, loff_t *ppos)
 {
+	struct pkru_state *pk;
 	char buf[32];
 	ssize_t len;
 	u32 new_init_pkru;
@@ -183,6 +185,10 @@ static ssize_t init_pkru_write_file(struct file *file,
 		return -EINVAL;
 
 	WRITE_ONCE(init_pkru_value, new_init_pkru);
+	pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU);
+	if (!pk)
+		return -EINVAL;
+	pk->pkru = new_init_pkru;
 	return count;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From cae5ec342645746d617dd420d206e1588d47768a Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Fri, 12 Apr 2019 17:50:57 -0400
Subject: x86/speculation/mds: Fix comment

s/L1TF/MDS/

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tyler Hicks <tyhicks@canonical.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 arch/x86/kernel/cpu/bugs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 22a14d4b68a2..0642505dda69 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -221,7 +221,7 @@ static void x86_amd_ssb_disable(void)
 #undef pr_fmt
 #define pr_fmt(fmt)	"MDS: " fmt
 
-/* Default mitigation for L1TF-affected CPUs */
+/* Default mitigation for MDS-affected CPUs */
 static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL;
 static bool mds_nosmt __ro_after_init = false;
 
-- 
cgit v1.2.3-59-g8ed1b


From e2c3c94788b08891dcf3dbe608f9880523ecd71b Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 12 Apr 2019 17:50:58 -0400
Subject: x86/speculation/mds: Print SMT vulnerable on MSBDS with mitigations
 off

This code is only for CPUs which are affected by MSBDS, but are *not*
affected by the other two MDS issues.

For such CPUs, enabling the mds_idle_clear mitigation is enough to
mitigate SMT.

However if user boots with 'mds=off' and still has SMT enabled, we should
not report that SMT is mitigated:

$cat /sys//devices/system/cpu/vulnerabilities/mds
Vulnerable; SMT mitigated

But rather:
Vulnerable; SMT vulnerable

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tyler Hicks <tyhicks@canonical.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lkml.kernel.org/r/20190412215118.294906495@localhost.localdomain
---
 arch/x86/kernel/cpu/bugs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 0642505dda69..6b8a55c7cebc 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1204,7 +1204,8 @@ static ssize_t mds_show_state(char *buf)
 
 	if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) {
 		return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
-			       sched_smt_active() ? "mitigated" : "disabled");
+			       (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" :
+			        sched_smt_active() ? "mitigated" : "disabled"));
 	}
 
 	return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
-- 
cgit v1.2.3-59-g8ed1b


From ea094d53580f40c2124cef3d072b73b2425e7bfd Mon Sep 17 00:00:00 2001
From: Wenwen Wang <wang6495@umn.edu>
Date: Wed, 17 Apr 2019 09:18:50 -0500
Subject: x86/PCI: Fix PCI IRQ routing table memory leak

In pcibios_irq_init(), the PCI IRQ routing table 'pirq_table' is first
found through pirq_find_routing_table().  If the table is not found and
CONFIG_PCI_BIOS is defined, the table is then allocated in
pcibios_get_irq_routing_table() using kmalloc().  Later, if the I/O APIC is
used, this table is actually not used.  In that case, the allocated table
is not freed, which is a memory leak.

Free the allocated table if it is not used.

Signed-off-by: Wenwen Wang <wang6495@umn.edu>
[bhelgaas: added Ingo's reviewed-by, since the only change since v1 was to
use the irq_routing_table local variable name he suggested]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/pci/irq.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 52e55108404e..d3a73f9335e1 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1119,6 +1119,8 @@ static const struct dmi_system_id pciirq_dmi_table[] __initconst = {
 
 void __init pcibios_irq_init(void)
 {
+	struct irq_routing_table *rtable = NULL;
+
 	DBG(KERN_DEBUG "PCI: IRQ init\n");
 
 	if (raw_pci_ops == NULL)
@@ -1129,8 +1131,10 @@ void __init pcibios_irq_init(void)
 	pirq_table = pirq_find_routing_table();
 
 #ifdef CONFIG_PCI_BIOS
-	if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN))
+	if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN)) {
 		pirq_table = pcibios_get_irq_routing_table();
+		rtable = pirq_table;
+	}
 #endif
 	if (pirq_table) {
 		pirq_peer_trick();
@@ -1145,8 +1149,10 @@ void __init pcibios_irq_init(void)
 		 * If we're using the I/O APIC, avoid using the PCI IRQ
 		 * routing table
 		 */
-		if (io_apic_assign_pci_irqs)
+		if (io_apic_assign_pci_irqs) {
+			kfree(rtable);
 			pirq_table = NULL;
+		}
 	}
 
 	x86_init.pci.fixup_irqs();
-- 
cgit v1.2.3-59-g8ed1b


From 714c068228d3275da6d36f29e544f7e5ae648849 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 6 Feb 2019 15:12:27 +0100
Subject: mtd: nand: Clarify Kconfig entry for software BCH ECC algorithm

There is no point in having two distinct entries, merge them and
rename the symbol for more clarity: MTD_NAND_ECC_SW_BCH

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 arch/arm/configs/omap2plus_defconfig            | 2 +-
 arch/arm/configs/pxa_defconfig                  | 2 +-
 arch/mips/configs/db1xxx_defconfig              | 2 +-
 arch/mips/configs/generic/board-ni169445.config | 2 +-
 drivers/mtd/devices/Kconfig                     | 2 +-
 drivers/mtd/nand/raw/Kconfig                    | 9 ++-------
 drivers/mtd/nand/raw/Makefile                   | 2 +-
 drivers/mtd/nand/raw/nand_base.c                | 2 +-
 drivers/mtd/nand/raw/omap2.c                    | 4 ++--
 include/linux/mtd/nand_bch.h                    | 6 +++---
 10 files changed, 14 insertions(+), 19 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig
index 3f03ec6d2644..7dc2ac9175de 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -144,7 +144,7 @@ CONFIG_MTD_ONENAND=y
 CONFIG_MTD_ONENAND_VERIFY_WRITE=y
 CONFIG_MTD_ONENAND_OMAP2=y
 CONFIG_MTD_NAND=y
-CONFIG_MTD_NAND_ECC_BCH=y
+CONFIG_MTD_NAND_ECC_SW_BCH=y
 CONFIG_MTD_NAND_OMAP2=y
 CONFIG_MTD_NAND_OMAP_BCH=y
 CONFIG_MTD_SPI_NOR=m
diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
index d4654755b09c..d91adb49ae11 100644
--- a/arch/arm/configs/pxa_defconfig
+++ b/arch/arm/configs/pxa_defconfig
@@ -186,7 +186,7 @@ CONFIG_MTD_M25P80=m
 CONFIG_MTD_BLOCK2MTD=y
 CONFIG_MTD_DOCG3=m
 CONFIG_MTD_NAND=m
-CONFIG_MTD_NAND_ECC_BCH=y
+CONFIG_MTD_NAND_ECC_SW_BCH=y
 CONFIG_MTD_NAND_GPIO=m
 CONFIG_MTD_NAND_DISKONCHIP=m
 CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED=y
diff --git a/arch/mips/configs/db1xxx_defconfig b/arch/mips/configs/db1xxx_defconfig
index 34633b7611cb..c5a2c6f234d4 100644
--- a/arch/mips/configs/db1xxx_defconfig
+++ b/arch/mips/configs/db1xxx_defconfig
@@ -96,7 +96,7 @@ CONFIG_MTD_PHYSMAP=y
 CONFIG_MTD_M25P80=y
 CONFIG_MTD_SST25L=y
 CONFIG_MTD_NAND=y
-CONFIG_MTD_NAND_ECC_BCH=y
+CONFIG_MTD_NAND_ECC_SW_BCH=y
 CONFIG_MTD_NAND_AU1550=y
 CONFIG_MTD_NAND_PLATFORM=y
 CONFIG_MTD_SPI_NOR=y
diff --git a/arch/mips/configs/generic/board-ni169445.config b/arch/mips/configs/generic/board-ni169445.config
index f72223b366ca..01f6ab0d47d5 100644
--- a/arch/mips/configs/generic/board-ni169445.config
+++ b/arch/mips/configs/generic/board-ni169445.config
@@ -16,7 +16,7 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CMDLINE_PARTS=y
 
 CONFIG_MTD_NAND_ECC=y
-CONFIG_MTD_NAND_ECC_BCH=y
+CONFIG_MTD_NAND_ECC_SW_BCH=y
 CONFIG_MTD_NAND=y
 CONFIG_MTD_NAND_GPIO=y
 CONFIG_MTD_NAND_IDS=y
diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig
index aa983422aa97..f9258d666846 100644
--- a/drivers/mtd/devices/Kconfig
+++ b/drivers/mtd/devices/Kconfig
@@ -207,7 +207,7 @@ comment "Disk-On-Chip Device Drivers"
 config MTD_DOCG3
 	tristate "M-Systems Disk-On-Chip G3"
 	select BCH
-	select BCH_CONST_PARAMS if !MTD_NAND_BCH
+	select BCH_CONST_PARAMS if !MTD_NAND_ECC_SW_BCH
 	select BITREVERSE
 	help
 	  This provides an MTD device driver for the M-Systems DiskOnChip
diff --git a/drivers/mtd/nand/raw/Kconfig b/drivers/mtd/nand/raw/Kconfig
index 80b5fec1f75a..7a1bde4fe9f6 100644
--- a/drivers/mtd/nand/raw/Kconfig
+++ b/drivers/mtd/nand/raw/Kconfig
@@ -22,14 +22,9 @@ menuconfig MTD_NAND
 
 if MTD_NAND
 
-config MTD_NAND_BCH
-	tristate
+config MTD_NAND_ECC_SW_BCH
+	tristate "Support software BCH ECC"
 	select BCH
-	depends on MTD_NAND_ECC_BCH
-	default MTD_NAND
-
-config MTD_NAND_ECC_BCH
-	bool "Support software BCH ECC"
 	default n
 	help
 	  This enables support for software BCH error correction. Binary BCH
diff --git a/drivers/mtd/nand/raw/Makefile b/drivers/mtd/nand/raw/Makefile
index 5552c4a9f2cf..9749ff00f9e4 100644
--- a/drivers/mtd/nand/raw/Makefile
+++ b/drivers/mtd/nand/raw/Makefile
@@ -2,7 +2,7 @@
 
 obj-$(CONFIG_MTD_NAND)			+= nand.o
 obj-$(CONFIG_MTD_NAND_ECC)		+= nand_ecc.o
-obj-$(CONFIG_MTD_NAND_BCH)		+= nand_bch.o
+obj-$(CONFIG_MTD_NAND_ECC_SW_BCH)	+= nand_bch.o
 obj-$(CONFIG_MTD_SM_COMMON) 		+= sm_common.o
 
 obj-$(CONFIG_MTD_NAND_CAFE)		+= cafe_nand.o
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 4646add22114..95fac92d6b82 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5087,7 +5087,7 @@ static int nand_set_ecc_soft_ops(struct nand_chip *chip)
 		return 0;
 	case NAND_ECC_BCH:
 		if (!mtd_nand_has_bch()) {
-			WARN(1, "CONFIG_MTD_NAND_ECC_BCH not enabled\n");
+			WARN(1, "CONFIG_MTD_NAND_ECC_SW_BCH not enabled\n");
 			return -EINVAL;
 		}
 		ecc->calculate = nand_bch_calculate_ecc;
diff --git a/drivers/mtd/nand/raw/omap2.c b/drivers/mtd/nand/raw/omap2.c
index 8f280a2962c8..a9a275342a41 100644
--- a/drivers/mtd/nand/raw/omap2.c
+++ b/drivers/mtd/nand/raw/omap2.c
@@ -1725,9 +1725,9 @@ static bool omap2_nand_ecc_check(struct omap_nand_info *info)
 		break;
 	}
 
-	if (ecc_needs_bch && !IS_ENABLED(CONFIG_MTD_NAND_ECC_BCH)) {
+	if (ecc_needs_bch && !IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_BCH)) {
 		dev_err(&info->pdev->dev,
-			"CONFIG_MTD_NAND_ECC_BCH not enabled\n");
+			"CONFIG_MTD_NAND_ECC_SW_BCH not enabled\n");
 		return false;
 	}
 	if (ecc_needs_omap_bch && !IS_ENABLED(CONFIG_MTD_NAND_OMAP_BCH)) {
diff --git a/include/linux/mtd/nand_bch.h b/include/linux/mtd/nand_bch.h
index b8106651f807..a8a6909b594e 100644
--- a/include/linux/mtd/nand_bch.h
+++ b/include/linux/mtd/nand_bch.h
@@ -15,7 +15,7 @@ struct mtd_info;
 struct nand_chip;
 struct nand_bch_control;
 
-#if defined(CONFIG_MTD_NAND_ECC_BCH)
+#if IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_BCH)
 
 static inline int mtd_nand_has_bch(void) { return 1; }
 
@@ -39,7 +39,7 @@ struct nand_bch_control *nand_bch_init(struct mtd_info *mtd);
  */
 void nand_bch_free(struct nand_bch_control *nbc);
 
-#else /* !CONFIG_MTD_NAND_ECC_BCH */
+#else /* !CONFIG_MTD_NAND_ECC_SW_BCH */
 
 static inline int mtd_nand_has_bch(void) { return 0; }
 
@@ -64,6 +64,6 @@ static inline struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
 
 static inline void nand_bch_free(struct nand_bch_control *nbc) {}
 
-#endif /* CONFIG_MTD_NAND_ECC_BCH */
+#endif /* CONFIG_MTD_NAND_ECC_SW_BCH */
 
 #endif /* __MTD_NAND_BCH_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 9bb94643b94115990ffec18c8129f1ab970765c1 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 8 Feb 2019 08:48:37 +0100
Subject: mtd: nand: Clarify Kconfig entry for software Hamming ECC entries

The software Hamming ECC correction implementation is referred as
MTD_NAND_ECC which is too generic. Rename it
MTD_NAND_ECC_SW_HAMMING. Also rename MTD_NAND_ECC_SMC which is an
SMC quirk in the Hamming implementation as
MTD_NAND_ECC_SW_HAMMING_SMC.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 arch/arm/configs/nhk8815_defconfig              |  2 +-
 arch/mips/configs/generic/board-ni169445.config |  2 +-
 arch/powerpc/configs/85xx/tqm8548_defconfig     |  2 +-
 drivers/mtd/Kconfig                             |  3 +--
 drivers/mtd/nand/raw/Kconfig                    | 11 +++++------
 drivers/mtd/nand/raw/Makefile                   |  2 +-
 drivers/mtd/nand/raw/nand_base.c                |  2 +-
 drivers/mtd/sm_ftl.c                            | 12 ++++++------
 drivers/mtd/tests/mtd_nandecctest.c             | 14 +++++++-------
 9 files changed, 24 insertions(+), 26 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/configs/nhk8815_defconfig b/arch/arm/configs/nhk8815_defconfig
index 5f4c6aaa07f6..e0373989b32d 100644
--- a/arch/arm/configs/nhk8815_defconfig
+++ b/arch/arm/configs/nhk8815_defconfig
@@ -53,7 +53,7 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_ONENAND=y
 CONFIG_MTD_ONENAND_VERIFY_WRITE=y
 CONFIG_MTD_ONENAND_GENERIC=y
-CONFIG_MTD_NAND_ECC_SMC=y
+CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC=y
 CONFIG_MTD_NAND=y
 CONFIG_MTD_NAND_FSMC=y
 CONFIG_BLK_DEV_LOOP=y
diff --git a/arch/mips/configs/generic/board-ni169445.config b/arch/mips/configs/generic/board-ni169445.config
index 01f6ab0d47d5..e68da2e9dc7c 100644
--- a/arch/mips/configs/generic/board-ni169445.config
+++ b/arch/mips/configs/generic/board-ni169445.config
@@ -15,7 +15,7 @@ CONFIG_MTD=y
 CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CMDLINE_PARTS=y
 
-CONFIG_MTD_NAND_ECC=y
+CONFIG_MTD_NAND_ECC_SW_HAMMING=y
 CONFIG_MTD_NAND_ECC_SW_BCH=y
 CONFIG_MTD_NAND=y
 CONFIG_MTD_NAND_GPIO=y
diff --git a/arch/powerpc/configs/85xx/tqm8548_defconfig b/arch/powerpc/configs/85xx/tqm8548_defconfig
index 2697e4e8a761..464ce192bc14 100644
--- a/arch/powerpc/configs/85xx/tqm8548_defconfig
+++ b/arch/powerpc/configs/85xx/tqm8548_defconfig
@@ -35,7 +35,7 @@ CONFIG_MTD=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP_OF=y
-CONFIG_MTD_NAND_ECC_SMC=y
+CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC=y
 CONFIG_MTD_NAND=y
 CONFIG_MTD_NAND_FSL_UPM=y
 CONFIG_BLK_DEV_LOOP=y
diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig
index 79a8ff542883..aa5a27fdfdd1 100644
--- a/drivers/mtd/Kconfig
+++ b/drivers/mtd/Kconfig
@@ -230,12 +230,11 @@ config SSFDC
 	  This enables read only access to SmartMedia formatted NAND
 	  flash. You can mount it with FAT file system.
 
-
 config SM_FTL
 	tristate "SmartMedia/xD new translation layer"
 	depends on BLOCK
 	select MTD_BLKDEVS
-	select MTD_NAND_ECC
+	select MTD_NAND_ECC_SW_HAMMING
 	help
 	  This enables EXPERIMENTAL R/W support for SmartMedia/xD
 	  FTL (Flash translation layer).
diff --git a/drivers/mtd/nand/raw/Kconfig b/drivers/mtd/nand/raw/Kconfig
index 7a1bde4fe9f6..b521e9862e53 100644
--- a/drivers/mtd/nand/raw/Kconfig
+++ b/drivers/mtd/nand/raw/Kconfig
@@ -1,20 +1,19 @@
-config MTD_NAND_ECC
+config MTD_NAND_ECC_SW_HAMMING
 	tristate
 
-config MTD_NAND_ECC_SMC
+config MTD_NAND_ECC_SW_HAMMING_SMC
 	bool "NAND ECC Smart Media byte order"
-	depends on MTD_NAND_ECC
+	depends on MTD_NAND_ECC_SW_HAMMING
 	default n
 	help
 	  Software ECC according to the Smart Media Specification.
 	  The original Linux implementation had byte 0 and 1 swapped.
 
-
 menuconfig MTD_NAND
 	tristate "Raw/Parallel NAND Device Support"
 	depends on MTD
-	select MTD_NAND_ECC
 	select MTD_NAND_CORE
+	select MTD_NAND_ECC_SW_HAMMING
 	help
 	  This enables support for accessing all type of raw/parallel
 	  NAND flash devices. For further information see
@@ -132,7 +131,7 @@ config MTD_NAND_S3C2410_DEBUG
 config MTD_NAND_NDFC
 	tristate "NDFC NanD Flash Controller"
 	depends on 4xx
-	select MTD_NAND_ECC_SMC
+	select MTD_NAND_ECC_SW_HAMMING_SMC
 	help
 	  NDFC Nand Flash Controllers are integrated in IBM/AMCC's 4xx SoCs
 
diff --git a/drivers/mtd/nand/raw/Makefile b/drivers/mtd/nand/raw/Makefile
index 9749ff00f9e4..a022931318ac 100644
--- a/drivers/mtd/nand/raw/Makefile
+++ b/drivers/mtd/nand/raw/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-$(CONFIG_MTD_NAND)			+= nand.o
-obj-$(CONFIG_MTD_NAND_ECC)		+= nand_ecc.o
+obj-$(CONFIG_MTD_NAND_ECC_SW_HAMMING)	+= nand_ecc.o
 obj-$(CONFIG_MTD_NAND_ECC_SW_BCH)	+= nand_bch.o
 obj-$(CONFIG_MTD_SM_COMMON) 		+= sm_common.o
 
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 95fac92d6b82..fd7ce5b929c0 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5081,7 +5081,7 @@ static int nand_set_ecc_soft_ops(struct nand_chip *chip)
 		ecc->bytes = 3;
 		ecc->strength = 1;
 
-		if (IS_ENABLED(CONFIG_MTD_NAND_ECC_SMC))
+		if (IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC))
 			ecc->options |= NAND_ECC_SOFT_HAMMING_SM_ORDER;
 
 		return 0;
diff --git a/drivers/mtd/sm_ftl.c b/drivers/mtd/sm_ftl.c
index 89227b1d036a..e0955a98a0f4 100644
--- a/drivers/mtd/sm_ftl.c
+++ b/drivers/mtd/sm_ftl.c
@@ -222,17 +222,17 @@ static int sm_correct_sector(uint8_t *buffer, struct sm_oob *oob)
 	uint8_t ecc[3];
 
 	__nand_calculate_ecc(buffer, SM_SMALL_PAGE, ecc,
-			     IS_ENABLED(CONFIG_MTD_NAND_ECC_SMC));
+			     IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
 	if (__nand_correct_data(buffer, ecc, oob->ecc1, SM_SMALL_PAGE,
-				IS_ENABLED(CONFIG_MTD_NAND_ECC_SMC)) < 0)
+				IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC)) < 0)
 		return -EIO;
 
 	buffer += SM_SMALL_PAGE;
 
 	__nand_calculate_ecc(buffer, SM_SMALL_PAGE, ecc,
-			     IS_ENABLED(CONFIG_MTD_NAND_ECC_SMC));
+			     IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
 	if (__nand_correct_data(buffer, ecc, oob->ecc2, SM_SMALL_PAGE,
-				IS_ENABLED(CONFIG_MTD_NAND_ECC_SMC)) < 0)
+				IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC)) < 0)
 		return -EIO;
 	return 0;
 }
@@ -399,11 +399,11 @@ restart:
 		if (ftl->smallpagenand) {
 			__nand_calculate_ecc(buf + boffset, SM_SMALL_PAGE,
 					oob.ecc1,
-					IS_ENABLED(CONFIG_MTD_NAND_ECC_SMC));
+					IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
 
 			__nand_calculate_ecc(buf + boffset + SM_SMALL_PAGE,
 					SM_SMALL_PAGE, oob.ecc2,
-					IS_ENABLED(CONFIG_MTD_NAND_ECC_SMC));
+					IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
 		}
 		if (!sm_write_sector(ftl, zone, block, boffset,
 							buf + boffset, &oob))
diff --git a/drivers/mtd/tests/mtd_nandecctest.c b/drivers/mtd/tests/mtd_nandecctest.c
index c71523e94580..0c0091fb3708 100644
--- a/drivers/mtd/tests/mtd_nandecctest.c
+++ b/drivers/mtd/tests/mtd_nandecctest.c
@@ -122,9 +122,9 @@ static int no_bit_error_verify(void *error_data, void *error_ecc,
 	int ret;
 
 	__nand_calculate_ecc(error_data, size, calc_ecc,
-			     IS_ENABLED(CONFIG_MTD_NAND_ECC_SMC));
+			     IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
 	ret = __nand_correct_data(error_data, error_ecc, calc_ecc, size,
-				  IS_ENABLED(CONFIG_MTD_NAND_ECC_SMC));
+				  IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
 	if (ret == 0 && !memcmp(correct_data, error_data, size))
 		return 0;
 
@@ -152,9 +152,9 @@ static int single_bit_error_correct(void *error_data, void *error_ecc,
 	int ret;
 
 	__nand_calculate_ecc(error_data, size, calc_ecc,
-			     IS_ENABLED(CONFIG_MTD_NAND_ECC_SMC));
+			     IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
 	ret = __nand_correct_data(error_data, error_ecc, calc_ecc, size,
-				  IS_ENABLED(CONFIG_MTD_NAND_ECC_SMC));
+				  IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
 	if (ret == 1 && !memcmp(correct_data, error_data, size))
 		return 0;
 
@@ -189,9 +189,9 @@ static int double_bit_error_detect(void *error_data, void *error_ecc,
 	int ret;
 
 	__nand_calculate_ecc(error_data, size, calc_ecc,
-			     IS_ENABLED(CONFIG_MTD_NAND_ECC_SMC));
+			     IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
 	ret = __nand_correct_data(error_data, error_ecc, calc_ecc, size,
-				  IS_ENABLED(CONFIG_MTD_NAND_ECC_SMC));
+				  IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
 
 	return (ret == -EBADMSG) ? 0 : -EINVAL;
 }
@@ -266,7 +266,7 @@ static int nand_ecc_test_run(const size_t size)
 
 	prandom_bytes(correct_data, size);
 	__nand_calculate_ecc(correct_data, size, correct_ecc,
-			     IS_ENABLED(CONFIG_MTD_NAND_ECC_SMC));
+			     IS_ENABLED(CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC));
 
 	for (i = 0; i < ARRAY_SIZE(nand_ecc_test); i++) {
 		nand_ecc_test[i].prepare(error_data, error_ecc,
-- 
cgit v1.2.3-59-g8ed1b


From 72c5af00272339af6bbed6fe7275cd731f57be2d Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 6 Feb 2019 16:47:44 +0100
Subject: mtd: rawnand: Clarify Kconfig entry MTD_NAND

MTD_NAND is large and encloses much more than what the symbol is
actually used for: raw NAND. Clarify the symbol by naming it
MTD_RAW_NAND instead.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 arch/arm/configs/at91_dt_defconfig              | 2 +-
 arch/arm/configs/clps711x_defconfig             | 2 +-
 arch/arm/configs/cm_x2xx_defconfig              | 2 +-
 arch/arm/configs/cm_x300_defconfig              | 2 +-
 arch/arm/configs/colibri_pxa270_defconfig       | 2 +-
 arch/arm/configs/corgi_defconfig                | 2 +-
 arch/arm/configs/davinci_all_defconfig          | 2 +-
 arch/arm/configs/em_x270_defconfig              | 2 +-
 arch/arm/configs/ep93xx_defconfig               | 2 +-
 arch/arm/configs/eseries_pxa_defconfig          | 2 +-
 arch/arm/configs/imx_v4_v5_defconfig            | 2 +-
 arch/arm/configs/imx_v6_v7_defconfig            | 2 +-
 arch/arm/configs/ixp4xx_defconfig               | 2 +-
 arch/arm/configs/keystone_defconfig             | 2 +-
 arch/arm/configs/lpc32xx_defconfig              | 2 +-
 arch/arm/configs/mini2440_defconfig             | 2 +-
 arch/arm/configs/mmp2_defconfig                 | 2 +-
 arch/arm/configs/multi_v4t_defconfig            | 2 +-
 arch/arm/configs/multi_v5_defconfig             | 2 +-
 arch/arm/configs/multi_v7_defconfig             | 2 +-
 arch/arm/configs/mv78xx0_defconfig              | 2 +-
 arch/arm/configs/mvebu_v5_defconfig             | 2 +-
 arch/arm/configs/mvebu_v7_defconfig             | 2 +-
 arch/arm/configs/mxs_defconfig                  | 2 +-
 arch/arm/configs/nhk8815_defconfig              | 2 +-
 arch/arm/configs/omap1_defconfig                | 2 +-
 arch/arm/configs/omap2plus_defconfig            | 2 +-
 arch/arm/configs/orion5x_defconfig              | 2 +-
 arch/arm/configs/oxnas_v6_defconfig             | 2 +-
 arch/arm/configs/pxa3xx_defconfig               | 2 +-
 arch/arm/configs/pxa_defconfig                  | 2 +-
 arch/arm/configs/qcom_defconfig                 | 2 +-
 arch/arm/configs/s3c2410_defconfig              | 2 +-
 arch/arm/configs/s3c6400_defconfig              | 2 +-
 arch/arm/configs/sama5_defconfig                | 2 +-
 arch/arm/configs/socfpga_defconfig              | 2 +-
 arch/arm/configs/spear13xx_defconfig            | 2 +-
 arch/arm/configs/spear3xx_defconfig             | 2 +-
 arch/arm/configs/spear6xx_defconfig             | 2 +-
 arch/arm/configs/spitz_defconfig                | 2 +-
 arch/arm/configs/tango4_defconfig               | 2 +-
 arch/arm/configs/trizeps4_defconfig             | 2 +-
 arch/arm/configs/u300_defconfig                 | 2 +-
 arch/arm64/configs/defconfig                    | 2 +-
 arch/mips/configs/bcm47xx_defconfig             | 2 +-
 arch/mips/configs/ci20_defconfig                | 2 +-
 arch/mips/configs/db1xxx_defconfig              | 2 +-
 arch/mips/configs/generic/board-ni169445.config | 2 +-
 arch/mips/configs/generic/board-ocelot.config   | 2 +-
 arch/mips/configs/loongson1b_defconfig          | 2 +-
 arch/mips/configs/loongson1c_defconfig          | 2 +-
 arch/mips/configs/qi_lb60_defconfig             | 2 +-
 arch/mips/configs/rb532_defconfig               | 2 +-
 arch/mips/configs/rbtx49xx_defconfig            | 2 +-
 arch/mips/configs/xway_defconfig                | 2 +-
 arch/powerpc/configs/40x/kilauea_defconfig      | 2 +-
 arch/powerpc/configs/40x/obs600_defconfig       | 2 +-
 arch/powerpc/configs/44x/canyonlands_defconfig  | 2 +-
 arch/powerpc/configs/44x/eiger_defconfig        | 2 +-
 arch/powerpc/configs/44x/sequoia_defconfig      | 2 +-
 arch/powerpc/configs/44x/warp_defconfig         | 2 +-
 arch/powerpc/configs/83xx/mpc8313_rdb_defconfig | 2 +-
 arch/powerpc/configs/83xx/mpc8315_rdb_defconfig | 2 +-
 arch/powerpc/configs/85xx-hw.config             | 2 +-
 arch/powerpc/configs/85xx/ge_imp3a_defconfig    | 2 +-
 arch/powerpc/configs/85xx/socrates_defconfig    | 2 +-
 arch/powerpc/configs/85xx/tqm8548_defconfig     | 2 +-
 arch/powerpc/configs/85xx/xes_mpc85xx_defconfig | 2 +-
 arch/powerpc/configs/86xx-hw.config             | 2 +-
 arch/powerpc/configs/mpc512x_defconfig          | 2 +-
 arch/powerpc/configs/mpc83xx_defconfig          | 2 +-
 arch/powerpc/configs/pasemi_defconfig           | 2 +-
 arch/powerpc/configs/ppc44x_defconfig           | 2 +-
 arch/sh/configs/ap325rxa_defconfig              | 2 +-
 arch/sh/configs/ecovec24_defconfig              | 2 +-
 arch/sh/configs/migor_defconfig                 | 2 +-
 arch/sh/configs/sdk7786_defconfig               | 2 +-
 arch/sh/configs/se7724_defconfig                | 2 +-
 arch/sh/configs/titan_defconfig                 | 2 +-
 drivers/mtd/nand/raw/Kconfig                    | 6 +++---
 drivers/mtd/nand/raw/Makefile                   | 2 +-
 drivers/mtd/tests/mtd_nandecctest.c             | 2 +-
 82 files changed, 84 insertions(+), 84 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/configs/at91_dt_defconfig b/arch/arm/configs/at91_dt_defconfig
index e4b1be66b3f5..50ef3eb0ab64 100644
--- a/arch/arm/configs/at91_dt_defconfig
+++ b/arch/arm/configs/at91_dt_defconfig
@@ -56,7 +56,7 @@ CONFIG_MTD=y
 CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_BLOCK=y
 CONFIG_MTD_DATAFLASH=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_ATMEL=y
 CONFIG_MTD_UBI=y
 CONFIG_MTD_UBI_GLUEBI=y
diff --git a/arch/arm/configs/clps711x_defconfig b/arch/arm/configs/clps711x_defconfig
index fc105c9178cc..7968d20673b9 100644
--- a/arch/arm/configs/clps711x_defconfig
+++ b/arch/arm/configs/clps711x_defconfig
@@ -36,7 +36,7 @@ CONFIG_MTD_CFI_INTELEXT=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_CFI_STAA=y
 CONFIG_MTD_PLATRAM=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_GPIO=y
 CONFIG_NETDEVICES=y
 # CONFIG_NET_CADENCE is not set
diff --git a/arch/arm/configs/cm_x2xx_defconfig b/arch/arm/configs/cm_x2xx_defconfig
index fb45b4983d3c..5344434df652 100644
--- a/arch/arm/configs/cm_x2xx_defconfig
+++ b/arch/arm/configs/cm_x2xx_defconfig
@@ -58,7 +58,7 @@ CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_CFI_STAA=y
 CONFIG_MTD_PHYSMAP=y
 CONFIG_MTD_PXA2XX=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_GPIO=m
 CONFIG_MTD_NAND_CM_X270=y
 CONFIG_MTD_NAND_PLATFORM=y
diff --git a/arch/arm/configs/cm_x300_defconfig b/arch/arm/configs/cm_x300_defconfig
index 5e349c625b71..3707a014cbc4 100644
--- a/arch/arm/configs/cm_x300_defconfig
+++ b/arch/arm/configs/cm_x300_defconfig
@@ -48,7 +48,7 @@ CONFIG_LIB80211=m
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_MTD=y
 CONFIG_MTD_BLOCK=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_MARVELL=y
 CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=y
diff --git a/arch/arm/configs/colibri_pxa270_defconfig b/arch/arm/configs/colibri_pxa270_defconfig
index 8995695fc118..8d484e4d51cc 100644
--- a/arch/arm/configs/colibri_pxa270_defconfig
+++ b/arch/arm/configs/colibri_pxa270_defconfig
@@ -64,7 +64,7 @@ CONFIG_MTD_COMPLEX_MAPPINGS=y
 CONFIG_MTD_PHYSMAP=y
 CONFIG_MTD_PXA2XX=y
 CONFIG_MTD_BLOCK2MTD=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_DISKONCHIP=y
 CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED=y
 CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0x4000000
diff --git a/arch/arm/configs/corgi_defconfig b/arch/arm/configs/corgi_defconfig
index 09e1672777c9..d99725984947 100644
--- a/arch/arm/configs/corgi_defconfig
+++ b/arch/arm/configs/corgi_defconfig
@@ -87,7 +87,7 @@ CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_BLOCK=y
 CONFIG_MTD_ROM=y
 CONFIG_MTD_COMPLEX_MAPPINGS=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_SHARPSL=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_SD=y
diff --git a/arch/arm/configs/davinci_all_defconfig b/arch/arm/configs/davinci_all_defconfig
index 207962a656a2..4a8cad4d3707 100644
--- a/arch/arm/configs/davinci_all_defconfig
+++ b/arch/arm/configs/davinci_all_defconfig
@@ -74,7 +74,7 @@ CONFIG_MTD_CFI_INTELEXT=m
 CONFIG_MTD_CFI_AMDSTD=m
 CONFIG_MTD_PHYSMAP=m
 CONFIG_MTD_M25P80=m
-CONFIG_MTD_NAND=m
+CONFIG_MTD_RAW_NAND=m
 CONFIG_MTD_NAND_DAVINCI=m
 CONFIG_MTD_SPI_NOR=m
 CONFIG_MTD_UBI=m
diff --git a/arch/arm/configs/em_x270_defconfig b/arch/arm/configs/em_x270_defconfig
index 30a67523f860..61228a25ba8d 100644
--- a/arch/arm/configs/em_x270_defconfig
+++ b/arch/arm/configs/em_x270_defconfig
@@ -54,7 +54,7 @@ CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_CFI_STAA=y
 CONFIG_MTD_PHYSMAP=y
 CONFIG_MTD_PXA2XX=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_PLATFORM=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
diff --git a/arch/arm/configs/ep93xx_defconfig b/arch/arm/configs/ep93xx_defconfig
index 78cd73d1c795..14889a785f07 100644
--- a/arch/arm/configs/ep93xx_defconfig
+++ b/arch/arm/configs/ep93xx_defconfig
@@ -63,7 +63,7 @@ CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_CFI_STAA=y
 CONFIG_MTD_ROM=y
 CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_BLK_DEV_NBD=y
 CONFIG_EEPROM_LEGACY=y
 CONFIG_SCSI=y
diff --git a/arch/arm/configs/eseries_pxa_defconfig b/arch/arm/configs/eseries_pxa_defconfig
index eabb784cf7da..b85575867d21 100644
--- a/arch/arm/configs/eseries_pxa_defconfig
+++ b/arch/arm/configs/eseries_pxa_defconfig
@@ -43,7 +43,7 @@ CONFIG_MAC80211_RC_PID=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 # CONFIG_STANDALONE is not set
 CONFIG_MTD=m
-CONFIG_MTD_NAND=m
+CONFIG_MTD_RAW_NAND=m
 CONFIG_MTD_NAND_TMIO=m
 CONFIG_BLK_DEV_LOOP=m
 # CONFIG_SCSI_PROC_FS is not set
diff --git a/arch/arm/configs/imx_v4_v5_defconfig b/arch/arm/configs/imx_v4_v5_defconfig
index 8661dd9b064a..ffdcfc3edc25 100644
--- a/arch/arm/configs/imx_v4_v5_defconfig
+++ b/arch/arm/configs/imx_v4_v5_defconfig
@@ -61,7 +61,7 @@ CONFIG_MTD_CFI_GEOMETRY=y
 # CONFIG_MTD_CFI_I2 is not set
 CONFIG_MTD_CFI_INTELEXT=y
 CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_MXC=y
 CONFIG_MTD_UBI=y
 CONFIG_EEPROM_AT24=y
diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig
index 5586a5074a96..07b796c2f0ee 100644
--- a/arch/arm/configs/imx_v6_v7_defconfig
+++ b/arch/arm/configs/imx_v6_v7_defconfig
@@ -110,7 +110,7 @@ CONFIG_MTD_PHYSMAP_OF=y
 CONFIG_MTD_DATAFLASH=y
 CONFIG_MTD_M25P80=y
 CONFIG_MTD_SST25L=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_GPMI_NAND=y
 CONFIG_MTD_NAND_VF610_NFC=y
 CONFIG_MTD_NAND_MXC=y
diff --git a/arch/arm/configs/ixp4xx_defconfig b/arch/arm/configs/ixp4xx_defconfig
index 8c3c99cd6de9..39ebcce3bc2f 100644
--- a/arch/arm/configs/ixp4xx_defconfig
+++ b/arch/arm/configs/ixp4xx_defconfig
@@ -112,7 +112,7 @@ CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_INTELEXT=y
 CONFIG_MTD_COMPLEX_MAPPINGS=y
 CONFIG_MTD_IXP4XX=y
-CONFIG_MTD_NAND=m
+CONFIG_MTD_RAW_NAND=m
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
diff --git a/arch/arm/configs/keystone_defconfig b/arch/arm/configs/keystone_defconfig
index 3ded35a07f45..72fee57aad2f 100644
--- a/arch/arm/configs/keystone_defconfig
+++ b/arch/arm/configs/keystone_defconfig
@@ -124,7 +124,7 @@ CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_BLOCK=y
 CONFIG_MTD_PLATRAM=y
 CONFIG_MTD_M25P80=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_DAVINCI=y
 CONFIG_MTD_SPI_NOR=y
 CONFIG_MTD_UBI=y
diff --git a/arch/arm/configs/lpc32xx_defconfig b/arch/arm/configs/lpc32xx_defconfig
index e752fb704df0..4b3b2c693c29 100644
--- a/arch/arm/configs/lpc32xx_defconfig
+++ b/arch/arm/configs/lpc32xx_defconfig
@@ -47,7 +47,7 @@ CONFIG_DEVTMPFS_MOUNT=y
 CONFIG_MTD=y
 CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_BLOCK=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_SLC_LPC32XX=y
 CONFIG_MTD_NAND_MLC_LPC32XX=y
 CONFIG_MTD_UBI=y
diff --git a/arch/arm/configs/mini2440_defconfig b/arch/arm/configs/mini2440_defconfig
index d95a8059d30b..8b0f7c4c3f09 100644
--- a/arch/arm/configs/mini2440_defconfig
+++ b/arch/arm/configs/mini2440_defconfig
@@ -92,7 +92,7 @@ CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_CFI_STAA=y
 CONFIG_MTD_RAM=y
 CONFIG_MTD_ROM=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_S3C2410=y
 CONFIG_MTD_NAND_PLATFORM=y
 CONFIG_MTD_LPDDR=y
diff --git a/arch/arm/configs/mmp2_defconfig b/arch/arm/configs/mmp2_defconfig
index 1eeee7f11d91..94deb0ed0541 100644
--- a/arch/arm/configs/mmp2_defconfig
+++ b/arch/arm/configs/mmp2_defconfig
@@ -28,7 +28,7 @@ CONFIG_IP_PNP=y
 CONFIG_MTD=y
 CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_BLOCK=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_ONENAND=y
 CONFIG_MTD_ONENAND_GENERIC=y
 # CONFIG_BLK_DEV is not set
diff --git a/arch/arm/configs/multi_v4t_defconfig b/arch/arm/configs/multi_v4t_defconfig
index 9a6390c172d6..14f3a4a65d01 100644
--- a/arch/arm/configs/multi_v4t_defconfig
+++ b/arch/arm/configs/multi_v4t_defconfig
@@ -39,7 +39,7 @@ CONFIG_MTD_CFI_INTELEXT=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_CFI_STAA=y
 CONFIG_MTD_PLATRAM=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_GPIO=y
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
diff --git a/arch/arm/configs/multi_v5_defconfig b/arch/arm/configs/multi_v5_defconfig
index 318b76fa26d1..63b5a8824f0f 100644
--- a/arch/arm/configs/multi_v5_defconfig
+++ b/arch/arm/configs/multi_v5_defconfig
@@ -87,7 +87,7 @@ CONFIG_MTD_CFI_GEOMETRY=y
 CONFIG_MTD_CFI_INTELEXT=y
 CONFIG_MTD_CFI_STAA=y
 CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_ATMEL=y
 CONFIG_MTD_NAND_ORION=y
 CONFIG_MTD_SPI_NOR=y
diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig
index c75051b9392c..b7b1cd00a294 100644
--- a/arch/arm/configs/multi_v7_defconfig
+++ b/arch/arm/configs/multi_v7_defconfig
@@ -184,7 +184,7 @@ CONFIG_MTD=y
 CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_BLOCK=y
 CONFIG_MTD_M25P80=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_DENALI_DT=y
 CONFIG_MTD_NAND_OMAP2=y
 CONFIG_MTD_NAND_OMAP_BCH=y
diff --git a/arch/arm/configs/mv78xx0_defconfig b/arch/arm/configs/mv78xx0_defconfig
index 0448bd8075ac..e9567513f068 100644
--- a/arch/arm/configs/mv78xx0_defconfig
+++ b/arch/arm/configs/mv78xx0_defconfig
@@ -47,7 +47,7 @@ CONFIG_MTD_CFI_GEOMETRY=y
 CONFIG_MTD_CFI_INTELEXT=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_ORION=y
 CONFIG_BLK_DEV_LOOP=y
 # CONFIG_SCSI_PROC_FS is not set
diff --git a/arch/arm/configs/mvebu_v5_defconfig b/arch/arm/configs/mvebu_v5_defconfig
index 4b598da0d086..0e5577a31851 100644
--- a/arch/arm/configs/mvebu_v5_defconfig
+++ b/arch/arm/configs/mvebu_v5_defconfig
@@ -77,7 +77,7 @@ CONFIG_MTD_CFI_INTELEXT=y
 CONFIG_MTD_CFI_STAA=y
 CONFIG_MTD_PHYSMAP=y
 CONFIG_MTD_M25P80=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_ORION=y
 CONFIG_MTD_SPI_NOR=y
 CONFIG_BLK_DEV_LOOP=y
diff --git a/arch/arm/configs/mvebu_v7_defconfig b/arch/arm/configs/mvebu_v7_defconfig
index 55140219ab11..48f7b4277b8d 100644
--- a/arch/arm/configs/mvebu_v7_defconfig
+++ b/arch/arm/configs/mvebu_v7_defconfig
@@ -52,7 +52,7 @@ CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_CFI_STAA=y
 CONFIG_MTD_PHYSMAP_OF=y
 CONFIG_MTD_M25P80=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_MARVELL=y
 CONFIG_MTD_SPI_NOR=y
 CONFIG_MTD_UBI=y
diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig
index 38480596c449..ed570a0d1f2a 100644
--- a/arch/arm/configs/mxs_defconfig
+++ b/arch/arm/configs/mxs_defconfig
@@ -50,7 +50,7 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_DATAFLASH=y
 CONFIG_MTD_M25P80=y
 CONFIG_MTD_SST25L=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_GPMI_NAND=y
 CONFIG_MTD_SPI_NOR=y
 CONFIG_MTD_UBI=y
diff --git a/arch/arm/configs/nhk8815_defconfig b/arch/arm/configs/nhk8815_defconfig
index e0373989b32d..cfc094189d09 100644
--- a/arch/arm/configs/nhk8815_defconfig
+++ b/arch/arm/configs/nhk8815_defconfig
@@ -54,7 +54,7 @@ CONFIG_MTD_ONENAND=y
 CONFIG_MTD_ONENAND_VERIFY_WRITE=y
 CONFIG_MTD_ONENAND_GENERIC=y
 CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_FSMC=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_CRYPTOLOOP=y
diff --git a/arch/arm/configs/omap1_defconfig b/arch/arm/configs/omap1_defconfig
index cfc00b0961ec..40fdf890a292 100644
--- a/arch/arm/configs/omap1_defconfig
+++ b/arch/arm/configs/omap1_defconfig
@@ -90,7 +90,7 @@ CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_INTELEXT=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_COUNT=2
diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig
index 7dc2ac9175de..c7bf9c493646 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -143,7 +143,7 @@ CONFIG_MTD_M25P80=m
 CONFIG_MTD_ONENAND=y
 CONFIG_MTD_ONENAND_VERIFY_WRITE=y
 CONFIG_MTD_ONENAND_OMAP2=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_ECC_SW_BCH=y
 CONFIG_MTD_NAND_OMAP2=y
 CONFIG_MTD_NAND_OMAP_BCH=y
diff --git a/arch/arm/configs/orion5x_defconfig b/arch/arm/configs/orion5x_defconfig
index bf9046331f6e..077e0fde1ff9 100644
--- a/arch/arm/configs/orion5x_defconfig
+++ b/arch/arm/configs/orion5x_defconfig
@@ -70,7 +70,7 @@ CONFIG_MTD_CFI_GEOMETRY=y
 CONFIG_MTD_CFI_INTELEXT=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_PLATFORM=y
 CONFIG_MTD_NAND_ORION=y
 CONFIG_BLK_DEV_LOOP=y
diff --git a/arch/arm/configs/oxnas_v6_defconfig b/arch/arm/configs/oxnas_v6_defconfig
index f6ba32c9d173..cae0db6b4eaf 100644
--- a/arch/arm/configs/oxnas_v6_defconfig
+++ b/arch/arm/configs/oxnas_v6_defconfig
@@ -50,7 +50,7 @@ CONFIG_SIMPLE_PM_BUS=y
 CONFIG_MTD=y
 CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_BLOCK=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_OXNAS=y
 CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=y
diff --git a/arch/arm/configs/pxa3xx_defconfig b/arch/arm/configs/pxa3xx_defconfig
index 3e0de035ab77..7681eea60127 100644
--- a/arch/arm/configs/pxa3xx_defconfig
+++ b/arch/arm/configs/pxa3xx_defconfig
@@ -31,7 +31,7 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 # CONFIG_PREVENT_FIRMWARE_BUILD is not set
 CONFIG_MTD=y
 CONFIG_MTD_BLOCK=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_MARVELL=y
 CONFIG_MTD_ONENAND=y
 CONFIG_MTD_ONENAND_VERIFY_WRITE=y
diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
index d91adb49ae11..f6d24d762a7f 100644
--- a/arch/arm/configs/pxa_defconfig
+++ b/arch/arm/configs/pxa_defconfig
@@ -185,7 +185,7 @@ CONFIG_MTD_PXA2XX=m
 CONFIG_MTD_M25P80=m
 CONFIG_MTD_BLOCK2MTD=y
 CONFIG_MTD_DOCG3=m
-CONFIG_MTD_NAND=m
+CONFIG_MTD_RAW_NAND=m
 CONFIG_MTD_NAND_ECC_SW_BCH=y
 CONFIG_MTD_NAND_GPIO=m
 CONFIG_MTD_NAND_DISKONCHIP=m
diff --git a/arch/arm/configs/qcom_defconfig b/arch/arm/configs/qcom_defconfig
index bd6440f23493..4c50b5337cf6 100644
--- a/arch/arm/configs/qcom_defconfig
+++ b/arch/arm/configs/qcom_defconfig
@@ -57,7 +57,7 @@ CONFIG_DEVTMPFS_MOUNT=y
 CONFIG_MTD=y
 CONFIG_MTD_BLOCK=y
 CONFIG_MTD_M25P80=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_QCOM=y
 CONFIG_MTD_SPI_NOR=y
 CONFIG_BLK_DEV_LOOP=y
diff --git a/arch/arm/configs/s3c2410_defconfig b/arch/arm/configs/s3c2410_defconfig
index 2afb359f3168..39c648594d93 100644
--- a/arch/arm/configs/s3c2410_defconfig
+++ b/arch/arm/configs/s3c2410_defconfig
@@ -192,7 +192,7 @@ CONFIG_MTD_JEDECPROBE=y
 CONFIG_MTD_CFI_INTELEXT=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_ROM=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_S3C2410=y
 CONFIG_PARPORT=y
 CONFIG_PARPORT_PC=m
diff --git a/arch/arm/configs/s3c6400_defconfig b/arch/arm/configs/s3c6400_defconfig
index 507d7ad7523a..6e2656567da6 100644
--- a/arch/arm/configs/s3c6400_defconfig
+++ b/arch/arm/configs/s3c6400_defconfig
@@ -23,7 +23,7 @@ CONFIG_CMDLINE="console=ttySAC0,115200 root=/dev/ram init=/linuxrc initrd=0x5100
 CONFIG_VFP=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_MTD=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_S3C2410=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
diff --git a/arch/arm/configs/sama5_defconfig b/arch/arm/configs/sama5_defconfig
index b0026f73083d..515cb37eeab6 100644
--- a/arch/arm/configs/sama5_defconfig
+++ b/arch/arm/configs/sama5_defconfig
@@ -66,7 +66,7 @@ CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_M25P80=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_ATMEL=y
 CONFIG_MTD_SPI_NOR=y
 CONFIG_MTD_UBI=y
diff --git a/arch/arm/configs/socfpga_defconfig b/arch/arm/configs/socfpga_defconfig
index 08d1b3e11d68..3b42e0d597bd 100644
--- a/arch/arm/configs/socfpga_defconfig
+++ b/arch/arm/configs/socfpga_defconfig
@@ -51,7 +51,7 @@ CONFIG_DEVTMPFS_MOUNT=y
 CONFIG_MTD=y
 CONFIG_MTD_BLOCK=y
 CONFIG_MTD_M25P80=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_DENALI_DT=y
 CONFIG_MTD_SPI_NOR=y
 # CONFIG_MTD_SPI_NOR_USE_4K_SECTORS is not set
diff --git a/arch/arm/configs/spear13xx_defconfig b/arch/arm/configs/spear13xx_defconfig
index 7b36eeb928bb..8ee3679ca8b2 100644
--- a/arch/arm/configs/spear13xx_defconfig
+++ b/arch/arm/configs/spear13xx_defconfig
@@ -32,7 +32,7 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_MTD=y
 CONFIG_MTD_OF_PARTS=y
 CONFIG_MTD_BLOCK=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_FSMC=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=16384
diff --git a/arch/arm/configs/spear3xx_defconfig b/arch/arm/configs/spear3xx_defconfig
index f1b52fb3461b..ddd73b25f75e 100644
--- a/arch/arm/configs/spear3xx_defconfig
+++ b/arch/arm/configs/spear3xx_defconfig
@@ -17,7 +17,7 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_MTD=y
 CONFIG_MTD_OF_PARTS=y
 CONFIG_MTD_BLOCK=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_FSMC=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=16384
diff --git a/arch/arm/configs/spear6xx_defconfig b/arch/arm/configs/spear6xx_defconfig
index 124c244d8df1..5b410f0a365b 100644
--- a/arch/arm/configs/spear6xx_defconfig
+++ b/arch/arm/configs/spear6xx_defconfig
@@ -14,7 +14,7 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_MTD=y
 CONFIG_MTD_OF_PARTS=y
 CONFIG_MTD_BLOCK=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_FSMC=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=16384
diff --git a/arch/arm/configs/spitz_defconfig b/arch/arm/configs/spitz_defconfig
index 9ea82c118661..f6d2f674517c 100644
--- a/arch/arm/configs/spitz_defconfig
+++ b/arch/arm/configs/spitz_defconfig
@@ -84,7 +84,7 @@ CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_BLOCK=y
 CONFIG_MTD_ROM=y
 CONFIG_MTD_COMPLEX_MAPPINGS=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_SHARPSL=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_SD=y
diff --git a/arch/arm/configs/tango4_defconfig b/arch/arm/configs/tango4_defconfig
index 68725d4eae45..68eb16e583ac 100644
--- a/arch/arm/configs/tango4_defconfig
+++ b/arch/arm/configs/tango4_defconfig
@@ -39,7 +39,7 @@ CONFIG_DEVTMPFS_MOUNT=y
 CONFIG_MTD=y
 CONFIG_MTD_TESTS=m
 CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_TANGO=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_SCSI=y
diff --git a/arch/arm/configs/trizeps4_defconfig b/arch/arm/configs/trizeps4_defconfig
index 2b5a224d2da1..ecad22501b48 100644
--- a/arch/arm/configs/trizeps4_defconfig
+++ b/arch/arm/configs/trizeps4_defconfig
@@ -76,7 +76,7 @@ CONFIG_MTD_DOC2001PLUS=y
 CONFIG_MTD_DOCPROBE_ADVANCED=y
 CONFIG_MTD_DOCPROBE_ADDRESS=0x4000000
 CONFIG_MTD_DOCPROBE_HIGH=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_DISKONCHIP=y
 CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED=y
 CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0x4000000
diff --git a/arch/arm/configs/u300_defconfig b/arch/arm/configs/u300_defconfig
index 36d77406e31b..cfd3622e2c8a 100644
--- a/arch/arm/configs/u300_defconfig
+++ b/arch/arm/configs/u300_defconfig
@@ -27,7 +27,7 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 # CONFIG_PREVENT_FIRMWARE_BUILD is not set
 CONFIG_MTD=y
 CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_FSMC=y
 # CONFIG_INPUT_MOUSEDEV is not set
 CONFIG_INPUT_EVDEV=y
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 2d9c39033c1a..9a8718bbf6ba 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -206,7 +206,7 @@ CONFIG_SIMPLE_PM_BUS=y
 CONFIG_MTD=y
 CONFIG_MTD_BLOCK=y
 CONFIG_MTD_M25P80=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_DENALI_DT=y
 CONFIG_MTD_NAND_MARVELL=y
 CONFIG_MTD_NAND_QCOM=y
diff --git a/arch/mips/configs/bcm47xx_defconfig b/arch/mips/configs/bcm47xx_defconfig
index 249f5285e343..91ce75edbfb4 100644
--- a/arch/mips/configs/bcm47xx_defconfig
+++ b/arch/mips/configs/bcm47xx_defconfig
@@ -41,7 +41,7 @@ CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_COMPLEX_MAPPINGS=y
 CONFIG_MTD_PHYSMAP=y
 CONFIG_MTD_BCM47XXSFLASH=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_BCM47XXNFLASH=y
 CONFIG_NETDEVICES=y
 CONFIG_B44=y
diff --git a/arch/mips/configs/ci20_defconfig b/arch/mips/configs/ci20_defconfig
index 412800d5d7e0..50bebce28500 100644
--- a/arch/mips/configs/ci20_defconfig
+++ b/arch/mips/configs/ci20_defconfig
@@ -51,7 +51,7 @@ CONFIG_DEVTMPFS=y
 CONFIG_DMA_CMA=y
 CONFIG_CMA_SIZE_MBYTES=32
 CONFIG_MTD=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_JZ4780=y
 CONFIG_MTD_UBI=y
 CONFIG_MTD_UBI_FASTMAP=y
diff --git a/arch/mips/configs/db1xxx_defconfig b/arch/mips/configs/db1xxx_defconfig
index c5a2c6f234d4..bc9b6ae046b2 100644
--- a/arch/mips/configs/db1xxx_defconfig
+++ b/arch/mips/configs/db1xxx_defconfig
@@ -95,7 +95,7 @@ CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP=y
 CONFIG_MTD_M25P80=y
 CONFIG_MTD_SST25L=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_ECC_SW_BCH=y
 CONFIG_MTD_NAND_AU1550=y
 CONFIG_MTD_NAND_PLATFORM=y
diff --git a/arch/mips/configs/generic/board-ni169445.config b/arch/mips/configs/generic/board-ni169445.config
index e68da2e9dc7c..1ed0d3e8715e 100644
--- a/arch/mips/configs/generic/board-ni169445.config
+++ b/arch/mips/configs/generic/board-ni169445.config
@@ -17,7 +17,7 @@ CONFIG_MTD_CMDLINE_PARTS=y
 
 CONFIG_MTD_NAND_ECC_SW_HAMMING=y
 CONFIG_MTD_NAND_ECC_SW_BCH=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_GPIO=y
 CONFIG_MTD_NAND_IDS=y
 
diff --git a/arch/mips/configs/generic/board-ocelot.config b/arch/mips/configs/generic/board-ocelot.config
index f607888d2483..2621630f9850 100644
--- a/arch/mips/configs/generic/board-ocelot.config
+++ b/arch/mips/configs/generic/board-ocelot.config
@@ -6,7 +6,7 @@ CONFIG_MTD=y
 CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_BLOCK=y
 CONFIG_MTD_M25P80=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_PLATFORM=y
 CONFIG_MTD_SPI_NOR=y
 CONFIG_MTD_UBI=y
diff --git a/arch/mips/configs/loongson1b_defconfig b/arch/mips/configs/loongson1b_defconfig
index b064d68a5424..1628e6fcc405 100644
--- a/arch/mips/configs/loongson1b_defconfig
+++ b/arch/mips/configs/loongson1b_defconfig
@@ -42,7 +42,7 @@ CONFIG_DEVTMPFS_MOUNT=y
 CONFIG_MTD=y
 CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_BLOCK=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_SCSI=m
diff --git a/arch/mips/configs/loongson1c_defconfig b/arch/mips/configs/loongson1c_defconfig
index 5d76559b56cd..7aa5527e50b1 100644
--- a/arch/mips/configs/loongson1c_defconfig
+++ b/arch/mips/configs/loongson1c_defconfig
@@ -43,7 +43,7 @@ CONFIG_DEVTMPFS_MOUNT=y
 CONFIG_MTD=y
 CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_BLOCK=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_SCSI=m
diff --git a/arch/mips/configs/qi_lb60_defconfig b/arch/mips/configs/qi_lb60_defconfig
index 7671fe6a8042..1a0677d04982 100644
--- a/arch/mips/configs/qi_lb60_defconfig
+++ b/arch/mips/configs/qi_lb60_defconfig
@@ -44,7 +44,7 @@ CONFIG_TCP_CONG_WESTWOOD=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_MTD=y
 CONFIG_MTD_BLOCK=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_JZ4740=y
 CONFIG_MTD_UBI=y
 CONFIG_NETDEVICES=y
diff --git a/arch/mips/configs/rb532_defconfig b/arch/mips/configs/rb532_defconfig
index 7befe05fd813..4cbcf015a7ed 100644
--- a/arch/mips/configs/rb532_defconfig
+++ b/arch/mips/configs/rb532_defconfig
@@ -110,7 +110,7 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_MTD=y
 CONFIG_MTD_BLOCK=y
 CONFIG_MTD_BLOCK2MTD=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_PLATFORM=y
 CONFIG_ATA=y
 # CONFIG_ATA_VERBOSE_ERROR is not set
diff --git a/arch/mips/configs/rbtx49xx_defconfig b/arch/mips/configs/rbtx49xx_defconfig
index 50a2c9ad583f..96114ca59016 100644
--- a/arch/mips/configs/rbtx49xx_defconfig
+++ b/arch/mips/configs/rbtx49xx_defconfig
@@ -40,7 +40,7 @@ CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_COMPLEX_MAPPINGS=y
 CONFIG_MTD_PHYSMAP=y
 CONFIG_MTD_RBTX4939=y
-CONFIG_MTD_NAND=m
+CONFIG_MTD_RAW_NAND=m
 CONFIG_MTD_NAND_TXX9NDFMC=m
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
diff --git a/arch/mips/configs/xway_defconfig b/arch/mips/configs/xway_defconfig
index 2bb02ea9fb4e..203db83c3ee9 100644
--- a/arch/mips/configs/xway_defconfig
+++ b/arch/mips/configs/xway_defconfig
@@ -81,7 +81,7 @@ CONFIG_MTD_COMPLEX_MAPPINGS=y
 CONFIG_MTD_PHYSMAP=y
 CONFIG_MTD_PHYSMAP_OF=y
 CONFIG_MTD_LANTIQ=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_XWAY=y
 CONFIG_EEPROM_93CX6=m
 CONFIG_SCSI=y
diff --git a/arch/powerpc/configs/40x/kilauea_defconfig b/arch/powerpc/configs/40x/kilauea_defconfig
index b5cc7426c21f..3da091f651d6 100644
--- a/arch/powerpc/configs/40x/kilauea_defconfig
+++ b/arch/powerpc/configs/40x/kilauea_defconfig
@@ -33,7 +33,7 @@ CONFIG_MTD_CFI=y
 CONFIG_MTD_JEDECPROBE=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP_OF=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_NDFC=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=35000
diff --git a/arch/powerpc/configs/40x/obs600_defconfig b/arch/powerpc/configs/40x/obs600_defconfig
index aac06d2ad01a..38d3d7769a2f 100644
--- a/arch/powerpc/configs/40x/obs600_defconfig
+++ b/arch/powerpc/configs/40x/obs600_defconfig
@@ -33,7 +33,7 @@ CONFIG_MTD_CFI=y
 CONFIG_MTD_JEDECPROBE=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP_OF=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_NDFC=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=35000
diff --git a/arch/powerpc/configs/44x/canyonlands_defconfig b/arch/powerpc/configs/44x/canyonlands_defconfig
index c8e6f048a122..d427cee027a6 100644
--- a/arch/powerpc/configs/44x/canyonlands_defconfig
+++ b/arch/powerpc/configs/44x/canyonlands_defconfig
@@ -32,7 +32,7 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP_OF=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_NDFC=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=35000
diff --git a/arch/powerpc/configs/44x/eiger_defconfig b/arch/powerpc/configs/44x/eiger_defconfig
index f6dc23fef683..f593258806ad 100644
--- a/arch/powerpc/configs/44x/eiger_defconfig
+++ b/arch/powerpc/configs/44x/eiger_defconfig
@@ -33,7 +33,7 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP_OF=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_NDFC=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=35000
diff --git a/arch/powerpc/configs/44x/sequoia_defconfig b/arch/powerpc/configs/44x/sequoia_defconfig
index 1e04122912f3..f34fee9464e5 100644
--- a/arch/powerpc/configs/44x/sequoia_defconfig
+++ b/arch/powerpc/configs/44x/sequoia_defconfig
@@ -33,7 +33,7 @@ CONFIG_MTD_JEDECPROBE=y
 CONFIG_MTD_CFI_INTELEXT=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP_OF=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_NDFC=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=35000
diff --git a/arch/powerpc/configs/44x/warp_defconfig b/arch/powerpc/configs/44x/warp_defconfig
index 6c02f53271cd..6ae88d4879bf 100644
--- a/arch/powerpc/configs/44x/warp_defconfig
+++ b/arch/powerpc/configs/44x/warp_defconfig
@@ -34,7 +34,7 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP_OF=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_NDFC=y
 CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_RAM=y
diff --git a/arch/powerpc/configs/83xx/mpc8313_rdb_defconfig b/arch/powerpc/configs/83xx/mpc8313_rdb_defconfig
index 1f69f4edf074..9dffb2e7f735 100644
--- a/arch/powerpc/configs/83xx/mpc8313_rdb_defconfig
+++ b/arch/powerpc/configs/83xx/mpc8313_rdb_defconfig
@@ -31,7 +31,7 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP_OF=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_FSL_ELBC=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
diff --git a/arch/powerpc/configs/83xx/mpc8315_rdb_defconfig b/arch/powerpc/configs/83xx/mpc8315_rdb_defconfig
index 797fc3ffddee..a42232732c6d 100644
--- a/arch/powerpc/configs/83xx/mpc8315_rdb_defconfig
+++ b/arch/powerpc/configs/83xx/mpc8315_rdb_defconfig
@@ -31,7 +31,7 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP_OF=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=32768
diff --git a/arch/powerpc/configs/85xx-hw.config b/arch/powerpc/configs/85xx-hw.config
index c03d0fb16665..9575a38c9155 100644
--- a/arch/powerpc/configs/85xx-hw.config
+++ b/arch/powerpc/configs/85xx-hw.config
@@ -71,7 +71,7 @@ CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_M25P80=y
 CONFIG_MTD_NAND_FSL_ELBC=y
 CONFIG_MTD_NAND_FSL_IFC=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_PHYSMAP_OF=y
 CONFIG_MTD_PHYSMAP=y
 CONFIG_MTD_PLATRAM=y
diff --git a/arch/powerpc/configs/85xx/ge_imp3a_defconfig b/arch/powerpc/configs/85xx/ge_imp3a_defconfig
index dd98f43b2fb8..d70b60314dad 100644
--- a/arch/powerpc/configs/85xx/ge_imp3a_defconfig
+++ b/arch/powerpc/configs/85xx/ge_imp3a_defconfig
@@ -73,7 +73,7 @@ CONFIG_MTD_JEDECPROBE=y
 CONFIG_MTD_CFI_INTELEXT=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP_OF=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_FSL_ELBC=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_CRYPTOLOOP=m
diff --git a/arch/powerpc/configs/85xx/socrates_defconfig b/arch/powerpc/configs/85xx/socrates_defconfig
index 6106fadbbd8b..7037a6d8018c 100644
--- a/arch/powerpc/configs/85xx/socrates_defconfig
+++ b/arch/powerpc/configs/85xx/socrates_defconfig
@@ -31,7 +31,7 @@ CONFIG_MTD_CFI=y
 CONFIG_MTD_JEDECPROBE=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP_OF=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_SOCRATES=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
diff --git a/arch/powerpc/configs/85xx/tqm8548_defconfig b/arch/powerpc/configs/85xx/tqm8548_defconfig
index 464ce192bc14..1c63cbdc3211 100644
--- a/arch/powerpc/configs/85xx/tqm8548_defconfig
+++ b/arch/powerpc/configs/85xx/tqm8548_defconfig
@@ -36,7 +36,7 @@ CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP_OF=y
 CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_FSL_UPM=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
diff --git a/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig b/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig
index 6531139a8a8d..78f5beb2928c 100644
--- a/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig
+++ b/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig
@@ -65,7 +65,7 @@ CONFIG_MTD_CFI_INTELEXT=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_CFI_STAA=y
 CONFIG_MTD_PHYSMAP_OF=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_FSL_ELBC=y
 CONFIG_MTD_NAND_FSL_UPM=y
 CONFIG_BLK_DEV_LOOP=y
diff --git a/arch/powerpc/configs/86xx-hw.config b/arch/powerpc/configs/86xx-hw.config
index d3dd6b8865c0..151164cf8cb3 100644
--- a/arch/powerpc/configs/86xx-hw.config
+++ b/arch/powerpc/configs/86xx-hw.config
@@ -47,7 +47,7 @@ CONFIG_MTD_CFI=y
 CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_JEDECPROBE=y
 CONFIG_MTD_NAND_FSL_ELBC=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_PHYSMAP_OF=y
 CONFIG_NETDEVICES=y
 CONFIG_NET_TULIP=y
diff --git a/arch/powerpc/configs/mpc512x_defconfig b/arch/powerpc/configs/mpc512x_defconfig
index e4bfb1101c0e..e4bf8aa87e60 100644
--- a/arch/powerpc/configs/mpc512x_defconfig
+++ b/arch/powerpc/configs/mpc512x_defconfig
@@ -46,7 +46,7 @@ CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_ROM=y
 CONFIG_MTD_PHYSMAP_OF=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_MPC5121_NFC=y
 CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_RAM=y
diff --git a/arch/powerpc/configs/mpc83xx_defconfig b/arch/powerpc/configs/mpc83xx_defconfig
index d1b82035d35f..005d00020fb9 100644
--- a/arch/powerpc/configs/mpc83xx_defconfig
+++ b/arch/powerpc/configs/mpc83xx_defconfig
@@ -46,7 +46,7 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP_OF=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_FSL_ELBC=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
diff --git a/arch/powerpc/configs/pasemi_defconfig b/arch/powerpc/configs/pasemi_defconfig
index 6daa56f8895c..c0423b2cf7c0 100644
--- a/arch/powerpc/configs/pasemi_defconfig
+++ b/arch/powerpc/configs/pasemi_defconfig
@@ -51,7 +51,7 @@ CONFIG_MTD=y
 CONFIG_MTD_BLOCK=y
 CONFIG_MTD_SLRAM=y
 CONFIG_MTD_PHRAM=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_PASEMI=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
diff --git a/arch/powerpc/configs/ppc44x_defconfig b/arch/powerpc/configs/ppc44x_defconfig
index 66dd6bf45cde..db48039e0b11 100644
--- a/arch/powerpc/configs/ppc44x_defconfig
+++ b/arch/powerpc/configs/ppc44x_defconfig
@@ -44,7 +44,7 @@ CONFIG_MTD_CFI=y
 CONFIG_MTD_JEDECPROBE=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP_OF=y
-CONFIG_MTD_NAND=m
+CONFIG_MTD_RAW_NAND=m
 CONFIG_MTD_NAND_NDFC=m
 CONFIG_MTD_UBI=m
 CONFIG_MTD_UBI_GLUEBI=m
diff --git a/arch/sh/configs/ap325rxa_defconfig b/arch/sh/configs/ap325rxa_defconfig
index 72b72e50a92e..0ef3f1f9de5c 100644
--- a/arch/sh/configs/ap325rxa_defconfig
+++ b/arch/sh/configs/ap325rxa_defconfig
@@ -35,7 +35,7 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_SH_FLCTL=y
 CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_RAM=y
diff --git a/arch/sh/configs/ecovec24_defconfig b/arch/sh/configs/ecovec24_defconfig
index 3568310c2c2f..ba67e3752938 100644
--- a/arch/sh/configs/ecovec24_defconfig
+++ b/arch/sh/configs/ecovec24_defconfig
@@ -38,7 +38,7 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_COUNT=4
diff --git a/arch/sh/configs/migor_defconfig b/arch/sh/configs/migor_defconfig
index e04f21be0756..121a75d65fb4 100644
--- a/arch/sh/configs/migor_defconfig
+++ b/arch/sh/configs/migor_defconfig
@@ -34,7 +34,7 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_PLATFORM=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_SCSI=y
diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig
index d16e9334cd98..5209889765ad 100644
--- a/arch/sh/configs/sdk7786_defconfig
+++ b/arch/sh/configs/sdk7786_defconfig
@@ -108,7 +108,7 @@ CONFIG_MTD_ROM=m
 CONFIG_MTD_ABSENT=m
 CONFIG_MTD_PLATRAM=y
 CONFIG_MTD_PHRAM=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_NAND_PLATFORM=y
 CONFIG_MTD_NAND_SH_FLCTL=m
 CONFIG_MTD_UBI=y
diff --git a/arch/sh/configs/se7724_defconfig b/arch/sh/configs/se7724_defconfig
index aedb3a2d9a10..9f6d46d58554 100644
--- a/arch/sh/configs/se7724_defconfig
+++ b/arch/sh/configs/se7724_defconfig
@@ -37,7 +37,7 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
 CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_NAND=y
+CONFIG_MTD_RAW_NAND=y
 CONFIG_MTD_UBI=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_COUNT=4
diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig
index ceb48e9b70f4..822fa9e96f74 100644
--- a/arch/sh/configs/titan_defconfig
+++ b/arch/sh/configs/titan_defconfig
@@ -155,7 +155,7 @@ CONFIG_INFTL=m
 CONFIG_RFD_FTL=m
 CONFIG_MTD_CFI=m
 CONFIG_MTD_JEDECPROBE=m
-CONFIG_MTD_NAND=m
+CONFIG_MTD_RAW_NAND=m
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_RAM=y
diff --git a/drivers/mtd/nand/raw/Kconfig b/drivers/mtd/nand/raw/Kconfig
index c3a898a300e0..615d738be411 100644
--- a/drivers/mtd/nand/raw/Kconfig
+++ b/drivers/mtd/nand/raw/Kconfig
@@ -9,7 +9,7 @@ config MTD_NAND_ECC_SW_HAMMING_SMC
 	  Software ECC according to the Smart Media Specification.
 	  The original Linux implementation had byte 0 and 1 swapped.
 
-menuconfig MTD_NAND
+menuconfig MTD_RAW_NAND
 	tristate "Raw/Parallel NAND Device Support"
 	depends on MTD
 	select MTD_NAND_CORE
@@ -19,7 +19,7 @@ menuconfig MTD_NAND
 	  NAND flash devices. For further information see
 	  <http://www.linux-mtd.infradead.org/doc/nand.html>.
 
-if MTD_NAND
+if MTD_RAW_NAND
 
 config MTD_NAND_ECC_SW_BCH
 	tristate "Support software BCH ECC"
@@ -545,4 +545,4 @@ config MTD_NAND_DISKONCHIP_BBTWRITE
 	  load time (assuming you build diskonchip as a module) with the module
 	  parameter "inftl_bbt_write=1".
 
-endif # MTD_NAND
+endif # MTD_RAW_NAND
diff --git a/drivers/mtd/nand/raw/Makefile b/drivers/mtd/nand/raw/Makefile
index a022931318ac..8bc6faaa3bc7 100644
--- a/drivers/mtd/nand/raw/Makefile
+++ b/drivers/mtd/nand/raw/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-obj-$(CONFIG_MTD_NAND)			+= nand.o
+obj-$(CONFIG_MTD_RAW_NAND)		+= nand.o
 obj-$(CONFIG_MTD_NAND_ECC_SW_HAMMING)	+= nand_ecc.o
 obj-$(CONFIG_MTD_NAND_ECC_SW_BCH)	+= nand_bch.o
 obj-$(CONFIG_MTD_SM_COMMON) 		+= sm_common.o
diff --git a/drivers/mtd/tests/mtd_nandecctest.c b/drivers/mtd/tests/mtd_nandecctest.c
index 0c0091fb3708..73b06304c975 100644
--- a/drivers/mtd/tests/mtd_nandecctest.c
+++ b/drivers/mtd/tests/mtd_nandecctest.c
@@ -21,7 +21,7 @@
  * or detected.
  */
 
-#if IS_ENABLED(CONFIG_MTD_NAND)
+#if IS_ENABLED(CONFIG_MTD_RAW_NAND)
 
 struct nand_ecc_test {
 	const char *name;
-- 
cgit v1.2.3-59-g8ed1b


From 5c14068f87d04adc73ba3f41c2a303d3c3d1fa12 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 17 Apr 2019 16:39:02 -0500
Subject: x86/speculation/mds: Add 'mitigations=' support for MDS

Add MDS to the new 'mitigations=' cmdline option.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/admin-guide/kernel-parameters.txt | 2 ++
 arch/x86/kernel/cpu/bugs.c                      | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 9aa3543a8723..18cad2b0392a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2556,6 +2556,7 @@
 					       spectre_v2_user=off [X86]
 					       spec_store_bypass_disable=off [X86,PPC]
 					       l1tf=off [X86]
+					       mds=off [X86]
 
 			auto (default)
 				Mitigate all CPU vulnerabilities, but leave SMT
@@ -2570,6 +2571,7 @@
 				if needed.  This is for users who always want to
 				be fully mitigated, even if it means losing SMT.
 				Equivalent to: l1tf=flush,nosmt [X86]
+					       mds=full,nosmt [X86]
 
 	mminit_loglevel=
 			[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 3c5c3c3ba734..667c273a66d7 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -233,7 +233,7 @@ static const char * const mds_strings[] = {
 
 static void __init mds_select_mitigation(void)
 {
-	if (!boot_cpu_has_bug(X86_BUG_MDS)) {
+	if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) {
 		mds_mitigation = MDS_MITIGATION_OFF;
 		return;
 	}
@@ -244,7 +244,8 @@ static void __init mds_select_mitigation(void)
 
 		static_branch_enable(&mds_user_clear);
 
-		if (mds_nosmt && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
+		if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
+		    (mds_nosmt || cpu_mitigations_auto_nosmt()))
 			cpu_smt_disable(false);
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 767f015ea0b7ab9d60432ff6cd06b664fd71f50f Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 9 Apr 2019 23:46:31 -0700
Subject: crypto: arm/aes-neonbs - don't access already-freed walk.iv

If the user-provided IV needs to be aligned to the algorithm's
alignmask, then skcipher_walk_virt() copies the IV into a new aligned
buffer walk.iv.  But skcipher_walk_virt() can fail afterwards, and then
if the caller unconditionally accesses walk.iv, it's a use-after-free.

arm32 xts-aes-neonbs doesn't set an alignmask, so currently it isn't
affected by this despite unconditionally accessing walk.iv.  However
this is more subtle than desired, and it was actually broken prior to
the alignmask being removed by commit cc477bf64573 ("crypto: arm/aes -
replace bit-sliced OpenSSL NEON code").  Thus, update xts-aes-neonbs to
start checking the return value of skcipher_walk_virt().

Fixes: e4e7f10bfc40 ("ARM: add support for bit sliced AES using NEON instructions")
Cc: <stable@vger.kernel.org> # v3.13+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm/crypto/aes-neonbs-glue.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c
index 07e31941dc67..617c2c99ebfb 100644
--- a/arch/arm/crypto/aes-neonbs-glue.c
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@@ -278,6 +278,8 @@ static int __xts_crypt(struct skcipher_request *req,
 	int err;
 
 	err = skcipher_walk_virt(&walk, req, true);
+	if (err)
+		return err;
 
 	crypto_cipher_encrypt_one(ctx->tweak_tfm, walk.iv, walk.iv);
 
-- 
cgit v1.2.3-59-g8ed1b


From 4a8108b70508df0b6c4ffa4a3974dab93dcbe851 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 9 Apr 2019 23:46:32 -0700
Subject: crypto: arm64/aes-neonbs - don't access already-freed walk.iv

If the user-provided IV needs to be aligned to the algorithm's
alignmask, then skcipher_walk_virt() copies the IV into a new aligned
buffer walk.iv.  But skcipher_walk_virt() can fail afterwards, and then
if the caller unconditionally accesses walk.iv, it's a use-after-free.

xts-aes-neonbs doesn't set an alignmask, so currently it isn't affected
by this despite unconditionally accessing walk.iv.  However this is more
subtle than desired, and unconditionally accessing walk.iv has caused a
real problem in other algorithms.  Thus, update xts-aes-neonbs to start
checking the return value of skcipher_walk_virt().

Fixes: 1abee99eafab ("crypto: arm64/aes - reimplement bit-sliced ARM/NEON implementation for arm64")
Cc: <stable@vger.kernel.org> # v4.11+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/aes-neonbs-glue.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
index 4737b6c6c5cf..514455117733 100644
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -304,6 +304,8 @@ static int __xts_crypt(struct skcipher_request *req,
 	int err;
 
 	err = skcipher_walk_virt(&walk, req, false);
+	if (err)
+		return err;
 
 	kernel_neon_begin();
 	neon_aes_ecb_encrypt(walk.iv, walk.iv, ctx->twkey, ctx->key.rounds, 1);
-- 
cgit v1.2.3-59-g8ed1b


From 55902d8514483bd2c94f375b5259224520a1fd6b Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 11 Apr 2019 16:50:57 +0800
Subject: crypto: s390 - Forbid 2-key 3DES in FIPS mode

This patch forbids the use of 2-key 3DES (K1 == K3) in FIPS mode.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/s390/crypto/des_s390.c | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/crypto/des_s390.c b/arch/s390/crypto/des_s390.c
index 0d15383d0ff1..1f9ab24dc048 100644
--- a/arch/s390/crypto/des_s390.c
+++ b/arch/s390/crypto/des_s390.c
@@ -224,24 +224,11 @@ static int des3_setkey(struct crypto_tfm *tfm, const u8 *key,
 		       unsigned int key_len)
 {
 	struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm);
+	int err;
 
-	if (!(crypto_memneq(key, &key[DES_KEY_SIZE], DES_KEY_SIZE) &&
-	    crypto_memneq(&key[DES_KEY_SIZE], &key[DES_KEY_SIZE * 2],
-			  DES_KEY_SIZE)) &&
-	    (tfm->crt_flags & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS)) {
-		tfm->crt_flags |= CRYPTO_TFM_RES_WEAK_KEY;
-		return -EINVAL;
-	}
-
-	/* in fips mode, ensure k1 != k2 and k2 != k3 and k1 != k3 */
-	if (fips_enabled &&
-	    !(crypto_memneq(key, &key[DES_KEY_SIZE], DES_KEY_SIZE) &&
-	      crypto_memneq(&key[DES_KEY_SIZE], &key[DES_KEY_SIZE * 2],
-			    DES_KEY_SIZE) &&
-	      crypto_memneq(key, &key[DES_KEY_SIZE * 2], DES_KEY_SIZE))) {
-		tfm->crt_flags |= CRYPTO_TFM_RES_WEAK_KEY;
-		return -EINVAL;
-	}
+	err = __des3_verify_key(&tfm->crt_flags, key);
+	if (unlikely(err))
+		return err;
 
 	memcpy(ctx->key, key, key_len);
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From eee25da50baaa32790dbe82d60754936ceb82042 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 11 Apr 2019 16:50:58 +0800
Subject: crypto: sparc - Forbid 2-key 3DES in FIPS mode

This patch forbids the use of 2-key 3DES (K1 == K3) in FIPS mode.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/sparc/crypto/des_glue.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/sparc/crypto/des_glue.c b/arch/sparc/crypto/des_glue.c
index 4884315daff4..453a4cf5492a 100644
--- a/arch/sparc/crypto/des_glue.c
+++ b/arch/sparc/crypto/des_glue.c
@@ -201,18 +201,15 @@ static int des3_ede_set_key(struct crypto_tfm *tfm, const u8 *key,
 			    unsigned int keylen)
 {
 	struct des3_ede_sparc64_ctx *dctx = crypto_tfm_ctx(tfm);
-	const u32 *K = (const u32 *)key;
 	u32 *flags = &tfm->crt_flags;
 	u64 k1[DES_EXPKEY_WORDS / 2];
 	u64 k2[DES_EXPKEY_WORDS / 2];
 	u64 k3[DES_EXPKEY_WORDS / 2];
+	int err;
 
-	if (unlikely(!((K[0] ^ K[2]) | (K[1] ^ K[3])) ||
-		     !((K[2] ^ K[4]) | (K[3] ^ K[5]))) &&
-		     (*flags & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS)) {
-		*flags |= CRYPTO_TFM_RES_WEAK_KEY;
-		return -EINVAL;
-	}
+	err = __des3_verify_key(flags, key);
+	if (unlikely(err))
+		return err;
 
 	des_sparc64_key_expand((const u32 *)key, k1);
 	key += DES_KEY_SIZE;
-- 
cgit v1.2.3-59-g8ed1b


From 626ddb2fbe7931a2996bd7fe88bd1ffd5daf7143 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 12 Apr 2019 22:33:12 -0700
Subject: crypto: powerpc - convert to use crypto_simd_usable()

Replace all calls to in_interrupt() in the PowerPC crypto code with
!crypto_simd_usable().  This causes the crypto self-tests to test the
no-SIMD code paths when CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y.

The p8_ghash algorithm is currently failing and needs to be fixed, as it
produces the wrong digest when no-SIMD updates are mixed with SIMD ones.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/powerpc/crypto/crc32c-vpmsum_glue.c    | 4 +++-
 arch/powerpc/crypto/crct10dif-vpmsum_glue.c | 4 +++-
 arch/powerpc/include/asm/Kbuild             | 1 +
 drivers/crypto/vmx/aes.c                    | 7 ++++---
 drivers/crypto/vmx/aes_cbc.c                | 7 ++++---
 drivers/crypto/vmx/aes_ctr.c                | 5 +++--
 drivers/crypto/vmx/aes_xts.c                | 5 +++--
 drivers/crypto/vmx/ghash.c                  | 9 ++++-----
 8 files changed, 25 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/crypto/crc32c-vpmsum_glue.c b/arch/powerpc/crypto/crc32c-vpmsum_glue.c
index fd1d6c83f0c0..c4fa242dd652 100644
--- a/arch/powerpc/crypto/crc32c-vpmsum_glue.c
+++ b/arch/powerpc/crypto/crc32c-vpmsum_glue.c
@@ -1,10 +1,12 @@
 #include <linux/crc32.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/cpufeature.h>
+#include <asm/simd.h>
 #include <asm/switch_to.h>
 
 #define CHKSUM_BLOCK_SIZE	1
@@ -22,7 +24,7 @@ static u32 crc32c_vpmsum(u32 crc, unsigned char const *p, size_t len)
 	unsigned int prealign;
 	unsigned int tail;
 
-	if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || in_interrupt())
+	if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || !crypto_simd_usable())
 		return __crc32c_le(crc, p, len);
 
 	if ((unsigned long)p & VMX_ALIGN_MASK) {
diff --git a/arch/powerpc/crypto/crct10dif-vpmsum_glue.c b/arch/powerpc/crypto/crct10dif-vpmsum_glue.c
index 02ea277863d1..e27ff16573b5 100644
--- a/arch/powerpc/crypto/crct10dif-vpmsum_glue.c
+++ b/arch/powerpc/crypto/crct10dif-vpmsum_glue.c
@@ -12,11 +12,13 @@
 
 #include <linux/crc-t10dif.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/cpufeature.h>
+#include <asm/simd.h>
 #include <asm/switch_to.h>
 
 #define VMX_ALIGN		16
@@ -32,7 +34,7 @@ static u16 crct10dif_vpmsum(u16 crci, unsigned char const *p, size_t len)
 	unsigned int tail;
 	u32 crc = crci;
 
-	if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || in_interrupt())
+	if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || !crypto_simd_usable())
 		return crc_t10dif_generic(crc, p, len);
 
 	if ((unsigned long)p & VMX_ALIGN_MASK) {
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index a0c132bedfae..5ac3dead6952 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -11,3 +11,4 @@ generic-y += preempt.h
 generic-y += rwsem.h
 generic-y += vtime.h
 generic-y += msi.h
+generic-y += simd.h
diff --git a/drivers/crypto/vmx/aes.c b/drivers/crypto/vmx/aes.c
index b00d6947e02f..603a62081994 100644
--- a/drivers/crypto/vmx/aes.c
+++ b/drivers/crypto/vmx/aes.c
@@ -23,9 +23,10 @@
 #include <linux/err.h>
 #include <linux/crypto.h>
 #include <linux/delay.h>
-#include <linux/hardirq.h>
+#include <asm/simd.h>
 #include <asm/switch_to.h>
 #include <crypto/aes.h>
+#include <crypto/internal/simd.h>
 
 #include "aesp8-ppc.h"
 
@@ -92,7 +93,7 @@ static void p8_aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
 	struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
 
-	if (in_interrupt()) {
+	if (!crypto_simd_usable()) {
 		crypto_cipher_encrypt_one(ctx->fallback, dst, src);
 	} else {
 		preempt_disable();
@@ -109,7 +110,7 @@ static void p8_aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
 	struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
 
-	if (in_interrupt()) {
+	if (!crypto_simd_usable()) {
 		crypto_cipher_decrypt_one(ctx->fallback, dst, src);
 	} else {
 		preempt_disable();
diff --git a/drivers/crypto/vmx/aes_cbc.c b/drivers/crypto/vmx/aes_cbc.c
index fbe882ef1bc5..a1a9a6f0d42c 100644
--- a/drivers/crypto/vmx/aes_cbc.c
+++ b/drivers/crypto/vmx/aes_cbc.c
@@ -23,9 +23,10 @@
 #include <linux/err.h>
 #include <linux/crypto.h>
 #include <linux/delay.h>
-#include <linux/hardirq.h>
+#include <asm/simd.h>
 #include <asm/switch_to.h>
 #include <crypto/aes.h>
+#include <crypto/internal/simd.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/skcipher.h>
 
@@ -100,7 +101,7 @@ static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc,
 	struct p8_aes_cbc_ctx *ctx =
 		crypto_tfm_ctx(crypto_blkcipher_tfm(desc->tfm));
 
-	if (in_interrupt()) {
+	if (!crypto_simd_usable()) {
 		SYNC_SKCIPHER_REQUEST_ON_STACK(req, ctx->fallback);
 		skcipher_request_set_sync_tfm(req, ctx->fallback);
 		skcipher_request_set_callback(req, desc->flags, NULL, NULL);
@@ -139,7 +140,7 @@ static int p8_aes_cbc_decrypt(struct blkcipher_desc *desc,
 	struct p8_aes_cbc_ctx *ctx =
 		crypto_tfm_ctx(crypto_blkcipher_tfm(desc->tfm));
 
-	if (in_interrupt()) {
+	if (!crypto_simd_usable()) {
 		SYNC_SKCIPHER_REQUEST_ON_STACK(req, ctx->fallback);
 		skcipher_request_set_sync_tfm(req, ctx->fallback);
 		skcipher_request_set_callback(req, desc->flags, NULL, NULL);
diff --git a/drivers/crypto/vmx/aes_ctr.c b/drivers/crypto/vmx/aes_ctr.c
index 214c69db9ebd..192a53512f5e 100644
--- a/drivers/crypto/vmx/aes_ctr.c
+++ b/drivers/crypto/vmx/aes_ctr.c
@@ -23,9 +23,10 @@
 #include <linux/err.h>
 #include <linux/crypto.h>
 #include <linux/delay.h>
-#include <linux/hardirq.h>
+#include <asm/simd.h>
 #include <asm/switch_to.h>
 #include <crypto/aes.h>
+#include <crypto/internal/simd.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/skcipher.h>
 
@@ -119,7 +120,7 @@ static int p8_aes_ctr_crypt(struct blkcipher_desc *desc,
 	struct p8_aes_ctr_ctx *ctx =
 		crypto_tfm_ctx(crypto_blkcipher_tfm(desc->tfm));
 
-	if (in_interrupt()) {
+	if (!crypto_simd_usable()) {
 		SYNC_SKCIPHER_REQUEST_ON_STACK(req, ctx->fallback);
 		skcipher_request_set_sync_tfm(req, ctx->fallback);
 		skcipher_request_set_callback(req, desc->flags, NULL, NULL);
diff --git a/drivers/crypto/vmx/aes_xts.c b/drivers/crypto/vmx/aes_xts.c
index 5bf4c3856650..00d412d811ae 100644
--- a/drivers/crypto/vmx/aes_xts.c
+++ b/drivers/crypto/vmx/aes_xts.c
@@ -23,9 +23,10 @@
 #include <linux/err.h>
 #include <linux/crypto.h>
 #include <linux/delay.h>
-#include <linux/hardirq.h>
+#include <asm/simd.h>
 #include <asm/switch_to.h>
 #include <crypto/aes.h>
+#include <crypto/internal/simd.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/xts.h>
 #include <crypto/skcipher.h>
@@ -109,7 +110,7 @@ static int p8_aes_xts_crypt(struct blkcipher_desc *desc,
 	struct p8_aes_xts_ctx *ctx =
 		crypto_tfm_ctx(crypto_blkcipher_tfm(desc->tfm));
 
-	if (in_interrupt()) {
+	if (!crypto_simd_usable()) {
 		SYNC_SKCIPHER_REQUEST_ON_STACK(req, ctx->fallback);
 		skcipher_request_set_sync_tfm(req, ctx->fallback);
 		skcipher_request_set_callback(req, desc->flags, NULL, NULL);
diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c
index dd8b8716467a..611ff591410e 100644
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
@@ -23,16 +23,15 @@
 #include <linux/err.h>
 #include <linux/crypto.h>
 #include <linux/delay.h>
-#include <linux/hardirq.h>
+#include <asm/simd.h>
 #include <asm/switch_to.h>
 #include <crypto/aes.h>
 #include <crypto/ghash.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
 #include <crypto/b128ops.h>
 
-#define IN_INTERRUPT in_interrupt()
-
 void gcm_init_p8(u128 htable[16], const u64 Xi[2]);
 void gcm_gmult_p8(u64 Xi[2], const u128 htable[16]);
 void gcm_ghash_p8(u64 Xi[2], const u128 htable[16],
@@ -131,7 +130,7 @@ static int p8_ghash_update(struct shash_desc *desc,
 	struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
 	struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
 
-	if (IN_INTERRUPT) {
+	if (!crypto_simd_usable()) {
 		return crypto_shash_update(&dctx->fallback_desc, src,
 					   srclen);
 	} else {
@@ -182,7 +181,7 @@ static int p8_ghash_final(struct shash_desc *desc, u8 *out)
 	struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
 	struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
 
-	if (IN_INTERRUPT) {
+	if (!crypto_simd_usable()) {
 		return crypto_shash_final(&dctx->fallback_desc, out);
 	} else {
 		if (dctx->bytes) {
-- 
cgit v1.2.3-59-g8ed1b


From 5dd50aaeb1853ee0953b60fa6d1143d95429ae7b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 5 Nov 2018 17:40:31 +0000
Subject: Make anon_inodes unconditional

Make the anon_inodes facility unconditional so that it can be used by core
VFS code and pidfd code.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
[christian@brauner.io: adapt commit message to mention pidfds]
Signed-off-by: Christian Brauner <christian@brauner.io>
---
 arch/arm/kvm/Kconfig       |  1 -
 arch/arm64/kvm/Kconfig     |  1 -
 arch/mips/kvm/Kconfig      |  1 -
 arch/powerpc/kvm/Kconfig   |  1 -
 arch/s390/kvm/Kconfig      |  1 -
 arch/x86/Kconfig           |  1 -
 arch/x86/kvm/Kconfig       |  1 -
 drivers/base/Kconfig       |  1 -
 drivers/char/tpm/Kconfig   |  1 -
 drivers/dma-buf/Kconfig    |  1 -
 drivers/gpio/Kconfig       |  1 -
 drivers/iio/Kconfig        |  1 -
 drivers/infiniband/Kconfig |  1 -
 drivers/vfio/Kconfig       |  1 -
 fs/Makefile                |  2 +-
 fs/notify/fanotify/Kconfig |  1 -
 fs/notify/inotify/Kconfig  |  1 -
 init/Kconfig               | 10 ----------
 18 files changed, 1 insertion(+), 27 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 3f5320f46de2..f591026347a5 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -22,7 +22,6 @@ config KVM
 	bool "Kernel-based Virtual Machine (KVM) support"
 	depends on MMU && OF
 	select PREEMPT_NOTIFIERS
-	select ANON_INODES
 	select ARM_GIC
 	select ARM_GIC_V3
 	select ARM_GIC_V3_ITS
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index a3f85624313e..a67121d419a2 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -23,7 +23,6 @@ config KVM
 	depends on OF
 	select MMU_NOTIFIER
 	select PREEMPT_NOTIFIERS
-	select ANON_INODES
 	select HAVE_KVM_CPU_RELAX_INTERCEPT
 	select HAVE_KVM_ARCH_TLB_FLUSH_ALL
 	select KVM_MMIO
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index 4528bc9c3cb1..eac25aef21e0 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -21,7 +21,6 @@ config KVM
 	depends on MIPS_FP_SUPPORT
 	select EXPORT_UASM
 	select PREEMPT_NOTIFIERS
-	select ANON_INODES
 	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
 	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select KVM_MMIO
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index bfdde04e4905..f53997a8ca62 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,7 +20,6 @@ if VIRTUALIZATION
 config KVM
 	bool
 	select PREEMPT_NOTIFIERS
-	select ANON_INODES
 	select HAVE_KVM_EVENTFD
 	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select SRCU
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 767453faacfc..1816ee48eadd 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -21,7 +21,6 @@ config KVM
 	prompt "Kernel-based Virtual Machine (KVM) support"
 	depends on HAVE_KVM
 	select PREEMPT_NOTIFIERS
-	select ANON_INODES
 	select HAVE_KVM_CPU_RELAX_INTERCEPT
 	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select HAVE_KVM_EVENTFD
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5ad92419be19..7a70fb58b2d0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -44,7 +44,6 @@ config X86
 	#
 	select ACPI_LEGACY_TABLES_LOOKUP	if ACPI
 	select ACPI_SYSTEM_POWER_STATES_SUPPORT	if ACPI
-	select ANON_INODES
 	select ARCH_32BIT_OFF_T			if X86_32
 	select ARCH_CLOCKSOURCE_DATA
 	select ARCH_CLOCKSOURCE_INIT
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 72fa955f4a15..fc042419e670 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -27,7 +27,6 @@ config KVM
 	depends on X86_LOCAL_APIC
 	select PREEMPT_NOTIFIERS
 	select MMU_NOTIFIER
-	select ANON_INODES
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_IRQFD
 	select IRQ_BYPASS_MANAGER
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 059700ea3521..03f067da12ee 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -174,7 +174,6 @@ source "drivers/base/regmap/Kconfig"
 config DMA_SHARED_BUFFER
 	bool
 	default n
-	select ANON_INODES
 	select IRQ_WORK
 	help
 	  This option enables the framework for buffer-sharing between
diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig
index 536e55d3919f..f3e4bc490cf0 100644
--- a/drivers/char/tpm/Kconfig
+++ b/drivers/char/tpm/Kconfig
@@ -157,7 +157,6 @@ config TCG_CRB
 config TCG_VTPM_PROXY
 	tristate "VTPM Proxy Interface"
 	depends on TCG_TPM
-	select ANON_INODES
 	---help---
 	  This driver proxies for an emulated TPM (vTPM) running in userspace.
 	  A device /dev/vtpmx is provided that creates a device pair
diff --git a/drivers/dma-buf/Kconfig b/drivers/dma-buf/Kconfig
index 2e5a0faa2cb1..3fc9c2efc583 100644
--- a/drivers/dma-buf/Kconfig
+++ b/drivers/dma-buf/Kconfig
@@ -3,7 +3,6 @@ menu "DMABUF options"
 config SYNC_FILE
 	bool "Explicit Synchronization Framework"
 	default n
-	select ANON_INODES
 	select DMA_SHARED_BUFFER
 	---help---
 	  The Sync File Framework adds explicit syncronization via
diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 3f50526a771f..0f91600c27ae 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -12,7 +12,6 @@ config ARCH_HAVE_CUSTOM_GPIO_H
 
 menuconfig GPIOLIB
 	bool "GPIO Support"
-	select ANON_INODES
 	help
 	  This enables GPIO support through the generic GPIO library.
 	  You only need to enable this, if you also want to enable
diff --git a/drivers/iio/Kconfig b/drivers/iio/Kconfig
index d08aeb41cd07..1dec0fecb6ef 100644
--- a/drivers/iio/Kconfig
+++ b/drivers/iio/Kconfig
@@ -4,7 +4,6 @@
 
 menuconfig IIO
 	tristate "Industrial I/O support"
-	select ANON_INODES
 	help
 	  The industrial I/O subsystem provides a unified framework for
 	  drivers for many different types of embedded sensors using a
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index a1fb840de45d..d318bab25860 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -25,7 +25,6 @@ config INFINIBAND_USER_MAD
 
 config INFINIBAND_USER_ACCESS
 	tristate "InfiniBand userspace access (verbs and CM)"
-	select ANON_INODES
 	depends on MMU
 	---help---
 	  Userspace InfiniBand access support.  This enables the
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 9de5ed38da83..3798d77d131c 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -22,7 +22,6 @@ menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if (X86 || S390 || ARM || ARM64)
-	select ANON_INODES
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/fs/Makefile b/fs/Makefile
index 427fec226fae..35945f8139e6 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -25,7 +25,7 @@ obj-$(CONFIG_PROC_FS) += proc_namespace.o
 
 obj-y				+= notify/
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
-obj-$(CONFIG_ANON_INODES)	+= anon_inodes.o
+obj-y				+= anon_inodes.o
 obj-$(CONFIG_SIGNALFD)		+= signalfd.o
 obj-$(CONFIG_TIMERFD)		+= timerfd.o
 obj-$(CONFIG_EVENTFD)		+= eventfd.o
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 735bfb2e9190..521dc91d2cb5 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -1,7 +1,6 @@
 config FANOTIFY
 	bool "Filesystem wide access notification"
 	select FSNOTIFY
-	select ANON_INODES
 	select EXPORTFS
 	default n
 	---help---
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index b981fc0c8379..0161c74e76e2 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,6 +1,5 @@
 config INOTIFY_USER
 	bool "Inotify support for userspace"
-	select ANON_INODES
 	select FSNOTIFY
 	default y
 	---help---
diff --git a/init/Kconfig b/init/Kconfig
index 4592bf7997c0..be8f97e37a76 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1171,9 +1171,6 @@ config LD_DEAD_CODE_DATA_ELIMINATION
 config SYSCTL
 	bool
 
-config ANON_INODES
-	bool
-
 config HAVE_UID16
 	bool
 
@@ -1378,14 +1375,12 @@ config HAVE_FUTEX_CMPXCHG
 config EPOLL
 	bool "Enable eventpoll support" if EXPERT
 	default y
-	select ANON_INODES
 	help
 	  Disabling this option will cause the kernel to be built without
 	  support for epoll family of system calls.
 
 config SIGNALFD
 	bool "Enable signalfd() system call" if EXPERT
-	select ANON_INODES
 	default y
 	help
 	  Enable the signalfd() system call that allows to receive signals
@@ -1395,7 +1390,6 @@ config SIGNALFD
 
 config TIMERFD
 	bool "Enable timerfd() system call" if EXPERT
-	select ANON_INODES
 	default y
 	help
 	  Enable the timerfd() system call that allows to receive timer
@@ -1405,7 +1399,6 @@ config TIMERFD
 
 config EVENTFD
 	bool "Enable eventfd() system call" if EXPERT
-	select ANON_INODES
 	default y
 	help
 	  Enable the eventfd() system call that allows to receive both
@@ -1516,7 +1509,6 @@ config KALLSYMS_BASE_RELATIVE
 # syscall, maps, verifier
 config BPF_SYSCALL
 	bool "Enable bpf() system call"
-	select ANON_INODES
 	select BPF
 	select IRQ_WORK
 	default n
@@ -1533,7 +1525,6 @@ config BPF_JIT_ALWAYS_ON
 
 config USERFAULTFD
 	bool "Enable userfaultfd() system call"
-	select ANON_INODES
 	depends on MMU
 	help
 	  Enable the userfaultfd() system call that allows to intercept and
@@ -1600,7 +1591,6 @@ config PERF_EVENTS
 	bool "Kernel performance events and counters"
 	default y if PROFILING
 	depends on HAVE_PERF_EVENTS
-	select ANON_INODES
 	select IRQ_WORK
 	select SRCU
 	help
-- 
cgit v1.2.3-59-g8ed1b


From 5ce5d8a5a4ae64b281bea7ae028c960f12acae21 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 17 Apr 2019 22:51:49 +0200
Subject: asm-generic: generalize asm/sockios.h

ia64, parisc and sparc just use a copy of the generic version
of asm/sockios.h, and x86 is a redirect to the same file, so we
can just let the header file be generated.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/ia64/include/uapi/asm/sockios.h   | 21 ---------------------
 arch/parisc/include/uapi/asm/sockios.h | 14 --------------
 arch/sparc/include/uapi/asm/sockios.h  | 15 ---------------
 arch/x86/include/uapi/asm/sockios.h    |  1 -
 4 files changed, 51 deletions(-)
 delete mode 100644 arch/ia64/include/uapi/asm/sockios.h
 delete mode 100644 arch/parisc/include/uapi/asm/sockios.h
 delete mode 100644 arch/sparc/include/uapi/asm/sockios.h
 delete mode 100644 arch/x86/include/uapi/asm/sockios.h

(limited to 'arch')

diff --git a/arch/ia64/include/uapi/asm/sockios.h b/arch/ia64/include/uapi/asm/sockios.h
deleted file mode 100644
index f27a12f95d20..000000000000
--- a/arch/ia64/include/uapi/asm/sockios.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_SOCKIOS_H
-#define _ASM_IA64_SOCKIOS_H
-
-/*
- * Socket-level I/O control calls.
- *
- * Based on <asm-i386/sockios.h>.
- *
- * Modified 1998, 1999
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-#define FIOSETOWN 	0x8901
-#define SIOCSPGRP	0x8902
-#define FIOGETOWN	0x8903
-#define SIOCGPGRP	0x8904
-#define SIOCATMARK	0x8905
-#define SIOCGSTAMP	0x8906		/* Get stamp (timeval) */
-#define SIOCGSTAMPNS	0x8907		/* Get stamp (timespec) */
-
-#endif /* _ASM_IA64_SOCKIOS_H */
diff --git a/arch/parisc/include/uapi/asm/sockios.h b/arch/parisc/include/uapi/asm/sockios.h
deleted file mode 100644
index 66a3ba64d53f..000000000000
--- a/arch/parisc/include/uapi/asm/sockios.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __ARCH_PARISC_SOCKIOS__
-#define __ARCH_PARISC_SOCKIOS__
-
-/* Socket-level I/O control calls. */
-#define FIOSETOWN 	0x8901
-#define SIOCSPGRP	0x8902
-#define FIOGETOWN	0x8903
-#define SIOCGPGRP	0x8904
-#define SIOCATMARK	0x8905
-#define SIOCGSTAMP	0x8906		/* Get stamp (timeval) */
-#define SIOCGSTAMPNS	0x8907		/* Get stamp (timespec) */
-
-#endif
diff --git a/arch/sparc/include/uapi/asm/sockios.h b/arch/sparc/include/uapi/asm/sockios.h
deleted file mode 100644
index 18a3ec14a847..000000000000
--- a/arch/sparc/include/uapi/asm/sockios.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_SPARC_SOCKIOS_H
-#define _ASM_SPARC_SOCKIOS_H
-
-/* Socket-level I/O control calls. */
-#define FIOSETOWN 	0x8901
-#define SIOCSPGRP	0x8902
-#define FIOGETOWN	0x8903
-#define SIOCGPGRP	0x8904
-#define SIOCATMARK	0x8905
-#define SIOCGSTAMP	0x8906		/* Get stamp (timeval) */
-#define SIOCGSTAMPNS	0x8907		/* Get stamp (timespec) */
-
-#endif /* !(_ASM_SPARC_SOCKIOS_H) */
-
diff --git a/arch/x86/include/uapi/asm/sockios.h b/arch/x86/include/uapi/asm/sockios.h
deleted file mode 100644
index def6d4746ee7..000000000000
--- a/arch/x86/include/uapi/asm/sockios.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/sockios.h>
-- 
cgit v1.2.3-59-g8ed1b


From 0768e17073dc527ccd18ed5f96ce85f9985e9115 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 17 Apr 2019 22:56:11 +0200
Subject: net: socket: implement 64-bit timestamps

The 'timeval' and 'timespec' data structures used for socket timestamps
are going to be redefined in user space based on 64-bit time_t in future
versions of the C library to deal with the y2038 overflow problem,
which breaks the ABI definition.

Unlike many modern ioctl commands, SIOCGSTAMP and SIOCGSTAMPNS do not
use the _IOR() macro to encode the size of the transferred data, so it
remains ambiguous whether the application uses the old or new layout.

The best workaround I could find is rather ugly: we redefine the command
code based on the size of the respective data structure with a ternary
operator. This lets it get evaluated as late as possible, hopefully after
that structure is visible to the caller. We cannot use an #ifdef here,
because inux/sockios.h might have been included before any libc header
that could determine the size of time_t.

The ioctl implementation now interprets the new command codes as always
referring to the 64-bit structure on all architectures, while the old
architecture specific command code still refers to the old architecture
specific layout. The new command number is only used when they are
actually different.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/sockios.h  |  4 ++--
 arch/mips/include/uapi/asm/sockios.h   |  4 ++--
 arch/sh/include/uapi/asm/sockios.h     |  5 +++--
 arch/xtensa/include/uapi/asm/sockios.h |  4 ++--
 include/uapi/asm-generic/sockios.h     |  4 ++--
 include/uapi/linux/sockios.h           | 21 +++++++++++++++++++++
 net/socket.c                           | 24 ++++++++++++++++++------
 7 files changed, 50 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/alpha/include/uapi/asm/sockios.h b/arch/alpha/include/uapi/asm/sockios.h
index ba287e4b01bf..af92bc27c3be 100644
--- a/arch/alpha/include/uapi/asm/sockios.h
+++ b/arch/alpha/include/uapi/asm/sockios.h
@@ -11,7 +11,7 @@
 #define SIOCSPGRP	_IOW('s', 8, pid_t)
 #define SIOCGPGRP	_IOR('s', 9, pid_t)
 
-#define SIOCGSTAMP	0x8906		/* Get stamp (timeval) */
-#define SIOCGSTAMPNS	0x8907		/* Get stamp (timespec) */
+#define SIOCGSTAMP_OLD	0x8906		/* Get stamp (timeval) */
+#define SIOCGSTAMPNS_OLD 0x8907		/* Get stamp (timespec) */
 
 #endif /* _ASM_ALPHA_SOCKIOS_H */
diff --git a/arch/mips/include/uapi/asm/sockios.h b/arch/mips/include/uapi/asm/sockios.h
index 5b40a88593fa..66f60234f290 100644
--- a/arch/mips/include/uapi/asm/sockios.h
+++ b/arch/mips/include/uapi/asm/sockios.h
@@ -21,7 +21,7 @@
 #define SIOCSPGRP	_IOW('s', 8, pid_t)
 #define SIOCGPGRP	_IOR('s', 9, pid_t)
 
-#define SIOCGSTAMP	0x8906		/* Get stamp (timeval) */
-#define SIOCGSTAMPNS	0x8907		/* Get stamp (timespec) */
+#define SIOCGSTAMP_OLD	0x8906		/* Get stamp (timeval) */
+#define SIOCGSTAMPNS_OLD 0x8907		/* Get stamp (timespec) */
 
 #endif /* _ASM_SOCKIOS_H */
diff --git a/arch/sh/include/uapi/asm/sockios.h b/arch/sh/include/uapi/asm/sockios.h
index 17313d2c3527..ef18a668456d 100644
--- a/arch/sh/include/uapi/asm/sockios.h
+++ b/arch/sh/include/uapi/asm/sockios.h
@@ -10,6 +10,7 @@
 #define SIOCSPGRP	_IOW('s', 8, pid_t)
 #define SIOCGPGRP	_IOR('s', 9, pid_t)
 
-#define SIOCGSTAMP	_IOR('s', 100, struct timeval) /* Get stamp (timeval) */
-#define SIOCGSTAMPNS	_IOR('s', 101, struct timespec) /* Get stamp (timespec) */
+#define SIOCGSTAMP_OLD	_IOR('s', 100, struct timeval) /* Get stamp (timeval) */
+#define SIOCGSTAMPNS_OLD _IOR('s', 101, struct timespec) /* Get stamp (timespec) */
+
 #endif /* __ASM_SH_SOCKIOS_H */
diff --git a/arch/xtensa/include/uapi/asm/sockios.h b/arch/xtensa/include/uapi/asm/sockios.h
index fb8ac3607189..1a1f58f4b75a 100644
--- a/arch/xtensa/include/uapi/asm/sockios.h
+++ b/arch/xtensa/include/uapi/asm/sockios.h
@@ -26,7 +26,7 @@
 #define SIOCSPGRP	_IOW('s', 8, pid_t)
 #define SIOCGPGRP	_IOR('s', 9, pid_t)
 
-#define SIOCGSTAMP	0x8906		/* Get stamp (timeval) */
-#define SIOCGSTAMPNS	0x8907		/* Get stamp (timespec) */
+#define SIOCGSTAMP_OLD	0x8906		/* Get stamp (timeval) */
+#define SIOCGSTAMPNS_OLD 0x8907		/* Get stamp (timespec) */
 
 #endif	/* _XTENSA_SOCKIOS_H */
diff --git a/include/uapi/asm-generic/sockios.h b/include/uapi/asm-generic/sockios.h
index 64f658c7cec2..44fa3ed70483 100644
--- a/include/uapi/asm-generic/sockios.h
+++ b/include/uapi/asm-generic/sockios.h
@@ -8,7 +8,7 @@
 #define FIOGETOWN	0x8903
 #define SIOCGPGRP	0x8904
 #define SIOCATMARK	0x8905
-#define SIOCGSTAMP	0x8906		/* Get stamp (timeval) */
-#define SIOCGSTAMPNS	0x8907		/* Get stamp (timespec) */
+#define SIOCGSTAMP_OLD	0x8906		/* Get stamp (timeval) */
+#define SIOCGSTAMPNS_OLD 0x8907		/* Get stamp (timespec) */
 
 #endif /* __ASM_GENERIC_SOCKIOS_H */
diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h
index d393e9ed3964..7d1bccbbef78 100644
--- a/include/uapi/linux/sockios.h
+++ b/include/uapi/linux/sockios.h
@@ -19,6 +19,7 @@
 #ifndef _LINUX_SOCKIOS_H
 #define _LINUX_SOCKIOS_H
 
+#include <asm/bitsperlong.h>
 #include <asm/sockios.h>
 
 /* Linux-specific socket ioctls */
@@ -27,6 +28,26 @@
 
 #define SOCK_IOC_TYPE	0x89
 
+/*
+ * the timeval/timespec data structure layout is defined by libc,
+ * so we need to cover both possible versions on 32-bit.
+ */
+/* Get stamp (timeval) */
+#define SIOCGSTAMP_NEW	 _IOR(SOCK_IOC_TYPE, 0x06, long long[2])
+/* Get stamp (timespec) */
+#define SIOCGSTAMPNS_NEW _IOR(SOCK_IOC_TYPE, 0x07, long long[2])
+
+#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
+/* on 64-bit and x32, avoid the ?: operator */
+#define SIOCGSTAMP	SIOCGSTAMP_OLD
+#define SIOCGSTAMPNS	SIOCGSTAMPNS_OLD
+#else
+#define SIOCGSTAMP	((sizeof(struct timeval))  == 8 ? \
+			 SIOCGSTAMP_OLD   : SIOCGSTAMP_NEW)
+#define SIOCGSTAMPNS	((sizeof(struct timespec)) == 8 ? \
+			 SIOCGSTAMPNS_OLD : SIOCGSTAMPNS_NEW)
+#endif
+
 /* Routing table calls. */
 #define SIOCADDRT	0x890B		/* add routing table entry	*/
 #define SIOCDELRT	0x890C		/* delete routing table entry	*/
diff --git a/net/socket.c b/net/socket.c
index ab624d42ead5..8d9d4fc7d962 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1164,14 +1164,24 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 
 			err = open_related_ns(&net->ns, get_net_ns);
 			break;
-		case SIOCGSTAMP:
-		case SIOCGSTAMPNS:
+		case SIOCGSTAMP_OLD:
+		case SIOCGSTAMPNS_OLD:
 			if (!sock->ops->gettstamp) {
 				err = -ENOIOCTLCMD;
 				break;
 			}
 			err = sock->ops->gettstamp(sock, argp,
-						   cmd == SIOCGSTAMP, false);
+						   cmd == SIOCGSTAMP_OLD,
+						   !IS_ENABLED(CONFIG_64BIT));
+		case SIOCGSTAMP_NEW:
+		case SIOCGSTAMPNS_NEW:
+			if (!sock->ops->gettstamp) {
+				err = -ENOIOCTLCMD;
+				break;
+			}
+			err = sock->ops->gettstamp(sock, argp,
+						   cmd == SIOCGSTAMP_NEW,
+						   false);
 			break;
 		default:
 			err = sock_do_ioctl(net, sock, cmd, arg);
@@ -3324,11 +3334,11 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCADDRT:
 	case SIOCDELRT:
 		return routing_ioctl(net, sock, cmd, argp);
-	case SIOCGSTAMP:
-	case SIOCGSTAMPNS:
+	case SIOCGSTAMP_OLD:
+	case SIOCGSTAMPNS_OLD:
 		if (!sock->ops->gettstamp)
 			return -ENOIOCTLCMD;
-		return sock->ops->gettstamp(sock, argp, cmd == SIOCGSTAMP,
+		return sock->ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD,
 					    !COMPAT_USE_64BIT_TIME);
 
 	case SIOCBONDSLAVEINFOQUERY:
@@ -3348,6 +3358,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCADDDLCI:
 	case SIOCDELDLCI:
 	case SIOCGSKNS:
+	case SIOCGSTAMP_NEW:
+	case SIOCGSTAMPNS_NEW:
 		return sock_ioctl(file, cmd, arg);
 
 	case SIOCGIFFLAGS:
-- 
cgit v1.2.3-59-g8ed1b


From a273fa386a947612a23b0d56dcfb8823662b8606 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 7 Feb 2019 16:16:51 +1100
Subject: powerpc/32: Add ppc_defconfig

Add a generic 32-bit defconfig called ppc_defconfig. This means we'll
have a defconfig matching "uname -m" for all cases.

This config is mostly intended for build testing but if someone wants
to tweak it to get it booting on something that would be fine too.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Tested-by: Mathieu Malaterre <malat@debian.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 7de49889bd5d..a1b17bcd0b62 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -367,6 +367,10 @@ ppc32_allmodconfig:
 	$(Q)$(MAKE) KCONFIG_ALLCONFIG=$(srctree)/arch/powerpc/configs/book3s_32.config \
 		-f $(srctree)/Makefile allmodconfig
 
+PHONY += ppc_defconfig
+ppc_defconfig:
+	$(call merge_into_defconfig,book3s_32.config,)
+
 PHONY += ppc64le_allmodconfig
 ppc64le_allmodconfig:
 	$(Q)$(MAKE) KCONFIG_ALLCONFIG=$(srctree)/arch/powerpc/configs/le.config \
-- 
cgit v1.2.3-59-g8ed1b


From af5cd05de5dd38cf25d14ea4d30ae9b791d2420b Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 7 Feb 2019 16:16:52 +1100
Subject: powerpc: Fix defconfig choice logic when cross compiling

Our logic for choosing defconfig doesn't work well in some situations.

For example if you're on a ppc64le machine but you specify a non-empty
CROSS_COMPILE, in order to use a non-default toolchain, then defconfig
will give you ppc64_defconfig (big endian):

  $ make CROSS_COMPILE=~/toolchains/gcc-8/bin/powerpc-linux- defconfig
  *** Default configuration is based on 'ppc64_defconfig'

This is because we assume that CROSS_COMPILE being set means we
can't be on a ppc machine and rather than checking we just default to
ppc64_defconfig.

We should just ignore CROSS_COMPILE, instead check the machine with
uname and if it's one of ppc, ppc64 or ppc64le then use that
defconfig. If it's none of those then we fall back to ppc64_defconfig.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Makefile | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index a1b17bcd0b62..64b8a5ae3b75 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -34,11 +34,10 @@ ifdef CONFIG_PPC_BOOK3S_32
 KBUILD_CFLAGS		+= -mcpu=powerpc
 endif
 
-ifeq ($(CROSS_COMPILE),)
-KBUILD_DEFCONFIG := $(shell uname -m)_defconfig
-else
-KBUILD_DEFCONFIG := ppc64_defconfig
-endif
+# If we're on a ppc/ppc64/ppc64le machine use that defconfig, otherwise just use
+# ppc64_defconfig because we have nothing better to go on.
+uname := $(shell uname -m)
+KBUILD_DEFCONFIG := $(if $(filter ppc%,$(uname)),$(uname),ppc64)_defconfig
 
 ifdef CONFIG_PPC64
 new_nm := $(shell if $(NM) --help 2>&1 | grep -- '--synthetic' > /dev/null; then echo y; else echo n; fi)
-- 
cgit v1.2.3-59-g8ed1b


From 6c84f8c5cbfb4bf728f88296bc035c4a401c3423 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 7 Mar 2019 09:47:50 +0000
Subject: powerpc/highmem: Change BUG_ON() to WARN_ON()

In arch/powerpc/mm/highmem.c, BUG_ON() is called only when
CONFIG_DEBUG_HIGHMEM is selected, this means the BUG_ON() is not vital
and can be replaced by a a WARN_ON().

At the same time, use IS_ENABLED() instead of #ifdef to clean a bit.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/highmem.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c
index 82a0e37557a5..320c1672b2ae 100644
--- a/arch/powerpc/mm/highmem.c
+++ b/arch/powerpc/mm/highmem.c
@@ -43,9 +43,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-#ifdef CONFIG_DEBUG_HIGHMEM
-	BUG_ON(!pte_none(*(kmap_pte-idx)));
-#endif
+	WARN_ON(IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !pte_none(*(kmap_pte - idx)));
 	__set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot), 1);
 	local_flush_tlb_page(NULL, vaddr);
 
@@ -56,7 +54,6 @@ EXPORT_SYMBOL(kmap_atomic_prot);
 void __kunmap_atomic(void *kvaddr)
 {
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-	int type __maybe_unused;
 
 	if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
 		pagefault_enable();
@@ -64,14 +61,12 @@ void __kunmap_atomic(void *kvaddr)
 		return;
 	}
 
-	type = kmap_atomic_idx();
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-	{
+	if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM)) {
+		int type = kmap_atomic_idx();
 		unsigned int idx;
 
 		idx = type + KM_TYPE_NR * smp_processor_id();
-		BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+		WARN_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
 
 		/*
 		 * force other mappings to Oops if they'll try to access
@@ -80,7 +75,6 @@ void __kunmap_atomic(void *kvaddr)
 		pte_clear(&init_mm, vaddr, kmap_pte-idx);
 		local_flush_tlb_page(NULL, vaddr);
 	}
-#endif
 
 	kmap_atomic_idx_pop();
 	pagefault_enable();
-- 
cgit v1.2.3-59-g8ed1b


From eea86aa4171d4960f0fcdc99dab358c224d53ffe Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 14 Mar 2019 23:54:53 +1100
Subject: powerpc/mm/64: Document the sizes of/sizes mapped by Pxx_INDEX_SIZE

Add comments describing the size in bytes of the various levels of the
page table tree, and the size of the virtual address space mapped by
each level, to make it clear what the sizes are without having to also
look up other definitions.

The code that calculates the sizes actually uses sizeof(pgd_t) etc.,
so in theory these comments could skew vs the code, but the size of
pgd_t etc. is unlikely to change very often.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h   | 8 ++++----
 arch/powerpc/include/asm/book3s/64/hash-64k.h  | 9 +++++----
 arch/powerpc/include/asm/book3s/64/radix-4k.h  | 9 +++++----
 arch/powerpc/include/asm/book3s/64/radix-64k.h | 8 ++++----
 4 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index cf5ba5254299..54fab723a8c7 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -2,10 +2,10 @@
 #ifndef _ASM_POWERPC_BOOK3S_64_HASH_4K_H
 #define _ASM_POWERPC_BOOK3S_64_HASH_4K_H
 
-#define H_PTE_INDEX_SIZE  9
-#define H_PMD_INDEX_SIZE  7
-#define H_PUD_INDEX_SIZE  9
-#define H_PGD_INDEX_SIZE  9
+#define H_PTE_INDEX_SIZE  9  // size: 8B << 9 = 4KB, maps: 2^9 x   4KB =   2MB
+#define H_PMD_INDEX_SIZE  7  // size: 8B << 7 = 1KB, maps: 2^7 x   2MB = 256MB
+#define H_PUD_INDEX_SIZE  9  // size: 8B << 9 = 4KB, maps: 2^9 x 256MB = 128GB
+#define H_PGD_INDEX_SIZE  9  // size: 8B << 9 = 4KB, maps: 2^9 x 128GB =  64TB
 
 /*
  * Each context is 512TB. But on 4k we restrict our max TASK size to 64TB
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index f82ee8a3b561..81f4eb6e7da4 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -2,10 +2,11 @@
 #ifndef _ASM_POWERPC_BOOK3S_64_HASH_64K_H
 #define _ASM_POWERPC_BOOK3S_64_HASH_64K_H
 
-#define H_PTE_INDEX_SIZE  8
-#define H_PMD_INDEX_SIZE  10
-#define H_PUD_INDEX_SIZE  10
-#define H_PGD_INDEX_SIZE  8
+#define H_PTE_INDEX_SIZE   8  // size: 8B <<  8 = 2KB, maps 2^8  x 64KB = 16MB
+#define H_PMD_INDEX_SIZE  10  // size: 8B << 10 = 8KB, maps 2^10 x 16MB = 16GB
+#define H_PUD_INDEX_SIZE  10  // size: 8B << 10 = 8KB, maps 2^10 x 16GB = 16TB
+#define H_PGD_INDEX_SIZE   8  // size: 8B <<  8 = 2KB, maps 2^8  x 16TB =  4PB
+
 
 /*
  * Each context is 512TB size. SLB miss for first context/default context
diff --git a/arch/powerpc/include/asm/book3s/64/radix-4k.h b/arch/powerpc/include/asm/book3s/64/radix-4k.h
index 863c3e8286fb..d5f5ab73dc7f 100644
--- a/arch/powerpc/include/asm/book3s/64/radix-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/radix-4k.h
@@ -5,10 +5,11 @@
 /*
  * For 4K page size supported index is 13/9/9/9
  */
-#define RADIX_PTE_INDEX_SIZE  9  /* 2MB huge page */
-#define RADIX_PMD_INDEX_SIZE  9  /* 1G huge page */
-#define RADIX_PUD_INDEX_SIZE	 9
-#define RADIX_PGD_INDEX_SIZE  13
+#define RADIX_PTE_INDEX_SIZE   9  // size: 8B <<  9 =  4KB, maps 2^9  x    4K =   2MB
+#define RADIX_PMD_INDEX_SIZE   9  // size: 8B <<  9 =  4KB, maps 2^9  x   2MB =   1GB
+#define RADIX_PUD_INDEX_SIZE   9  // size: 8B <<  9 =  4KB, maps 2^9  x   1GB = 512GB
+#define RADIX_PGD_INDEX_SIZE  13  // size: 8B << 13 = 64KB, maps 2^13 x 512GB =   4PB
+
 /*
  * One fragment per per page
  */
diff --git a/arch/powerpc/include/asm/book3s/64/radix-64k.h b/arch/powerpc/include/asm/book3s/64/radix-64k.h
index ccb78ca9d0c5..54e33828b0fb 100644
--- a/arch/powerpc/include/asm/book3s/64/radix-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/radix-64k.h
@@ -5,10 +5,10 @@
 /*
  * For 64K page size supported index is 13/9/9/5
  */
-#define RADIX_PTE_INDEX_SIZE  5  /* 2MB huge page */
-#define RADIX_PMD_INDEX_SIZE  9  /* 1G huge page */
-#define RADIX_PUD_INDEX_SIZE	 9
-#define RADIX_PGD_INDEX_SIZE  13
+#define RADIX_PTE_INDEX_SIZE   5  // size: 8B <<  5 = 256B, maps 2^5  x   64K =   2MB
+#define RADIX_PMD_INDEX_SIZE   9  // size: 8B <<  9 =  4KB, maps 2^9  x   2MB =   1GB
+#define RADIX_PUD_INDEX_SIZE   9  // size: 8B <<  9 =  4KB, maps 2^9  x   1GB = 512GB
+#define RADIX_PGD_INDEX_SIZE  13  // size: 8B << 13 = 64KB, maps 2^13 x 512GB =   4PB
 
 /*
  * We use a 256 byte PTE page fragment in radix
-- 
cgit v1.2.3-59-g8ed1b


From ff6d27823f619892ab96f7461764840e0d786b15 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Fri, 22 Mar 2019 04:24:37 +0000
Subject: powerpc: vdso: Make vdso32 installation conditional in vdso_install

The 32-bit vDSO is not needed and not normally built for 64-bit
little-endian configurations.  However, the vdso_install target still
builds and installs it.  Add the same config condition as is normally
used for the build.

Fixes: e0d005916994 ("powerpc/vdso: Disable building the 32-bit VDSO ...")
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Makefile | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 64b8a5ae3b75..258ea6b2f2e7 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -409,7 +409,9 @@ vdso_install:
 ifdef CONFIG_PPC64
 	$(Q)$(MAKE) $(build)=arch/$(ARCH)/kernel/vdso64 $@
 endif
+ifdef CONFIG_VDSO32
 	$(Q)$(MAKE) $(build)=arch/$(ARCH)/kernel/vdso32 $@
+endif
 
 archclean:
 	$(Q)$(MAKE) $(clean)=$(boot)
-- 
cgit v1.2.3-59-g8ed1b


From 308be6c7817c850b55c3b6ff4bf53c2427e274bc Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Tue, 26 Mar 2019 21:47:18 +0100
Subject: powerpc/embedded6xx: Make some functions static

In commit cb9e4d10c448 ("[POWERPC] Add support for 750CL Holly board")
new functions were added. Since most of these functions can be made
static, make it so.

Both holly_power_off and holly_halt functions were not changed since
they are unused, making them static would have triggered the following
warning (treated as error):

  arch/powerpc/platforms/embedded6xx/holly.c:244:13: error: 'holly_halt' defined but not used

Silence the following warnings triggered using W=1:

  arch/powerpc/platforms/embedded6xx/holly.c:47:5: error: no previous prototype for 'holly_exclude_device'
  arch/powerpc/platforms/embedded6xx/holly.c:190:6: error: no previous prototype for 'holly_show_cpuinfo'
  arch/powerpc/platforms/embedded6xx/holly.c:196:17: error: no previous prototype for 'holly_restart'

Signed-off-by: Mathieu Malaterre <malat@debian.org>
Reviewed-by: Mukesh Ojha <mojha@codeaurora.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/embedded6xx/holly.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/embedded6xx/holly.c b/arch/powerpc/platforms/embedded6xx/holly.c
index 0409714e8070..9d2eefef7b7b 100644
--- a/arch/powerpc/platforms/embedded6xx/holly.c
+++ b/arch/powerpc/platforms/embedded6xx/holly.c
@@ -44,7 +44,8 @@
 
 #define HOLLY_PCI_CFG_PHYS 0x7c000000
 
-int holly_exclude_device(struct pci_controller *hose, u_char bus, u_char devfn)
+static int holly_exclude_device(struct pci_controller *hose, u_char bus,
+				u_char devfn)
 {
 	if (bus == 0 && PCI_SLOT(devfn) == 0)
 		return PCIBIOS_DEVICE_NOT_FOUND;
@@ -187,13 +188,13 @@ static void __init holly_init_IRQ(void)
 	tsi108_write_reg(TSI108_MPIC_OFFSET + 0x30c, 0);
 }
 
-void holly_show_cpuinfo(struct seq_file *m)
+static void holly_show_cpuinfo(struct seq_file *m)
 {
 	seq_printf(m, "vendor\t\t: IBM\n");
 	seq_printf(m, "machine\t\t: PPC750 GX/CL\n");
 }
 
-void __noreturn holly_restart(char *cmd)
+static void __noreturn holly_restart(char *cmd)
 {
 	__be32 __iomem *ocn_bar1 = NULL;
 	unsigned long bar;
-- 
cgit v1.2.3-59-g8ed1b


From 62611c1e241878c538211d80cefef9de72030b56 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Tue, 26 Mar 2019 21:47:19 +0100
Subject: powerpc/embedded6xx: Remove unused functions holly_power_off and
 holly_halt

Silence the following warnings triggered using W=1:

  arch/powerpc/platforms/embedded6xx/holly.c:236:6: error: no previous prototype for 'holly_power_off'
  arch/powerpc/platforms/embedded6xx/holly.c:243:6: error: no previous prototype for 'holly_halt'

Signed-off-by: Mathieu Malaterre <malat@debian.org>
Reviewed-by: Mukesh Ojha <mojha@codeaurora.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/embedded6xx/holly.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/embedded6xx/holly.c b/arch/powerpc/platforms/embedded6xx/holly.c
index 9d2eefef7b7b..829bf3697dc9 100644
--- a/arch/powerpc/platforms/embedded6xx/holly.c
+++ b/arch/powerpc/platforms/embedded6xx/holly.c
@@ -234,18 +234,6 @@ static void __noreturn holly_restart(char *cmd)
 	for (;;) ;
 }
 
-void holly_power_off(void)
-{
-	local_irq_disable();
-	/* No way to shut power off with software */
-	for (;;) ;
-}
-
-void holly_halt(void)
-{
-	holly_power_off();
-}
-
 /*
  * Called very early, device-tree isn't unflattened
  */
-- 
cgit v1.2.3-59-g8ed1b


From 56c46bba9bbfe229b4472a5be313c44c5b714a39 Mon Sep 17 00:00:00 2001
From: Russell Currey <ruscur@russell.cc>
Date: Wed, 27 Mar 2019 14:35:54 +1100
Subject: powerpc/64: Fix booting large kernels with STRICT_KERNEL_RWX

With STRICT_KERNEL_RWX enabled anything marked __init is placed at a 16M
boundary.  This is necessary so that it can be repurposed later with
different permissions.  However, in kernels with text larger than 16M,
this pushes early_setup past 32M, incapable of being reached by the
branch instruction.

Fix this by setting the CTR and branching there instead.

Fixes: 1e0fc9d1eb2b ("powerpc/Kconfig: Enable STRICT_KERNEL_RWX for some configs")
Signed-off-by: Russell Currey <ruscur@russell.cc>
[mpe: Fix it to work on BE by using DOTSYM()]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/head_64.S | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 3fad8d499767..5321a11c2835 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -968,7 +968,9 @@ start_here_multiplatform:
 
 	/* Restore parameters passed from prom_init/kexec */
 	mr	r3,r31
-	bl	early_setup		/* also sets r13 and SPRG_PACA */
+	LOAD_REG_ADDR(r12, DOTSYM(early_setup))
+	mtctr	r12
+	bctrl		/* also sets r13 and SPRG_PACA */
 
 	LOAD_REG_ADDR(r3, start_here_common)
 	ld	r4,PACAKMSR(r13)
-- 
cgit v1.2.3-59-g8ed1b


From c9d8dda42372dce00ac3a1c653bef7b8d2dbe3ce Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Thu, 28 Mar 2019 17:10:33 +0530
Subject: powerpc/pseries/mce: Improve array initialization.

This is a follow up to the patch that fixed misleading print for TLB
mutlihit due to wrongly populated mc_err_types[] array. Convert all the
static array initialization to '[x] = val' style for better
readability of array indexing and avoid any further confusion.

Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/ras.c | 52 ++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 452dcfd7e5dd..a25c2ac0c9c0 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -539,44 +539,44 @@ static void pseries_print_mce_info(struct pt_regs *regs,
 	int disposition = rtas_error_disposition(errp);
 
 	static const char * const initiators[] = {
-		"Unknown",
-		"CPU",
-		"PCI",
-		"ISA",
-		"Memory",
-		"Power Mgmt",
+		[0] = "Unknown",
+		[1] = "CPU",
+		[2] = "PCI",
+		[3] = "ISA",
+		[4] = "Memory",
+		[5] = "Power Mgmt",
 	};
 	static const char * const mc_err_types[] = {
-		"UE",
-		"SLB",
-		"ERAT",
-		"Unknown",
-		"TLB",
-		"D-Cache",
-		"Unknown",
-		"I-Cache",
+		[0] = "UE",
+		[1] = "SLB",
+		[2] = "ERAT",
+		[3] = "Unknown",
+		[4] = "TLB",
+		[5] = "D-Cache",
+		[6] = "Unknown",
+		[7] = "I-Cache",
 	};
 	static const char * const mc_ue_types[] = {
-		"Indeterminate",
-		"Instruction fetch",
-		"Page table walk ifetch",
-		"Load/Store",
-		"Page table walk Load/Store",
+		[0] = "Indeterminate",
+		[1] = "Instruction fetch",
+		[2] = "Page table walk ifetch",
+		[3] = "Load/Store",
+		[4] = "Page table walk Load/Store",
 	};
 
 	/* SLB sub errors valid values are 0x0, 0x1, 0x2 */
 	static const char * const mc_slb_types[] = {
-		"Parity",
-		"Multihit",
-		"Indeterminate",
+		[0] = "Parity",
+		[1] = "Multihit",
+		[2] = "Indeterminate",
 	};
 
 	/* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */
 	static const char * const mc_soft_types[] = {
-		"Unknown",
-		"Parity",
-		"Multihit",
-		"Indeterminate",
+		[0] = "Unknown",
+		[1] = "Parity",
+		[2] = "Multihit",
+		[3] = "Indeterminate",
 	};
 
 	if (!rtas_error_extended(errp)) {
-- 
cgit v1.2.3-59-g8ed1b


From 24c174bb23ebcbe4c3979855b220513f2b3a730f Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Mon, 11 Feb 2019 12:37:12 +0100
Subject: powerpc/configs: Enable CONFIG_USB_XHCI_HCD by default

Recent versions of QEMU provide a XHCI device by default these
days instead of an old-fashioned OHCI device:

 https://git.qemu.org/?p=qemu.git;a=commitdiff;h=57040d451315320b7d27

So to get the keyboard working in the graphical console there again,
we should now include XHCI support in the kernel by default, too.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/configs/pseries_defconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index ea79c519863d..62e12f61a3b2 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -217,6 +217,7 @@ CONFIG_USB_MON=m
 CONFIG_USB_EHCI_HCD=y
 # CONFIG_USB_EHCI_HCD_PPC_OF is not set
 CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_XHCI_HCD=y
 CONFIG_USB_STORAGE=m
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=m
-- 
cgit v1.2.3-59-g8ed1b


From f89bd8ba834e392ff614a7be9ee68c5679675122 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Tue, 9 Apr 2019 09:33:28 +0530
Subject: powerpc/mm/radix: Don't do SLB preload when using the radix MMU

Add radix_enabled() check to avoid SLB preload with radix translation.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/process.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index dd9e0d5386ee..f7b2e3b3db28 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1729,7 +1729,8 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp)
 	unsigned long load_addr = regs->gpr[2];	/* saved by ELF_PLAT_INIT */
 
 #ifdef CONFIG_PPC_BOOK3S_64
-	preload_new_slb_context(start, sp);
+	if (!radix_enabled())
+		preload_new_slb_context(start, sp);
 #endif
 #endif
 
-- 
cgit v1.2.3-59-g8ed1b


From f172acbfae1a78b1a3c775f78e8d0dcd15b9d768 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Wed, 13 Mar 2019 11:25:28 +0100
Subject: powerpc/mm: move warning from resize_hpt_for_hotplug()

resize_hpt_for_hotplug() reports a warning when it cannot
resize the hash page table ("Unable to resize hash page
table to target order") but in some cases it's not a problem
and can make user thinks something has not worked properly.

This patch moves the warning to arch_remove_memory() to
only report the problem when it is needed.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/sparsemem.h  |  4 ++--
 arch/powerpc/mm/hash_utils_64.c       | 19 +++++++------------
 arch/powerpc/mm/mem.c                 |  3 ++-
 arch/powerpc/platforms/pseries/lpar.c |  3 ++-
 4 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h
index 68da49320592..3192d454a733 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -17,9 +17,9 @@ extern int create_section_mapping(unsigned long start, unsigned long end, int ni
 extern int remove_section_mapping(unsigned long start, unsigned long end);
 
 #ifdef CONFIG_PPC_BOOK3S_64
-extern void resize_hpt_for_hotplug(unsigned long new_mem_size);
+extern int resize_hpt_for_hotplug(unsigned long new_mem_size);
 #else
-static inline void resize_hpt_for_hotplug(unsigned long new_mem_size) { }
+static inline int resize_hpt_for_hotplug(unsigned long new_mem_size) { return 0; }
 #endif
 
 #ifdef CONFIG_NUMA
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 0a4f939a8161..c4c9610ce6e3 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -755,12 +755,12 @@ static unsigned long __init htab_get_table_size(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-void resize_hpt_for_hotplug(unsigned long new_mem_size)
+int resize_hpt_for_hotplug(unsigned long new_mem_size)
 {
 	unsigned target_hpt_shift;
 
 	if (!mmu_hash_ops.resize_hpt)
-		return;
+		return 0;
 
 	target_hpt_shift = htab_shift_for_mem_size(new_mem_size);
 
@@ -772,16 +772,11 @@ void resize_hpt_for_hotplug(unsigned long new_mem_size)
 	 * reduce unless the target shift is at least 2 below the
 	 * current shift
 	 */
-	if ((target_hpt_shift > ppc64_pft_size)
-	    || (target_hpt_shift < (ppc64_pft_size - 1))) {
-		int rc;
-
-		rc = mmu_hash_ops.resize_hpt(target_hpt_shift);
-		if (rc && (rc != -ENODEV))
-			printk(KERN_WARNING
-			       "Unable to resize hash page table to target order %d: %d\n",
-			       target_hpt_shift, rc);
-	}
+	if (target_hpt_shift > ppc64_pft_size ||
+	    target_hpt_shift < ppc64_pft_size - 1)
+		return mmu_hash_ops.resize_hpt(target_hpt_shift);
+
+	return 0;
 }
 
 int hash__create_section_mapping(unsigned long start, unsigned long end, int nid)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index f6787f90e158..3665602a9dfa 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -161,7 +161,8 @@ int __meminit arch_remove_memory(int nid, u64 start, u64 size,
 	 */
 	vm_unmap_aliases();
 
-	resize_hpt_for_hotplug(memblock_phys_mem_size());
+	if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
+		pr_warn("Hash collision while resizing HPT\n");
 
 	return ret;
 }
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index f2a9f0adc2d3..1034ef1fe2b4 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -901,8 +901,10 @@ static int pseries_lpar_resize_hpt(unsigned long shift)
 		break;
 
 	case H_PARAMETER:
+		pr_warn("Invalid argument from H_RESIZE_HPT_PREPARE\n");
 		return -EINVAL;
 	case H_RESOURCE:
+		pr_warn("Operation not permitted from H_RESIZE_HPT_PREPARE\n");
 		return -EPERM;
 	default:
 		pr_warn("Unexpected error %d from H_RESIZE_HPT_PREPARE\n", rc);
@@ -918,7 +920,6 @@ static int pseries_lpar_resize_hpt(unsigned long shift)
 	if (rc != 0) {
 		switch (state.commit_rc) {
 		case H_PTEG_FULL:
-			pr_warn("Hash collision while resizing HPT\n");
 			return -ENOSPC;
 
 		default:
-- 
cgit v1.2.3-59-g8ed1b


From bff25143da0d623a1765bf78dbc82044e46da5a4 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Thu, 7 Mar 2019 09:40:31 -0500
Subject: powerpc/mm: Silence unused-but-set-variable warnings

pte_unmap() compiles away on some powerpc platforms, so silence the
warnings below by making it a static inline function.

  mm/memory.c: In function 'copy_pte_range':
  mm/memory.c:820:24: warning: variable 'orig_dst_pte' set but not used
  mm/memory.c:820:9: warning: variable 'orig_src_pte' set but not used
  mm/madvise.c: In function 'madvise_free_pte_range':
  mm/madvise.c:318:9: warning: variable 'orig_pte' set but not used
  mm/swap_state.c: In function 'swap_ra_info':
  mm/swap_state.c:634:15: warning: variable 'orig_pte' set but not used

Suggested-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Qian Cai <cai@lca.pw>
Reviewed-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 3 ++-
 arch/powerpc/include/asm/nohash/64/pgtable.h | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 581f91be9dd4..e3d18b3f6e5d 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -992,7 +992,8 @@ extern struct page *pgd_page(pgd_t pgd);
 	(((pte_t *) pmd_page_vaddr(*(dir))) + pte_index(addr))
 
 #define pte_offset_map(dir,addr)	pte_offset_kernel((dir), (addr))
-#define pte_unmap(pte)			do { } while(0)
+
+static inline void pte_unmap(pte_t *pte) { }
 
 /* to find an entry in a kernel page-table-directory */
 /* This now only contains the vmalloc pages */
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index e77ed9761632..0384a3302fb6 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -205,7 +205,8 @@ static inline void pgd_set(pgd_t *pgdp, unsigned long val)
   (((pte_t *) pmd_page_vaddr(*(dir))) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)))
 
 #define pte_offset_map(dir,addr)	pte_offset_kernel((dir), (addr))
-#define pte_unmap(pte)			do { } while(0)
+
+static inline void pte_unmap(pte_t *pte) { }
 
 /* to find an entry in a kernel page-table-directory */
 /* This now only contains the vmalloc pages */
-- 
cgit v1.2.3-59-g8ed1b


From c05f57fdc34a3d00c9ee28a35772e9d11b5ce100 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Sat, 6 Apr 2019 22:48:08 -0400
Subject: powerpc/pseries/iommu: Fix set but not used values

The commit b7d6bf4fdd47 ("powerpc/pseries/pci: Remove obsolete SW
invalidate") left 2 variables unused.

  arch/powerpc/platforms/pseries/iommu.c:108:17: warning: variable 'tces' set but not used
    __be64 *tcep, *tces;
                   ^~~~

  arch/powerpc/platforms/pseries/iommu.c:132:17: warning: variable 'tces' set but not used
    __be64 *tcep, *tces;
                   ^~~~

Also, the commit 68c0449ea16d ("powerpc/pseries/iommu: Use memory@
nodes in max RAM address calculation") set "ranges" in
ddw_memory_hotplug_max() but never use it.

  arch/powerpc/platforms/pseries/iommu.c: In function 'ddw_memory_hotplug_max':
  arch/powerpc/platforms/pseries/iommu.c:948:7: warning: variable 'ranges' set but not used
     int ranges, n_mem_addr_cells, n_mem_size_cells, len;
         ^~~~~~

Signed-off-by: Qian Cai <cai@lca.pw>
Reviewed-by: Mukesh Ojha <mojha@codeaurora.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/iommu.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 36eb1ddbac69..03bbb299320e 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -105,7 +105,7 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index,
 			      unsigned long attrs)
 {
 	u64 proto_tce;
-	__be64 *tcep, *tces;
+	__be64 *tcep;
 	u64 rpn;
 
 	proto_tce = TCE_PCI_READ; // Read allowed
@@ -113,7 +113,7 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index,
 	if (direction != DMA_TO_DEVICE)
 		proto_tce |= TCE_PCI_WRITE;
 
-	tces = tcep = ((__be64 *)tbl->it_base) + index;
+	tcep = ((__be64 *)tbl->it_base) + index;
 
 	while (npages--) {
 		/* can't move this out since we might cross MEMBLOCK boundary */
@@ -129,9 +129,9 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index,
 
 static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)
 {
-	__be64 *tcep, *tces;
+	__be64 *tcep;
 
-	tces = tcep = ((__be64 *)tbl->it_base) + index;
+	tcep = ((__be64 *)tbl->it_base) + index;
 
 	while (npages--)
 		*(tcep++) = 0;
@@ -945,7 +945,7 @@ static phys_addr_t ddw_memory_hotplug_max(void)
 
 	for_each_node_by_type(memory, "memory") {
 		unsigned long start, size;
-		int ranges, n_mem_addr_cells, n_mem_size_cells, len;
+		int n_mem_addr_cells, n_mem_size_cells, len;
 		const __be32 *memcell_buf;
 
 		memcell_buf = of_get_property(memory, "reg", &len);
@@ -955,9 +955,6 @@ static phys_addr_t ddw_memory_hotplug_max(void)
 		n_mem_addr_cells = of_n_addr_cells(memory);
 		n_mem_size_cells = of_n_size_cells(memory);
 
-		/* ranges in cell */
-		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
-
 		start = of_read_number(memcell_buf, n_mem_addr_cells);
 		memcell_buf += n_mem_addr_cells;
 		size = of_read_number(memcell_buf, n_mem_size_cells);
-- 
cgit v1.2.3-59-g8ed1b


From e663e1e06089773cdab03023563aead65cfed042 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Sat, 6 Apr 2019 21:54:47 -0400
Subject: powerpc/pseries/pmem: Fix a set but not used value

The commit 4c5d87db4978 ("powerpc/pseries: PAPR persistent memory
support") set a local variable "count" in dlpar_hp_pmem() but never
use it.

  arch/powerpc/platforms/pseries/pmem.c: In function 'dlpar_hp_pmem':
  arch/powerpc/platforms/pseries/pmem.c:109:6: warning: variable 'count' set but not used

Signed-off-by: Qian Cai <cai@lca.pw>
Reviewed-by: Mukesh Ojha <mojha@codeaurora.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/pmem.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/pseries/pmem.c b/arch/powerpc/platforms/pseries/pmem.c
index 27f0a915c8a9..f860a897a9e0 100644
--- a/arch/powerpc/platforms/pseries/pmem.c
+++ b/arch/powerpc/platforms/pseries/pmem.c
@@ -106,7 +106,7 @@ static ssize_t pmem_drc_remove_node(u32 drc_index)
 
 int dlpar_hp_pmem(struct pseries_hp_errorlog *hp_elog)
 {
-	u32 count, drc_index;
+	u32 drc_index;
 	int rc;
 
 	/* slim chance, but we might get a hotplug event while booting */
@@ -123,7 +123,6 @@ int dlpar_hp_pmem(struct pseries_hp_errorlog *hp_elog)
 		return -EINVAL;
 	}
 
-	count = hp_elog->_drc_u.drc_count;
 	drc_index = hp_elog->_drc_u.drc_index;
 
 	lock_device_hotplug();
-- 
cgit v1.2.3-59-g8ed1b


From 4df2cb633b5b22ba152511f1a55e718efca6c0d9 Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Sat, 23 Feb 2019 14:20:34 +0100
Subject: powerpc/83xx: Add missing of_node_put() after
 of_device_is_available()

Add an of_node_put() when a tested device node is not available.

Fixes: c026c98739c7e ("powerpc/83xx: Do not configure or probe disabled FSL DR USB controllers")
Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Reviewed-by: Mukesh Ojha <mojha@codeaurora.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/83xx/usb.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/83xx/usb.c b/arch/powerpc/platforms/83xx/usb.c
index 5c31d8292d3b..e7c2c3fb011a 100644
--- a/arch/powerpc/platforms/83xx/usb.c
+++ b/arch/powerpc/platforms/83xx/usb.c
@@ -221,8 +221,10 @@ int mpc837x_usb_cfg(void)
 	int ret = 0;
 
 	np = of_find_compatible_node(NULL, NULL, "fsl-usb2-dr");
-	if (!np || !of_device_is_available(np))
+	if (!np || !of_device_is_available(np)) {
+		of_node_put(np);
 		return -ENODEV;
+	}
 	prop = of_get_property(np, "phy_type", NULL);
 
 	if (!prop || (strcmp(prop, "ulpi") && strcmp(prop, "serial"))) {
-- 
cgit v1.2.3-59-g8ed1b


From 7f177f9810ada8ec2e8b378eddbe2d91fda79c9b Mon Sep 17 00:00:00 2001
From: Ganesh Goudar <ganeshgr@linux.ibm.com>
Date: Mon, 15 Apr 2019 15:35:44 +0530
Subject: powerpc/pseries: hwpoison the pages upon hitting UE

Add support to hwpoison the pages upon hitting machine check
exception.

This patch queues the address where UE is hit to percpu array
and schedules work to plumb it into memory poison infrastructure.

Reviewed-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
[mpe: Combine #ifdefs, drop PPC_BIT8(), and empty inline stub]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mce.h       |  1 +
 arch/powerpc/kernel/mce_power.c      |  2 +-
 arch/powerpc/platforms/pseries/ras.c | 83 ++++++++++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 17996bc9382b..ad47fa865324 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -210,6 +210,7 @@ extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt,
 					   bool user_mode, bool in_guest);
+unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 6b800eec31f2..367fbfa2e835 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -36,7 +36,7 @@
  * Convert an address related to an mm to a PFN. NOTE: we are in real
  * mode, we could potentially race with page table updates.
  */
-static unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
+unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
 {
 	pte_t *ptep;
 	unsigned long flags;
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index a25c2ac0c9c0..c97d15352f9f 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -707,6 +707,87 @@ out:
 	return disposition;
 }
 
+#ifdef CONFIG_MEMORY_FAILURE
+
+static DEFINE_PER_CPU(int, rtas_ue_count);
+static DEFINE_PER_CPU(unsigned long, rtas_ue_paddr[MAX_MC_EVT]);
+
+#define UE_EFFECTIVE_ADDR_PROVIDED	0x40
+#define UE_LOGICAL_ADDR_PROVIDED	0x20
+
+
+static void pseries_hwpoison_work_fn(struct work_struct *work)
+{
+	unsigned long paddr;
+	int index;
+
+	while (__this_cpu_read(rtas_ue_count) > 0) {
+		index = __this_cpu_read(rtas_ue_count) - 1;
+		paddr = __this_cpu_read(rtas_ue_paddr[index]);
+		memory_failure(paddr >> PAGE_SHIFT, 0);
+		__this_cpu_dec(rtas_ue_count);
+	}
+}
+
+static DECLARE_WORK(hwpoison_work, pseries_hwpoison_work_fn);
+
+static void queue_ue_paddr(unsigned long paddr)
+{
+	int index;
+
+	index = __this_cpu_inc_return(rtas_ue_count) - 1;
+	if (index >= MAX_MC_EVT) {
+		__this_cpu_dec(rtas_ue_count);
+		return;
+	}
+	this_cpu_write(rtas_ue_paddr[index], paddr);
+	schedule_work(&hwpoison_work);
+}
+
+static void pseries_do_memory_failure(struct pt_regs *regs,
+				      struct pseries_mc_errorlog *mce_log)
+{
+	unsigned long paddr;
+
+	if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) {
+		paddr = be64_to_cpu(mce_log->logical_address);
+	} else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
+		unsigned long pfn;
+
+		pfn = addr_to_pfn(regs,
+				  be64_to_cpu(mce_log->effective_address));
+		if (pfn == ULONG_MAX)
+			return;
+		paddr = pfn << PAGE_SHIFT;
+	} else {
+		return;
+	}
+	queue_ue_paddr(paddr);
+}
+
+static void pseries_process_ue(struct pt_regs *regs,
+			       struct rtas_error_log *errp)
+{
+	struct pseries_errorlog *pseries_log;
+	struct pseries_mc_errorlog *mce_log;
+
+	if (!rtas_error_extended(errp))
+		return;
+
+	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+	if (!pseries_log)
+		return;
+
+	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+
+	if (mce_log->error_type == MC_ERROR_TYPE_UE)
+		pseries_do_memory_failure(regs, mce_log);
+}
+#else
+static inline void pseries_process_ue(struct pt_regs *regs,
+				      struct rtas_error_log *errp) { }
+#endif /*CONFIG_MEMORY_FAILURE */
+
 /*
  * Process MCE rtas errlog event.
  */
@@ -765,6 +846,8 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
 		recovered = 1;
 	}
 
+	pseries_process_ue(regs, err);
+
 	/* Queue irq work to log this rtas event later. */
 	irq_work_queue(&mce_errlog_process_work);
 
-- 
cgit v1.2.3-59-g8ed1b


From cc76404feaed597bb4f5234d34d3f49e2d1139bf Mon Sep 17 00:00:00 2001
From: Wen Yang <wen.yang99@zte.com.cn>
Date: Tue, 26 Mar 2019 18:29:51 +0800
Subject: powerpc/8xx: Fix possible device node reference leak

The call to of_find_compatible_node() returns a node pointer with
refcount incremented thus it must be explicitly decremented after the
last usage.

irq_domain_add_linear() also calls of_node_get() to increase refcount,
so irq_domain() will not be affected when it is released.

Detected by coccinelle.

Fixes: a8db8cf0d894 ("irq_domain: Replace irq_alloc_host() with revmap-specific initializers")
Signed-off-by: Wen Yang <wen.yang99@zte.com.cn>
Suggested-by: Christophe Leroy <christophe.leroy@c-s.fr>
Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Peng Hao <peng.hao2@zte.com.cn>
Reviewed-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/8xx/pic.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/8xx/pic.c b/arch/powerpc/platforms/8xx/pic.c
index 8d5a25d43ef3..e9617d35fd1f 100644
--- a/arch/powerpc/platforms/8xx/pic.c
+++ b/arch/powerpc/platforms/8xx/pic.c
@@ -153,10 +153,9 @@ int mpc8xx_pic_init(void)
 	if (mpc8xx_pic_host == NULL) {
 		printk(KERN_ERR "MPC8xx PIC: failed to allocate irq host!\n");
 		ret = -ENOMEM;
-		goto out;
 	}
-	return 0;
 
+	ret = 0;
 out:
 	of_node_put(np);
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From 6917735e8f905da1f62ccdf62830b185524835c7 Mon Sep 17 00:00:00 2001
From: Jagadeesh Pagadala <jagdsh.linux@gmail.com>
Date: Sat, 23 Mar 2019 18:20:55 +0530
Subject: powerpc: Remove duplicate headers

Remove duplicate headers inclusions.

Signed-off-by: Jagadeesh Pagadala <jagdsh.linux@gmail.com>
Reviewed-by: Mukesh Ojha <mojha@codeaurora.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/time.c       | 1 -
 arch/powerpc/lib/code-patching.c | 1 -
 arch/powerpc/mm/numa.c           | 1 -
 3 files changed, 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index bc0503ef9c9c..6ef32472ee1d 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -43,7 +43,6 @@
 #include <linux/timex.h>
 #include <linux/kernel_stat.h>
 #include <linux/time.h>
-#include <linux/clockchips.h>
 #include <linux/init.h>
 #include <linux/profile.h>
 #include <linux/cpu.h>
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 506413a2c25e..587ff9788ab0 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -15,7 +15,6 @@
 #include <linux/cpuhotplug.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
-#include <linux/kprobes.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index f976676004ad..f3ee0e18edd6 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -32,7 +32,6 @@
 #include <asm/sparsemem.h>
 #include <asm/prom.h>
 #include <asm/smp.h>
-#include <asm/cputhreads.h>
 #include <asm/topology.h>
 #include <asm/firmware.h>
 #include <asm/paca.h>
-- 
cgit v1.2.3-59-g8ed1b


From 80d04b7fabe161a23d143b3bfcfca1b002c23da1 Mon Sep 17 00:00:00 2001
From: George Spelvin <lkml@sdf.org>
Date: Thu, 21 Mar 2019 10:42:22 +0000
Subject: powerpc/crypto: Use cheaper random numbers for crc-vpmsum self-test

This code was filling a 64K buffer from /dev/urandom in order to
compute a CRC over (on average half of) it by two different methods,
comparing the CRCs, and repeating.

This is not a remotely security-critical application, so use the far
faster and cheaper prandom_u32() generator.

And, while we're at it, only fill as much of the buffer as we plan to use.

Signed-off-by: George Spelvin <lkml@sdf.org>
Acked-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/crypto/crc-vpmsum_test.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/crypto/crc-vpmsum_test.c b/arch/powerpc/crypto/crc-vpmsum_test.c
index 0153a9c6f4af..98ea4f4d3dde 100644
--- a/arch/powerpc/crypto/crc-vpmsum_test.c
+++ b/arch/powerpc/crypto/crc-vpmsum_test.c
@@ -78,16 +78,12 @@ static int __init crc_test_init(void)
 
 		pr_info("crc-vpmsum_test begins, %lu iterations\n", iterations);
 		for (i=0; i<iterations; i++) {
-			size_t len, offset;
+			size_t offset = prandom_u32_max(16);
+			size_t len = prandom_u32_max(MAX_CRC_LENGTH);
 
-			get_random_bytes(data, MAX_CRC_LENGTH);
-			get_random_bytes(&len, sizeof(len));
-			get_random_bytes(&offset, sizeof(offset));
-
-			len %= MAX_CRC_LENGTH;
-			offset &= 15;
 			if (len <= offset)
 				continue;
+			prandom_bytes(data, len);
 			len -= offset;
 
 			crypto_shash_update(crct10dif_shash, data+offset, len);
-- 
cgit v1.2.3-59-g8ed1b


From 2f9196b67237f1e59f2753b2968f8ad6433d4042 Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Date: Thu, 14 Mar 2019 15:27:27 +1100
Subject: powerpc/powernv: Squash sparse warnings in opal-call.c

sparse complains a lot about opal-call.c:

  arch/powerpc/platforms/powernv/opal-call.c:128:1: warning: symbol 'opal_invalid_call' was not declared. Should it be static?
  arch/powerpc/platforms/powernv/opal-call.c:129:1: warning: symbol 'opal_console_write' was not declared. Should it be static?
  arch/powerpc/platforms/powernv/opal-call.c:130:1: warning: symbol 'opal_console_read' was not declared. Should it be static?

Those symbols are forward declared in opal.h, but we can't include that
because the function signatures in opal.h are different. So instead, just
add an extra forward declaration to the OPAL_CALL macro to shut sparse up.

Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-call.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c
index daad8c45c8e7..c53773a149e0 100644
--- a/arch/powerpc/platforms/powernv/opal-call.c
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -120,6 +120,8 @@ static int64_t opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
 }
 
 #define OPAL_CALL(name, opcode)					\
+int64_t name(int64_t a0, int64_t a1, int64_t a2, int64_t a3,	\
+	     int64_t a4, int64_t a5, int64_t a6, int64_t a7);	\
 int64_t name(int64_t a0, int64_t a1, int64_t a2, int64_t a3,	\
 	     int64_t a4, int64_t a5, int64_t a6, int64_t a7)	\
 {								\
-- 
cgit v1.2.3-59-g8ed1b


From 2d4d9b308f8f8dec68f6dbbff18c68ec7c6bd26f Mon Sep 17 00:00:00 2001
From: Nathan Lynch <nathanl@linux.ibm.com>
Date: Thu, 18 Apr 2019 13:56:57 -0500
Subject: powerpc/numa: improve control of topology updates

When booted with "topology_updates=no", or when "off" is written to
/proc/powerpc/topology_updates, NUMA reassignments are inhibited for
PRRN and VPHN events. However, migration and suspend unconditionally
re-enable reassignments via start_topology_update(). This is
incoherent.

Check the topology_updates_enabled flag in
start/stop_topology_update() so that callers of those APIs need not be
aware of whether reassignments are enabled. This allows the
administrative decision on reassignments to remain in force across
migrations and suspensions.

Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/numa.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index f3ee0e18edd6..952ada44df62 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1497,6 +1497,9 @@ int start_topology_update(void)
 {
 	int rc = 0;
 
+	if (!topology_updates_enabled)
+		return 0;
+
 	if (firmware_has_feature(FW_FEATURE_PRRN)) {
 		if (!prrn_enabled) {
 			prrn_enabled = 1;
@@ -1530,6 +1533,9 @@ int stop_topology_update(void)
 {
 	int rc = 0;
 
+	if (!topology_updates_enabled)
+		return 0;
+
 	if (prrn_enabled) {
 		prrn_enabled = 0;
 #ifdef CONFIG_SMP
@@ -1587,11 +1593,13 @@ static ssize_t topology_write(struct file *file, const char __user *buf,
 
 	kbuf[read_len] = '\0';
 
-	if (!strncmp(kbuf, "on", 2))
+	if (!strncmp(kbuf, "on", 2)) {
+		topology_updates_enabled = true;
 		start_topology_update();
-	else if (!strncmp(kbuf, "off", 3))
+	} else if (!strncmp(kbuf, "off", 3)) {
 		stop_topology_update();
-	else
+		topology_updates_enabled = false;
+	} else
 		return -EINVAL;
 
 	return count;
@@ -1606,9 +1614,7 @@ static const struct file_operations topology_ops = {
 
 static int topology_update_init(void)
 {
-	/* Do not poll for changes if disabled at boot */
-	if (topology_updates_enabled)
-		start_topology_update();
+	start_topology_update();
 
 	if (vphn_enabled)
 		topology_schedule_update();
-- 
cgit v1.2.3-59-g8ed1b


From 558f86493df09f68f79fe056d9028d317a3ce8ab Mon Sep 17 00:00:00 2001
From: Nathan Lynch <nathanl@linux.ibm.com>
Date: Thu, 18 Apr 2019 13:56:58 -0500
Subject: powerpc/numa: document topology_updates_enabled, disable by default

Changing the NUMA associations for CPUs and memory at runtime is
basically unsupported by the core mm, scheduler etc. We see all manner
of crashes, warnings and instability when the pseries code tries to do
this. Disable this behavior by default, and document the switch a bit.

Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/numa.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 952ada44df62..6ef36d553cde 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -907,16 +907,22 @@ static int __init early_numa(char *p)
 }
 early_param("numa", early_numa);
 
-static bool topology_updates_enabled = true;
+/*
+ * The platform can inform us through one of several mechanisms
+ * (post-migration device tree updates, PRRN or VPHN) that the NUMA
+ * assignment of a resource has changed. This controls whether we act
+ * on that. Disabled by default.
+ */
+static bool topology_updates_enabled;
 
 static int __init early_topology_updates(char *p)
 {
 	if (!p)
 		return 0;
 
-	if (!strcmp(p, "off")) {
-		pr_info("Disabling topology updates\n");
-		topology_updates_enabled = false;
+	if (!strcmp(p, "on")) {
+		pr_warn("Caution: enabling topology updates\n");
+		topology_updates_enabled = true;
 	}
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From c1fe190c06723322f2dfac31d3b982c581e434ef Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Mon, 1 Apr 2019 17:03:12 +1100
Subject: powerpc: Add force enable of DAWR on P9 option

This adds a flag so that the DAWR can be enabled on P9 via:
  echo Y > /sys/kernel/debug/powerpc/dawr_enable_dangerous

The DAWR was previously force disabled on POWER9 in:
  9654153158 powerpc: Disable DAWR in the base POWER9 CPU features
Also see Documentation/powerpc/DAWR-POWER9.txt

This is a dangerous setting, USE AT YOUR OWN RISK.

Some users may not care about a bad user crashing their box
(ie. single user/desktop systems) and really want the DAWR.  This
allows them to force enable DAWR.

This flag can also be used to disable DAWR access. Once this is
cleared, all DAWR access should be cleared immediately and your
machine once again safe from crashing.

Userspace may get confused by toggling this. If DAWR is force
enabled/disabled between getting the number of breakpoints (via
PTRACE_GETHWDBGINFO) and setting the breakpoint, userspace will get an
inconsistent view of what's available. Similarly for guests.

For the DAWR to be enabled in a KVM guest, the DAWR needs to be force
enabled in the host AND the guest. For this reason, this won't work on
POWERVM as it doesn't allow the HCALL to work. Writes of 'Y' to the
dawr_enable_dangerous file will fail if the hypervisor doesn't support
writing the DAWR.

To double check the DAWR is working, run this kernel selftest:
  tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
Any errors/failures/skips mean something is wrong.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 Documentation/powerpc/DAWR-POWER9.txt    | 32 +++++++++++++++++
 arch/powerpc/include/asm/hw_breakpoint.h |  8 +++++
 arch/powerpc/kernel/hw_breakpoint.c      | 62 +++++++++++++++++++++++++++++++-
 arch/powerpc/kernel/process.c            |  9 ++---
 arch/powerpc/kernel/ptrace.c             |  3 +-
 arch/powerpc/kvm/book3s_hv.c             |  3 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  | 23 ++++++------
 7 files changed, 123 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/Documentation/powerpc/DAWR-POWER9.txt b/Documentation/powerpc/DAWR-POWER9.txt
index 2feaa6619658..bdec03650941 100644
--- a/Documentation/powerpc/DAWR-POWER9.txt
+++ b/Documentation/powerpc/DAWR-POWER9.txt
@@ -56,3 +56,35 @@ POWER9. Loads and stores to the watchpoint locations will not be
 trapped in GDB. The watchpoint is remembered, so if the guest is
 migrated back to the POWER8 host, it will start working again.
 
+Force enabling the DAWR
+=============================
+Kernels (since ~v5.2) have an option to force enable the DAWR via:
+
+  echo Y > /sys/kernel/debug/powerpc/dawr_enable_dangerous
+
+This enables the DAWR even on POWER9.
+
+This is a dangerous setting, USE AT YOUR OWN RISK.
+
+Some users may not care about a bad user crashing their box
+(ie. single user/desktop systems) and really want the DAWR.  This
+allows them to force enable DAWR.
+
+This flag can also be used to disable DAWR access. Once this is
+cleared, all DAWR access should be cleared immediately and your
+machine once again safe from crashing.
+
+Userspace may get confused by toggling this. If DAWR is force
+enabled/disabled between getting the number of breakpoints (via
+PTRACE_GETHWDBGINFO) and setting the breakpoint, userspace will get an
+inconsistent view of what's available. Similarly for guests.
+
+For the DAWR to be enabled in a KVM guest, the DAWR needs to be force
+enabled in the host AND the guest. For this reason, this won't work on
+POWERVM as it doesn't allow the HCALL to work. Writes of 'Y' to the
+dawr_enable_dangerous file will fail if the hypervisor doesn't support
+writing the DAWR.
+
+To double check the DAWR is working, run this kernel selftest:
+  tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
+Any errors/failures/skips mean something is wrong.
diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h
index ece4dc89c90b..0fe8c1e46bbc 100644
--- a/arch/powerpc/include/asm/hw_breakpoint.h
+++ b/arch/powerpc/include/asm/hw_breakpoint.h
@@ -90,10 +90,18 @@ static inline void hw_breakpoint_disable(void)
 extern void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs);
 int hw_breakpoint_handler(struct die_args *args);
 
+extern int set_dawr(struct arch_hw_breakpoint *brk);
+extern bool dawr_force_enable;
+static inline bool dawr_enabled(void)
+{
+	return dawr_force_enable;
+}
+
 #else	/* CONFIG_HAVE_HW_BREAKPOINT */
 static inline void hw_breakpoint_disable(void) { }
 static inline void thread_change_pc(struct task_struct *tsk,
 					struct pt_regs *regs) { }
+static inline bool dawr_enabled(void) { return false; }
 #endif	/* CONFIG_HAVE_HW_BREAKPOINT */
 #endif	/* __KERNEL__ */
 #endif	/* _PPC_BOOK3S_64_HW_BREAKPOINT_H */
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index fec8a6773119..da307dd93ee3 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -29,11 +29,15 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
+#include <linux/debugfs.h>
+#include <linux/init.h>
 
 #include <asm/hw_breakpoint.h>
 #include <asm/processor.h>
 #include <asm/sstep.h>
 #include <asm/debug.h>
+#include <asm/debugfs.h>
+#include <asm/hvcall.h>
 #include <linux/uaccess.h>
 
 /*
@@ -174,7 +178,7 @@ int hw_breakpoint_arch_parse(struct perf_event *bp,
 	if (!ppc_breakpoint_available())
 		return -ENODEV;
 	length_max = 8; /* DABR */
-	if (cpu_has_feature(CPU_FTR_DAWR)) {
+	if (dawr_enabled()) {
 		length_max = 512 ; /* 64 doublewords */
 		/* DAWR region can't cross 512 boundary */
 		if ((attr->bp_addr >> 9) !=
@@ -376,3 +380,59 @@ void hw_breakpoint_pmu_read(struct perf_event *bp)
 {
 	/* TODO */
 }
+
+bool dawr_force_enable;
+EXPORT_SYMBOL_GPL(dawr_force_enable);
+
+static ssize_t dawr_write_file_bool(struct file *file,
+				    const char __user *user_buf,
+				    size_t count, loff_t *ppos)
+{
+	struct arch_hw_breakpoint null_brk = {0, 0, 0};
+	size_t rc;
+
+	/* Send error to user if they hypervisor won't allow us to write DAWR */
+	if ((!dawr_force_enable) &&
+	    (firmware_has_feature(FW_FEATURE_LPAR)) &&
+	    (set_dawr(&null_brk) != H_SUCCESS))
+		return -1;
+
+	rc = debugfs_write_file_bool(file, user_buf, count, ppos);
+	if (rc)
+		return rc;
+
+	/* If we are clearing, make sure all CPUs have the DAWR cleared */
+	if (!dawr_force_enable)
+		smp_call_function((smp_call_func_t)set_dawr, &null_brk, 0);
+
+	return rc;
+}
+
+static const struct file_operations dawr_enable_fops = {
+	.read =		debugfs_read_file_bool,
+	.write =	dawr_write_file_bool,
+	.open =		simple_open,
+	.llseek =	default_llseek,
+};
+
+static int __init dawr_force_setup(void)
+{
+	dawr_force_enable = false;
+
+	if (cpu_has_feature(CPU_FTR_DAWR)) {
+		/* Don't setup sysfs file for user control on P8 */
+		dawr_force_enable = true;
+		return 0;
+	}
+
+	if (PVR_VER(mfspr(SPRN_PVR)) == PVR_POWER9) {
+		/* Turn DAWR off by default, but allow admin to turn it on */
+		dawr_force_enable = false;
+		debugfs_create_file_unsafe("dawr_enable_dangerous", 0600,
+					   powerpc_debugfs_root,
+					   &dawr_force_enable,
+					   &dawr_enable_fops);
+	}
+	return 0;
+}
+arch_initcall(dawr_force_setup);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index dd9e0d5386ee..225705aac814 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -67,6 +67,7 @@
 #include <asm/cpu_has_feature.h>
 #include <asm/asm-prototypes.h>
 #include <asm/stacktrace.h>
+#include <asm/hw_breakpoint.h>
 
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
@@ -784,7 +785,7 @@ static inline int set_dabr(struct arch_hw_breakpoint *brk)
 	return __set_dabr(dabr, dabrx);
 }
 
-static inline int set_dawr(struct arch_hw_breakpoint *brk)
+int set_dawr(struct arch_hw_breakpoint *brk)
 {
 	unsigned long dawr, dawrx, mrd;
 
@@ -816,7 +817,7 @@ void __set_breakpoint(struct arch_hw_breakpoint *brk)
 {
 	memcpy(this_cpu_ptr(&current_brk), brk, sizeof(*brk));
 
-	if (cpu_has_feature(CPU_FTR_DAWR))
+	if (dawr_enabled())
 		// Power8 or later
 		set_dawr(brk);
 	else if (!cpu_has_feature(CPU_FTR_ARCH_207S))
@@ -830,8 +831,8 @@ void __set_breakpoint(struct arch_hw_breakpoint *brk)
 /* Check if we have DAWR or DABR hardware */
 bool ppc_breakpoint_available(void)
 {
-	if (cpu_has_feature(CPU_FTR_DAWR))
-		return true; /* POWER8 DAWR */
+	if (dawr_enabled())
+		return true; /* POWER8 DAWR or POWER9 forced DAWR */
 	if (cpu_has_feature(CPU_FTR_ARCH_207S))
 		return false; /* POWER9 with DAWR disabled */
 	/* DABR: Everything but POWER8 and POWER9 */
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index d9ac7d94656e..684b0b315c32 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -43,6 +43,7 @@
 #include <asm/tm.h>
 #include <asm/asm-prototypes.h>
 #include <asm/debug.h>
+#include <asm/hw_breakpoint.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
@@ -3088,7 +3089,7 @@ long arch_ptrace(struct task_struct *child, long request,
 		dbginfo.sizeof_condition = 0;
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		dbginfo.features = PPC_DEBUG_FEATURE_DATA_BP_RANGE;
-		if (cpu_has_feature(CPU_FTR_DAWR))
+		if (dawr_enabled())
 			dbginfo.features |= PPC_DEBUG_FEATURE_DATA_BP_DAWR;
 #else
 		dbginfo.features = 0;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 06964350b97a..0fab0a201027 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -74,6 +74,7 @@
 #include <asm/opal.h>
 #include <asm/xics.h>
 #include <asm/xive.h>
+#include <asm/hw_breakpoint.h>
 
 #include "book3s.h"
 
@@ -3374,7 +3375,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
 	mtspr(SPRN_PURR, vcpu->arch.purr);
 	mtspr(SPRN_SPURR, vcpu->arch.spurr);
 
-	if (cpu_has_feature(CPU_FTR_DAWR)) {
+	if (dawr_enabled()) {
 		mtspr(SPRN_DAWR, vcpu->arch.dawr);
 		mtspr(SPRN_DAWRX, vcpu->arch.dawrx);
 	}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 3a5e719ef032..139027c62dc2 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -822,18 +822,21 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	mtspr	SPRN_IAMR, r5
 	mtspr	SPRN_PSPB, r6
 	mtspr	SPRN_FSCR, r7
-	ld	r5, VCPU_DAWR(r4)
-	ld	r6, VCPU_DAWRX(r4)
-	ld	r7, VCPU_CIABR(r4)
-	ld	r8, VCPU_TAR(r4)
 	/*
 	 * Handle broken DAWR case by not writing it. This means we
 	 * can still store the DAWR register for migration.
 	 */
-BEGIN_FTR_SECTION
+	LOAD_REG_ADDR(r5, dawr_force_enable)
+	lbz	r5, 0(r5)
+	cmpdi	r5, 0
+	beq	1f
+	ld	r5, VCPU_DAWR(r4)
+	ld	r6, VCPU_DAWRX(r4)
 	mtspr	SPRN_DAWR, r5
 	mtspr	SPRN_DAWRX, r6
-END_FTR_SECTION_IFSET(CPU_FTR_DAWR)
+1:
+	ld	r7, VCPU_CIABR(r4)
+	ld	r8, VCPU_TAR(r4)
 	mtspr	SPRN_CIABR, r7
 	mtspr	SPRN_TAR, r8
 	ld	r5, VCPU_IC(r4)
@@ -2513,11 +2516,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	blr
 
 2:
-BEGIN_FTR_SECTION
-	/* POWER9 with disabled DAWR */
+	LOAD_REG_ADDR(r11, dawr_force_enable)
+	lbz	r11, 0(r11)
+	cmpdi	r11, 0
 	li	r3, H_HARDWARE
-	blr
-END_FTR_SECTION_IFCLR(CPU_FTR_DAWR)
+	beqlr
 	/* Emulate H_SET_DABR/X on P8 for the sake of compat mode guests */
 	rlwimi	r5, r4, 5, DAWRX_DR | DAWRX_DW
 	rlwimi	r5, r4, 2, DAWRX_WT
-- 
cgit v1.2.3-59-g8ed1b


From a3f3072db6cad40895c585dce65e36aab997f042 Mon Sep 17 00:00:00 2001
From: Russell Currey <ruscur@russell.cc>
Date: Thu, 18 Apr 2019 16:51:16 +1000
Subject: powerpc/powernv/idle: Restore IAMR after idle

Without restoring the IAMR after idle, execution prevention on POWER9
with Radix MMU is overwritten and the kernel can freely execute
userspace without faulting.

This is necessary when returning from any stop state that modifies
user state, as well as hypervisor state.

To test how this fails without this patch, load the lkdtm driver and
do the following:

  $ echo EXEC_USERSPACE > /sys/kernel/debug/provoke-crash/DIRECT

which won't fault, then boot the kernel with powersave=off, where it
will fault. Applying this patch will fix this.

Fixes: 3b10d0095a1e ("powerpc/mm/radix: Prevent kernel execution of user space")
Cc: stable@vger.kernel.org # v4.10+
Signed-off-by: Russell Currey <ruscur@russell.cc>
Reviewed-by: Akshay Adiga <akshay.adiga@linux.vnet.ibm.com>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/idle_book3s.S | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 7f5ac2e8581b..36178000a2f2 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -170,6 +170,9 @@ core_idle_lock_held:
 	bne-	core_idle_lock_held
 	blr
 
+/* Reuse an unused pt_regs slot for IAMR */
+#define PNV_POWERSAVE_IAMR	_DAR
+
 /*
  * Pass requested state in r3:
  *	r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8
@@ -200,6 +203,12 @@ pnv_powersave_common:
 	/* Continue saving state */
 	SAVE_GPR(2, r1)
 	SAVE_NVGPRS(r1)
+
+BEGIN_FTR_SECTION
+	mfspr	r5, SPRN_IAMR
+	std	r5, PNV_POWERSAVE_IAMR(r1)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
 	mfcr	r5
 	std	r5,_CCR(r1)
 	std	r1,PACAR1(r13)
@@ -924,6 +933,17 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	REST_NVGPRS(r1)
 	REST_GPR(2, r1)
+
+BEGIN_FTR_SECTION
+	/* IAMR was saved in pnv_powersave_common() */
+	ld	r5, PNV_POWERSAVE_IAMR(r1)
+	mtspr	SPRN_IAMR, r5
+	/*
+	 * We don't need an isync here because the upcoming mtmsrd is
+	 * execution synchronizing.
+	 */
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
 	ld	r4,PACAKMSR(r13)
 	ld	r5,_LINK(r1)
 	ld	r6,_CCR(r1)
-- 
cgit v1.2.3-59-g8ed1b


From 53a712bae5dd919521a58d7bad773b949358add0 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 18 Apr 2019 16:51:17 +1000
Subject: powerpc/powernv/idle: Restore AMR/UAMOR/AMOR after idle

In order to implement KUAP (Kernel Userspace Access Protection) on
Power9 we will be using the AMR, and therefore indirectly the
UAMOR/AMOR.

So save/restore these regs in the idle code.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/idle_book3s.S | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 36178000a2f2..4a860d3b9229 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -170,8 +170,11 @@ core_idle_lock_held:
 	bne-	core_idle_lock_held
 	blr
 
-/* Reuse an unused pt_regs slot for IAMR */
+/* Reuse some unused pt_regs slots for AMR/IAMR/UAMOR/UAMOR */
+#define PNV_POWERSAVE_AMR	_TRAP
 #define PNV_POWERSAVE_IAMR	_DAR
+#define PNV_POWERSAVE_UAMOR	_DSISR
+#define PNV_POWERSAVE_AMOR	RESULT
 
 /*
  * Pass requested state in r3:
@@ -205,8 +208,16 @@ pnv_powersave_common:
 	SAVE_NVGPRS(r1)
 
 BEGIN_FTR_SECTION
+	mfspr	r4, SPRN_AMR
 	mfspr	r5, SPRN_IAMR
+	mfspr	r6, SPRN_UAMOR
+	std	r4, PNV_POWERSAVE_AMR(r1)
 	std	r5, PNV_POWERSAVE_IAMR(r1)
+	std	r6, PNV_POWERSAVE_UAMOR(r1)
+BEGIN_FTR_SECTION_NESTED(42)
+	mfspr	r7, SPRN_AMOR
+	std	r7, PNV_POWERSAVE_AMOR(r1)
+END_FTR_SECTION_NESTED_IFSET(CPU_FTR_HVMODE, 42)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 	mfcr	r5
@@ -935,12 +946,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	REST_GPR(2, r1)
 
 BEGIN_FTR_SECTION
-	/* IAMR was saved in pnv_powersave_common() */
+	/* These regs were saved in pnv_powersave_common() */
+	ld	r4, PNV_POWERSAVE_AMR(r1)
 	ld	r5, PNV_POWERSAVE_IAMR(r1)
+	ld	r6, PNV_POWERSAVE_UAMOR(r1)
+	mtspr	SPRN_AMR, r4
 	mtspr	SPRN_IAMR, r5
+	mtspr	SPRN_UAMOR, r6
+BEGIN_FTR_SECTION_NESTED(42)
+	ld	r7, PNV_POWERSAVE_AMOR(r1)
+	mtspr	SPRN_AMOR, r7
+END_FTR_SECTION_NESTED_IFSET(CPU_FTR_HVMODE, 42)
 	/*
-	 * We don't need an isync here because the upcoming mtmsrd is
-	 * execution synchronizing.
+	 * We don't need an isync here after restoring IAMR because the upcoming
+	 * mtmsrd is execution synchronizing.
 	 */
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
-- 
cgit v1.2.3-59-g8ed1b


From 69795cabe4cfe5122438d50010ad5310c113a013 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 18 Apr 2019 16:51:18 +1000
Subject: powerpc: Add framework for Kernel Userspace Protection

This patch adds a skeleton for Kernel Userspace Protection
functionnalities like Kernel Userspace Access Protection and Kernel
Userspace Execution Prevention

The subsequent implementation of KUAP for radix makes use of a MMU
feature in order to patch out assembly when KUAP is disabled or
unsupported. This won't work unless there's an entry point for KUP
support before the feature magic happens, so for PPC64 setup_kup() is
called early in setup.

On PPC32, feature_fixup() is done too early to allow the same.

Suggested-by: Russell Currey <ruscur@russell.cc>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kup.h | 11 +++++++++++
 arch/powerpc/kernel/setup_64.c |  7 +++++++
 arch/powerpc/mm/init-common.c  |  5 +++++
 arch/powerpc/mm/init_32.c      |  3 +++
 4 files changed, 26 insertions(+)
 create mode 100644 arch/powerpc/include/asm/kup.h

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h
new file mode 100644
index 000000000000..7a88b8b9b54d
--- /dev/null
+++ b/arch/powerpc/include/asm/kup.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_KUP_H_
+#define _ASM_POWERPC_KUP_H_
+
+#ifndef __ASSEMBLY__
+
+void setup_kup(void);
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_POWERPC_KUP_H_ */
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index ba404dd9ce1d..6179c4200339 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -68,6 +68,7 @@
 #include <asm/cputhreads.h>
 #include <asm/hw_irq.h>
 #include <asm/feature-fixups.h>
+#include <asm/kup.h>
 
 #include "setup.h"
 
@@ -331,6 +332,12 @@ void __init early_setup(unsigned long dt_ptr)
 	 */
 	configure_exceptions();
 
+	/*
+	 * Configure Kernel Userspace Protection. This needs to happen before
+	 * feature fixups for platforms that implement this using features.
+	 */
+	setup_kup();
+
 	/* Apply all the dynamic patching */
 	apply_feature_fixups();
 	setup_feature_keys();
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index 1e6910eb70ed..36d28e872289 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -24,6 +24,11 @@
 #include <linux/string.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
+#include <asm/kup.h>
+
+void __init setup_kup(void)
+{
+}
 
 #define CTOR(shift) static void ctor_##shift(void *addr) \
 {							\
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 41a3513cadc9..80cc97cd8878 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -45,6 +45,7 @@
 #include <asm/tlb.h>
 #include <asm/sections.h>
 #include <asm/hugetlb.h>
+#include <asm/kup.h>
 
 #include "mmu_decl.h"
 
@@ -178,6 +179,8 @@ void __init MMU_init(void)
 	btext_unmap();
 #endif
 
+	setup_kup();
+
 	/* Shortly after that, the entire linear mapping will be available */
 	memblock_set_current_limit(lowmem_end_addr);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 0fb1c25ab523614b056ace11be67aac8f8ccabb1 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 18 Apr 2019 16:51:19 +1000
Subject: powerpc: Add skeleton for Kernel Userspace Execution Prevention

This patch adds a skeleton for Kernel Userspace Execution Prevention.

Then subarches implementing it have to define CONFIG_PPC_HAVE_KUEP
and provide setup_kuep() function.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
[mpe: Don't split strings, use pr_crit_ratelimited()]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 Documentation/admin-guide/kernel-parameters.txt |  2 +-
 arch/powerpc/include/asm/kup.h                  |  6 ++++++
 arch/powerpc/mm/fault.c                         |  9 ++++-----
 arch/powerpc/mm/init-common.c                   | 11 +++++++++++
 arch/powerpc/platforms/Kconfig.cputype          | 12 ++++++++++++
 5 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2b8ee90bb644..a53df74589e5 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2843,7 +2843,7 @@
 			Disable SMAP (Supervisor Mode Access Prevention)
 			even if it is supported by processor.
 
-	nosmep		[X86]
+	nosmep		[X86,PPC]
 			Disable SMEP (Supervisor Mode Execution Prevention)
 			even if it is supported by processor.
 
diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h
index 7a88b8b9b54d..a2a959cb4e36 100644
--- a/arch/powerpc/include/asm/kup.h
+++ b/arch/powerpc/include/asm/kup.h
@@ -6,6 +6,12 @@
 
 void setup_kup(void);
 
+#ifdef CONFIG_PPC_KUEP
+void setup_kuep(bool disabled);
+#else
+static inline void setup_kuep(bool disabled) { }
+#endif /* CONFIG_PPC_KUEP */
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_KUP_H_ */
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 887f11bcf330..3384354abc1d 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -229,11 +229,10 @@ static bool bad_kernel_fault(bool is_exec, unsigned long error_code,
 	/* NX faults set DSISR_PROTFAULT on the 8xx, DSISR_NOEXEC_OR_G on others */
 	if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT |
 				      DSISR_PROTFAULT))) {
-		printk_ratelimited(KERN_CRIT "kernel tried to execute"
-				   " exec-protected page (%lx) -"
-				   "exploit attempt? (uid: %d)\n",
-				   address, from_kuid(&init_user_ns,
-						      current_uid()));
+		pr_crit_ratelimited("kernel tried to execute %s page (%lx) - exploit attempt? (uid: %d)\n",
+				    address >= TASK_SIZE ? "exec-protected" : "user",
+				    address,
+				    from_kuid(&init_user_ns, current_uid()));
 	}
 	return is_exec || (address >= TASK_SIZE);
 }
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index 36d28e872289..83f95a5565d6 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -26,8 +26,19 @@
 #include <asm/pgtable.h>
 #include <asm/kup.h>
 
+static bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP);
+
+static int __init parse_nosmep(char *p)
+{
+	disable_kuep = true;
+	pr_warn("Disabling Kernel Userspace Execution Prevention\n");
+	return 0;
+}
+early_param("nosmep", parse_nosmep);
+
 void __init setup_kup(void)
 {
+	setup_kuep(disable_kuep);
 }
 
 #define CTOR(shift) static void ctor_##shift(void *addr) \
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 842b2c7e156a..7d30bbbaa3c1 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -345,6 +345,18 @@ config PPC_RADIX_MMU_DEFAULT
 
 	  If you're unsure, say Y.
 
+config PPC_HAVE_KUEP
+	bool
+
+config PPC_KUEP
+	bool "Kernel Userspace Execution Prevention"
+	depends on PPC_HAVE_KUEP
+	default y
+	help
+	  Enable support for Kernel Userspace Execution Prevention (KUEP)
+
+	  If you're unsure, say Y.
+
 config ARCH_ENABLE_HUGEPAGE_MIGRATION
 	def_bool y
 	depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION
-- 
cgit v1.2.3-59-g8ed1b


From de78a9c42a790011f179bc94a7da3f5d8721f4cc Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 18 Apr 2019 16:51:20 +1000
Subject: powerpc: Add a framework for Kernel Userspace Access Protection

This patch implements a framework for Kernel Userspace Access
Protection.

Then subarches will have the possibility to provide their own
implementation by providing setup_kuap() and
allow/prevent_user_access().

Some platforms will need to know the area accessed and whether it is
accessed from read, write or both. Therefore source, destination and
size and handed over to the two functions.

mpe: Rename to allow/prevent rather than unlock/lock, and add
read/write wrappers. Drop the 32-bit code for now until we have an
implementation for it. Add kuap to pt_regs for 64-bit as well as
32-bit. Don't split strings, use pr_crit_ratelimited().

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Russell Currey <ruscur@russell.cc>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 Documentation/admin-guide/kernel-parameters.txt |  2 +-
 arch/powerpc/include/asm/futex.h                |  4 +++
 arch/powerpc/include/asm/kup.h                  | 32 +++++++++++++++++++++
 arch/powerpc/include/asm/ptrace.h               | 11 +++++--
 arch/powerpc/include/asm/uaccess.h              | 38 +++++++++++++++++++------
 arch/powerpc/kernel/asm-offsets.c               |  4 +++
 arch/powerpc/lib/checksum_wrappers.c            |  4 +++
 arch/powerpc/mm/fault.c                         | 19 ++++++++++---
 arch/powerpc/mm/init-common.c                   | 10 +++++++
 arch/powerpc/platforms/Kconfig.cputype          | 12 ++++++++
 10 files changed, 121 insertions(+), 15 deletions(-)

(limited to 'arch')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a53df74589e5..c45a19d654f3 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2839,7 +2839,7 @@
 			noexec=on: enable non-executable mappings (default)
 			noexec=off: disable non-executable mappings
 
-	nosmap		[X86]
+	nosmap		[X86,PPC]
 			Disable SMAP (Supervisor Mode Access Prevention)
 			even if it is supported by processor.
 
diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
index 88b38b37c21b..3a6aa57b9d90 100644
--- a/arch/powerpc/include/asm/futex.h
+++ b/arch/powerpc/include/asm/futex.h
@@ -35,6 +35,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
 {
 	int oldval = 0, ret;
 
+	allow_write_to_user(uaddr, sizeof(*uaddr));
 	pagefault_disable();
 
 	switch (op) {
@@ -62,6 +63,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
 	if (!ret)
 		*oval = oldval;
 
+	prevent_write_to_user(uaddr, sizeof(*uaddr));
 	return ret;
 }
 
@@ -75,6 +77,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	if (!access_ok(uaddr, sizeof(u32)))
 		return -EFAULT;
 
+	allow_write_to_user(uaddr, sizeof(*uaddr));
         __asm__ __volatile__ (
         PPC_ATOMIC_ENTRY_BARRIER
 "1:     lwarx   %1,0,%3         # futex_atomic_cmpxchg_inatomic\n\
@@ -95,6 +98,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
         : "cc", "memory");
 
 	*uval = prev;
+	prevent_write_to_user(uaddr, sizeof(*uaddr));
         return ret;
 }
 
diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h
index a2a959cb4e36..4d78b9d8c99c 100644
--- a/arch/powerpc/include/asm/kup.h
+++ b/arch/powerpc/include/asm/kup.h
@@ -4,6 +4,8 @@
 
 #ifndef __ASSEMBLY__
 
+#include <asm/pgtable.h>
+
 void setup_kup(void);
 
 #ifdef CONFIG_PPC_KUEP
@@ -12,6 +14,36 @@ void setup_kuep(bool disabled);
 static inline void setup_kuep(bool disabled) { }
 #endif /* CONFIG_PPC_KUEP */
 
+#ifdef CONFIG_PPC_KUAP
+void setup_kuap(bool disabled);
+#else
+static inline void setup_kuap(bool disabled) { }
+static inline void allow_user_access(void __user *to, const void __user *from,
+				     unsigned long size) { }
+static inline void prevent_user_access(void __user *to, const void __user *from,
+				       unsigned long size) { }
+#endif /* CONFIG_PPC_KUAP */
+
+static inline void allow_read_from_user(const void __user *from, unsigned long size)
+{
+	allow_user_access(NULL, from, size);
+}
+
+static inline void allow_write_to_user(void __user *to, unsigned long size)
+{
+	allow_user_access(to, NULL, size);
+}
+
+static inline void prevent_read_from_user(const void __user *from, unsigned long size)
+{
+	prevent_user_access(NULL, from, size);
+}
+
+static inline void prevent_write_to_user(void __user *to, unsigned long size)
+{
+	prevent_user_access(to, NULL, size);
+}
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_KUP_H_ */
diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h
index 64271e562fed..6f047730e642 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -52,10 +52,17 @@ struct pt_regs
 		};
 	};
 
+	union {
+		struct {
 #ifdef CONFIG_PPC64
-	unsigned long ppr;
-	unsigned long __pad;	/* Maintain 16 byte interrupt stack alignment */
+			unsigned long ppr;
+#endif
+#ifdef CONFIG_PPC_KUAP
+			unsigned long kuap;
 #endif
+		};
+		unsigned long __pad[2];	/* Maintain 16 byte interrupt stack alignment */
+	};
 };
 #endif
 
diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index 4d6d905e9138..76f34346b642 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -6,6 +6,7 @@
 #include <asm/processor.h>
 #include <asm/page.h>
 #include <asm/extable.h>
+#include <asm/kup.h>
 
 /*
  * The fs value determines whether argument validity checking should be
@@ -140,6 +141,7 @@ extern long __put_user_bad(void);
 #define __put_user_size(x, ptr, size, retval)			\
 do {								\
 	retval = 0;						\
+	allow_write_to_user(ptr, size);				\
 	switch (size) {						\
 	  case 1: __put_user_asm(x, ptr, retval, "stb"); break;	\
 	  case 2: __put_user_asm(x, ptr, retval, "sth"); break;	\
@@ -147,6 +149,7 @@ do {								\
 	  case 8: __put_user_asm2(x, ptr, retval); break;	\
 	  default: __put_user_bad();				\
 	}							\
+	prevent_write_to_user(ptr, size);			\
 } while (0)
 
 #define __put_user_nocheck(x, ptr, size)			\
@@ -239,6 +242,7 @@ do {								\
 	__chk_user_ptr(ptr);					\
 	if (size > sizeof(x))					\
 		(x) = __get_user_bad();				\
+	allow_read_from_user(ptr, size);			\
 	switch (size) {						\
 	case 1: __get_user_asm(x, ptr, retval, "lbz"); break;	\
 	case 2: __get_user_asm(x, ptr, retval, "lhz"); break;	\
@@ -246,6 +250,7 @@ do {								\
 	case 8: __get_user_asm2(x, ptr, retval);  break;	\
 	default: (x) = __get_user_bad();			\
 	}							\
+	prevent_read_from_user(ptr, size);			\
 } while (0)
 
 /*
@@ -305,15 +310,21 @@ extern unsigned long __copy_tofrom_user(void __user *to,
 static inline unsigned long
 raw_copy_in_user(void __user *to, const void __user *from, unsigned long n)
 {
-	return __copy_tofrom_user(to, from, n);
+	unsigned long ret;
+
+	allow_user_access(to, from, n);
+	ret = __copy_tofrom_user(to, from, n);
+	prevent_user_access(to, from, n);
+	return ret;
 }
 #endif /* __powerpc64__ */
 
 static inline unsigned long raw_copy_from_user(void *to,
 		const void __user *from, unsigned long n)
 {
+	unsigned long ret;
 	if (__builtin_constant_p(n) && (n <= 8)) {
-		unsigned long ret = 1;
+		ret = 1;
 
 		switch (n) {
 		case 1:
@@ -338,14 +349,18 @@ static inline unsigned long raw_copy_from_user(void *to,
 	}
 
 	barrier_nospec();
-	return __copy_tofrom_user((__force void __user *)to, from, n);
+	allow_read_from_user(from, n);
+	ret = __copy_tofrom_user((__force void __user *)to, from, n);
+	prevent_read_from_user(from, n);
+	return ret;
 }
 
 static inline unsigned long raw_copy_to_user(void __user *to,
 		const void *from, unsigned long n)
 {
+	unsigned long ret;
 	if (__builtin_constant_p(n) && (n <= 8)) {
-		unsigned long ret = 1;
+		ret = 1;
 
 		switch (n) {
 		case 1:
@@ -365,17 +380,24 @@ static inline unsigned long raw_copy_to_user(void __user *to,
 			return 0;
 	}
 
-	return __copy_tofrom_user(to, (__force const void __user *)from, n);
+	allow_write_to_user(to, n);
+	ret = __copy_tofrom_user(to, (__force const void __user *)from, n);
+	prevent_write_to_user(to, n);
+	return ret;
 }
 
 extern unsigned long __clear_user(void __user *addr, unsigned long size);
 
 static inline unsigned long clear_user(void __user *addr, unsigned long size)
 {
+	unsigned long ret = size;
 	might_fault();
-	if (likely(access_ok(addr, size)))
-		return __clear_user(addr, size);
-	return size;
+	if (likely(access_ok(addr, size))) {
+		allow_write_to_user(addr, size);
+		ret = __clear_user(addr, size);
+		prevent_write_to_user(addr, size);
+	}
+	return ret;
 }
 
 extern long strncpy_from_user(char *dst, const char __user *src, long count);
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 86a61e5f8285..66202e02fee2 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -332,6 +332,10 @@ int main(void)
 	STACK_PT_REGS_OFFSET(_PPR, ppr);
 #endif /* CONFIG_PPC64 */
 
+#ifdef CONFIG_PPC_KUAP
+	STACK_PT_REGS_OFFSET(STACK_REGS_KUAP, kuap);
+#endif
+
 #if defined(CONFIG_PPC32)
 #if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
 	DEFINE(EXC_LVL_SIZE, STACK_EXC_LVL_FRAME_SIZE);
diff --git a/arch/powerpc/lib/checksum_wrappers.c b/arch/powerpc/lib/checksum_wrappers.c
index 890d4ddd91d6..bb9307ce2440 100644
--- a/arch/powerpc/lib/checksum_wrappers.c
+++ b/arch/powerpc/lib/checksum_wrappers.c
@@ -29,6 +29,7 @@ __wsum csum_and_copy_from_user(const void __user *src, void *dst,
 	unsigned int csum;
 
 	might_sleep();
+	allow_read_from_user(src, len);
 
 	*err_ptr = 0;
 
@@ -60,6 +61,7 @@ __wsum csum_and_copy_from_user(const void __user *src, void *dst,
 	}
 
 out:
+	prevent_read_from_user(src, len);
 	return (__force __wsum)csum;
 }
 EXPORT_SYMBOL(csum_and_copy_from_user);
@@ -70,6 +72,7 @@ __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len,
 	unsigned int csum;
 
 	might_sleep();
+	allow_write_to_user(dst, len);
 
 	*err_ptr = 0;
 
@@ -97,6 +100,7 @@ __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len,
 	}
 
 out:
+	prevent_write_to_user(dst, len);
 	return (__force __wsum)csum;
 }
 EXPORT_SYMBOL(csum_and_copy_to_user);
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 3384354abc1d..463d1e9d026e 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -223,9 +223,11 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr,
 }
 
 /* Is this a bad kernel fault ? */
-static bool bad_kernel_fault(bool is_exec, unsigned long error_code,
+static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
 			     unsigned long address)
 {
+	int is_exec = TRAP(regs) == 0x400;
+
 	/* NX faults set DSISR_PROTFAULT on the 8xx, DSISR_NOEXEC_OR_G on others */
 	if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT |
 				      DSISR_PROTFAULT))) {
@@ -234,7 +236,15 @@ static bool bad_kernel_fault(bool is_exec, unsigned long error_code,
 				    address,
 				    from_kuid(&init_user_ns, current_uid()));
 	}
-	return is_exec || (address >= TASK_SIZE);
+
+	if (!is_exec && address < TASK_SIZE && (error_code & DSISR_PROTFAULT) &&
+	    !search_exception_tables(regs->nip)) {
+		pr_crit_ratelimited("Kernel attempted to access user page (%lx) - exploit attempt? (uid: %d)\n",
+				    address,
+				    from_kuid(&init_user_ns, current_uid()));
+	}
+
+	return is_exec || (address >= TASK_SIZE) || !search_exception_tables(regs->nip);
 }
 
 static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
@@ -454,9 +464,10 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
 
 	/*
 	 * The kernel should never take an execute fault nor should it
-	 * take a page fault to a kernel address.
+	 * take a page fault to a kernel address or a page fault to a user
+	 * address outside of dedicated places
 	 */
-	if (unlikely(!is_user && bad_kernel_fault(is_exec, error_code, address)))
+	if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address)))
 		return SIGSEGV;
 
 	/*
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index 83f95a5565d6..ecaedfff9992 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -27,6 +27,7 @@
 #include <asm/kup.h>
 
 static bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP);
+static bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP);
 
 static int __init parse_nosmep(char *p)
 {
@@ -36,9 +37,18 @@ static int __init parse_nosmep(char *p)
 }
 early_param("nosmep", parse_nosmep);
 
+static int __init parse_nosmap(char *p)
+{
+	disable_kuap = true;
+	pr_warn("Disabling Kernel Userspace Access Protection\n");
+	return 0;
+}
+early_param("nosmap", parse_nosmap);
+
 void __init setup_kup(void)
 {
 	setup_kuep(disable_kuep);
+	setup_kuap(disable_kuap);
 }
 
 #define CTOR(shift) static void ctor_##shift(void *addr) \
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 7d30bbbaa3c1..457fc3a5ed93 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -357,6 +357,18 @@ config PPC_KUEP
 
 	  If you're unsure, say Y.
 
+config PPC_HAVE_KUAP
+	bool
+
+config PPC_KUAP
+	bool "Kernel Userspace Access Protection"
+	depends on PPC_HAVE_KUAP
+	default y
+	help
+	  Enable support for Kernel Userspace Access Protection (KUAP)
+
+	  If you're unsure, say Y.
+
 config ARCH_ENABLE_HUGEPAGE_MIGRATION
 	def_bool y
 	depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION
-- 
cgit v1.2.3-59-g8ed1b


From b28c97505eb1a5265e367c398c3406be6ce5e313 Mon Sep 17 00:00:00 2001
From: Russell Currey <ruscur@russell.cc>
Date: Thu, 18 Apr 2019 16:51:21 +1000
Subject: powerpc/64: Setup KUP on secondary CPUs

Some platforms (i.e. Radix MMU) need per-CPU initialisation for KUP.

Any platforms that only want to do KUP initialisation once
globally can just check to see if they're running on the boot CPU, or
check if whatever setup they need has already been performed.

Note that this is only for 64-bit.

Signed-off-by: Russell Currey <ruscur@russell.cc>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/setup_64.c | 3 +++
 arch/powerpc/mm/init-common.c  | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 6179c4200339..684e34493bf5 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -390,6 +390,9 @@ void early_setup_secondary(void)
 	/* Initialize the hash table or TLB handling */
 	early_init_mmu_secondary();
 
+	/* Perform any KUP setup that is per-cpu */
+	setup_kup();
+
 	/*
 	 * At this point, we can let interrupts switch to virtual mode
 	 * (the MMU has been setup), so adjust the MSR in the PACA to
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index ecaedfff9992..6ea5607fc564 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -45,7 +45,7 @@ static int __init parse_nosmap(char *p)
 }
 early_param("nosmap", parse_nosmap);
 
-void __init setup_kup(void)
+void setup_kup(void)
 {
 	setup_kuep(disable_kuep);
 	setup_kuap(disable_kuap);
-- 
cgit v1.2.3-59-g8ed1b


From 1bb2bae2e6c7d94e3bc1fdce06baf31b8d811ed6 Mon Sep 17 00:00:00 2001
From: Russell Currey <ruscur@russell.cc>
Date: Thu, 18 Apr 2019 16:51:22 +1000
Subject: powerpc/mm/radix: Use KUEP API for Radix MMU

Execution protection already exists on radix, this just refactors
the radix init to provide the KUEP setup function instead.

Thus, the only functional change is that it can now be disabled.

Signed-off-by: Russell Currey <ruscur@russell.cc>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/pgtable-radix.c        | 12 +++++++++---
 arch/powerpc/platforms/Kconfig.cputype |  1 +
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 154472a28c77..8616b291bcec 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -531,8 +531,15 @@ static void radix_init_amor(void)
 	mtspr(SPRN_AMOR, (3ul << 62));
 }
 
-static void radix_init_iamr(void)
+#ifdef CONFIG_PPC_KUEP
+void setup_kuep(bool disabled)
 {
+	if (disabled || !early_radix_enabled())
+		return;
+
+	if (smp_processor_id() == boot_cpuid)
+		pr_info("Activating Kernel Userspace Execution Prevention\n");
+
 	/*
 	 * Radix always uses key0 of the IAMR to determine if an access is
 	 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
@@ -540,6 +547,7 @@ static void radix_init_iamr(void)
 	 */
 	mtspr(SPRN_IAMR, (1ul << 62));
 }
+#endif
 
 void __init radix__early_init_mmu(void)
 {
@@ -601,7 +609,6 @@ void __init radix__early_init_mmu(void)
 
 	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
 
-	radix_init_iamr();
 	radix_init_pgtable();
 	/* Switch to the guard PID before turning on MMU */
 	radix__switch_mmu_context(NULL, &init_mm);
@@ -623,7 +630,6 @@ void radix__early_init_mmu_secondary(void)
 		      __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
 		radix_init_amor();
 	}
-	radix_init_iamr();
 
 	radix__switch_mmu_context(NULL, &init_mm);
 	if (cpu_has_feature(CPU_FTR_HVMODE))
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 457fc3a5ed93..60371784c9f1 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -326,6 +326,7 @@ config PPC_RADIX_MMU
 	bool "Radix MMU Support"
 	depends on PPC_BOOK3S_64
 	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+	select PPC_HAVE_KUEP
 	default y
 	help
 	  Enable support for the Power ISA 3.0 Radix style MMU. Currently this
-- 
cgit v1.2.3-59-g8ed1b


From ef296729b735e083d8919e76ac213b8ff237eb78 Mon Sep 17 00:00:00 2001
From: Russell Currey <ruscur@russell.cc>
Date: Thu, 18 Apr 2019 16:51:23 +1000
Subject: powerpc/lib: Refactor __patch_instruction() to use __put_user_asm()

__patch_instruction() is called in early boot, and uses
__put_user_size(), which includes the allow/prevent calls to enforce
KUAP, which could either be called too early, or in the Radix case,
forced to use "early_" versions of functions just to safely handle
this one case.

__put_user_asm() does not do this, and thus is safe to use both in
early boot, and later on since in this case it should only ever be
touching kernel memory.

__patch_instruction() was previously refactored to use
__put_user_size() in order to be able to return -EFAULT, which would
allow the kernel to patch instructions in userspace, which should
never happen. This has the functional change of causing faults on
userspace addresses if KUAP is turned on, which should never happen in
practice.

A future enhancement could be to double check the patch address is
definitely allowed to be tampered with by the kernel.

Signed-off-by: Russell Currey <ruscur@russell.cc>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/lib/code-patching.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 587ff9788ab0..90c9d4a1e36f 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -25,9 +25,9 @@
 static int __patch_instruction(unsigned int *exec_addr, unsigned int instr,
 			       unsigned int *patch_addr)
 {
-	int err;
+	int err = 0;
 
-	__put_user_size(instr, patch_addr, 4, err);
+	__put_user_asm(instr, patch_addr, err, "stw");
 	if (err)
 		return err;
 
-- 
cgit v1.2.3-59-g8ed1b


From 890274c2dc4c0a57ae5a12d6a76fa6d05b599d98 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 18 Apr 2019 16:51:24 +1000
Subject: powerpc/64s: Implement KUAP for Radix MMU

Kernel Userspace Access Prevention utilises a feature of the Radix MMU
which disallows read and write access to userspace addresses. By
utilising this, the kernel is prevented from accessing user data from
outside of trusted paths that perform proper safety checks, such as
copy_{to/from}_user() and friends.

Userspace access is disabled from early boot and is only enabled when
performing an operation like copy_{to/from}_user(). The register that
controls this (AMR) does not prevent userspace from accessing itself,
so there is no need to save and restore when entering and exiting
userspace.

When entering the kernel from the kernel we save AMR and if it is not
blocking user access (because eg. we faulted doing a user access) we
reblock user access for the duration of the exception (ie. the page
fault) and then restore the AMR when returning back to the kernel.

This feature can be tested by using the lkdtm driver (CONFIG_LKDTM=y)
and performing the following:

  # (echo ACCESS_USERSPACE) > [debugfs]/provoke-crash/DIRECT

If enabled, this should send SIGSEGV to the thread.

We also add paranoid checking of AMR in switch and syscall return
under CONFIG_PPC_KUAP_DEBUG.

Co-authored-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Russell Currey <ruscur@russell.cc>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/kup-radix.h | 102 +++++++++++++++++++++++++
 arch/powerpc/include/asm/exception-64s.h       |   2 +
 arch/powerpc/include/asm/feature-fixups.h      |   3 +
 arch/powerpc/include/asm/kup.h                 |   4 +
 arch/powerpc/include/asm/mmu.h                 |  10 ++-
 arch/powerpc/kernel/entry_64.S                 |  27 ++++++-
 arch/powerpc/kernel/exceptions-64s.S           |   3 +
 arch/powerpc/mm/pgtable-radix.c                |  19 +++++
 arch/powerpc/mm/pkeys.c                        |   1 +
 arch/powerpc/platforms/Kconfig.cputype         |   8 ++
 10 files changed, 176 insertions(+), 3 deletions(-)
 create mode 100644 arch/powerpc/include/asm/book3s/64/kup-radix.h

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h
new file mode 100644
index 000000000000..6d6628424134
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_BOOK3S_64_KUP_RADIX_H
+#define _ASM_POWERPC_BOOK3S_64_KUP_RADIX_H
+
+#include <linux/const.h>
+
+#define AMR_KUAP_BLOCK_READ	UL(0x4000000000000000)
+#define AMR_KUAP_BLOCK_WRITE	UL(0x8000000000000000)
+#define AMR_KUAP_BLOCKED	(AMR_KUAP_BLOCK_READ | AMR_KUAP_BLOCK_WRITE)
+#define AMR_KUAP_SHIFT		62
+
+#ifdef __ASSEMBLY__
+
+.macro kuap_restore_amr	gpr
+#ifdef CONFIG_PPC_KUAP
+	BEGIN_MMU_FTR_SECTION_NESTED(67)
+	ld	\gpr, STACK_REGS_KUAP(r1)
+	mtspr	SPRN_AMR, \gpr
+	END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67)
+#endif
+.endm
+
+.macro kuap_check_amr gpr1, gpr2
+#ifdef CONFIG_PPC_KUAP_DEBUG
+	BEGIN_MMU_FTR_SECTION_NESTED(67)
+	mfspr	\gpr1, SPRN_AMR
+	li	\gpr2, (AMR_KUAP_BLOCKED >> AMR_KUAP_SHIFT)
+	sldi	\gpr2, \gpr2, AMR_KUAP_SHIFT
+999:	tdne	\gpr1, \gpr2
+	EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE)
+	END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67)
+#endif
+.endm
+
+.macro kuap_save_amr_and_lock gpr1, gpr2, use_cr, msr_pr_cr
+#ifdef CONFIG_PPC_KUAP
+	BEGIN_MMU_FTR_SECTION_NESTED(67)
+	.ifnb \msr_pr_cr
+	bne	\msr_pr_cr, 99f
+	.endif
+	mfspr	\gpr1, SPRN_AMR
+	std	\gpr1, STACK_REGS_KUAP(r1)
+	li	\gpr2, (AMR_KUAP_BLOCKED >> AMR_KUAP_SHIFT)
+	sldi	\gpr2, \gpr2, AMR_KUAP_SHIFT
+	cmpd	\use_cr, \gpr1, \gpr2
+	beq	\use_cr, 99f
+	// We don't isync here because we very recently entered via rfid
+	mtspr	SPRN_AMR, \gpr2
+	isync
+99:
+	END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67)
+#endif
+.endm
+
+#else /* !__ASSEMBLY__ */
+
+#ifdef CONFIG_PPC_KUAP
+
+#include <asm/reg.h>
+
+/*
+ * We support individually allowing read or write, but we don't support nesting
+ * because that would require an expensive read/modify write of the AMR.
+ */
+
+static inline void set_kuap(unsigned long value)
+{
+	if (!mmu_has_feature(MMU_FTR_RADIX_KUAP))
+		return;
+
+	/*
+	 * ISA v3.0B says we need a CSI (Context Synchronising Instruction) both
+	 * before and after the move to AMR. See table 6 on page 1134.
+	 */
+	isync();
+	mtspr(SPRN_AMR, value);
+	isync();
+}
+
+static inline void allow_user_access(void __user *to, const void __user *from,
+				     unsigned long size)
+{
+	// This is written so we can resolve to a single case at build time
+	if (__builtin_constant_p(to) && to == NULL)
+		set_kuap(AMR_KUAP_BLOCK_WRITE);
+	else if (__builtin_constant_p(from) && from == NULL)
+		set_kuap(AMR_KUAP_BLOCK_READ);
+	else
+		set_kuap(0);
+}
+
+static inline void prevent_user_access(void __user *to, const void __user *from,
+				       unsigned long size)
+{
+	set_kuap(AMR_KUAP_BLOCKED);
+}
+
+#endif /* CONFIG_PPC_KUAP */
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_POWERPC_BOOK3S_64_KUP_RADIX_H */
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 937bb630093f..bef4e05a6823 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -497,6 +497,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	RESTORE_CTR(r1, area);						   \
 	b	bad_stack;						   \
 3:	EXCEPTION_PROLOG_COMMON_1();					   \
+	kuap_save_amr_and_lock r9, r10, cr1, cr0;			   \
 	beq	4f;			/* if from kernel mode		*/ \
 	ACCOUNT_CPU_USER_ENTRY(r13, r9, r10);				   \
 	SAVE_PPR(area, r9);						   \
@@ -691,6 +692,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
  */
 #define EXCEPTION_COMMON_NORET_STACK(area, trap, label, hdlr, additions) \
 	EXCEPTION_PROLOG_COMMON_1();				\
+	kuap_save_amr_and_lock r9, r10, cr1;			\
 	EXCEPTION_PROLOG_COMMON_2(area);			\
 	EXCEPTION_PROLOG_COMMON_3(trap);			\
 	/* Volatile regs are potentially clobbered here */	\
diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h
index 40a6c9261a6b..f6fc31f8baff 100644
--- a/arch/powerpc/include/asm/feature-fixups.h
+++ b/arch/powerpc/include/asm/feature-fixups.h
@@ -100,6 +100,9 @@ label##5:							\
 #define END_MMU_FTR_SECTION(msk, val)		\
 	END_MMU_FTR_SECTION_NESTED(msk, val, 97)
 
+#define END_MMU_FTR_SECTION_NESTED_IFSET(msk, label)	\
+	END_MMU_FTR_SECTION_NESTED((msk), (msk), label)
+
 #define END_MMU_FTR_SECTION_IFSET(msk)	END_MMU_FTR_SECTION((msk), (msk))
 #define END_MMU_FTR_SECTION_IFCLR(msk)	END_MMU_FTR_SECTION((msk), 0)
 
diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h
index 4d78b9d8c99c..d7312defbe1c 100644
--- a/arch/powerpc/include/asm/kup.h
+++ b/arch/powerpc/include/asm/kup.h
@@ -2,6 +2,10 @@
 #ifndef _ASM_POWERPC_KUP_H_
 #define _ASM_POWERPC_KUP_H_
 
+#ifdef CONFIG_PPC64
+#include <asm/book3s/64/kup-radix.h>
+#endif
+
 #ifndef __ASSEMBLY__
 
 #include <asm/pgtable.h>
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 8ddd4a91bdc1..38d21adfde40 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -107,6 +107,11 @@
  */
 #define MMU_FTR_1T_SEGMENT		ASM_CONST(0x40000000)
 
+/*
+ * Supports KUAP (key 0 controlling userspace addresses) on radix
+ */
+#define MMU_FTR_RADIX_KUAP		ASM_CONST(0x80000000)
+
 /* MMU feature bit sets for various CPUs */
 #define MMU_FTRS_DEFAULT_HPTE_ARCH_V2	\
 	MMU_FTR_HPTE_TABLE | MMU_FTR_PPCAS_ARCH_V2
@@ -164,7 +169,10 @@ enum {
 #endif
 #ifdef CONFIG_PPC_RADIX_MMU
 		MMU_FTR_TYPE_RADIX |
-#endif
+#ifdef CONFIG_PPC_KUAP
+		MMU_FTR_RADIX_KUAP |
+#endif /* CONFIG_PPC_KUAP */
+#endif /* CONFIG_PPC_RADIX_MMU */
 		0,
 };
 
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 15c67d2c0534..7cc25389c6bd 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -46,6 +46,7 @@
 #include <asm/exception-64e.h>
 #endif
 #include <asm/feature-fixups.h>
+#include <asm/kup.h>
 
 /*
  * System calls.
@@ -120,6 +121,9 @@ END_BTB_FLUSH_SECTION
 	addi	r9,r1,STACK_FRAME_OVERHEAD
 	ld	r11,exception_marker@toc(r2)
 	std	r11,-16(r9)		/* "regshere" marker */
+
+	kuap_check_amr r10, r11
+
 #if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR)
 BEGIN_FW_FTR_SECTION
 	beq	33f
@@ -275,6 +279,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
 	andi.	r6,r8,MSR_PR
 	ld	r4,_LINK(r1)
 
+	kuap_check_amr r10, r11
+
 #ifdef CONFIG_PPC_BOOK3S
 	/*
 	 * Clear MSR_RI, MSR_EE is already and remains disabled. We could do
@@ -296,6 +302,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	std	r8, PACATMSCRATCH(r13)
 #endif
 
+	/*
+	 * We don't need to restore AMR on the way back to userspace for KUAP.
+	 * The value of AMR only matters while we're in the kernel.
+	 */
 	ld	r13,GPR13(r1)	/* only restore r13 if returning to usermode */
 	ld	r2,GPR2(r1)
 	ld	r1,GPR1(r1)
@@ -306,8 +316,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	RFI_TO_USER
 	b	.	/* prevent speculative execution */
 
-	/* exit to kernel */
-1:	ld	r2,GPR2(r1)
+1:	/* exit to kernel */
+	kuap_restore_amr r2
+
+	ld	r2,GPR2(r1)
 	ld	r1,GPR1(r1)
 	mtlr	r4
 	mtcr	r5
@@ -594,6 +606,8 @@ _GLOBAL(_switch)
 	std	r23,_CCR(r1)
 	std	r1,KSP(r3)	/* Set old stack pointer */
 
+	kuap_check_amr r9, r10
+
 	FLUSH_COUNT_CACHE
 
 	/*
@@ -942,6 +956,8 @@ fast_exception_return:
 	ld	r4,_XER(r1)
 	mtspr	SPRN_XER,r4
 
+	kuap_check_amr r5, r6
+
 	REST_8GPRS(5, r1)
 
 	andi.	r0,r3,MSR_RI
@@ -974,6 +990,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	ACCOUNT_CPU_USER_EXIT(r13, r2, r4)
 	REST_GPR(13, r1)
 
+	/*
+	 * We don't need to restore AMR on the way back to userspace for KUAP.
+	 * The value of AMR only matters while we're in the kernel.
+	 */
 	mtspr	SPRN_SRR1,r3
 
 	ld	r2,_CCR(r1)
@@ -1006,6 +1026,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	ld	r0,GPR0(r1)
 	ld	r2,GPR2(r1)
 	ld	r3,GPR3(r1)
+
+	kuap_restore_amr r4
+
 	ld	r4,GPR4(r1)
 	ld	r1,GPR1(r1)
 	RFI_TO_KERNEL
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 9481a117e242..bedd89438827 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -19,6 +19,7 @@
 #include <asm/cpuidle.h>
 #include <asm/head-64.h>
 #include <asm/feature-fixups.h>
+#include <asm/kup.h>
 
 /*
  * There are a few constraints to be concerned with.
@@ -309,6 +310,7 @@ TRAMP_REAL_BEGIN(machine_check_common_early)
 	mfspr	r11,SPRN_DSISR		/* Save DSISR */
 	std	r11,_DSISR(r1)
 	std	r9,_CCR(r1)		/* Save CR in stackframe */
+	kuap_save_amr_and_lock r9, r10, cr1
 	/* Save r9 through r13 from EXMC save area to stack frame. */
 	EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
 	mfmsr	r11			/* get MSR value */
@@ -1109,6 +1111,7 @@ TRAMP_REAL_BEGIN(hmi_exception_early)
 	mfspr	r11,SPRN_HSRR0		/* Save HSRR0 */
 	mfspr	r12,SPRN_HSRR1		/* Save HSRR1 */
 	EXCEPTION_PROLOG_COMMON_1()
+	/* We don't touch AMR here, we never go to virtual mode */
 	EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN)
 	EXCEPTION_PROLOG_COMMON_3(0xe60)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 8616b291bcec..45869fd698a0 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -29,6 +29,7 @@
 #include <asm/powernv.h>
 #include <asm/sections.h>
 #include <asm/trace.h>
+#include <asm/uaccess.h>
 
 #include <trace/events/thp.h>
 
@@ -549,6 +550,24 @@ void setup_kuep(bool disabled)
 }
 #endif
 
+#ifdef CONFIG_PPC_KUAP
+void setup_kuap(bool disabled)
+{
+	if (disabled || !early_radix_enabled())
+		return;
+
+	if (smp_processor_id() == boot_cpuid) {
+		pr_info("Activating Kernel Userspace Access Prevention\n");
+		cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP;
+	}
+
+	/* Make sure userspace can't change the AMR */
+	mtspr(SPRN_UAMOR, 0);
+	mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
+	isync();
+}
+#endif
+
 void __init radix__early_init_mmu(void)
 {
 	unsigned long lpcr;
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 587807763737..ae7fca40e5b3 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -7,6 +7,7 @@
 
 #include <asm/mman.h>
 #include <asm/mmu_context.h>
+#include <asm/mmu.h>
 #include <asm/setup.h>
 #include <linux/pkeys.h>
 #include <linux/of_device.h>
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 60371784c9f1..5e53b9fd62aa 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -327,6 +327,7 @@ config PPC_RADIX_MMU
 	depends on PPC_BOOK3S_64
 	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
 	select PPC_HAVE_KUEP
+	select PPC_HAVE_KUAP
 	default y
 	help
 	  Enable support for the Power ISA 3.0 Radix style MMU. Currently this
@@ -370,6 +371,13 @@ config PPC_KUAP
 
 	  If you're unsure, say Y.
 
+config PPC_KUAP_DEBUG
+	bool "Extra debugging for Kernel Userspace Access Protection"
+	depends on PPC_HAVE_KUAP && PPC_RADIX_MMU
+	help
+	  Add extra debugging for Kernel Userspace Access Protection (KUAP)
+	  If you're unsure, say N.
+
 config ARCH_ENABLE_HUGEPAGE_MIGRATION
 	def_bool y
 	depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION
-- 
cgit v1.2.3-59-g8ed1b


From 5e5be3aed23032d40d5ab7407f344f1a74f2765b Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 18 Apr 2019 16:51:25 +1000
Subject: powerpc/mm: Detect bad KUAP faults

When KUAP is enabled we have logic to detect page faults that occur
outside of a valid user access region and are blocked by the AMR.

What we don't have at the moment is logic to detect a fault *within* a
valid user access region, that has been incorrectly blocked by AMR.
This is not meant to ever happen, but it can if we incorrectly
save/restore the AMR, or if the AMR was overwritten for some other
reason.

Currently if that happens we assume it's just a regular fault that
will be corrected by handling the fault normally, so we just return.
But there is nothing the fault handling code can do to fix it, so the
fault just happens again and we spin forever, leading to soft lockups.

So add some logic to detect that case and WARN() if we ever see it.
Arguably it should be a BUG(), but it's more polite to fail the access
and let the kernel continue, rather than taking down the box. There
should be no data integrity issue with failing the fault rather than
BUG'ing, as we're just going to disallow an access that should have
been allowed.

To make the code a little easier to follow, unroll the condition at
the end of bad_kernel_fault() and comment each case, before adding the
call to bad_kuap_fault().

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/kup-radix.h |  6 ++++++
 arch/powerpc/include/asm/kup.h                 |  1 +
 arch/powerpc/mm/fault.c                        | 25 ++++++++++++++++++++++---
 3 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h
index 6d6628424134..7679bd0c5af0 100644
--- a/arch/powerpc/include/asm/book3s/64/kup-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h
@@ -95,6 +95,12 @@ static inline void prevent_user_access(void __user *to, const void __user *from,
 	set_kuap(AMR_KUAP_BLOCKED);
 }
 
+static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write)
+{
+	return WARN(mmu_has_feature(MMU_FTR_RADIX_KUAP) &&
+		    (regs->kuap & (is_write ? AMR_KUAP_BLOCK_WRITE : AMR_KUAP_BLOCK_READ)),
+		    "Bug: %s fault blocked by AMR!", is_write ? "Write" : "Read");
+}
 #endif /* CONFIG_PPC_KUAP */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h
index d7312defbe1c..28ad4654eed2 100644
--- a/arch/powerpc/include/asm/kup.h
+++ b/arch/powerpc/include/asm/kup.h
@@ -26,6 +26,7 @@ static inline void allow_user_access(void __user *to, const void __user *from,
 				     unsigned long size) { }
 static inline void prevent_user_access(void __user *to, const void __user *from,
 				       unsigned long size) { }
+static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write) { return false; }
 #endif /* CONFIG_PPC_KUAP */
 
 static inline void allow_read_from_user(const void __user *from, unsigned long size)
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 463d1e9d026e..b5d3578d9f65 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -44,6 +44,7 @@
 #include <asm/mmu_context.h>
 #include <asm/siginfo.h>
 #include <asm/debug.h>
+#include <asm/kup.h>
 
 static inline bool notify_page_fault(struct pt_regs *regs)
 {
@@ -224,7 +225,7 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr,
 
 /* Is this a bad kernel fault ? */
 static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
-			     unsigned long address)
+			     unsigned long address, bool is_write)
 {
 	int is_exec = TRAP(regs) == 0x400;
 
@@ -235,6 +236,9 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
 				    address >= TASK_SIZE ? "exec-protected" : "user",
 				    address,
 				    from_kuid(&init_user_ns, current_uid()));
+
+		// Kernel exec fault is always bad
+		return true;
 	}
 
 	if (!is_exec && address < TASK_SIZE && (error_code & DSISR_PROTFAULT) &&
@@ -244,7 +248,22 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
 				    from_kuid(&init_user_ns, current_uid()));
 	}
 
-	return is_exec || (address >= TASK_SIZE) || !search_exception_tables(regs->nip);
+	// Kernel fault on kernel address is bad
+	if (address >= TASK_SIZE)
+		return true;
+
+	// Fault on user outside of certain regions (eg. copy_tofrom_user()) is bad
+	if (!search_exception_tables(regs->nip))
+		return true;
+
+	// Read/write fault in a valid region (the exception table search passed
+	// above), but blocked by KUAP is bad, it can never succeed.
+	if (bad_kuap_fault(regs, is_write))
+		return true;
+
+	// What's left? Kernel fault on user in well defined regions (extable
+	// matched), and allowed by KUAP in the faulting context.
+	return false;
 }
 
 static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
@@ -467,7 +486,7 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * take a page fault to a kernel address or a page fault to a user
 	 * address outside of dedicated places
 	 */
-	if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address)))
+	if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address, is_write)))
 		return SIGSEGV;
 
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From e291b6d575bc6e4d1d36961b081be521a6c800d6 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 11 Mar 2019 08:30:30 +0000
Subject: powerpc/32: Remove MSR_PR test when returning from syscall

syscalls are from user only, so we can account time without checking
whether returning to kernel or user as it will only be user.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/entry_32.S | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index b61cfd29c76f..aaf7c5f44823 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -422,12 +422,7 @@ BEGIN_FTR_SECTION
 	lwarx	r7,0,r1
 END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
 	stwcx.	r0,0,r1			/* to clear the reservation */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	andi.	r4,r8,MSR_PR
-	beq	3f
 	ACCOUNT_CPU_USER_EXIT(r2, r5, r7)
-3:
-#endif
 	lwz	r4,_LINK(r1)
 	lwz	r5,_CCR(r1)
 	mtlr	r4
-- 
cgit v1.2.3-59-g8ed1b


From e2fb9f5444312fd01627c84a3e018c1fe8ac6ebb Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 11 Mar 2019 08:30:31 +0000
Subject: powerpc/32: Prepare for Kernel Userspace Access Protection

This patch adds ASM macros for saving, restoring and checking
the KUAP state, and modifies setup_32 to call them on exceptions
from kernel.

The macros are defined as empty by default for when CONFIG_PPC_KUAP
is not selected and/or for platforms which don't handle (yet) KUAP.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kup.h         | 15 ++++++++++++++-
 arch/powerpc/kernel/entry_32.S         | 16 ++++++++++++----
 arch/powerpc/platforms/Kconfig.cputype |  2 +-
 3 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h
index 28ad4654eed2..7d8ad3d6729d 100644
--- a/arch/powerpc/include/asm/kup.h
+++ b/arch/powerpc/include/asm/kup.h
@@ -6,7 +6,20 @@
 #include <asm/book3s/64/kup-radix.h>
 #endif
 
-#ifndef __ASSEMBLY__
+#ifdef __ASSEMBLY__
+#ifndef CONFIG_PPC_KUAP
+.macro kuap_save_and_lock	sp, thread, gpr1, gpr2, gpr3
+.endm
+
+.macro kuap_restore	sp, current, gpr1, gpr2, gpr3
+.endm
+
+.macro kuap_check	current, gpr
+.endm
+
+#endif
+
+#else /* !__ASSEMBLY__ */
 
 #include <asm/pgtable.h>
 
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index aaf7c5f44823..1182bf603d3c 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -36,6 +36,7 @@
 #include <asm/asm-405.h>
 #include <asm/feature-fixups.h>
 #include <asm/barrier.h>
+#include <asm/kup.h>
 
 /*
  * MSR_KERNEL is > 0x10000 on 4xx/Book-E since it include MSR_CE.
@@ -150,8 +151,8 @@ transfer_to_handler:
 	stw	r12,_CTR(r11)
 	stw	r2,_XER(r11)
 	mfspr	r12,SPRN_SPRG_THREAD
-	addi	r2,r12,-THREAD
 	beq	2f			/* if from user, fix up THREAD.regs */
+	addi	r2, r12, -THREAD
 	addi	r11,r1,STACK_FRAME_OVERHEAD
 	stw	r11,PT_REGS(r12)
 #if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
@@ -186,6 +187,8 @@ transfer_to_handler:
 2:	/* if from kernel, check interrupted DOZE/NAP mode and
          * check for stack overflow
          */
+	kuap_save_and_lock r11, r12, r9, r2, r0
+	addi	r2, r12, -THREAD
 	lwz	r9,KSP_LIMIT(r12)
 	cmplw	r1,r9			/* if r1 <= ksp_limit */
 	ble-	stack_ovf		/* then the kernel stack overflowed */
@@ -272,6 +275,7 @@ reenable_mmu:				/* re-enable mmu so we can */
 	lwz	r9,_MSR(r11)		/* if sleeping, clear MSR.EE */
 	rlwinm	r9,r9,0,~MSR_EE
 	lwz	r12,_LINK(r11)		/* and return to address in LR */
+	kuap_restore r11, r2, r3, r4, r5
 	b	fast_exception_return
 #endif
 
@@ -423,6 +427,7 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
 	stwcx.	r0,0,r1			/* to clear the reservation */
 	ACCOUNT_CPU_USER_EXIT(r2, r5, r7)
+	kuap_check r2, r4
 	lwz	r4,_LINK(r1)
 	lwz	r5,_CCR(r1)
 	mtlr	r4
@@ -673,6 +678,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPE)
 	stw	r10,_CCR(r1)
 	stw	r1,KSP(r3)	/* Set old stack pointer */
 
+	kuap_check r2, r4
 #ifdef CONFIG_SMP
 	/* We need a sync somewhere here to make sure that if the
 	 * previous task gets rescheduled on another CPU, it sees all
@@ -861,12 +867,12 @@ resume_kernel:
 	/* check current_thread_info->preempt_count */
 	lwz	r0,TI_PREEMPT(r2)
 	cmpwi	0,r0,0		/* if non-zero, just restore regs and return */
-	bne	restore
+	bne	restore_kuap
 	andi.	r8,r8,_TIF_NEED_RESCHED
-	beq+	restore
+	beq+	restore_kuap
 	lwz	r3,_MSR(r1)
 	andi.	r0,r3,MSR_EE	/* interrupts off? */
-	beq	restore		/* don't schedule if so */
+	beq	restore_kuap	/* don't schedule if so */
 #ifdef CONFIG_TRACE_IRQFLAGS
 	/* Lockdep thinks irqs are enabled, we need to call
 	 * preempt_schedule_irq with IRQs off, so we inform lockdep
@@ -885,6 +891,8 @@ resume_kernel:
 	bl	trace_hardirqs_on
 #endif
 #endif /* CONFIG_PREEMPT */
+restore_kuap:
+	kuap_restore r1, r2, r9, r10, r0
 
 	/* interrupts are hard-disabled at this point */
 restore:
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 5e53b9fd62aa..2e45a6e2bc99 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -373,7 +373,7 @@ config PPC_KUAP
 
 config PPC_KUAP_DEBUG
 	bool "Extra debugging for Kernel Userspace Access Protection"
-	depends on PPC_HAVE_KUAP && PPC_RADIX_MMU
+	depends on PPC_HAVE_KUAP && (PPC_RADIX_MMU || PPC_32)
 	help
 	  Add extra debugging for Kernel Userspace Access Protection (KUAP)
 	  If you're unsure, say N.
-- 
cgit v1.2.3-59-g8ed1b


From c341a108a58100b4d0774ddb1dacbd67dfa749b3 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 11 Mar 2019 08:30:32 +0000
Subject: powerpc/8xx: Only define APG0 and APG1

Since the 8xx implements hardware page table walk assistance,
the PGD entries always point to a 4k aligned page, so the 2 upper
bits of the APG are not clobbered anymore and remain 0. Therefore
only APG0 and APG1 are used and need a definition. We set the
other APG to the lowest permission level.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
index 0a1a3fc54e54..fc5a653d5dd2 100644
--- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
@@ -35,11 +35,11 @@
  * Then we use the APG to say whether accesses are according to Page rules or
  * "all Supervisor" rules (Access to all)
  * Therefore, we define 2 APG groups. lsb is _PMD_USER
- * 0 => No user => 01 (all accesses performed according to page definition)
+ * 0 => Kernel => 01 (all accesses performed according to page definition)
  * 1 => User => 00 (all accesses performed as supervisor iaw page definition)
- * We define all 16 groups so that all other bits of APG can take any value
+ * 2-16 => NA => 11 (all accesses performed as user iaw page definition)
  */
-#define MI_APG_INIT	0x44444444
+#define MI_APG_INIT	0x4fffffff
 
 /* The effective page number register.  When read, contains the information
  * about the last instruction TLB miss.  When MI_RPN is written, bits in
@@ -108,11 +108,11 @@
  * Then we use the APG to say whether accesses are according to Page rules or
  * "all Supervisor" rules (Access to all)
  * Therefore, we define 2 APG groups. lsb is _PMD_USER
- * 0 => No user => 01 (all accesses performed according to page definition)
+ * 0 => Kernel => 01 (all accesses performed according to page definition)
  * 1 => User => 00 (all accesses performed as supervisor iaw page definition)
- * We define all 16 groups so that all other bits of APG can take any value
+ * 2-16 => NA => 11 (all accesses performed as user iaw page definition)
  */
-#define MD_APG_INIT	0x44444444
+#define MD_APG_INIT	0x4fffffff
 
 /* The effective page number register.  When read, contains the information
  * about the last instruction TLB miss.  When MD_RPN is written, bits in
-- 
cgit v1.2.3-59-g8ed1b


From 06fbe81b5909847aa13f9c86c2b6f9bbc5c2795b Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 11 Mar 2019 08:30:33 +0000
Subject: powerpc/8xx: Add Kernel Userspace Execution Prevention

This patch adds Kernel Userspace Execution Prevention on the 8xx.

When a page is Executable, it is set Executable for Key 0 and NX
for Key 1.

Up to now, the User group is defined with Key 0 for both User and
Supervisor.

By changing the group to Key 0 for User and Key 1 for Supervisor,
this patch prevents the Kernel from being able to execute user code.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/nohash/32/mmu-8xx.h |  7 +++++++
 arch/powerpc/mm/8xx_mmu.c                    | 12 ++++++++++++
 arch/powerpc/platforms/Kconfig.cputype       |  1 +
 3 files changed, 20 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
index fc5a653d5dd2..3cb743284e09 100644
--- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
@@ -41,6 +41,13 @@
  */
 #define MI_APG_INIT	0x4fffffff
 
+/*
+ * 0 => Kernel => 01 (all accesses performed according to page definition)
+ * 1 => User => 10 (all accesses performed according to swaped page definition)
+ * 2-16 => NA => 11 (all accesses performed as user iaw page definition)
+ */
+#define MI_APG_KUEP	0x6fffffff
+
 /* The effective page number register.  When read, contains the information
  * about the last instruction TLB miss.  When MI_RPN is written, bits in
  * this register are used to create the TLB entry.
diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c
index fe1f6443d57f..e257a0c9bd08 100644
--- a/arch/powerpc/mm/8xx_mmu.c
+++ b/arch/powerpc/mm/8xx_mmu.c
@@ -213,3 +213,15 @@ void flush_instruction_cache(void)
 	mtspr(SPRN_IC_CST, IDC_INVALL);
 	isync();
 }
+
+#ifdef CONFIG_PPC_KUEP
+void __init setup_kuep(bool disabled)
+{
+	if (disabled)
+		return;
+
+	pr_info("Activating Kernel Userspace Execution Prevention\n");
+
+	mtspr(SPRN_MI_AP, MI_APG_KUEP);
+}
+#endif
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 2e45a6e2bc99..00fa0d110dcb 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -34,6 +34,7 @@ config PPC_8xx
 	bool "Freescale 8xx"
 	select FSL_SOC
 	select SYS_SUPPORTS_HUGETLBFS
+	select PPC_HAVE_KUEP
 
 config 40x
 	bool "AMCC 40x"
-- 
cgit v1.2.3-59-g8ed1b


From 2679f9bd0abafb3044bcbaac0600b32159ac8bf2 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 11 Mar 2019 08:30:34 +0000
Subject: powerpc/8xx: Add Kernel Userspace Access Protection

This patch adds Kernel Userspace Access Protection on the 8xx.

When a page is RO or RW, it is set RO or RW for Key 0 and NA
for Key 1.

Up to now, the User group is defined with Key 0 for both User and
Supervisor.

By changing the group to Key 0 for User and Key 1 for Supervisor,
this patch prevents the Kernel from being able to access user data.

At exception entry, the kernel saves SPRN_MD_AP in the regs struct,
and reapply the protection. At exception exit it restores SPRN_MD_AP
with the value saved on exception entry.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
[mpe: Drop allow_read/write_to/from_user() as they're now in kup.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kup.h               |  3 ++
 arch/powerpc/include/asm/nohash/32/kup-8xx.h | 58 ++++++++++++++++++++++++++++
 arch/powerpc/include/asm/nohash/32/mmu-8xx.h |  7 ++++
 arch/powerpc/mm/8xx_mmu.c                    | 12 ++++++
 arch/powerpc/platforms/Kconfig.cputype       |  1 +
 5 files changed, 81 insertions(+)
 create mode 100644 arch/powerpc/include/asm/nohash/32/kup-8xx.h

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h
index 7d8ad3d6729d..043c800ec5fb 100644
--- a/arch/powerpc/include/asm/kup.h
+++ b/arch/powerpc/include/asm/kup.h
@@ -5,6 +5,9 @@
 #ifdef CONFIG_PPC64
 #include <asm/book3s/64/kup-radix.h>
 #endif
+#ifdef CONFIG_PPC_8xx
+#include <asm/nohash/32/kup-8xx.h>
+#endif
 
 #ifdef __ASSEMBLY__
 #ifndef CONFIG_PPC_KUAP
diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h
new file mode 100644
index 000000000000..1c3133b5f86a
--- /dev/null
+++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_KUP_8XX_H_
+#define _ASM_POWERPC_KUP_8XX_H_
+
+#include <asm/bug.h>
+
+#ifdef CONFIG_PPC_KUAP
+
+#ifdef __ASSEMBLY__
+
+.macro kuap_save_and_lock	sp, thread, gpr1, gpr2, gpr3
+	lis	\gpr2, MD_APG_KUAP@h	/* only APG0 and APG1 are used */
+	mfspr	\gpr1, SPRN_MD_AP
+	mtspr	SPRN_MD_AP, \gpr2
+	stw	\gpr1, STACK_REGS_KUAP(\sp)
+.endm
+
+.macro kuap_restore	sp, current, gpr1, gpr2, gpr3
+	lwz	\gpr1, STACK_REGS_KUAP(\sp)
+	mtspr	SPRN_MD_AP, \gpr1
+.endm
+
+.macro kuap_check	current, gpr
+#ifdef CONFIG_PPC_KUAP_DEBUG
+	mfspr	\gpr, SPRN_MD_AP
+	rlwinm	\gpr, \gpr, 16, 0xffff
+999:	twnei	\gpr, MD_APG_KUAP@h
+	EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE)
+#endif
+.endm
+
+#else /* !__ASSEMBLY__ */
+
+#include <asm/reg.h>
+
+static inline void allow_user_access(void __user *to, const void __user *from,
+				     unsigned long size)
+{
+	mtspr(SPRN_MD_AP, MD_APG_INIT);
+}
+
+static inline void prevent_user_access(void __user *to, const void __user *from,
+				       unsigned long size)
+{
+	mtspr(SPRN_MD_AP, MD_APG_KUAP);
+}
+
+static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write)
+{
+	return WARN(!((regs->kuap ^ MD_APG_KUAP) & 0xf0000000),
+		    "Bug: fault blocked by AP register !");
+}
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* CONFIG_PPC_KUAP */
+
+#endif /* _ASM_POWERPC_KUP_8XX_H_ */
diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
index 3cb743284e09..f620adef54fc 100644
--- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
@@ -121,6 +121,13 @@
  */
 #define MD_APG_INIT	0x4fffffff
 
+/*
+ * 0 => No user => 01 (all accesses performed according to page definition)
+ * 1 => User => 10 (all accesses performed according to swaped page definition)
+ * 2-16 => NA => 11 (all accesses performed as user iaw page definition)
+ */
+#define MD_APG_KUAP	0x6fffffff
+
 /* The effective page number register.  When read, contains the information
  * about the last instruction TLB miss.  When MD_RPN is written, bits in
  * this register are used to create the TLB entry.
diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c
index e257a0c9bd08..87648b58d295 100644
--- a/arch/powerpc/mm/8xx_mmu.c
+++ b/arch/powerpc/mm/8xx_mmu.c
@@ -225,3 +225,15 @@ void __init setup_kuep(bool disabled)
 	mtspr(SPRN_MI_AP, MI_APG_KUEP);
 }
 #endif
+
+#ifdef CONFIG_PPC_KUAP
+void __init setup_kuap(bool disabled)
+{
+	pr_info("Activating Kernel Userspace Access Protection\n");
+
+	if (disabled)
+		pr_warn("KUAP cannot be disabled yet on 8xx when compiled in\n");
+
+	mtspr(SPRN_MD_AP, MD_APG_KUAP);
+}
+#endif
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 00fa0d110dcb..ab586963893a 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -35,6 +35,7 @@ config PPC_8xx
 	select FSL_SOC
 	select SYS_SUPPORTS_HUGETLBFS
 	select PPC_HAVE_KUEP
+	select PPC_HAVE_KUAP
 
 config 40x
 	bool "AMCC 40x"
-- 
cgit v1.2.3-59-g8ed1b


From 31ed2b13c48d779efc838ad54e30121e088a62af Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 11 Mar 2019 08:30:35 +0000
Subject: powerpc/32s: Implement Kernel Userspace Execution Prevention.

To implement Kernel Userspace Execution Prevention, this patch
sets NX bit on all user segments on kernel entry and clears NX bit
on all user segments on kernel exit.

Note that powerpc 601 doesn't have the NX bit, so KUEP will not
work on it. A warning is displayed at startup.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/32/kup.h      | 42 +++++++++++++++++++++++++++
 arch/powerpc/include/asm/book3s/32/mmu-hash.h |  3 ++
 arch/powerpc/include/asm/kup.h                |  3 ++
 arch/powerpc/kernel/entry_32.S                |  9 ++++++
 arch/powerpc/kernel/head_32.S                 | 15 +++++++++-
 arch/powerpc/mm/ppc_mmu_32.c                  | 13 +++++++++
 arch/powerpc/platforms/Kconfig.cputype        |  1 +
 7 files changed, 85 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/include/asm/book3s/32/kup.h

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h
new file mode 100644
index 000000000000..5f97c742ca71
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_BOOK3S_32_KUP_H
+#define _ASM_POWERPC_BOOK3S_32_KUP_H
+
+#include <asm/book3s/32/mmu-hash.h>
+
+#ifdef __ASSEMBLY__
+
+.macro kuep_update_sr	gpr1, gpr2		/* NEVER use r0 as gpr2 due to addis */
+101:	mtsrin	\gpr1, \gpr2
+	addi	\gpr1, \gpr1, 0x111		/* next VSID */
+	rlwinm	\gpr1, \gpr1, 0, 0xf0ffffff	/* clear VSID overflow */
+	addis	\gpr2, \gpr2, 0x1000		/* address of next segment */
+	bdnz	101b
+	isync
+.endm
+
+.macro kuep_lock	gpr1, gpr2
+#ifdef CONFIG_PPC_KUEP
+	li	\gpr1, NUM_USER_SEGMENTS
+	li	\gpr2, 0
+	mtctr	\gpr1
+	mfsrin	\gpr1, \gpr2
+	oris	\gpr1, \gpr1, SR_NX@h		/* set Nx */
+	kuep_update_sr \gpr1, \gpr2
+#endif
+.endm
+
+.macro kuep_unlock	gpr1, gpr2
+#ifdef CONFIG_PPC_KUEP
+	li	\gpr1, NUM_USER_SEGMENTS
+	li	\gpr2, 0
+	mtctr	\gpr1
+	mfsrin	\gpr1, \gpr2
+	rlwinm	\gpr1, \gpr1, 0, ~SR_NX		/* Clear Nx */
+	kuep_update_sr \gpr1, \gpr2
+#endif
+.endm
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_POWERPC_BOOK3S_32_KUP_H */
diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
index 5cb588395fdc..8c5727a322b1 100644
--- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
@@ -63,6 +63,9 @@ typedef pte_t *pgtable_t;
 #define PP_RWRW 2	/* Supervisor read/write, User read/write */
 #define PP_RXRX 3	/* Supervisor read,       User read */
 
+/* Values for Segment Registers */
+#define SR_NX	0x10000000	/* No Execute */
+
 #ifndef __ASSEMBLY__
 
 /*
diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h
index 043c800ec5fb..5b5e39643a27 100644
--- a/arch/powerpc/include/asm/kup.h
+++ b/arch/powerpc/include/asm/kup.h
@@ -8,6 +8,9 @@
 #ifdef CONFIG_PPC_8xx
 #include <asm/nohash/32/kup-8xx.h>
 #endif
+#ifdef CONFIG_PPC_BOOK3S_32
+#include <asm/book3s/32/kup.h>
+#endif
 
 #ifdef __ASSEMBLY__
 #ifndef CONFIG_PPC_KUAP
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 1182bf603d3c..2f3d159c11d7 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -162,6 +162,9 @@ transfer_to_handler:
 	andis.	r12,r12,DBCR0_IDM@h
 #endif
 	ACCOUNT_CPU_USER_ENTRY(r2, r11, r12)
+#ifdef CONFIG_PPC_BOOK3S_32
+	kuep_lock r11, r12
+#endif
 #if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
 	beq+	3f
 	/* From user and task is ptraced - load up global dbcr0 */
@@ -427,6 +430,9 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
 	stwcx.	r0,0,r1			/* to clear the reservation */
 	ACCOUNT_CPU_USER_EXIT(r2, r5, r7)
+#ifdef CONFIG_PPC_BOOK3S_32
+	kuep_unlock r5, r7
+#endif
 	kuap_check r2, r4
 	lwz	r4,_LINK(r1)
 	lwz	r5,_CCR(r1)
@@ -821,6 +827,9 @@ restore_user:
 	bnel-	load_dbcr0
 #endif
 	ACCOUNT_CPU_USER_EXIT(r2, r10, r11)
+#ifdef CONFIG_PPC_BOOK3S_32
+	kuep_unlock	r10, r11
+#endif
 
 	b	restore
 
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index e25b615e9f9e..19b46cb9f623 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -896,14 +896,24 @@ load_up_mmu:
 	tophys(r6,r6)
 	lwz	r6,_SDR1@l(r6)
 	mtspr	SPRN_SDR1,r6
-	li	r0,16		/* load up segment register values */
+	li	r0, NUM_USER_SEGMENTS	/* load up segment register values */
 	mtctr	r0		/* for context 0 */
 	lis	r3,0x2000	/* Ku = 1, VSID = 0 */
+#ifdef CONFIG_PPC_KUEP
+	oris	r3, r3, SR_NX@h	/* Set Nx */
+#endif
 	li	r4,0
 3:	mtsrin	r3,r4
 	addi	r3,r3,0x111	/* increment VSID */
 	addis	r4,r4,0x1000	/* address of next segment */
 	bdnz	3b
+	li	r0, 16 - NUM_USER_SEGMENTS /* load up kernel segment registers */
+	mtctr	r0			/* for context 0 */
+	rlwinm	r3, r3, 0, ~SR_NX	/* Nx = 0 */
+3:	mtsrin	r3, r4
+	addi	r3, r3, 0x111	/* increment VSID */
+	addis	r4, r4, 0x1000	/* address of next segment */
+	bdnz	3b
 
 /* Load the BAT registers with the values set up by MMU_init.
    MMU_init takes care of whether we're on a 601 or not. */
@@ -1007,6 +1017,9 @@ _ENTRY(switch_mmu_context)
 	mulli	r3,r3,897	/* multiply context by skew factor */
 	rlwinm	r3,r3,4,8,27	/* VSID = (context & 0xfffff) << 4 */
 	addis	r3,r3,0x6000	/* Set Ks, Ku bits */
+#ifdef CONFIG_PPC_KUEP
+	oris	r3, r3, SR_NX@h	/* Set Nx */
+#endif
 	li	r0,NUM_USER_SEGMENTS
 	mtctr	r0
 
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index f29d2f118b44..baa75673b1f5 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -394,3 +394,16 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 	else /* Anything else has 256M mapped */
 		memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000));
 }
+
+#ifdef CONFIG_PPC_KUEP
+void __init setup_kuep(bool disabled)
+{
+	pr_info("Activating Kernel Userspace Execution Prevention\n");
+
+	if (cpu_has_feature(CPU_FTR_601))
+		pr_warn("KUEP is not working on powerpc 601 (No NX bit in Seg Regs)\n");
+
+	if (disabled)
+		pr_warn("KUEP cannot be disabled yet on 6xx when compiled in\n");
+}
+#endif
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index ab586963893a..6bc0a4c08c1c 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -25,6 +25,7 @@ config PPC_BOOK3S_32
 	bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx"
 	select PPC_FPU
 	select PPC_HAVE_PMU_SUPPORT
+	select PPC_HAVE_KUEP
 
 config PPC_85xx
 	bool "Freescale 85xx"
-- 
cgit v1.2.3-59-g8ed1b


From f342adca3afc84c4ef648352440ed6331518d72d Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 11 Mar 2019 08:30:36 +0000
Subject: powerpc/32s: Prepare Kernel Userspace Access Protection

This patch prepares Kernel Userspace Access Protection for
book3s/32.

Due to limitations of the processor page protection capabilities,
the protection is only against writing. read protection cannot be
achieved using page protection.

book3s/32 provides the following values for PP bits:

PP00 provides RW for Key 0 and NA for Key 1
PP01 provides RW for Key 0 and RO for Key 1
PP10 provides RW for all
PP11 provides RO for all

Today PP10 is used for RW pages and PP11 for RO pages, and user
segment register's Kp and Ks are set to 1. This patch modifies
page protection to use PP01 for RW pages and sets user segment
registers to Kp 0 and Ks 0.

This will allow to setup Userspace write access protection by
settng Ks to 1 in the following patch.

Kernel space segment registers remain unchanged.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/32/mmu-hash.h |  2 ++
 arch/powerpc/kernel/head_32.S                 | 22 +++++++++++-----------
 arch/powerpc/mm/hash_low_32.S                 |  6 +++---
 3 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
index 8c5727a322b1..f9eae105a9f4 100644
--- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
@@ -65,6 +65,8 @@ typedef pte_t *pgtable_t;
 
 /* Values for Segment Registers */
 #define SR_NX	0x10000000	/* No Execute */
+#define SR_KP	0x20000000	/* User key */
+#define SR_KS	0x40000000	/* Supervisor key */
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 19b46cb9f623..69b97cc7079f 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -522,9 +522,9 @@ InstructionTLBMiss:
 	andc.	r1,r1,r0		/* check access & ~permission */
 	bne-	InstructionAddressInvalid /* return if access not permitted */
 	/* Convert linux-style PTE to low word of PPC-style PTE */
-	rlwimi	r0,r0,32-1,30,30	/* _PAGE_USER -> PP msb */
-	ori	r1, r1, 0xe05		/* clear out reserved bits */
-	andc	r1, r0, r1		/* PP = user? 2 : 0 */
+	rlwimi	r0,r0,32-2,31,31	/* _PAGE_USER -> PP lsb */
+	ori	r1, r1, 0xe06		/* clear out reserved bits */
+	andc	r1, r0, r1		/* PP = user? 1 : 0 */
 BEGIN_FTR_SECTION
 	rlwinm	r1,r1,0,~_PAGE_COHERENT	/* clear M (coherence not required) */
 END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
@@ -590,11 +590,11 @@ DataLoadTLBMiss:
 	 * we would need to update the pte atomically with lwarx/stwcx.
 	 */
 	/* Convert linux-style PTE to low word of PPC-style PTE */
-	rlwinm	r1,r0,32-10,31,31	/* _PAGE_RW -> PP lsb */
+	rlwinm	r1,r0,32-9,30,30	/* _PAGE_RW -> PP msb */
 	rlwimi	r0,r0,32-1,30,30	/* _PAGE_USER -> PP msb */
 	rlwimi	r0,r0,32-1,31,31	/* _PAGE_USER -> PP lsb */
 	ori	r1,r1,0xe04		/* clear out reserved bits */
-	andc	r1,r0,r1		/* PP = user? rw? 2: 3: 0 */
+	andc	r1,r0,r1		/* PP = user? rw? 1: 3: 0 */
 BEGIN_FTR_SECTION
 	rlwinm	r1,r1,0,~_PAGE_COHERENT	/* clear M (coherence not required) */
 END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
@@ -670,9 +670,9 @@ DataStoreTLBMiss:
 	 * we would need to update the pte atomically with lwarx/stwcx.
 	 */
 	/* Convert linux-style PTE to low word of PPC-style PTE */
-	rlwimi	r0,r0,32-1,30,30	/* _PAGE_USER -> PP msb */
-	li	r1,0xe05		/* clear out reserved bits & PP lsb */
-	andc	r1,r0,r1		/* PP = user? 2: 0 */
+	rlwimi	r0,r0,32-2,31,31	/* _PAGE_USER -> PP lsb */
+	li	r1,0xe06		/* clear out reserved bits & PP msb */
+	andc	r1,r0,r1		/* PP = user? 1: 0 */
 BEGIN_FTR_SECTION
 	rlwinm	r1,r1,0,~_PAGE_COHERENT	/* clear M (coherence not required) */
 END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
@@ -896,9 +896,9 @@ load_up_mmu:
 	tophys(r6,r6)
 	lwz	r6,_SDR1@l(r6)
 	mtspr	SPRN_SDR1,r6
-	li	r0, NUM_USER_SEGMENTS	/* load up segment register values */
+	li	r0, NUM_USER_SEGMENTS /* load up user segment register values */
 	mtctr	r0		/* for context 0 */
-	lis	r3,0x2000	/* Ku = 1, VSID = 0 */
+	li	r3, 0		/* Kp = 0, Ks = 0, VSID = 0 */
 #ifdef CONFIG_PPC_KUEP
 	oris	r3, r3, SR_NX@h	/* Set Nx */
 #endif
@@ -910,6 +910,7 @@ load_up_mmu:
 	li	r0, 16 - NUM_USER_SEGMENTS /* load up kernel segment registers */
 	mtctr	r0			/* for context 0 */
 	rlwinm	r3, r3, 0, ~SR_NX	/* Nx = 0 */
+	oris	r3, r3, SR_KP@h		/* Kp = 1 */
 3:	mtsrin	r3, r4
 	addi	r3, r3, 0x111	/* increment VSID */
 	addis	r4, r4, 0x1000	/* address of next segment */
@@ -1016,7 +1017,6 @@ _ENTRY(switch_mmu_context)
 	blt-	4f
 	mulli	r3,r3,897	/* multiply context by skew factor */
 	rlwinm	r3,r3,4,8,27	/* VSID = (context & 0xfffff) << 4 */
-	addis	r3,r3,0x6000	/* Set Ks, Ku bits */
 #ifdef CONFIG_PPC_KUEP
 	oris	r3, r3, SR_NX@h	/* Set Nx */
 #endif
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index a6c491f18a04..e27792d0b744 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -309,13 +309,13 @@ Hash_msk = (((1 << Hash_bits) - 1) * 64)
 
 _GLOBAL(create_hpte)
 	/* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */
-	rlwinm	r8,r5,32-10,31,31	/* _PAGE_RW -> PP lsb */
-	rlwinm	r0,r5,32-7,31,31	/* _PAGE_DIRTY -> PP lsb */
+	rlwinm	r8,r5,32-9,30,30	/* _PAGE_RW -> PP msb */
+	rlwinm	r0,r5,32-6,30,30	/* _PAGE_DIRTY -> PP msb */
 	and	r8,r8,r0		/* writable if _RW & _DIRTY */
 	rlwimi	r5,r5,32-1,30,30	/* _PAGE_USER -> PP msb */
 	rlwimi	r5,r5,32-2,31,31	/* _PAGE_USER -> PP lsb */
 	ori	r8,r8,0xe04		/* clear out reserved bits */
-	andc	r8,r5,r8		/* PP = user? (rw&dirty? 2: 3): 0 */
+	andc	r8,r5,r8		/* PP = user? (rw&dirty? 1: 3): 0 */
 BEGIN_FTR_SECTION
 	rlwinm	r8,r8,0,~_PAGE_COHERENT	/* clear M (coherence not required) */
 END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
-- 
cgit v1.2.3-59-g8ed1b


From a68c31fc01ef7863acc0fc74694bf279456a58c4 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 11 Mar 2019 08:30:38 +0000
Subject: powerpc/32s: Implement Kernel Userspace Access Protection

This patch implements Kernel Userspace Access Protection for
book3s/32.

Due to limitations of the processor page protection capabilities,
the protection is only against writing. read protection cannot be
achieved using page protection.

The previous patch modifies the page protection so that RW user
pages are RW for Key 0 and RO for Key 1, and it sets Key 0 for
both user and kernel.

This patch changes userspace segment registers are set to Ku 0
and Ks 1. When kernel needs to write to RW pages, the associated
segment register is then changed to Ks 0 in order to allow write
access to the kernel.

In order to avoid having the read all segment registers when
locking/unlocking the access, some data is kept in the thread_struct
and saved on stack on exceptions. The field identifies both the
first unlocked segment and the first segment following the last
unlocked one. When no segment is unlocked, it contains value 0.

As the hash_page() function is not able to easily determine if a
protfault is due to a bad kernel access to userspace, protfaults
need to be handled by handle_page_fault when KUAP is set.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
[mpe: Drop allow_read/write_to/from_user() as they're now in kup.h,
      and adapt allow_user_access() to do nothing when to == NULL]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/32/kup.h | 103 +++++++++++++++++++++++++++++++
 arch/powerpc/include/asm/processor.h     |   3 +
 arch/powerpc/kernel/asm-offsets.c        |   3 +
 arch/powerpc/kernel/head_32.S            |  11 ++++
 arch/powerpc/mm/ppc_mmu_32.c             |  10 +++
 arch/powerpc/platforms/Kconfig.cputype   |   1 +
 6 files changed, 131 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h
index 5f97c742ca71..677e9babef80 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -37,6 +37,109 @@
 #endif
 .endm
 
+#ifdef CONFIG_PPC_KUAP
+
+.macro kuap_update_sr	gpr1, gpr2, gpr3	/* NEVER use r0 as gpr2 due to addis */
+101:	mtsrin	\gpr1, \gpr2
+	addi	\gpr1, \gpr1, 0x111		/* next VSID */
+	rlwinm	\gpr1, \gpr1, 0, 0xf0ffffff	/* clear VSID overflow */
+	addis	\gpr2, \gpr2, 0x1000		/* address of next segment */
+	cmplw	\gpr2, \gpr3
+	blt-	101b
+	isync
+.endm
+
+.macro kuap_save_and_lock	sp, thread, gpr1, gpr2, gpr3
+	lwz	\gpr2, KUAP(\thread)
+	rlwinm.	\gpr3, \gpr2, 28, 0xf0000000
+	stw	\gpr2, STACK_REGS_KUAP(\sp)
+	beq+	102f
+	li	\gpr1, 0
+	stw	\gpr1, KUAP(\thread)
+	mfsrin	\gpr1, \gpr2
+	oris	\gpr1, \gpr1, SR_KS@h	/* set Ks */
+	kuap_update_sr	\gpr1, \gpr2, \gpr3
+102:
+.endm
+
+.macro kuap_restore	sp, current, gpr1, gpr2, gpr3
+	lwz	\gpr2, STACK_REGS_KUAP(\sp)
+	rlwinm.	\gpr3, \gpr2, 28, 0xf0000000
+	stw	\gpr2, THREAD + KUAP(\current)
+	beq+	102f
+	mfsrin	\gpr1, \gpr2
+	rlwinm	\gpr1, \gpr1, 0, ~SR_KS	/* Clear Ks */
+	kuap_update_sr	\gpr1, \gpr2, \gpr3
+102:
+.endm
+
+.macro kuap_check	current, gpr
+#ifdef CONFIG_PPC_KUAP_DEBUG
+	lwz	\gpr2, KUAP(thread)
+999:	twnei	\gpr, 0
+	EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE)
+#endif
+.endm
+
+#endif /* CONFIG_PPC_KUAP */
+
+#else /* !__ASSEMBLY__ */
+
+#ifdef CONFIG_PPC_KUAP
+
+#include <linux/sched.h>
+
+static inline void kuap_update_sr(u32 sr, u32 addr, u32 end)
+{
+	barrier();	/* make sure thread.kuap is updated before playing with SRs */
+	while (addr < end) {
+		mtsrin(sr, addr);
+		sr += 0x111;		/* next VSID */
+		sr &= 0xf0ffffff;	/* clear VSID overflow */
+		addr += 0x10000000;	/* address of next segment */
+	}
+	isync();	/* Context sync required after mtsrin() */
+}
+
+static inline void allow_user_access(void __user *to, const void __user *from, u32 size)
+{
+	u32 addr, end;
+
+	if (__builtin_constant_p(to) && to == NULL)
+		return;
+
+	addr = (__force u32)to;
+
+	if (!addr || addr >= TASK_SIZE || !size)
+		return;
+
+	end = min(addr + size, TASK_SIZE);
+	current->thread.kuap = (addr & 0xf0000000) | ((((end - 1) >> 28) + 1) & 0xf);
+	kuap_update_sr(mfsrin(addr) & ~SR_KS, addr, end);	/* Clear Ks */
+}
+
+static inline void prevent_user_access(void __user *to, const void __user *from, u32 size)
+{
+	u32 addr = (__force u32)to;
+	u32 end = min(addr + size, TASK_SIZE);
+
+	if (!addr || addr >= TASK_SIZE || !size)
+		return;
+
+	current->thread.kuap = 0;
+	kuap_update_sr(mfsrin(addr) | SR_KS, addr, end);	/* set Ks */
+}
+
+static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write)
+{
+	if (!is_write)
+		return false;
+
+	return WARN(!regs->kuap, "Bug: write fault blocked by segment registers !");
+}
+
+#endif /* CONFIG_PPC_KUAP */
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_BOOK3S_32_KUP_H */
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 3351bcf42f2d..540949b397d4 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -163,6 +163,9 @@ struct thread_struct {
 #ifdef CONFIG_PPC_RTAS
 	unsigned long	rtas_sp;	/* stack pointer for when in RTAS */
 #endif
+#endif
+#if defined(CONFIG_PPC_BOOK3S_32) && defined(CONFIG_PPC_KUAP)
+	unsigned long	kuap;		/* opened segments for user access */
 #endif
 	/* Debug Registers */
 	struct debug_reg debug;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 66202e02fee2..60b82198de7c 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -147,6 +147,9 @@ int main(void)
 #if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
 	OFFSET(THREAD_KVM_VCPU, thread_struct, kvm_vcpu);
 #endif
+#if defined(CONFIG_PPC_BOOK3S_32) && defined(CONFIG_PPC_KUAP)
+	OFFSET(KUAP, thread_struct, kuap);
+#endif
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	OFFSET(PACATMSCRATCH, paca_struct, tm_scratch);
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 69b97cc7079f..40aec3f00a05 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -387,7 +387,11 @@ DataAccess:
 	EXCEPTION_PROLOG
 	mfspr	r10,SPRN_DSISR
 	stw	r10,_DSISR(r11)
+#ifdef CONFIG_PPC_KUAP
+	andis.	r0,r10,(DSISR_BAD_FAULT_32S | DSISR_DABRMATCH | DSISR_PROTFAULT)@h
+#else
 	andis.	r0,r10,(DSISR_BAD_FAULT_32S|DSISR_DABRMATCH)@h
+#endif
 	bne	1f			/* if not, try to put a PTE */
 	mfspr	r4,SPRN_DAR		/* into the hash table */
 	rlwinm	r3,r10,32-15,21,21	/* DSISR_STORE -> _PAGE_RW */
@@ -901,6 +905,9 @@ load_up_mmu:
 	li	r3, 0		/* Kp = 0, Ks = 0, VSID = 0 */
 #ifdef CONFIG_PPC_KUEP
 	oris	r3, r3, SR_NX@h	/* Set Nx */
+#endif
+#ifdef CONFIG_PPC_KUAP
+	oris	r3, r3, SR_KS@h	/* Set Ks */
 #endif
 	li	r4,0
 3:	mtsrin	r3,r4
@@ -910,6 +917,7 @@ load_up_mmu:
 	li	r0, 16 - NUM_USER_SEGMENTS /* load up kernel segment registers */
 	mtctr	r0			/* for context 0 */
 	rlwinm	r3, r3, 0, ~SR_NX	/* Nx = 0 */
+	rlwinm	r3, r3, 0, ~SR_KS	/* Ks = 0 */
 	oris	r3, r3, SR_KP@h		/* Kp = 1 */
 3:	mtsrin	r3, r4
 	addi	r3, r3, 0x111	/* increment VSID */
@@ -1019,6 +1027,9 @@ _ENTRY(switch_mmu_context)
 	rlwinm	r3,r3,4,8,27	/* VSID = (context & 0xfffff) << 4 */
 #ifdef CONFIG_PPC_KUEP
 	oris	r3, r3, SR_NX@h	/* Set Nx */
+#endif
+#ifdef CONFIG_PPC_KUAP
+	oris	r3, r3, SR_KS@h	/* Set Ks */
 #endif
 	li	r0,NUM_USER_SEGMENTS
 	mtctr	r0
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index baa75673b1f5..bf1de3ca39bc 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -407,3 +407,13 @@ void __init setup_kuep(bool disabled)
 		pr_warn("KUEP cannot be disabled yet on 6xx when compiled in\n");
 }
 #endif
+
+#ifdef CONFIG_PPC_KUAP
+void __init setup_kuap(bool disabled)
+{
+	pr_info("Activating Kernel Userspace Access Protection\n");
+
+	if (disabled)
+		pr_warn("KUAP cannot be disabled yet on 6xx when compiled in\n");
+}
+#endif
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 6bc0a4c08c1c..60a7c7095b05 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -26,6 +26,7 @@ config PPC_BOOK3S_32
 	select PPC_FPU
 	select PPC_HAVE_PMU_SUPPORT
 	select PPC_HAVE_KUEP
+	select PPC_HAVE_KUAP
 
 config PPC_85xx
 	bool "Freescale 85xx"
-- 
cgit v1.2.3-59-g8ed1b


From 6161a37307f3320808b5a7549593b991500f2656 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 3 Apr 2019 11:35:14 +0530
Subject: powerpc/mm: Fix build error with FLATMEM book3s64 config

The current value of MAX_PHYSMEM_BITS cannot work with 32 bit configs.
We used to have MAX_PHYSMEM_BITS not defined without SPARSEMEM and 32
bit configs never expected a value to be set for MAX_PHYSMEM_BITS.

Dependent code such as zsmalloc derived the right values based on other
fields. Instead of finding a value that works with different configs,
use new values only for book3s_64. For 64 bit booke, use the definition
of MAX_PHYSMEM_BITS as per commit a7df61a0e2b6 ("[PATCH] ppc64: Increase sparsemem defaults")
That change was done in 2005 and hopefully will work with book3e 64.

Fixes: 8bc086899816 ("powerpc/mm: Only define MAX_PHYSMEM_BITS in SPARSEMEM configurations")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu.h | 15 +++++++++++++++
 arch/powerpc/include/asm/mmu.h           | 15 ---------------
 arch/powerpc/include/asm/nohash/64/mmu.h |  2 ++
 3 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 1ceee000c18d..a809bdd77322 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -35,6 +35,21 @@ typedef pte_t *pgtable_t;
 
 #endif /* __ASSEMBLY__ */
 
+/*
+ * If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS
+ * if we increase SECTIONS_WIDTH we will not store node details in page->flags and
+ * page_to_nid does a page->section->node lookup
+ * Hence only increase for VMEMMAP. Further depending on SPARSEMEM_EXTREME reduce
+ * memory requirements with large number of sections.
+ * 51 bits is the max physical real address on POWER9
+ */
+#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME) &&  \
+	defined(CONFIG_PPC_64K_PAGES)
+#define MAX_PHYSMEM_BITS 51
+#else
+#define MAX_PHYSMEM_BITS 46
+#endif
+
 /* 64-bit classic hash table MMU */
 #include <asm/book3s/64/mmu-hash.h>
 
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 38d21adfde40..d86c5641bd97 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -349,21 +349,6 @@ static inline bool strict_kernel_rwx_enabled(void)
  */
 #define MMU_PAGE_COUNT	16
 
-/*
- * If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS
- * if we increase SECTIONS_WIDTH we will not store node details in page->flags and
- * page_to_nid does a page->section->node lookup
- * Hence only increase for VMEMMAP. Further depending on SPARSEMEM_EXTREME reduce
- * memory requirements with large number of sections.
- * 51 bits is the max physical real address on POWER9
- */
-#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME) &&	\
-	defined (CONFIG_PPC_64K_PAGES)
-#define MAX_PHYSMEM_BITS        51
-#elif defined(CONFIG_PPC64)
-#define MAX_PHYSMEM_BITS        46
-#endif
-
 #ifdef CONFIG_PPC_BOOK3S_64
 #include <asm/book3s/64/mmu.h>
 #else /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/include/asm/nohash/64/mmu.h b/arch/powerpc/include/asm/nohash/64/mmu.h
index e6585480dfc4..81cf30c370e5 100644
--- a/arch/powerpc/include/asm/nohash/64/mmu.h
+++ b/arch/powerpc/include/asm/nohash/64/mmu.h
@@ -2,6 +2,8 @@
 #ifndef _ASM_POWERPC_NOHASH_64_MMU_H_
 #define _ASM_POWERPC_NOHASH_64_MMU_H_
 
+#define MAX_PHYSMEM_BITS        44
+
 /* Freescale Book-E software loaded TLB or Book-3e (ISA 2.06+) MMU */
 #include <asm/nohash/mmu-book3e.h>
 
-- 
cgit v1.2.3-59-g8ed1b


From 4f40b15f339d896f5726714842947c9339742494 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 17 Apr 2019 18:33:47 +0530
Subject: powerpc/mm: Remove PPC_MM_SLICES #ifdef for book3s64

Book3s64 always have PPC_MM_SLICES enabled. So remove the unncessary #ifdef

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu.h   |  4 ----
 arch/powerpc/include/asm/book3s/64/slice.h | 13 -------------
 2 files changed, 17 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index a809bdd77322..afe10dd11c68 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -138,7 +138,6 @@ typedef struct {
 	/* NPU NMMU context */
 	struct npu_context *npu_context;
 
-#ifdef CONFIG_PPC_MM_SLICES
 	 /* SLB page size encodings*/
 	unsigned char low_slices_psize[BITS_PER_LONG / BITS_PER_BYTE];
 	unsigned char high_slices_psize[SLICE_ARRAY_SIZE];
@@ -151,9 +150,6 @@ typedef struct {
 	struct slice_mask mask_16m;
 	struct slice_mask mask_16g;
 # endif
-#else
-	u16 sllp;		/* SLB page size encoding */
-#endif
 	unsigned long vdso_base;
 #ifdef CONFIG_PPC_SUBPAGE_PROT
 	struct subpage_prot_table spt;
diff --git a/arch/powerpc/include/asm/book3s/64/slice.h b/arch/powerpc/include/asm/book3s/64/slice.h
index db0dedab65ee..062e11136e9c 100644
--- a/arch/powerpc/include/asm/book3s/64/slice.h
+++ b/arch/powerpc/include/asm/book3s/64/slice.h
@@ -2,8 +2,6 @@
 #ifndef _ASM_POWERPC_BOOK3S_64_SLICE_H
 #define _ASM_POWERPC_BOOK3S_64_SLICE_H
 
-#ifdef CONFIG_PPC_MM_SLICES
-
 #define SLICE_LOW_SHIFT		28
 #define SLICE_LOW_TOP		(0x100000000ul)
 #define SLICE_NUM_LOW		(SLICE_LOW_TOP >> SLICE_LOW_SHIFT)
@@ -13,15 +11,4 @@
 #define SLICE_NUM_HIGH		(H_PGTABLE_RANGE >> SLICE_HIGH_SHIFT)
 #define GET_HIGH_SLICE_INDEX(addr)	((addr) >> SLICE_HIGH_SHIFT)
 
-#else /* CONFIG_PPC_MM_SLICES */
-
-#define get_slice_psize(mm, addr)	((mm)->context.user_psize)
-#define slice_set_user_psize(mm, psize)		\
-do {						\
-	(mm)->context.user_psize = (psize);	\
-	(mm)->context.sllp = SLB_VSID_USER | mmu_psize_defs[(psize)].sllp; \
-} while (0)
-
-#endif /* CONFIG_PPC_MM_SLICES */
-
 #endif /* _ASM_POWERPC_BOOK3S_64_SLICE_H */
-- 
cgit v1.2.3-59-g8ed1b


From 60458fba469a695a026334b364cf8adbcd5807e3 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 17 Apr 2019 18:33:48 +0530
Subject: powerpc/mm: Add helpers for accessing hash translation related
 variables

We want to switch to allocating them runtime only when hash translation is
enabled. Add helpers so that both book3s and nohash can be adapted to
upcoming change easily.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |  4 +-
 arch/powerpc/include/asm/book3s/64/mmu.h      | 63 ++++++++++++++++++++++++++-
 arch/powerpc/include/asm/nohash/32/mmu-8xx.h  | 50 +++++++++++++++++++++
 arch/powerpc/kernel/paca.c                    | 12 ++---
 arch/powerpc/mm/hash_utils_64.c               | 10 ++---
 arch/powerpc/mm/slb.c                         |  2 +-
 arch/powerpc/mm/slice.c                       | 49 ++++++++++-----------
 arch/powerpc/mm/subpage-prot.c                |  8 ++--
 8 files changed, 154 insertions(+), 44 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index a28a28079edb..eb36fbfe4ef5 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -657,8 +657,8 @@ extern void slb_set_size(u16 size);
 
 /* 4 bits per slice and we have one slice per 1TB */
 #define SLICE_ARRAY_SIZE	(H_PGTABLE_RANGE >> 41)
-#define TASK_SLICE_ARRAY_SZ(x)	((x)->context.slb_addr_limit >> 41)
-
+#define LOW_SLICE_ARRAY_SZ	(BITS_PER_LONG / BITS_PER_BYTE)
+#define TASK_SLICE_ARRAY_SZ(x)	((x)->slb_addr_limit >> 41)
 #ifndef __ASSEMBLY__
 
 #ifdef CONFIG_PPC_SUBPAGE_PROT
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index afe10dd11c68..c9f317090620 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -139,7 +139,7 @@ typedef struct {
 	struct npu_context *npu_context;
 
 	 /* SLB page size encodings*/
-	unsigned char low_slices_psize[BITS_PER_LONG / BITS_PER_BYTE];
+	unsigned char low_slices_psize[LOW_SLICE_ARRAY_SZ];
 	unsigned char high_slices_psize[SLICE_ARRAY_SIZE];
 	unsigned long slb_addr_limit;
 # ifdef CONFIG_PPC_64K_PAGES
@@ -174,6 +174,67 @@ typedef struct {
 #endif
 } mm_context_t;
 
+static inline u16 mm_ctx_user_psize(mm_context_t *ctx)
+{
+	return ctx->user_psize;
+}
+
+static inline void mm_ctx_set_user_psize(mm_context_t *ctx, u16 user_psize)
+{
+	ctx->user_psize = user_psize;
+}
+
+static inline unsigned char *mm_ctx_low_slices(mm_context_t *ctx)
+{
+	return ctx->low_slices_psize;
+}
+
+static inline unsigned char *mm_ctx_high_slices(mm_context_t *ctx)
+{
+	return ctx->high_slices_psize;
+}
+
+static inline unsigned long mm_ctx_slb_addr_limit(mm_context_t *ctx)
+{
+	return ctx->slb_addr_limit;
+}
+
+static inline void mm_ctx_set_slb_addr_limit(mm_context_t *ctx, unsigned long limit)
+{
+	ctx->slb_addr_limit = limit;
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+static inline struct slice_mask *mm_ctx_slice_mask_64k(mm_context_t *ctx)
+{
+	return &ctx->mask_64k;
+}
+#endif
+
+static inline struct slice_mask *mm_ctx_slice_mask_4k(mm_context_t *ctx)
+{
+	return &ctx->mask_4k;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static inline struct slice_mask *mm_ctx_slice_mask_16m(mm_context_t *ctx)
+{
+	return &ctx->mask_16m;
+}
+
+static inline struct slice_mask *mm_ctx_slice_mask_16g(mm_context_t *ctx)
+{
+	return &ctx->mask_16g;
+}
+#endif
+
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+static inline struct subpage_prot_table *mm_ctx_subpage_prot(mm_context_t *ctx)
+{
+	return &ctx->spt;
+}
+#endif
+
 /*
  * The current system page and segment sizes
  */
diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
index f620adef54fc..c503e2f05e61 100644
--- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
@@ -181,6 +181,7 @@
 #ifdef CONFIG_PPC_MM_SLICES
 #include <asm/nohash/32/slice.h>
 #define SLICE_ARRAY_SIZE	(1 << (32 - SLICE_LOW_SHIFT - 1))
+#define LOW_SLICE_ARRAY_SZ	SLICE_ARRAY_SIZE
 #endif
 
 #ifndef __ASSEMBLY__
@@ -207,6 +208,55 @@ typedef struct {
 	void *pte_frag;
 } mm_context_t;
 
+#ifdef CONFIG_PPC_MM_SLICES
+static inline u16 mm_ctx_user_psize(mm_context_t *ctx)
+{
+	return ctx->user_psize;
+}
+
+static inline void mm_ctx_set_user_psize(mm_context_t *ctx, u16 user_psize)
+{
+	ctx->user_psize = user_psize;
+}
+
+static inline unsigned char *mm_ctx_low_slices(mm_context_t *ctx)
+{
+	return ctx->low_slices_psize;
+}
+
+static inline unsigned char *mm_ctx_high_slices(mm_context_t *ctx)
+{
+	return ctx->high_slices_psize;
+}
+
+static inline unsigned long mm_ctx_slb_addr_limit(mm_context_t *ctx)
+{
+	return ctx->slb_addr_limit;
+}
+
+static inline void mm_ctx_set_slb_addr_limit(mm_context_t *ctx, unsigned long limit)
+{
+	ctx->slb_addr_limit = limit;
+}
+
+static inline struct slice_mask *mm_ctx_slice_mask_base(mm_context_t *ctx)
+{
+	return &ctx->mask_base_psize;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static inline struct slice_mask *mm_ctx_slice_mask_512k(mm_context_t *ctx)
+{
+	return &ctx->mask_512k;
+}
+
+static inline struct slice_mask *mm_ctx_slice_mask_8m(mm_context_t *ctx)
+{
+	return &ctx->mask_8m;
+}
+#endif
+#endif /* CONFIG_PPC_MM_SLICE */
+
 #define PHYS_IMMR_BASE (mfspr(SPRN_IMMR) & 0xfff80000)
 #define VIRT_IMMR_BASE (__fix_to_virt(FIX_IMMR_BASE))
 
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index e7382abee868..9cc91d03ab62 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -267,12 +267,12 @@ void copy_mm_to_paca(struct mm_struct *mm)
 
 	get_paca()->mm_ctx_id = context->id;
 #ifdef CONFIG_PPC_MM_SLICES
-	VM_BUG_ON(!mm->context.slb_addr_limit);
-	get_paca()->mm_ctx_slb_addr_limit = mm->context.slb_addr_limit;
-	memcpy(&get_paca()->mm_ctx_low_slices_psize,
-	       &context->low_slices_psize, sizeof(context->low_slices_psize));
-	memcpy(&get_paca()->mm_ctx_high_slices_psize,
-	       &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm));
+	VM_BUG_ON(!mm_ctx_slb_addr_limit(context));
+	get_paca()->mm_ctx_slb_addr_limit = mm_ctx_slb_addr_limit(context);
+	memcpy(&get_paca()->mm_ctx_low_slices_psize, mm_ctx_low_slices(context),
+	       LOW_SLICE_ARRAY_SZ);
+	memcpy(&get_paca()->mm_ctx_high_slices_psize, mm_ctx_high_slices(context),
+	       TASK_SLICE_ARRAY_SZ(context));
 #else /* CONFIG_PPC_MM_SLICES */
 	get_paca()->mm_ctx_user_psize = context->user_psize;
 	get_paca()->mm_ctx_sllp = context->sllp;
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index c4c9610ce6e3..fee0270618ac 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1142,7 +1142,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
  */
 static int subpage_protection(struct mm_struct *mm, unsigned long ea)
 {
-	struct subpage_prot_table *spt = &mm->context.spt;
+	struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
 	u32 spp = 0;
 	u32 **sbpm, *sbpp;
 
@@ -1465,7 +1465,7 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
 	int psize = get_slice_psize(mm, ea);
 
 	/* We only prefault standard pages for now */
-	if (unlikely(psize != mm->context.user_psize))
+	if (unlikely(psize != mm_ctx_user_psize(&mm->context)))
 		return false;
 
 	/*
@@ -1544,7 +1544,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 
 	/* Hash it in */
 #ifdef CONFIG_PPC_64K_PAGES
-	if (mm->context.user_psize == MMU_PAGE_64K)
+	if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K)
 		rc = __hash_page_64K(ea, access, vsid, ptep, trap,
 				     update_flags, ssize);
 	else
@@ -1557,8 +1557,8 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 	 */
 	if (rc == -1)
 		hash_failure_debug(ea, access, vsid, trap, ssize,
-				   mm->context.user_psize,
-				   mm->context.user_psize,
+				   mm_ctx_user_psize(&mm->context),
+				   mm_ctx_user_psize(&mm->context),
 				   pte_val(*ptep));
 out_exit:
 	local_irq_restore(flags);
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 5986df48359b..78c0c0a0e355 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -739,7 +739,7 @@ static long slb_allocate_user(struct mm_struct *mm, unsigned long ea)
 	 * consider this as bad access if we take a SLB miss
 	 * on an address above addr limit.
 	 */
-	if (ea >= mm->context.slb_addr_limit)
+	if (ea >= mm_ctx_slb_addr_limit(&mm->context))
 		return -EFAULT;
 
 	context = get_user_context(&mm->context, ea);
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index aec91dbcdc0b..35b278082391 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -101,7 +101,7 @@ static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
 {
 	struct vm_area_struct *vma;
 
-	if ((mm->context.slb_addr_limit - len) < addr)
+	if ((mm_ctx_slb_addr_limit(&mm->context) - len) < addr)
 		return 0;
 	vma = find_vma(mm, addr);
 	return (!vma || (addr + len) <= vm_start_gap(vma));
@@ -155,15 +155,15 @@ static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize)
 {
 #ifdef CONFIG_PPC_64K_PAGES
 	if (psize == MMU_PAGE_64K)
-		return &mm->context.mask_64k;
+		return mm_ctx_slice_mask_64k(&mm->context);
 #endif
 	if (psize == MMU_PAGE_4K)
-		return &mm->context.mask_4k;
+		return mm_ctx_slice_mask_4k(&mm->context);
 #ifdef CONFIG_HUGETLB_PAGE
 	if (psize == MMU_PAGE_16M)
-		return &mm->context.mask_16m;
+		return mm_ctx_slice_mask_16m(&mm->context);
 	if (psize == MMU_PAGE_16G)
-		return &mm->context.mask_16g;
+		return mm_ctx_slice_mask_16g(&mm->context);
 #endif
 	BUG();
 }
@@ -253,7 +253,7 @@ static void slice_convert(struct mm_struct *mm,
 	 */
 	spin_lock_irqsave(&slice_convert_lock, flags);
 
-	lpsizes = mm->context.low_slices_psize;
+	lpsizes = mm_ctx_low_slices(&mm->context);
 	for (i = 0; i < SLICE_NUM_LOW; i++) {
 		if (!(mask->low_slices & (1u << i)))
 			continue;
@@ -272,8 +272,8 @@ static void slice_convert(struct mm_struct *mm,
 				(((unsigned long)psize) << (mask_index * 4));
 	}
 
-	hpsizes = mm->context.high_slices_psize;
-	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) {
+	hpsizes = mm_ctx_high_slices(&mm->context);
+	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm_ctx_slb_addr_limit(&mm->context)); i++) {
 		if (!test_bit(i, mask->high_slices))
 			continue;
 
@@ -292,8 +292,8 @@ static void slice_convert(struct mm_struct *mm,
 	}
 
 	slice_dbg(" lsps=%lx, hsps=%lx\n",
-		  (unsigned long)mm->context.low_slices_psize,
-		  (unsigned long)mm->context.high_slices_psize);
+		  (unsigned long)mm_ctx_low_slices(&mm->context),
+		  (unsigned long)mm_ctx_high_slices(&mm->context));
 
 	spin_unlock_irqrestore(&slice_convert_lock, flags);
 
@@ -393,7 +393,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
 	 * DEFAULT_MAP_WINDOW we should apply this.
 	 */
 	if (high_limit > DEFAULT_MAP_WINDOW)
-		addr += mm->context.slb_addr_limit - DEFAULT_MAP_WINDOW;
+		addr += mm_ctx_slb_addr_limit(&mm->context) - DEFAULT_MAP_WINDOW;
 
 	while (addr > min_addr) {
 		info.high_limit = addr;
@@ -505,20 +505,20 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 			return -ENOMEM;
 	}
 
-	if (high_limit > mm->context.slb_addr_limit) {
+	if (high_limit > mm_ctx_slb_addr_limit(&mm->context)) {
 		/*
 		 * Increasing the slb_addr_limit does not require
 		 * slice mask cache to be recalculated because it should
 		 * be already initialised beyond the old address limit.
 		 */
-		mm->context.slb_addr_limit = high_limit;
+		mm_ctx_set_slb_addr_limit(&mm->context, high_limit);
 
 		on_each_cpu(slice_flush_segments, mm, 1);
 	}
 
 	/* Sanity checks */
 	BUG_ON(mm->task_size == 0);
-	BUG_ON(mm->context.slb_addr_limit == 0);
+	BUG_ON(mm_ctx_slb_addr_limit(&mm->context) == 0);
 	VM_BUG_ON(radix_enabled());
 
 	slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
@@ -696,7 +696,7 @@ unsigned long arch_get_unmapped_area(struct file *filp,
 				     unsigned long flags)
 {
 	return slice_get_unmapped_area(addr, len, flags,
-				       current->mm->context.user_psize, 0);
+				       mm_ctx_user_psize(&current->mm->context), 0);
 }
 
 unsigned long arch_get_unmapped_area_topdown(struct file *filp,
@@ -706,7 +706,7 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp,
 					     const unsigned long flags)
 {
 	return slice_get_unmapped_area(addr0, len, flags,
-				       current->mm->context.user_psize, 1);
+				       mm_ctx_user_psize(&current->mm->context), 1);
 }
 
 unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr)
@@ -717,10 +717,10 @@ unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr)
 	VM_BUG_ON(radix_enabled());
 
 	if (slice_addr_is_low(addr)) {
-		psizes = mm->context.low_slices_psize;
+		psizes = mm_ctx_low_slices(&mm->context);
 		index = GET_LOW_SLICE_INDEX(addr);
 	} else {
-		psizes = mm->context.high_slices_psize;
+		psizes = mm_ctx_high_slices(&mm->context);
 		index = GET_HIGH_SLICE_INDEX(addr);
 	}
 	mask_index = index & 0x1;
@@ -742,20 +742,19 @@ void slice_init_new_context_exec(struct mm_struct *mm)
 	 * duplicated.
 	 */
 #ifdef CONFIG_PPC64
-	mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
+	mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW_USER64);
 #else
 	mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW;
 #endif
-
-	mm->context.user_psize = psize;
+	mm_ctx_set_user_psize(&mm->context, psize);
 
 	/*
 	 * Set all slice psizes to the default.
 	 */
-	lpsizes = mm->context.low_slices_psize;
+	lpsizes = mm_ctx_low_slices(&mm->context);
 	memset(lpsizes, (psize << 4) | psize, SLICE_NUM_LOW >> 1);
 
-	hpsizes = mm->context.high_slices_psize;
+	hpsizes = mm_ctx_high_slices(&mm->context);
 	memset(hpsizes, (psize << 4) | psize, SLICE_NUM_HIGH >> 1);
 
 	/*
@@ -777,7 +776,7 @@ void slice_setup_new_exec(void)
 	if (!is_32bit_task())
 		return;
 
-	mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW;
+	mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW);
 }
 #endif
 
@@ -816,7 +815,7 @@ int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 			   unsigned long len)
 {
 	const struct slice_mask *maskp;
-	unsigned int psize = mm->context.user_psize;
+	unsigned int psize = mm_ctx_user_psize(&mm->context);
 
 	VM_BUG_ON(radix_enabled());
 
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 5e4178790dee..c72252542210 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -25,7 +25,7 @@
  */
 void subpage_prot_free(struct mm_struct *mm)
 {
-	struct subpage_prot_table *spt = &mm->context.spt;
+	struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
 	unsigned long i, j, addr;
 	u32 **p;
 
@@ -52,7 +52,7 @@ void subpage_prot_free(struct mm_struct *mm)
 
 void subpage_prot_init_new_context(struct mm_struct *mm)
 {
-	struct subpage_prot_table *spt = &mm->context.spt;
+	struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
 
 	memset(spt, 0, sizeof(*spt));
 }
@@ -93,7 +93,7 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 static void subpage_prot_clear(unsigned long addr, unsigned long len)
 {
 	struct mm_struct *mm = current->mm;
-	struct subpage_prot_table *spt = &mm->context.spt;
+	struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
 	u32 **spm, *spp;
 	unsigned long i;
 	size_t nw;
@@ -189,7 +189,7 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
 		unsigned long, len, u32 __user *, map)
 {
 	struct mm_struct *mm = current->mm;
-	struct subpage_prot_table *spt = &mm->context.spt;
+	struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
 	u32 **spm, *spp;
 	unsigned long i;
 	size_t nw;
-- 
cgit v1.2.3-59-g8ed1b


From 67fda38f0d688580a916a8f829afd8edaffadfcf Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 17 Apr 2019 18:33:49 +0530
Subject: powerpc/mm: Move slb_addr_linit to early_init_mmu

Avoid #ifdef in generic code. Also enables us to do this specific to
MMU translation mode on book3s64

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/setup-common.c | 11 -----------
 arch/powerpc/mm/hash_utils_64.c    |  2 ++
 arch/powerpc/mm/tlb_nohash.c       |  6 ++++++
 3 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 2e5dfb6e0823..a07de8608484 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -947,17 +947,6 @@ void __init setup_arch(char **cmdline_p)
 	init_mm.end_data = (unsigned long) _edata;
 	init_mm.brk = klimit;
 
-#ifdef CONFIG_PPC_MM_SLICES
-#ifdef CONFIG_PPC64
-	if (!radix_enabled())
-		init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
-#elif defined(CONFIG_PPC_8xx)
-	init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW;
-#else
-#error	"context.addr_limit not initialized."
-#endif
-#endif
-
 #ifdef CONFIG_SPAPR_TCE_IOMMU
 	mm_iommu_init(&init_mm);
 #endif
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index fee0270618ac..fe2fc43a8664 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1036,6 +1036,8 @@ void __init hash__early_init_mmu(void)
 	 */
 	htab_initialize();
 
+	init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
+
 	pr_info("Initializing hash mmu with SLB\n");
 	/* Initialize SLB management */
 	slb_initialize();
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index ac23dc1c6535..088e0a6b5ade 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -800,5 +800,11 @@ void __init early_init_mmu(void)
 #ifdef CONFIG_PPC_47x
 	early_init_mmu_47x();
 #endif
+
+#ifdef CONFIG_PPC_MM_SLICES
+#if defined(CONFIG_PPC_8xx)
+	init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW;
+#endif
+#endif
 }
 #endif /* CONFIG_PPC64 */
-- 
cgit v1.2.3-59-g8ed1b


From 701101865f5d3e268281ce7a254eb4a97d16cbdc Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 17 Apr 2019 18:33:50 +0530
Subject: powerpc/mm: Reduce memory usage for mm_context_t for radix

Currently, our mm_context_t on book3s64 include all hash specific
context details like slice mask and subpage protection details. We
can skip allocating these with radix translation. This will help us to save
8K per mm_context with radix translation.

With the patch applied we have

sizeof(mm_context_t)  = 136
sizeof(struct hash_mm_context)  = 8288

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h | 33 +++++++++++++++++-
 arch/powerpc/include/asm/book3s/64/mmu.h      | 49 +++++++--------------------
 arch/powerpc/kernel/setup-common.c            |  6 ++++
 arch/powerpc/mm/hash_utils_64.c               |  4 ++-
 arch/powerpc/mm/mmu_context_book3s64.c        | 16 ++++++++-
 5 files changed, 68 insertions(+), 40 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index eb36fbfe4ef5..4481bedbb5be 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -658,7 +658,7 @@ extern void slb_set_size(u16 size);
 /* 4 bits per slice and we have one slice per 1TB */
 #define SLICE_ARRAY_SIZE	(H_PGTABLE_RANGE >> 41)
 #define LOW_SLICE_ARRAY_SZ	(BITS_PER_LONG / BITS_PER_BYTE)
-#define TASK_SLICE_ARRAY_SZ(x)	((x)->slb_addr_limit >> 41)
+#define TASK_SLICE_ARRAY_SZ(x)	((x)->hash_context->slb_addr_limit >> 41)
 #ifndef __ASSEMBLY__
 
 #ifdef CONFIG_PPC_SUBPAGE_PROT
@@ -693,6 +693,37 @@ static inline void subpage_prot_free(struct mm_struct *mm) {}
 static inline void subpage_prot_init_new_context(struct mm_struct *mm) { }
 #endif /* CONFIG_PPC_SUBPAGE_PROT */
 
+/*
+ * One bit per slice. We have lower slices which cover 256MB segments
+ * upto 4G range. That gets us 16 low slices. For the rest we track slices
+ * in 1TB size.
+ */
+struct slice_mask {
+	u64 low_slices;
+	DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH);
+};
+
+struct hash_mm_context {
+	u16 user_psize; /* page size index */
+
+	/* SLB page size encodings*/
+	unsigned char low_slices_psize[LOW_SLICE_ARRAY_SZ];
+	unsigned char high_slices_psize[SLICE_ARRAY_SIZE];
+	unsigned long slb_addr_limit;
+#ifdef CONFIG_PPC_64K_PAGES
+	struct slice_mask mask_64k;
+#endif
+	struct slice_mask mask_4k;
+#ifdef CONFIG_HUGETLB_PAGE
+	struct slice_mask mask_16m;
+	struct slice_mask mask_16g;
+#endif
+
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+	struct subpage_prot_table spt;
+#endif /* CONFIG_PPC_SUBPAGE_PROT */
+};
+
 #if 0
 /*
  * The code below is equivalent to this function for arguments
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index c9f317090620..e510e46b07ce 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -104,16 +104,6 @@ struct spinlock;
 /* Maximum possible number of NPUs in a system. */
 #define NV_MAX_NPUS 8
 
-/*
- * One bit per slice. We have lower slices which cover 256MB segments
- * upto 4G range. That gets us 16 low slices. For the rest we track slices
- * in 1TB size.
- */
-struct slice_mask {
-	u64 low_slices;
-	DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH);
-};
-
 typedef struct {
 	union {
 		/*
@@ -127,7 +117,6 @@ typedef struct {
 		mm_context_id_t id;
 		mm_context_id_t extended_id[TASK_SIZE_USER64/TASK_CONTEXT_SIZE];
 	};
-	u16 user_psize;		/* page size index */
 
 	/* Number of bits in the mm_cpumask */
 	atomic_t active_cpus;
@@ -137,23 +126,9 @@ typedef struct {
 
 	/* NPU NMMU context */
 	struct npu_context *npu_context;
+	struct hash_mm_context *hash_context;
 
-	 /* SLB page size encodings*/
-	unsigned char low_slices_psize[LOW_SLICE_ARRAY_SZ];
-	unsigned char high_slices_psize[SLICE_ARRAY_SIZE];
-	unsigned long slb_addr_limit;
-# ifdef CONFIG_PPC_64K_PAGES
-	struct slice_mask mask_64k;
-# endif
-	struct slice_mask mask_4k;
-# ifdef CONFIG_HUGETLB_PAGE
-	struct slice_mask mask_16m;
-	struct slice_mask mask_16g;
-# endif
 	unsigned long vdso_base;
-#ifdef CONFIG_PPC_SUBPAGE_PROT
-	struct subpage_prot_table spt;
-#endif /* CONFIG_PPC_SUBPAGE_PROT */
 	/*
 	 * pagetable fragment support
 	 */
@@ -176,62 +151,62 @@ typedef struct {
 
 static inline u16 mm_ctx_user_psize(mm_context_t *ctx)
 {
-	return ctx->user_psize;
+	return ctx->hash_context->user_psize;
 }
 
 static inline void mm_ctx_set_user_psize(mm_context_t *ctx, u16 user_psize)
 {
-	ctx->user_psize = user_psize;
+	ctx->hash_context->user_psize = user_psize;
 }
 
 static inline unsigned char *mm_ctx_low_slices(mm_context_t *ctx)
 {
-	return ctx->low_slices_psize;
+	return ctx->hash_context->low_slices_psize;
 }
 
 static inline unsigned char *mm_ctx_high_slices(mm_context_t *ctx)
 {
-	return ctx->high_slices_psize;
+	return ctx->hash_context->high_slices_psize;
 }
 
 static inline unsigned long mm_ctx_slb_addr_limit(mm_context_t *ctx)
 {
-	return ctx->slb_addr_limit;
+	return ctx->hash_context->slb_addr_limit;
 }
 
 static inline void mm_ctx_set_slb_addr_limit(mm_context_t *ctx, unsigned long limit)
 {
-	ctx->slb_addr_limit = limit;
+	ctx->hash_context->slb_addr_limit = limit;
 }
 
 #ifdef CONFIG_PPC_64K_PAGES
 static inline struct slice_mask *mm_ctx_slice_mask_64k(mm_context_t *ctx)
 {
-	return &ctx->mask_64k;
+	return &ctx->hash_context->mask_64k;
 }
 #endif
 
 static inline struct slice_mask *mm_ctx_slice_mask_4k(mm_context_t *ctx)
 {
-	return &ctx->mask_4k;
+	return &ctx->hash_context->mask_4k;
 }
 
 #ifdef CONFIG_HUGETLB_PAGE
 static inline struct slice_mask *mm_ctx_slice_mask_16m(mm_context_t *ctx)
 {
-	return &ctx->mask_16m;
+	return &ctx->hash_context->mask_16m;
 }
 
 static inline struct slice_mask *mm_ctx_slice_mask_16g(mm_context_t *ctx)
 {
-	return &ctx->mask_16g;
+	return &ctx->hash_context->mask_16g;
 }
 #endif
 
 #ifdef CONFIG_PPC_SUBPAGE_PROT
 static inline struct subpage_prot_table *mm_ctx_subpage_prot(mm_context_t *ctx)
 {
-	return &ctx->spt;
+	return &ctx->hash_context->spt;
 }
 #endif
 
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index a07de8608484..21b1ce200b22 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -947,6 +947,12 @@ void __init setup_arch(char **cmdline_p)
 	init_mm.end_data = (unsigned long) _edata;
 	init_mm.brk = klimit;
 
+#ifdef CONFIG_PPC_MM_SLICES
+#if defined(CONFIG_PPC_8xx)
+	init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW;
+#endif
+#endif
+
 #ifdef CONFIG_SPAPR_TCE_IOMMU
 	mm_iommu_init(&init_mm);
 #endif
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index fe2fc43a8664..27239a076773 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -963,6 +963,7 @@ void __init hash__early_init_devtree(void)
 	htab_scan_page_sizes();
 }
 
+struct hash_mm_context init_hash_mm_context;
 void __init hash__early_init_mmu(void)
 {
 #ifndef CONFIG_PPC_64K_PAGES
@@ -1036,7 +1037,8 @@ void __init hash__early_init_mmu(void)
 	 */
 	htab_initialize();
 
-	init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
+	init_mm.context.hash_context = &init_hash_mm_context;
+	init_mm.context.hash_context->slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
 
 	pr_info("Initializing hash mmu with SLB\n");
 	/* Initialize SLB management */
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index f720c5cc0b5e..6eef5a36b2e9 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -63,6 +63,12 @@ static int hash__init_new_context(struct mm_struct *mm)
 	if (index < 0)
 		return index;
 
+	mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context), GFP_KERNEL);
+	if (!mm->context.hash_context) {
+		ida_free(&mmu_context_ida, index);
+		return -ENOMEM;
+	}
+
 	/*
 	 * The old code would re-promote on fork, we don't do that when using
 	 * slices as it could cause problem promoting slices that have been
@@ -77,8 +83,14 @@ static int hash__init_new_context(struct mm_struct *mm)
 	 * We should not be calling init_new_context() on init_mm. Hence a
 	 * check against 0 is OK.
 	 */
-	if (mm->context.id == 0)
+	if (mm->context.id == 0) {
+		memset(mm->context.hash_context, 0, sizeof(struct hash_mm_context));
 		slice_init_new_context_exec(mm);
+	} else {
+		/* This is fork. Copy hash_context details from current->mm */
+		memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context));
+
+	}
 
 	subpage_prot_init_new_context(mm);
 
@@ -118,6 +130,7 @@ static int radix__init_new_context(struct mm_struct *mm)
 	asm volatile("ptesync;isync" : : : "memory");
 
 	mm->context.npu_context = NULL;
+	mm->context.hash_context = NULL;
 
 	return index;
 }
@@ -162,6 +175,7 @@ static void destroy_contexts(mm_context_t *ctx)
 		if (context_id)
 			ida_free(&mmu_context_ida, context_id);
 	}
+	kfree(ctx->hash_context);
 }
 
 static void pmd_frag_destroy(void *pmd_frag)
-- 
cgit v1.2.3-59-g8ed1b


From ef629cc5bf0543eb57d6e344ba776880ac35fd00 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 17 Apr 2019 18:33:51 +0530
Subject: powerc/mm/hash: Reduce hash_mm_context size

Allocate subpage protect related variables only if we use the feature.
This helps in reducing the hash related mm context struct by around 4K

Before the patch
sizeof(struct hash_mm_context)  = 8288

After the patch
sizeof(struct hash_mm_context) = 4160

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |  4 +---
 arch/powerpc/include/asm/book3s/64/mmu.h      |  2 +-
 arch/powerpc/mm/hash_utils_64.c               |  3 +++
 arch/powerpc/mm/mmu_context_book3s64.c        | 17 +++++++++++++---
 arch/powerpc/mm/subpage-prot.c                | 28 ++++++++++++++++++++-------
 5 files changed, 40 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 4481bedbb5be..eeb40091b46b 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -687,10 +687,8 @@ struct subpage_prot_table {
 #define SBP_L3_SHIFT		(SBP_L2_SHIFT + SBP_L2_BITS)
 
 extern void subpage_prot_free(struct mm_struct *mm);
-extern void subpage_prot_init_new_context(struct mm_struct *mm);
 #else
 static inline void subpage_prot_free(struct mm_struct *mm) {}
-static inline void subpage_prot_init_new_context(struct mm_struct *mm) { }
 #endif /* CONFIG_PPC_SUBPAGE_PROT */
 
 /*
@@ -720,7 +718,7 @@ struct hash_mm_context {
 #endif
 
 #ifdef CONFIG_PPC_SUBPAGE_PROT
-	struct subpage_prot_table spt;
+	struct subpage_prot_table *spt;
 #endif /* CONFIG_PPC_SUBPAGE_PROT */
 };
 
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index e510e46b07ce..230a9dec7677 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -206,7 +206,7 @@ static inline struct slice_mask *mm_ctx_slice_mask_16g(mm_context_t *ctx)
 #ifdef CONFIG_PPC_SUBPAGE_PROT
 static inline struct subpage_prot_table *mm_ctx_subpage_prot(mm_context_t *ctx)
 {
-	return &ctx->hash_context->spt;
+	return ctx->hash_context->spt;
 }
 #endif
 
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 27239a076773..6a2d315495a3 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1150,6 +1150,9 @@ static int subpage_protection(struct mm_struct *mm, unsigned long ea)
 	u32 spp = 0;
 	u32 **sbpm, *sbpp;
 
+	if (!spt)
+		return 0;
+
 	if (ea >= spt->maxaddr)
 		return 0;
 	if (ea < 0x100000000UL) {
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 6eef5a36b2e9..cb2b08635508 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -63,7 +63,8 @@ static int hash__init_new_context(struct mm_struct *mm)
 	if (index < 0)
 		return index;
 
-	mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context), GFP_KERNEL);
+	mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context),
+					   GFP_KERNEL);
 	if (!mm->context.hash_context) {
 		ida_free(&mmu_context_ida, index);
 		return -ENOMEM;
@@ -89,11 +90,21 @@ static int hash__init_new_context(struct mm_struct *mm)
 	} else {
 		/* This is fork. Copy hash_context details from current->mm */
 		memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context));
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+		/* inherit subpage prot detalis if we have one. */
+		if (current->mm->context.hash_context->spt) {
+			mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table),
+								GFP_KERNEL);
+			if (!mm->context.hash_context->spt) {
+				ida_free(&mmu_context_ida, index);
+				kfree(mm->context.hash_context);
+				return -ENOMEM;
+			}
+		}
+#endif
 
 	}
 
-	subpage_prot_init_new_context(mm);
-
 	pkey_mm_init(mm);
 	return index;
 }
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index c72252542210..c9dff4e1f295 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -29,6 +29,9 @@ void subpage_prot_free(struct mm_struct *mm)
 	unsigned long i, j, addr;
 	u32 **p;
 
+	if (!spt)
+		return;
+
 	for (i = 0; i < 4; ++i) {
 		if (spt->low_prot[i]) {
 			free_page((unsigned long)spt->low_prot[i]);
@@ -48,13 +51,7 @@ void subpage_prot_free(struct mm_struct *mm)
 		free_page((unsigned long)p);
 	}
 	spt->maxaddr = 0;
-}
-
-void subpage_prot_init_new_context(struct mm_struct *mm)
-{
-	struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
-
-	memset(spt, 0, sizeof(*spt));
+	kfree(spt);
 }
 
 static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
@@ -99,6 +96,9 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
 	size_t nw;
 	unsigned long next, limit;
 
+	if (!spt)
+		return ;
+
 	down_write(&mm->mmap_sem);
 	limit = addr + len;
 	if (limit > spt->maxaddr)
@@ -218,6 +218,20 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
 		return -EFAULT;
 
 	down_write(&mm->mmap_sem);
+
+	if (!spt) {
+		/*
+		 * Allocate subpage prot table if not already done.
+		 * Do this with mmap_sem held
+		 */
+		spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL);
+		if (!spt) {
+			err = -ENOMEM;
+			goto out;
+		}
+		mm->context.hash_context->spt = spt;
+	}
+
 	subpage_mark_vma_nohuge(mm, addr, len);
 	for (limit = addr + len; addr < limit; addr = next) {
 		next = pmd_addr_end(addr, limit);
-- 
cgit v1.2.3-59-g8ed1b


From a35a3c6f60657812366fca86a9ce71df1b8f7aff Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 17 Apr 2019 18:29:13 +0530
Subject: powerpc/mm/hash64: Add a variable to track the end of IO mapping

This makes it easy to update the region mapping in the later patch

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash.h    | 3 ++-
 arch/powerpc/include/asm/book3s/64/pgtable.h | 8 +++++---
 arch/powerpc/include/asm/book3s/64/radix.h   | 1 +
 arch/powerpc/mm/hash_utils_64.c              | 1 +
 arch/powerpc/mm/pgtable-radix.c              | 1 +
 arch/powerpc/mm/pgtable_64.c                 | 2 ++
 6 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 54b7af6cd27f..8cbc4106d449 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -69,7 +69,8 @@
 #define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE - H_KERN_IO_SIZE)
 #define H_VMALLOC_END  (H_VMALLOC_START + H_VMALLOC_SIZE)
 
-#define H_KERN_IO_START H_VMALLOC_END
+#define H_KERN_IO_START	H_VMALLOC_END
+#define H_KERN_IO_END	(H_KERN_VIRT_START + H_KERN_VIRT_SIZE)
 
 /*
  * Region IDs
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index e3d18b3f6e5d..f8ab18f77d1b 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -277,9 +277,12 @@ extern unsigned long __vmalloc_end;
 extern unsigned long __kernel_virt_start;
 extern unsigned long __kernel_virt_size;
 extern unsigned long __kernel_io_start;
+extern unsigned long __kernel_io_end;
 #define KERN_VIRT_START __kernel_virt_start
 #define KERN_VIRT_SIZE  __kernel_virt_size
 #define KERN_IO_START  __kernel_io_start
+#define KERN_IO_END __kernel_io_end
+
 extern struct page *vmemmap;
 extern unsigned long ioremap_bot;
 extern unsigned long pci_io_base;
@@ -296,8 +299,7 @@ extern unsigned long pci_io_base;
 
 #include <asm/barrier.h>
 /*
- * The second half of the kernel virtual space is used for IO mappings,
- * it's itself carved into the PIO region (ISA and PHB IO space) and
+ * IO space itself carved into the PIO region (ISA and PHB IO space) and
  * the ioremap space
  *
  *  ISA_IO_BASE = KERN_IO_START, 64K reserved area
@@ -310,7 +312,7 @@ extern unsigned long pci_io_base;
 #define  PHB_IO_BASE	(ISA_IO_END)
 #define  PHB_IO_END	(KERN_IO_START + FULL_IO_SIZE)
 #define IOREMAP_BASE	(PHB_IO_END)
-#define IOREMAP_END	(KERN_VIRT_START + KERN_VIRT_SIZE)
+#define IOREMAP_END	(KERN_IO_END)
 
 /* Advertise special mapping type for AGP */
 #define HAVE_PAGE_AGP
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 5ab134eeed20..6d760a083d62 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -111,6 +111,7 @@
 #define RADIX_VMEMMAP_BASE		(RADIX_VMALLOC_END)
 
 #define RADIX_KERN_IO_START	(RADIX_KERN_VIRT_START + (RADIX_KERN_VIRT_SIZE >> 1))
+#define RADIX_KERN_IO_END       (RADIX_KERN_VIRT_START + RADIX_KERN_VIRT_SIZE)
 
 #ifndef __ASSEMBLY__
 #define RADIX_PTE_TABLE_SIZE	(sizeof(pte_t) << RADIX_PTE_INDEX_SIZE)
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 6a2d315495a3..f6ffe8545717 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1013,6 +1013,7 @@ void __init hash__early_init_mmu(void)
 	__vmalloc_start = H_VMALLOC_START;
 	__vmalloc_end = H_VMALLOC_END;
 	__kernel_io_start = H_KERN_IO_START;
+	__kernel_io_end = H_KERN_IO_END;
 	vmemmap = (struct page *)H_VMEMMAP_BASE;
 	ioremap_bot = IOREMAP_BASE;
 
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 45869fd698a0..4c1a9843d0f2 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -605,6 +605,7 @@ void __init radix__early_init_mmu(void)
 	__vmalloc_start = RADIX_VMALLOC_START;
 	__vmalloc_end = RADIX_VMALLOC_END;
 	__kernel_io_start = RADIX_KERN_IO_START;
+	__kernel_io_end = RADIX_KERN_IO_END;
 	vmemmap = (struct page *)RADIX_VMEMMAP_BASE;
 	ioremap_bot = IOREMAP_BASE;
 
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index fb1375c07e8c..7cea39bdf05f 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -98,6 +98,8 @@ unsigned long __vmalloc_end;
 EXPORT_SYMBOL(__vmalloc_end);
 unsigned long __kernel_io_start;
 EXPORT_SYMBOL(__kernel_io_start);
+unsigned long __kernel_io_end;
+EXPORT_SYMBOL(__kernel_io_end);
 struct page *vmemmap;
 EXPORT_SYMBOL(vmemmap);
 unsigned long __pte_frag_nr;
-- 
cgit v1.2.3-59-g8ed1b


From 0034d395f89d9c092bb15adbabdca5283e258b41 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 17 Apr 2019 18:29:14 +0530
Subject: powerpc/mm/hash64: Map all the kernel regions in the same 0xc range

This patch maps vmalloc, IO and vmemap regions in the 0xc address range
instead of the current 0xd and 0xf range. This brings the mapping closer
to radix translation mode.

With hash 64K page size each of this region is 512TB whereas with 4K config
we are limited by the max page table range of 64TB and hence there regions
are of 16TB size.

The kernel mapping is now:

 On 4K hash

     kernel_region_map_size = 16TB
     kernel vmalloc start   = 0xc000100000000000
     kernel IO start        = 0xc000200000000000
     kernel vmemmap start   = 0xc000300000000000

64K hash, 64K radix and 4k radix:

     kernel_region_map_size = 512TB
     kernel vmalloc start   = 0xc008000000000000
     kernel IO start        = 0xc00a000000000000
     kernel vmemmap start   = 0xc00c000000000000

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  | 13 ++++
 arch/powerpc/include/asm/book3s/64/hash-64k.h | 11 ++++
 arch/powerpc/include/asm/book3s/64/hash.h     | 95 +++++++++++++++++----------
 arch/powerpc/include/asm/book3s/64/mmu-hash.h | 31 +++++----
 arch/powerpc/include/asm/book3s/64/pgtable.h  |  1 -
 arch/powerpc/include/asm/book3s/64/radix.h    | 41 ++++++------
 arch/powerpc/include/asm/page.h               |  3 +-
 arch/powerpc/kvm/book3s_hv_rm_xics.c          |  2 +-
 arch/powerpc/mm/copro_fault.c                 | 14 ++--
 arch/powerpc/mm/hash_utils_64.c               | 26 ++++----
 arch/powerpc/mm/pgtable-radix.c               |  7 +-
 arch/powerpc/mm/pgtable_64.c                  |  2 -
 arch/powerpc/mm/ptdump/hashpagetable.c        |  2 +-
 arch/powerpc/mm/ptdump/ptdump.c               |  3 +-
 arch/powerpc/mm/slb.c                         | 22 ++++---
 arch/powerpc/platforms/cell/spu_base.c        |  4 +-
 drivers/misc/cxl/fault.c                      |  2 +-
 drivers/misc/ocxl/link.c                      |  2 +-
 18 files changed, 172 insertions(+), 109 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 54fab723a8c7..4c9dfd625461 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -13,6 +13,19 @@
  */
 #define MAX_EA_BITS_PER_CONTEXT		46
 
+/*
+ * Our page table limit us to 64TB. Hence for the kernel mapping,
+ * each MAP area is limited to 16 TB.
+ * The four map areas are:  linear mapping, vmap, IO and vmemmap
+ */
+#define H_KERN_MAP_SIZE		(ASM_CONST(1) << (MAX_EA_BITS_PER_CONTEXT - 2))
+
+/*
+ * Define the address range of the kernel non-linear virtual area
+ * 16TB
+ */
+#define H_KERN_VIRT_START	ASM_CONST(0xc000100000000000)
+
 #ifndef __ASSEMBLY__
 #define H_PTE_TABLE_SIZE	(sizeof(pte_t) << H_PTE_INDEX_SIZE)
 #define H_PMD_TABLE_SIZE	(sizeof(pmd_t) << H_PMD_INDEX_SIZE)
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 81f4eb6e7da4..0d0191cda050 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -14,6 +14,17 @@
  */
 #define MAX_EA_BITS_PER_CONTEXT		49
 
+/*
+ * We use one context for each MAP area.
+ */
+#define H_KERN_MAP_SIZE		(1UL << MAX_EA_BITS_PER_CONTEXT)
+
+/*
+ * Define the address range of the kernel non-linear virtual area
+ * 2PB
+ */
+#define H_KERN_VIRT_START	ASM_CONST(0xc008000000000000)
+
 /*
  * 64k aligned address free up few of the lower bits of RPN for us
  * We steal that here. For more deatils look at pte_pfn/pfn_pte()
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 8cbc4106d449..76741a221910 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -29,6 +29,10 @@
 #define H_PGTABLE_EADDR_SIZE	(H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE + \
 				 H_PUD_INDEX_SIZE + H_PGD_INDEX_SIZE + PAGE_SHIFT)
 #define H_PGTABLE_RANGE		(ASM_CONST(1) << H_PGTABLE_EADDR_SIZE)
+/*
+ * Top 2 bits are ignored in page table walk.
+ */
+#define EA_MASK			(~(0xcUL << 60))
 
 /*
  * We store the slot details in the second half of page table.
@@ -42,53 +46,56 @@
 #endif
 
 /*
- * Define the address range of the kernel non-linear virtual area. In contrast
- * to the linear mapping, this is managed using the kernel page tables and then
- * inserted into the hash page table to actually take effect, similarly to user
- * mappings.
+ * +------------------------------+
+ * |                              |
+ * |                              |
+ * |                              |
+ * +------------------------------+  Kernel virtual map end (0xc00e000000000000)
+ * |                              |
+ * |                              |
+ * |      512TB/16TB of vmemmap   |
+ * |                              |
+ * |                              |
+ * +------------------------------+  Kernel vmemmap  start
+ * |                              |
+ * |      512TB/16TB of IO map    |
+ * |                              |
+ * +------------------------------+  Kernel IO map start
+ * |                              |
+ * |      512TB/16TB of vmap      |
+ * |                              |
+ * +------------------------------+  Kernel virt start (0xc008000000000000)
+ * |                              |
+ * |                              |
+ * |                              |
+ * +------------------------------+  Kernel linear (0xc.....)
  */
-#define H_KERN_VIRT_START ASM_CONST(0xD000000000000000)
 
-/*
- * Allow virtual mapping of one context size.
- * 512TB for 64K page size
- * 64TB for 4K page size
- */
-#define H_KERN_VIRT_SIZE (1UL << MAX_EA_BITS_PER_CONTEXT)
+#define H_VMALLOC_START		H_KERN_VIRT_START
+#define H_VMALLOC_SIZE		H_KERN_MAP_SIZE
+#define H_VMALLOC_END		(H_VMALLOC_START + H_VMALLOC_SIZE)
 
-/*
- * 8TB IO mapping size
- */
-#define H_KERN_IO_SIZE ASM_CONST(0x80000000000) /* 8T */
-
-/*
- * The vmalloc space starts at the beginning of the kernel non-linear virtual
- * region, and occupies 504T (64K) or 56T (4K)
- */
-#define H_VMALLOC_START H_KERN_VIRT_START
-#define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE - H_KERN_IO_SIZE)
-#define H_VMALLOC_END  (H_VMALLOC_START + H_VMALLOC_SIZE)
+#define H_KERN_IO_START		H_VMALLOC_END
+#define H_KERN_IO_SIZE		H_KERN_MAP_SIZE
+#define H_KERN_IO_END		(H_KERN_IO_START + H_KERN_IO_SIZE)
 
-#define H_KERN_IO_START	H_VMALLOC_END
-#define H_KERN_IO_END	(H_KERN_VIRT_START + H_KERN_VIRT_SIZE)
+#define H_VMEMMAP_START		H_KERN_IO_END
+#define H_VMEMMAP_SIZE		H_KERN_MAP_SIZE
+#define H_VMEMMAP_END		(H_VMEMMAP_START + H_VMEMMAP_SIZE)
 
 /*
  * Region IDs
  */
-#define REGION_SHIFT		60UL
-#define REGION_MASK		(0xfUL << REGION_SHIFT)
-#define REGION_ID(ea)		(((unsigned long)(ea)) >> REGION_SHIFT)
-
-#define VMALLOC_REGION_ID	(REGION_ID(H_VMALLOC_START))
-#define KERNEL_REGION_ID	(REGION_ID(PAGE_OFFSET))
-#define VMEMMAP_REGION_ID	(0xfUL)	/* Server only */
-#define USER_REGION_ID		(0UL)
+#define USER_REGION_ID		1
+#define KERNEL_REGION_ID	2
+#define VMALLOC_REGION_ID	3
+#define IO_REGION_ID		4
+#define VMEMMAP_REGION_ID	5
 
 /*
  * Defines the address of the vmemap area, in its own region on
  * hash table CPUs.
  */
-#define H_VMEMMAP_BASE		(VMEMMAP_REGION_ID << REGION_SHIFT)
 
 #ifdef CONFIG_PPC_MM_SLICES
 #define HAVE_ARCH_UNMAPPED_AREA
@@ -104,6 +111,26 @@
 #define H_PUD_BAD_BITS		(PMD_TABLE_SIZE-1)
 
 #ifndef __ASSEMBLY__
+static inline int get_region_id(unsigned long ea)
+{
+	int id = (ea >> 60UL);
+
+	if (id == 0)
+		return USER_REGION_ID;
+
+	VM_BUG_ON(id != 0xc);
+	VM_BUG_ON(ea >= H_VMEMMAP_END);
+
+	if (ea >= H_VMEMMAP_START)
+		return VMEMMAP_REGION_ID;
+	else if (ea >= H_KERN_IO_START)
+		return IO_REGION_ID;
+	else if (ea >= H_VMALLOC_START)
+		return VMALLOC_REGION_ID;
+
+	return KERNEL_REGION_ID;
+}
+
 #define	hash__pmd_bad(pmd)		(pmd_val(pmd) & H_PMD_BAD_BITS)
 #define	hash__pud_bad(pud)		(pud_val(pud) & H_PUD_BAD_BITS)
 static inline int hash__pgd_bad(pgd_t pgd)
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index eeb40091b46b..8a30bf189f10 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -588,7 +588,8 @@ extern void slb_set_size(u16 size);
 #endif
 
 #define MAX_VMALLOC_CTX_CNT	1
-#define MAX_MEMMAP_CTX_CNT	1
+#define MAX_IO_CTX_CNT		1
+#define MAX_VMEMMAP_CTX_CNT	1
 
 /*
  * 256MB segment
@@ -601,13 +602,10 @@ extern void slb_set_size(u16 size);
  * would give a protovsid of 0x1fffffffff. That will result in a VSID 0
  * because of the modulo operation in vsid scramble.
  *
- * We add one extra context to MIN_USER_CONTEXT so that we can map kernel
- * context easily. The +1 is to map the unused 0xe region mapping.
  */
 #define MAX_USER_CONTEXT	((ASM_CONST(1) << CONTEXT_BITS) - 2)
 #define MIN_USER_CONTEXT	(MAX_KERNEL_CTX_CNT + MAX_VMALLOC_CTX_CNT + \
-				 MAX_MEMMAP_CTX_CNT + 2)
-
+				 MAX_IO_CTX_CNT + MAX_VMEMMAP_CTX_CNT)
 /*
  * For platforms that support on 65bit VA we limit the context bits
  */
@@ -776,7 +774,7 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea,
 	/*
 	 * Bad address. We return VSID 0 for that
 	 */
-	if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE)
+	if ((ea & EA_MASK)  >= H_PGTABLE_RANGE)
 		return 0;
 
 	if (!mmu_has_feature(MMU_FTR_68_BIT_VA))
@@ -803,28 +801,29 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea,
  * 0x00002 -  [ 0xc002000000000000 - 0xc003ffffffffffff]
  * 0x00003 -  [ 0xc004000000000000 - 0xc005ffffffffffff]
  * 0x00004 -  [ 0xc006000000000000 - 0xc007ffffffffffff]
-
- * 0x00005 -  [ 0xd000000000000000 - 0xd001ffffffffffff ]
- * 0x00006 -  Not used - Can map 0xe000000000000000 range.
- * 0x00007 -  [ 0xf000000000000000 - 0xf001ffffffffffff ]
  *
- * So we can compute the context from the region (top nibble) by
- * subtracting 11, or 0xc - 1.
+ * vmap, IO, vmemap
+ *
+ * 0x00005 -  [ 0xc008000000000000 - 0xc009ffffffffffff]
+ * 0x00006 -  [ 0xc00a000000000000 - 0xc00bffffffffffff]
+ * 0x00007 -  [ 0xc00c000000000000 - 0xc00dffffffffffff]
+ *
  */
 static inline unsigned long get_kernel_context(unsigned long ea)
 {
-	unsigned long region_id = REGION_ID(ea);
+	unsigned long region_id = get_region_id(ea);
 	unsigned long ctx;
 	/*
-	 * For linear mapping we do support multiple context
+	 * Depending on Kernel config, kernel region can have one context
+	 * or more.
 	 */
 	if (region_id == KERNEL_REGION_ID) {
 		/*
 		 * We already verified ea to be not beyond the addr limit.
 		 */
-		ctx =  1 + ((ea & ~REGION_MASK) >> MAX_EA_BITS_PER_CONTEXT);
+		ctx =  1 + ((ea & EA_MASK) >> MAX_EA_BITS_PER_CONTEXT);
 	} else
-		ctx = (region_id - 0xc) + MAX_KERNEL_CTX_CNT;
+		ctx = region_id + MAX_KERNEL_CTX_CNT - 2;
 	return ctx;
 }
 
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index f8ab18f77d1b..7dede2e34b70 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -279,7 +279,6 @@ extern unsigned long __kernel_virt_size;
 extern unsigned long __kernel_io_start;
 extern unsigned long __kernel_io_end;
 #define KERN_VIRT_START __kernel_virt_start
-#define KERN_VIRT_SIZE  __kernel_virt_size
 #define KERN_IO_START  __kernel_io_start
 #define KERN_IO_END __kernel_io_end
 
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 6d760a083d62..574eca33f893 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -72,19 +72,17 @@
  * |                              |
  * |                              |
  * |                              |
- * +------------------------------+  Kernel IO map end (0xc010000000000000)
+ * +------------------------------+  Kernel vmemmap end (0xc010000000000000)
  * |                              |
+ * |           512TB		  |
  * |                              |
- * |      1/2 of virtual map      |
+ * +------------------------------+  Kernel IO map end/vmemap start
  * |                              |
+ * |           512TB		  |
  * |                              |
- * +------------------------------+  Kernel IO map start
+ * +------------------------------+  Kernel vmap end/ IO map start
  * |                              |
- * |      1/4 of virtual map      |
- * |                              |
- * +------------------------------+  Kernel vmemap start
- * |                              |
- * |     1/4 of virtual map       |
+ * |           512TB		  |
  * |                              |
  * +------------------------------+  Kernel virt start (0xc008000000000000)
  * |                              |
@@ -93,25 +91,24 @@
  * +------------------------------+  Kernel linear (0xc.....)
  */
 
-#define RADIX_KERN_VIRT_START ASM_CONST(0xc008000000000000)
-#define RADIX_KERN_VIRT_SIZE  ASM_CONST(0x0008000000000000)
-
+#define RADIX_KERN_VIRT_START	ASM_CONST(0xc008000000000000)
 /*
- * The vmalloc space starts at the beginning of that region, and
- * occupies a quarter of it on radix config.
- * (we keep a quarter for the virtual memmap)
+ * 49 =  MAX_EA_BITS_PER_CONTEXT (hash specific). To make sure we pick
+ * the same value as hash.
  */
+#define RADIX_KERN_MAP_SIZE	(1UL << 49)
+
 #define RADIX_VMALLOC_START	RADIX_KERN_VIRT_START
-#define RADIX_VMALLOC_SIZE	(RADIX_KERN_VIRT_SIZE >> 2)
+#define RADIX_VMALLOC_SIZE	RADIX_KERN_MAP_SIZE
 #define RADIX_VMALLOC_END	(RADIX_VMALLOC_START + RADIX_VMALLOC_SIZE)
-/*
- * Defines the address of the vmemap area, in its own region on
- * hash table CPUs.
- */
-#define RADIX_VMEMMAP_BASE		(RADIX_VMALLOC_END)
 
-#define RADIX_KERN_IO_START	(RADIX_KERN_VIRT_START + (RADIX_KERN_VIRT_SIZE >> 1))
-#define RADIX_KERN_IO_END       (RADIX_KERN_VIRT_START + RADIX_KERN_VIRT_SIZE)
+#define RADIX_KERN_IO_START	RADIX_VMALLOC_END
+#define RADIX_KERN_IO_SIZE	RADIX_KERN_MAP_SIZE
+#define RADIX_KERN_IO_END	(RADIX_KERN_IO_START + RADIX_KERN_IO_SIZE)
+
+#define RADIX_VMEMMAP_START	RADIX_KERN_IO_END
+#define RADIX_VMEMMAP_SIZE	RADIX_KERN_MAP_SIZE
+#define RADIX_VMEMMAP_END	(RADIX_VMEMMAP_START + RADIX_VMEMMAP_SIZE)
 
 #ifndef __ASSEMBLY__
 #define RADIX_PTE_TABLE_SIZE	(sizeof(pte_t) << RADIX_PTE_INDEX_SIZE)
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index ed870468ef6f..918228f2205b 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -139,7 +139,8 @@ static inline bool pfn_valid(unsigned long pfn)
  * return true for some vmalloc addresses, which is incorrect. So explicitly
  * check that the address is in the kernel region.
  */
-#define virt_addr_valid(kaddr) (REGION_ID(kaddr) == KERNEL_REGION_ID && \
+/* may be can drop get_region_id */
+#define virt_addr_valid(kaddr) (get_region_id((unsigned long)kaddr) == KERNEL_REGION_ID && \
 				pfn_valid(virt_to_pfn(kaddr)))
 #else
 #define virt_addr_valid(kaddr)	pfn_valid(virt_to_pfn(kaddr))
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 3b9662a4207e..085509148d95 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -822,7 +822,7 @@ static inline void this_cpu_inc_rm(unsigned int __percpu *addr)
 	raddr = per_cpu_ptr(addr, cpu);
 	l = (unsigned long)raddr;
 
-	if (REGION_ID(l) == VMALLOC_REGION_ID) {
+	if (get_region_id(l) == VMALLOC_REGION_ID) {
 		l = vmalloc_to_phys(raddr);
 		raddr = (unsigned int *)l;
 	}
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index c8da352e8686..9b0321061bc8 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -105,7 +105,7 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
 	u64 vsid, vsidkey;
 	int psize, ssize;
 
-	switch (REGION_ID(ea)) {
+	switch (get_region_id(ea)) {
 	case USER_REGION_ID:
 		pr_devel("%s: 0x%llx -- USER_REGION_ID\n", __func__, ea);
 		if (mm == NULL)
@@ -117,10 +117,14 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
 		break;
 	case VMALLOC_REGION_ID:
 		pr_devel("%s: 0x%llx -- VMALLOC_REGION_ID\n", __func__, ea);
-		if (ea < VMALLOC_END)
-			psize = mmu_vmalloc_psize;
-		else
-			psize = mmu_io_psize;
+		psize = mmu_vmalloc_psize;
+		ssize = mmu_kernel_ssize;
+		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+		vsidkey = SLB_VSID_KERNEL;
+		break;
+	case IO_REGION_ID:
+		pr_devel("%s: 0x%llx -- IO_REGION_ID\n", __func__, ea);
+		psize = mmu_io_psize;
 		ssize = mmu_kernel_ssize;
 		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
 		vsidkey = SLB_VSID_KERNEL;
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index f6ffe8545717..9c4ae4aa133e 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1009,12 +1009,11 @@ void __init hash__early_init_mmu(void)
 	__pgd_val_bits = HASH_PGD_VAL_BITS;
 
 	__kernel_virt_start = H_KERN_VIRT_START;
-	__kernel_virt_size = H_KERN_VIRT_SIZE;
 	__vmalloc_start = H_VMALLOC_START;
 	__vmalloc_end = H_VMALLOC_END;
 	__kernel_io_start = H_KERN_IO_START;
 	__kernel_io_end = H_KERN_IO_END;
-	vmemmap = (struct page *)H_VMEMMAP_BASE;
+	vmemmap = (struct page *)H_VMEMMAP_START;
 	ioremap_bot = IOREMAP_BASE;
 
 #ifdef CONFIG_PCI
@@ -1241,7 +1240,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 	trace_hash_fault(ea, access, trap);
 
 	/* Get region & vsid */
- 	switch (REGION_ID(ea)) {
+	switch (get_region_id(ea)) {
 	case USER_REGION_ID:
 		user_region = 1;
 		if (! mm) {
@@ -1255,10 +1254,13 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 		break;
 	case VMALLOC_REGION_ID:
 		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
-		if (ea < VMALLOC_END)
-			psize = mmu_vmalloc_psize;
-		else
-			psize = mmu_io_psize;
+		psize = mmu_vmalloc_psize;
+		ssize = mmu_kernel_ssize;
+		break;
+
+	case IO_REGION_ID:
+		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+		psize = mmu_io_psize;
 		ssize = mmu_kernel_ssize;
 		break;
 	default:
@@ -1424,7 +1426,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
 	unsigned long flags = 0;
 	struct mm_struct *mm = current->mm;
 
-	if (REGION_ID(ea) == VMALLOC_REGION_ID)
+	if ((get_region_id(ea) == VMALLOC_REGION_ID) ||
+	    (get_region_id(ea) == IO_REGION_ID))
 		mm = &init_mm;
 
 	if (dsisr & DSISR_NOHPTE)
@@ -1440,8 +1443,9 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap,
 	unsigned long access = _PAGE_PRESENT | _PAGE_READ;
 	unsigned long flags = 0;
 	struct mm_struct *mm = current->mm;
+	unsigned int region_id = get_region_id(ea);
 
-	if (REGION_ID(ea) == VMALLOC_REGION_ID)
+	if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID))
 		mm = &init_mm;
 
 	if (dsisr & DSISR_NOHPTE)
@@ -1458,7 +1462,7 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap,
 	 * 2) user space access kernel space.
 	 */
 	access |= _PAGE_PRIVILEGED;
-	if ((msr & MSR_PR) || (REGION_ID(ea) == USER_REGION_ID))
+	if ((msr & MSR_PR) || (region_id == USER_REGION_ID))
 		access &= ~_PAGE_PRIVILEGED;
 
 	if (trap == 0x400)
@@ -1502,7 +1506,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 	int rc, ssize, update_flags = 0;
 	unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0);
 
-	BUG_ON(REGION_ID(ea) != USER_REGION_ID);
+	BUG_ON(get_region_id(ea) != USER_REGION_ID);
 
 	if (!should_hash_preload(mm, ea))
 		return;
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 4c1a9843d0f2..4d9fa9e900d5 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -136,6 +136,10 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa,
 	 */
 	BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
 
+#ifdef CONFIG_PPC_64K_PAGES
+	BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
+#endif
+
 	if (unlikely(!slab_is_available()))
 		return early_map_kernel_page(ea, pa, flags, map_page_size,
 						nid, region_start, region_end);
@@ -601,12 +605,11 @@ void __init radix__early_init_mmu(void)
 	__pgd_val_bits = RADIX_PGD_VAL_BITS;
 
 	__kernel_virt_start = RADIX_KERN_VIRT_START;
-	__kernel_virt_size = RADIX_KERN_VIRT_SIZE;
 	__vmalloc_start = RADIX_VMALLOC_START;
 	__vmalloc_end = RADIX_VMALLOC_END;
 	__kernel_io_start = RADIX_KERN_IO_START;
 	__kernel_io_end = RADIX_KERN_IO_END;
-	vmemmap = (struct page *)RADIX_VMEMMAP_BASE;
+	vmemmap = (struct page *)RADIX_VMEMMAP_START;
 	ioremap_bot = IOREMAP_BASE;
 
 #ifdef CONFIG_PCI
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 7cea39bdf05f..56068cac2a3c 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -90,8 +90,6 @@ unsigned long __pgd_val_bits;
 EXPORT_SYMBOL(__pgd_val_bits);
 unsigned long __kernel_virt_start;
 EXPORT_SYMBOL(__kernel_virt_start);
-unsigned long __kernel_virt_size;
-EXPORT_SYMBOL(__kernel_virt_size);
 unsigned long __vmalloc_start;
 EXPORT_SYMBOL(__vmalloc_start);
 unsigned long __vmalloc_end;
diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c
index b430e4e08af6..b9bda0105841 100644
--- a/arch/powerpc/mm/ptdump/hashpagetable.c
+++ b/arch/powerpc/mm/ptdump/hashpagetable.c
@@ -500,7 +500,7 @@ static void populate_markers(void)
 	address_markers[7].start_address = IOREMAP_BASE;
 	address_markers[8].start_address = IOREMAP_END;
 #ifdef CONFIG_PPC_BOOK3S_64
-	address_markers[9].start_address =  H_VMEMMAP_BASE;
+	address_markers[9].start_address =  H_VMEMMAP_START;
 #else
 	address_markers[9].start_address =  VMEMMAP_BASE;
 #endif
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 37138428ab55..63fc56feea15 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -303,8 +303,9 @@ static void populate_markers(void)
 	address_markers[i++].start_address = PHB_IO_END;
 	address_markers[i++].start_address = IOREMAP_BASE;
 	address_markers[i++].start_address = IOREMAP_END;
+	/* What is the ifdef about? */
 #ifdef CONFIG_PPC_BOOK3S_64
-	address_markers[i++].start_address =  H_VMEMMAP_BASE;
+	address_markers[i++].start_address =  H_VMEMMAP_START;
 #else
 	address_markers[i++].start_address =  VMEMMAP_BASE;
 #endif
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 78c0c0a0e355..721cb09c9044 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -694,7 +694,7 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id)
 	if (id == KERNEL_REGION_ID) {
 
 		/* We only support upto MAX_PHYSMEM_BITS */
-		if ((ea & ~REGION_MASK) > (1UL << MAX_PHYSMEM_BITS))
+		if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS))
 			return -EFAULT;
 
 		flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp;
@@ -702,20 +702,25 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id)
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 	} else if (id == VMEMMAP_REGION_ID) {
 
-		if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT))
+		if (ea >= H_VMEMMAP_END)
 			return -EFAULT;
 
 		flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp;
 #endif
 	} else if (id == VMALLOC_REGION_ID) {
 
-		if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT))
+		if (ea >= H_VMALLOC_END)
 			return -EFAULT;
 
-		if (ea < H_VMALLOC_END)
-			flags = local_paca->vmalloc_sllp;
-		else
-			flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp;
+		flags = local_paca->vmalloc_sllp;
+
+	} else if (id == IO_REGION_ID) {
+
+		if (ea >= H_KERN_IO_END)
+			return -EFAULT;
+
+		flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp;
+
 	} else {
 		return -EFAULT;
 	}
@@ -725,6 +730,7 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id)
 		ssize = MMU_SEGSIZE_256M;
 
 	context = get_kernel_context(ea);
+
 	return slb_insert_entry(ea, context, flags, ssize, true);
 }
 
@@ -761,7 +767,7 @@ static long slb_allocate_user(struct mm_struct *mm, unsigned long ea)
 
 long do_slb_fault(struct pt_regs *regs, unsigned long ea)
 {
-	unsigned long id = REGION_ID(ea);
+	unsigned long id = get_region_id(ea);
 
 	/* IRQs are not reconciled here, so can't check irqs_disabled */
 	VM_WARN_ON(mfmsr() & MSR_EE);
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 7f12c7b78c0f..4770cce1bfe2 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -194,7 +194,7 @@ static int __spu_trap_data_map(struct spu *spu, unsigned long ea, u64 dsisr)
 	 * faults need to be deferred to process context.
 	 */
 	if ((dsisr & MFC_DSISR_PTE_NOT_FOUND) &&
-	    (REGION_ID(ea) != USER_REGION_ID)) {
+	    (get_region_id(ea) != USER_REGION_ID)) {
 
 		spin_unlock(&spu->register_lock);
 		ret = hash_page(ea,
@@ -224,7 +224,7 @@ static void __spu_kernel_slb(void *addr, struct copro_slb *slb)
 	unsigned long ea = (unsigned long)addr;
 	u64 llp;
 
-	if (REGION_ID(ea) == KERNEL_REGION_ID)
+	if (get_region_id(ea) == KERNEL_REGION_ID)
 		llp = mmu_psize_defs[mmu_linear_psize].sllp;
 	else
 		llp = mmu_psize_defs[mmu_virtual_psize].sllp;
diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index dc7b34174f85..a4d17a5a9763 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -168,7 +168,7 @@ int cxl_handle_mm_fault(struct mm_struct *mm, u64 dsisr, u64 dar)
 		if (dsisr & CXL_PSL_DSISR_An_S)
 			access |= _PAGE_WRITE;
 
-		if (!mm && (REGION_ID(dar) != USER_REGION_ID))
+		if (!mm && (get_region_id(dar) != USER_REGION_ID))
 			access |= _PAGE_PRIVILEGED;
 
 		if (dsisr & DSISR_NOHPTE)
diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c
index d50b861d7e57..04ec3d74f828 100644
--- a/drivers/misc/ocxl/link.c
+++ b/drivers/misc/ocxl/link.c
@@ -163,7 +163,7 @@ static void xsl_fault_handler_bh(struct work_struct *fault_work)
 		if (fault->dsisr & SPA_XSL_S)
 			access |= _PAGE_WRITE;
 
-		if (REGION_ID(fault->dar) != USER_REGION_ID)
+		if (get_region_id(fault->dar) != USER_REGION_ID)
 			access |= _PAGE_PRIVILEGED;
 
 		local_irq_save(flags);
-- 
cgit v1.2.3-59-g8ed1b


From e09093927e54323018dbb3bf6189c85a7f176bae Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 17 Apr 2019 18:29:15 +0530
Subject: powerpc/mm: Validate address values against different region limits

This adds an explicit check in various functions.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/hash_utils_64.c  | 18 +++++++++++++++---
 arch/powerpc/mm/pgtable-hash64.c | 13 ++++++++++---
 arch/powerpc/mm/pgtable-radix.c  | 16 ++++++++++++++++
 arch/powerpc/mm/pgtable_64.c     |  5 +++++
 4 files changed, 46 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 9c4ae4aa133e..f727197de713 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -781,9 +781,16 @@ int resize_hpt_for_hotplug(unsigned long new_mem_size)
 
 int hash__create_section_mapping(unsigned long start, unsigned long end, int nid)
 {
-	int rc = htab_bolt_mapping(start, end, __pa(start),
-				   pgprot_val(PAGE_KERNEL), mmu_linear_psize,
-				   mmu_kernel_ssize);
+	int rc;
+
+	if (end >= H_VMALLOC_START) {
+		pr_warn("Outisde the supported range\n");
+		return -1;
+	}
+
+	rc = htab_bolt_mapping(start, end, __pa(start),
+			       pgprot_val(PAGE_KERNEL), mmu_linear_psize,
+			       mmu_kernel_ssize);
 
 	if (rc < 0) {
 		int rc2 = htab_remove_mapping(start, end, mmu_linear_psize,
@@ -924,6 +931,11 @@ static void __init htab_initialize(void)
 		DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
 		    base, size, prot);
 
+		if ((base + size) >= H_VMALLOC_START) {
+			pr_warn("Outisde the supported range\n");
+			continue;
+		}
+
 		BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
 				prot, mmu_linear_psize, mmu_kernel_ssize));
 	}
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
index c08d49046a96..d934de4e2b3a 100644
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -112,9 +112,16 @@ int __meminit hash__vmemmap_create_mapping(unsigned long start,
 				       unsigned long page_size,
 				       unsigned long phys)
 {
-	int rc = htab_bolt_mapping(start, start + page_size, phys,
-				   pgprot_val(PAGE_KERNEL),
-				   mmu_vmemmap_psize, mmu_kernel_ssize);
+	int rc;
+
+	if ((start + page_size) >= H_VMEMMAP_END) {
+		pr_warn("Outisde the supported range\n");
+		return -1;
+	}
+
+	rc = htab_bolt_mapping(start, start + page_size, phys,
+			       pgprot_val(PAGE_KERNEL),
+			       mmu_vmemmap_psize, mmu_kernel_ssize);
 	if (rc < 0) {
 		int rc2 = htab_remove_mapping(start, start + page_size,
 					      mmu_vmemmap_psize,
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 4d9fa9e900d5..e6d5065b0bc8 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -339,6 +339,12 @@ void __init radix_init_pgtable(void)
 		 * page tables will be allocated within the range. No
 		 * need or a node (which we don't have yet).
 		 */
+
+		if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
+			pr_warn("Outisde the supported range\n");
+			continue;
+		}
+
 		WARN_ON(create_physical_mapping(reg->base,
 						reg->base + reg->size,
 						-1));
@@ -895,6 +901,11 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
 
 int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
 {
+	if (end >= RADIX_VMALLOC_START) {
+		pr_warn("Outisde the supported range\n");
+		return -1;
+	}
+
 	return create_physical_mapping(start, end, nid);
 }
 
@@ -922,6 +933,11 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start,
 	int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
 	int ret;
 
+	if ((start + page_size) >= RADIX_VMEMMAP_END) {
+		pr_warn("Outisde the supported range\n");
+		return -1;
+	}
+
 	ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
 	BUG_ON(ret);
 
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 56068cac2a3c..72f58c076e26 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -121,6 +121,11 @@ void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_
 	if (pgprot_val(prot) & H_PAGE_4K_PFN)
 		return NULL;
 
+	if ((ea + size) >= (void *)IOREMAP_END) {
+		pr_warn("Outisde the supported range\n");
+		return NULL;
+	}
+
 	WARN_ON(pa & ~PAGE_MASK);
 	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
 	WARN_ON(size & ~PAGE_MASK);
-- 
cgit v1.2.3-59-g8ed1b


From 53ed7a5947de2e19c270a0bc0c29257c6d004b0f Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 17 Apr 2019 18:29:16 +0530
Subject: powerpc/mm: Drop the unnecessary region check

All the regions are now mapped with top nibble 0xc. Hence the region id
check is not needed for virt_addr_valid()

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/page.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 918228f2205b..748f5db2e2b7 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -132,19 +132,7 @@ static inline bool pfn_valid(unsigned long pfn)
 #define virt_to_page(kaddr)	pfn_to_page(virt_to_pfn(kaddr))
 #define pfn_to_kaddr(pfn)	__va((pfn) << PAGE_SHIFT)
 
-#ifdef CONFIG_PPC_BOOK3S_64
-/*
- * On hash the vmalloc and other regions alias to the kernel region when passed
- * through __pa(), which virt_to_pfn() uses. That means virt_addr_valid() can
- * return true for some vmalloc addresses, which is incorrect. So explicitly
- * check that the address is in the kernel region.
- */
-/* may be can drop get_region_id */
-#define virt_addr_valid(kaddr) (get_region_id((unsigned long)kaddr) == KERNEL_REGION_ID && \
-				pfn_valid(virt_to_pfn(kaddr)))
-#else
 #define virt_addr_valid(kaddr)	pfn_valid(virt_to_pfn(kaddr))
-#endif
 
 /*
  * On Book-E parts we need __va to parse the device tree and we can't
-- 
cgit v1.2.3-59-g8ed1b


From 1c946c1b7f2ba40bc9b521219ad34e5da3fc3088 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 17 Apr 2019 18:29:17 +0530
Subject: powerpc/mm/hash: Simplify the region id calculation.

This reduces multiple comparisons in get_region_id to a bit shift operation.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  |  4 +++-
 arch/powerpc/include/asm/book3s/64/hash-64k.h |  1 +
 arch/powerpc/include/asm/book3s/64/hash.h     | 31 +++++++++++++--------------
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |  2 +-
 4 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 4c9dfd625461..8fd8599c9395 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -13,12 +13,14 @@
  */
 #define MAX_EA_BITS_PER_CONTEXT		46
 
+#define REGION_SHIFT		(MAX_EA_BITS_PER_CONTEXT - 2)
+
 /*
  * Our page table limit us to 64TB. Hence for the kernel mapping,
  * each MAP area is limited to 16 TB.
  * The four map areas are:  linear mapping, vmap, IO and vmemmap
  */
-#define H_KERN_MAP_SIZE		(ASM_CONST(1) << (MAX_EA_BITS_PER_CONTEXT - 2))
+#define H_KERN_MAP_SIZE		(ASM_CONST(1) << REGION_SHIFT)
 
 /*
  * Define the address range of the kernel non-linear virtual area
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 0d0191cda050..d1d9177d9ebd 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -13,6 +13,7 @@
  * is handled in the hotpath.
  */
 #define MAX_EA_BITS_PER_CONTEXT		49
+#define REGION_SHIFT		MAX_EA_BITS_PER_CONTEXT
 
 /*
  * We use one context for each MAP area.
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 76741a221910..7faa3d7214c0 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -83,26 +83,26 @@
 #define H_VMEMMAP_SIZE		H_KERN_MAP_SIZE
 #define H_VMEMMAP_END		(H_VMEMMAP_START + H_VMEMMAP_SIZE)
 
+#define NON_LINEAR_REGION_ID(ea)	((((unsigned long)ea - H_KERN_VIRT_START) >> REGION_SHIFT) + 2)
+
 /*
  * Region IDs
  */
-#define USER_REGION_ID		1
-#define KERNEL_REGION_ID	2
-#define VMALLOC_REGION_ID	3
-#define IO_REGION_ID		4
-#define VMEMMAP_REGION_ID	5
+#define USER_REGION_ID		0
+#define KERNEL_REGION_ID	1
+#define VMALLOC_REGION_ID	NON_LINEAR_REGION_ID(H_VMALLOC_START)
+#define IO_REGION_ID		NON_LINEAR_REGION_ID(H_KERN_IO_START)
+#define VMEMMAP_REGION_ID	NON_LINEAR_REGION_ID(H_VMEMMAP_START)
 
 /*
  * Defines the address of the vmemap area, in its own region on
  * hash table CPUs.
  */
-
 #ifdef CONFIG_PPC_MM_SLICES
 #define HAVE_ARCH_UNMAPPED_AREA
 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
 #endif /* CONFIG_PPC_MM_SLICES */
 
-
 /* PTEIDX nibble */
 #define _PTEIDX_SECONDARY	0x8
 #define _PTEIDX_GROUP_IX	0x7
@@ -113,22 +113,21 @@
 #ifndef __ASSEMBLY__
 static inline int get_region_id(unsigned long ea)
 {
+	int region_id;
 	int id = (ea >> 60UL);
 
 	if (id == 0)
 		return USER_REGION_ID;
 
-	VM_BUG_ON(id != 0xc);
-	VM_BUG_ON(ea >= H_VMEMMAP_END);
+	if (ea < H_KERN_VIRT_START)
+		return KERNEL_REGION_ID;
 
-	if (ea >= H_VMEMMAP_START)
-		return VMEMMAP_REGION_ID;
-	else if (ea >= H_KERN_IO_START)
-		return IO_REGION_ID;
-	else if (ea >= H_VMALLOC_START)
-		return VMALLOC_REGION_ID;
+	VM_BUG_ON(id != 0xc);
+	BUILD_BUG_ON(NON_LINEAR_REGION_ID(H_VMALLOC_START) != 2);
 
-	return KERNEL_REGION_ID;
+	region_id = NON_LINEAR_REGION_ID(ea);
+	VM_BUG_ON(region_id > VMEMMAP_REGION_ID);
+	return region_id;
 }
 
 #define	hash__pmd_bad(pmd)		(pmd_val(pmd) & H_PMD_BAD_BITS)
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 8a30bf189f10..9a9adbeef070 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -823,7 +823,7 @@ static inline unsigned long get_kernel_context(unsigned long ea)
 		 */
 		ctx =  1 + ((ea & EA_MASK) >> MAX_EA_BITS_PER_CONTEXT);
 	} else
-		ctx = region_id + MAX_KERNEL_CTX_CNT - 2;
+		ctx = region_id + MAX_KERNEL_CTX_CNT - 1;
 	return ctx;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From a092a03fa942b14369b8edea7690cd96206403f9 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 17 Apr 2019 18:29:18 +0530
Subject: powerpc/mm: Print kernel map details to dmesg

This helps in debugging. We can look at the dmesg to find out
different kernel mapping details.

On 4K config this shows

 kernel vmalloc start   = 0xc000100000000000
 kernel IO start        = 0xc000200000000000
 kernel vmemmap start   = 0xc000300000000000

On 64K config:

 kernel vmalloc start   = 0xc008000000000000
 kernel IO start        = 0xc00a000000000000
 kernel vmemmap start   = 0xc00c000000000000

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/setup-common.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 21b1ce200b22..1729bf409562 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -831,6 +831,9 @@ static __init void print_system_info(void)
 		pr_info("htab_address      = 0x%p\n", htab_address);
 	if (htab_hash_mask)
 		pr_info("htab_hash_mask    = 0x%lx\n", htab_hash_mask);
+	pr_info("kernel vmalloc start   = 0x%lx\n", KERN_VIRT_START);
+	pr_info("kernel IO start        = 0x%lx\n", KERN_IO_START);
+	pr_info("kernel vmemmap start   = 0x%lx\n", (unsigned long)vmemmap);
 #endif
 #ifdef CONFIG_PPC_BOOK3S_32
 	if (Hash)
-- 
cgit v1.2.3-59-g8ed1b


From 5f53d28608f600d9ee07378453bd2d49e132fff4 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 17 Apr 2019 18:29:19 +0530
Subject: powerpc/mm/hash: Rename KERNEL_REGION_ID to LINEAR_MAP_REGION_ID

The region actually point to linear map. Rename the #define to
clarify thati.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash.h     | 4 ++--
 arch/powerpc/include/asm/book3s/64/mmu-hash.h | 2 +-
 arch/powerpc/mm/copro_fault.c                 | 4 ++--
 arch/powerpc/mm/slb.c                         | 4 ++--
 arch/powerpc/platforms/cell/spu_base.c        | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 7faa3d7214c0..1d1183048cfd 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -89,7 +89,7 @@
  * Region IDs
  */
 #define USER_REGION_ID		0
-#define KERNEL_REGION_ID	1
+#define LINEAR_MAP_REGION_ID	1
 #define VMALLOC_REGION_ID	NON_LINEAR_REGION_ID(H_VMALLOC_START)
 #define IO_REGION_ID		NON_LINEAR_REGION_ID(H_KERN_IO_START)
 #define VMEMMAP_REGION_ID	NON_LINEAR_REGION_ID(H_VMEMMAP_START)
@@ -120,7 +120,7 @@ static inline int get_region_id(unsigned long ea)
 		return USER_REGION_ID;
 
 	if (ea < H_KERN_VIRT_START)
-		return KERNEL_REGION_ID;
+		return LINEAR_MAP_REGION_ID;
 
 	VM_BUG_ON(id != 0xc);
 	BUILD_BUG_ON(NON_LINEAR_REGION_ID(H_VMALLOC_START) != 2);
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 9a9adbeef070..1e4705516a54 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -817,7 +817,7 @@ static inline unsigned long get_kernel_context(unsigned long ea)
 	 * Depending on Kernel config, kernel region can have one context
 	 * or more.
 	 */
-	if (region_id == KERNEL_REGION_ID) {
+	if (region_id == LINEAR_MAP_REGION_ID) {
 		/*
 		 * We already verified ea to be not beyond the addr limit.
 		 */
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index 9b0321061bc8..f137286740cb 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -129,8 +129,8 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
 		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
 		vsidkey = SLB_VSID_KERNEL;
 		break;
-	case KERNEL_REGION_ID:
-		pr_devel("%s: 0x%llx -- KERNEL_REGION_ID\n", __func__, ea);
+	case LINEAR_MAP_REGION_ID:
+		pr_devel("%s: 0x%llx -- LINEAR_MAP_REGION_ID\n", __func__, ea);
 		psize = mmu_linear_psize;
 		ssize = mmu_kernel_ssize;
 		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 721cb09c9044..89e4531de64b 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -691,7 +691,7 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id)
 	unsigned long flags;
 	int ssize;
 
-	if (id == KERNEL_REGION_ID) {
+	if (id == LINEAR_MAP_REGION_ID) {
 
 		/* We only support upto MAX_PHYSMEM_BITS */
 		if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS))
@@ -790,7 +790,7 @@ long do_slb_fault(struct pt_regs *regs, unsigned long ea)
 	 * first class kernel code. But for performance it's probably nicer
 	 * if they go via fast_exception_return too.
 	 */
-	if (id >= KERNEL_REGION_ID) {
+	if (id >= LINEAR_MAP_REGION_ID) {
 		long err;
 #ifdef CONFIG_DEBUG_VM
 		/* Catch recursive kernel SLB faults. */
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 4770cce1bfe2..6646f152d57b 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -224,7 +224,7 @@ static void __spu_kernel_slb(void *addr, struct copro_slb *slb)
 	unsigned long ea = (unsigned long)addr;
 	u64 llp;
 
-	if (get_region_id(ea) == KERNEL_REGION_ID)
+	if (get_region_id(ea) == LINEAR_MAP_REGION_ID)
 		llp = mmu_psize_defs[mmu_linear_psize].sllp;
 	else
 		llp = mmu_psize_defs[mmu_virtual_psize].sllp;
-- 
cgit v1.2.3-59-g8ed1b


From 26ad26718dfaa7cf49d106d212ebf2370076c253 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Sat, 30 Mar 2019 11:13:45 +0530
Subject: powerpc/mm: Fix section mismatch warning

This patch fix the below section mismatch warnings.

WARNING: vmlinux.o(.text+0x2d1f44): Section mismatch in reference from the function devm_memremap_pages_release() to the function .meminit.text:arch_remove_memory()
WARNING: vmlinux.o(.text+0x2d265c): Section mismatch in reference from the function devm_memremap_pages() to the function .meminit.text:arch_add_memory()

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/mem.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 3665602a9dfa..e12bec98366f 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -109,8 +109,8 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end)
 	return -ENODEV;
 }
 
-int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-		bool want_memblock)
+int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
+			  bool want_memblock)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -131,8 +131,8 @@ int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int __meminit arch_remove_memory(int nid, u64 start, u64 size,
-					struct vmem_altmap *altmap)
+int __ref arch_remove_memory(int nid, u64 start, u64 size,
+			     struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
-- 
cgit v1.2.3-59-g8ed1b


From 3dfc242f11d792535db774613c6fd1df565c2137 Mon Sep 17 00:00:00 2001
From: Guo Ren <ren_guo@c-sky.com>
Date: Tue, 19 Feb 2019 12:32:41 +0800
Subject: csky: Fixup vdsp&fpu issues in kernel

This fixup is continue to commit 35ff802af1c4 (csky: fixup remove
vdsp implement for kernel.) and in that patch I didn't finish the
job. We must forbid gcc to generate any vdsp & fpu instructions
and remove vdsp asm in memmove.S.

eg: For GCC it's -mcpu=ck860 and For AS it's -Wa,-mcpu=ck860fv

Signed-off-by: Guo Ren <ren_guo@c-sky.com>
---
 arch/csky/Makefile        | 2 +-
 arch/csky/abiv2/memmove.S | 6 +-----
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/csky/Makefile b/arch/csky/Makefile
index 3607a6e8f66c..6b87f6c22ad6 100644
--- a/arch/csky/Makefile
+++ b/arch/csky/Makefile
@@ -36,7 +36,7 @@ endif
 
 ifneq ($(CSKYABI),)
 MCPU_STR = $(CPUTYPE)$(FPUEXT)$(VDSPEXT)$(TEEEXT)
-KBUILD_CFLAGS += -mcpu=$(MCPU_STR)
+KBUILD_CFLAGS += -mcpu=$(CPUTYPE) -Wa,-mcpu=$(MCPU_STR)
 KBUILD_CFLAGS += -DCSKYCPU_DEF_NAME=\"$(MCPU_STR)\"
 KBUILD_CFLAGS += -msoft-float -mdiv
 KBUILD_CFLAGS += -fno-tree-vectorize
diff --git a/arch/csky/abiv2/memmove.S b/arch/csky/abiv2/memmove.S
index b0c42ecf1889..5721e73ad3d8 100644
--- a/arch/csky/abiv2/memmove.S
+++ b/arch/csky/abiv2/memmove.S
@@ -35,11 +35,7 @@ ENTRY(memmove)
 .L_len_larger_16bytes:
 	subi	r1, 16
 	subi	r0, 16
-#if defined(__CSKY_VDSPV2__)
-	vldx.8	vr0, (r1), r19
-	PRE_BNEZAD (r18)
-	vstx.8	vr0, (r0), r19
-#elif defined(__CK860__)
+#if defined(__CK860__)
 	ldw	r3, (r1, 12)
 	stw	r3, (r0, 12)
 	ldw	r3, (r1, 8)
-- 
cgit v1.2.3-59-g8ed1b


From 28bb030f93334495ddc64ade0bff18721bf7023d Mon Sep 17 00:00:00 2001
From: Guo Ren <ren_guo@c-sky.com>
Date: Fri, 1 Mar 2019 08:50:36 +0800
Subject: csky/ftrace: Add dynamic function tracer (include graph tracer)

Support dynamic ftrace including dynamic graph tracer. Gcc-csky with -pg
will produce call site in every function prologue and we can use these
call site to hook trace function.

gcc with -pg origin call site:
	push	lr
	jbsr	_mcount
	nop32
	nop32

If the (callee - caller)'s offset is in range of bsr instruction, we'll
modify code with:
	push	lr
	bsr	_mcount
	nop32
	nop32
Else if the (callee - caller)'s offset is out of bsr instrunction, we'll
modify code with:
	push	lr
	movih	r26, ...
	ori	r26, ...
	jsr	r26

(r26 is reserved for jsr link reg in csky abiv2 spec.)

Signed-off-by: Guo Ren <ren_guo@c-sky.com>
---
 arch/csky/Kconfig              |   3 +-
 arch/csky/abiv2/mcount.S       |  39 ++++++++++-
 arch/csky/include/asm/ftrace.h |  18 ++++-
 arch/csky/kernel/ftrace.c      | 148 ++++++++++++++++++++++++++++++++++++++++-
 scripts/recordmcount.pl        |   3 +
 5 files changed, 205 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 725a115759c9..60ebaa325584 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -29,13 +29,14 @@ config CSKY
 	select GENERIC_SCHED_CLOCK
 	select GENERIC_SMP_IDLE_THREAD
 	select HAVE_ARCH_TRACEHOOK
+	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
+	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_KERNEL_LZMA
 	select HAVE_PERF_EVENTS
-	select HAVE_C_RECORDMCOUNT
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_CONTIGUOUS
 	select MAY_HAVE_SPARSE_IRQ
diff --git a/arch/csky/abiv2/mcount.S b/arch/csky/abiv2/mcount.S
index c633379956f5..326402e65f9e 100644
--- a/arch/csky/abiv2/mcount.S
+++ b/arch/csky/abiv2/mcount.S
@@ -61,10 +61,17 @@
 	addi	sp, 16
 .endm
 
+.macro nop32_stub
+	nop32
+	nop32
+	nop32
+.endm
+
 ENTRY(ftrace_stub)
 	jmp	lr
 END(ftrace_stub)
 
+#ifndef CONFIG_DYNAMIC_FTRACE
 ENTRY(_mcount)
 	mcount_enter
 
@@ -76,7 +83,7 @@ ENTRY(_mcount)
 	bf	skip_ftrace
 
 	mov	a0, lr
-	subi	a0, MCOUNT_INSN_SIZE
+	subi	a0, 4
 	ldw	a1, (sp, 24)
 
 	jsr	r26
@@ -101,13 +108,41 @@ skip_ftrace:
 	mcount_exit
 #endif
 END(_mcount)
+#else /* CONFIG_DYNAMIC_FTRACE */
+ENTRY(_mcount)
+	mov	t1, lr
+	ldw	lr, (sp, 0)
+	addi	sp, 4
+	jmp	t1
+ENDPROC(_mcount)
+
+ENTRY(ftrace_caller)
+	mcount_enter
+
+	ldw	a0, (sp, 16)
+	subi	a0, 4
+	ldw	a1, (sp, 24)
+
+	nop
+GLOBAL(ftrace_call)
+	nop32_stub
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	nop
+GLOBAL(ftrace_graph_call)
+	nop32_stub
+#endif
+
+	mcount_exit
+ENDPROC(ftrace_caller)
+#endif /* CONFIG_DYNAMIC_FTRACE */
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 ENTRY(ftrace_graph_caller)
 	mov	a0, sp
 	addi	a0, 24
 	ldw	a1, (sp, 16)
-	subi	a1, MCOUNT_INSN_SIZE
+	subi	a1, 4
 	mov	a2, r8
 	lrw	r26, prepare_ftrace_return
 	jsr	r26
diff --git a/arch/csky/include/asm/ftrace.h b/arch/csky/include/asm/ftrace.h
index 7547c45312a8..ba35d93ecda2 100644
--- a/arch/csky/include/asm/ftrace.h
+++ b/arch/csky/include/asm/ftrace.h
@@ -4,10 +4,26 @@
 #ifndef __ASM_CSKY_FTRACE_H
 #define __ASM_CSKY_FTRACE_H
 
-#define MCOUNT_INSN_SIZE 4
+#define MCOUNT_INSN_SIZE	14
 
 #define HAVE_FUNCTION_GRAPH_FP_TEST
 
 #define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
 
+#define MCOUNT_ADDR	((unsigned long)_mcount)
+
+#ifndef __ASSEMBLY__
+
+extern void _mcount(unsigned long);
+
+extern void ftrace_graph_call(void);
+
+static inline unsigned long ftrace_call_adjust(unsigned long addr)
+{
+	return addr;
+}
+
+struct dyn_arch_ftrace {
+};
+#endif /* !__ASSEMBLY__ */
 #endif /* __ASM_CSKY_FTRACE_H */
diff --git a/arch/csky/kernel/ftrace.c b/arch/csky/kernel/ftrace.c
index 274c431f1810..44f4880179b7 100644
--- a/arch/csky/kernel/ftrace.c
+++ b/arch/csky/kernel/ftrace.c
@@ -3,6 +3,137 @@
 
 #include <linux/ftrace.h>
 #include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#define NOP		0x4000
+#define NOP32_HI	0xc400
+#define NOP32_LO	0x4820
+#define PUSH_LR		0x14d0
+#define MOVIH_LINK	0xea3a
+#define ORI_LINK	0xef5a
+#define JSR_LINK	0xe8fa
+#define BSR_LINK	0xe000
+
+/*
+ * Gcc-csky with -pg will insert stub in function prologue:
+ *	push	lr
+ *	jbsr	_mcount
+ *	nop32
+ *	nop32
+ *
+ * If the (callee - current_pc) is less then 64MB, we'll use bsr:
+ *	push	lr
+ *	bsr	_mcount
+ *	nop32
+ *	nop32
+ * else we'll use (movih + ori + jsr):
+ *	push	lr
+ *	movih	r26, ...
+ *	ori	r26, ...
+ *	jsr	r26
+ *
+ * (r26 is our reserved link-reg)
+ *
+ */
+static inline void make_jbsr(unsigned long callee, unsigned long pc,
+			     uint16_t *call, bool nolr)
+{
+	long offset;
+
+	call[0]	= nolr ? NOP : PUSH_LR;
+
+	offset = (long) callee - (long) pc;
+
+	if (unlikely(offset < -67108864 || offset > 67108864)) {
+		call[1] = MOVIH_LINK;
+		call[2] = callee >> 16;
+		call[3] = ORI_LINK;
+		call[4] = callee & 0xffff;
+		call[5] = JSR_LINK;
+		call[6] = 0;
+	} else {
+		offset = offset >> 1;
+
+		call[1] = BSR_LINK |
+			 ((uint16_t)((unsigned long) offset >> 16) & 0x3ff);
+		call[2] = (uint16_t)((unsigned long) offset & 0xffff);
+		call[3] = call[5] = NOP32_HI;
+		call[4] = call[6] = NOP32_LO;
+	}
+}
+
+static uint16_t nops[7] = {NOP, NOP32_HI, NOP32_LO, NOP32_HI, NOP32_LO,
+				NOP32_HI, NOP32_LO};
+static int ftrace_check_current_nop(unsigned long hook)
+{
+	uint16_t olds[7];
+	unsigned long hook_pos = hook - 2;
+
+	if (probe_kernel_read((void *)olds, (void *)hook_pos, sizeof(nops)))
+		return -EFAULT;
+
+	if (memcmp((void *)nops, (void *)olds, sizeof(nops))) {
+		pr_err("%p: nop but get (%04x %04x %04x %04x %04x %04x %04x)\n",
+			(void *)hook_pos,
+			olds[0], olds[1], olds[2], olds[3], olds[4], olds[5],
+			olds[6]);
+
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ftrace_modify_code(unsigned long hook, unsigned long target,
+			      bool enable, bool nolr)
+{
+	uint16_t call[7];
+
+	unsigned long hook_pos = hook - 2;
+	int ret = 0;
+
+	make_jbsr(target, hook, call, nolr);
+
+	ret = probe_kernel_write((void *)hook_pos, enable ? call : nops,
+				 sizeof(nops));
+	if (ret)
+		return -EPERM;
+
+	flush_icache_range(hook_pos, hook_pos + MCOUNT_INSN_SIZE);
+
+	return 0;
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	int ret = ftrace_check_current_nop(rec->ip);
+
+	if (ret)
+		return ret;
+
+	return ftrace_modify_code(rec->ip, addr, true, false);
+}
+
+int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
+		    unsigned long addr)
+{
+	return ftrace_modify_code(rec->ip, addr, false, false);
+}
+
+int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	int ret = ftrace_modify_code((unsigned long)&ftrace_call,
+				(unsigned long)func, true, true);
+	return ret;
+}
+
+int __init ftrace_dyn_arch_init(void)
+{
+	return 0;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
@@ -43,8 +174,21 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
 			*(unsigned long *)frame_pointer = return_hooker;
 	}
 }
-#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+int ftrace_enable_ftrace_graph_caller(void)
+{
+	return ftrace_modify_code((unsigned long)&ftrace_graph_call,
+			(unsigned long)&ftrace_graph_caller, true, true);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+	return ftrace_modify_code((unsigned long)&ftrace_graph_call,
+			(unsigned long)&ftrace_graph_caller, false, true);
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 /* _mcount is defined in abi's mcount.S */
-extern void _mcount(void);
 EXPORT_SYMBOL(_mcount);
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 68841d01162c..f71666899245 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -397,6 +397,9 @@ if ($arch eq "x86_64") {
 } elsif ($arch eq "nds32") {
     $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_NDS32_HI20_RELA\\s+_mcount\$";
     $alignment = 2;
+} elsif ($arch eq "csky") {
+    $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_CKCORE_PCREL_JSR_IMM26BY2\\s+_mcount\$";
+    $alignment = 2;
 } else {
     die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD";
 }
-- 
cgit v1.2.3-59-g8ed1b


From cfa4d93b977a1b1129e7207d11b5daecdf0c56c4 Mon Sep 17 00:00:00 2001
From: Mao Han <han_mao@c-sky.com>
Date: Thu, 21 Feb 2019 21:41:26 +0800
Subject: csky: Add perf callchain support

This patch add support for perf callchain sampling on csky platform.
As fp is used to unwind the stack, the program being sampled and the
C library need to be compiled with -mbacktrace for user callchains,
kernel callchains require CONFIG_STACKTRACE = y.

Changelog:
 - Coding convention with Christoph's advice for riscv's.

Signed-off-by: Mao Han <han_mao@c-sky.com>
Signed-off-by: Guo Ren <ren_guo@c-sky.com>
Cc: Christoph Hellwig <hch@infradead.org>
---
 arch/csky/kernel/Makefile         |   1 +
 arch/csky/kernel/perf_callchain.c | 119 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+)
 create mode 100644 arch/csky/kernel/perf_callchain.c

(limited to 'arch')

diff --git a/arch/csky/kernel/Makefile b/arch/csky/kernel/Makefile
index 484e6d3a3647..4c462f584dd1 100644
--- a/arch/csky/kernel/Makefile
+++ b/arch/csky/kernel/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_SMP)			+= smp.o
 obj-$(CONFIG_FUNCTION_TRACER)		+= ftrace.o
 obj-$(CONFIG_STACKTRACE)		+= stacktrace.o
 obj-$(CONFIG_CSKY_PMU_V1)		+= perf_event.o
+obj-$(CONFIG_PERF_EVENTS)		+= perf_callchain.o
 
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE)
diff --git a/arch/csky/kernel/perf_callchain.c b/arch/csky/kernel/perf_callchain.c
new file mode 100644
index 000000000000..e68ff375c8f8
--- /dev/null
+++ b/arch/csky/kernel/perf_callchain.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd.
+
+#include <linux/perf_event.h>
+#include <linux/uaccess.h>
+
+/* Kernel callchain */
+struct stackframe {
+	unsigned long fp;
+	unsigned long lr;
+};
+
+static int unwind_frame_kernel(struct stackframe *frame)
+{
+	if (kstack_end((void *)frame->fp))
+		return -EPERM;
+	if (frame->fp & 0x3 || frame->fp < TASK_SIZE)
+		return -EPERM;
+
+	*frame = *(struct stackframe *)frame->fp;
+	if (__kernel_text_address(frame->lr)) {
+		int graph = 0;
+
+		frame->lr = ftrace_graph_ret_addr(NULL, &graph, frame->lr,
+				NULL);
+	}
+	return 0;
+}
+
+static void notrace walk_stackframe(struct stackframe *fr,
+			struct perf_callchain_entry_ctx *entry)
+{
+	do {
+		perf_callchain_store(entry, fr->lr);
+	} while (unwind_frame_kernel(fr) >= 0);
+}
+
+/*
+ * Get the return address for a single stackframe and return a pointer to the
+ * next frame tail.
+ */
+static unsigned long user_backtrace(struct perf_callchain_entry_ctx *entry,
+			unsigned long fp, unsigned long reg_lr)
+{
+	struct stackframe buftail;
+	unsigned long lr = 0;
+	unsigned long *user_frame_tail = (unsigned long *)fp;
+
+	/* Check accessibility of one struct frame_tail beyond */
+	if (!access_ok(user_frame_tail, sizeof(buftail)))
+		return 0;
+	if (__copy_from_user_inatomic(&buftail, user_frame_tail,
+				      sizeof(buftail)))
+		return 0;
+
+	if (reg_lr != 0)
+		lr = reg_lr;
+	else
+		lr = buftail.lr;
+
+	fp = buftail.fp;
+	perf_callchain_store(entry, lr);
+
+	return fp;
+}
+
+/*
+ * This will be called when the target is in user mode
+ * This function will only be called when we use
+ * "PERF_SAMPLE_CALLCHAIN" in
+ * kernel/events/core.c:perf_prepare_sample()
+ *
+ * How to trigger perf_callchain_[user/kernel] :
+ * $ perf record -e cpu-clock --call-graph fp ./program
+ * $ perf report --call-graph
+ *
+ * On C-SKY platform, the program being sampled and the C library
+ * need to be compiled with * -mbacktrace, otherwise the user
+ * stack will not contain function frame.
+ */
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
+			 struct pt_regs *regs)
+{
+	unsigned long fp = 0;
+
+	/* C-SKY does not support virtualization. */
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+		return;
+
+	fp = regs->regs[4];
+	perf_callchain_store(entry, regs->pc);
+
+	/*
+	 * While backtrace from leaf function, lr is normally
+	 * not saved inside frame on C-SKY, so get lr from pt_regs
+	 * at the sample point. However, lr value can be incorrect if
+	 * lr is used as temp register
+	 */
+	fp = user_backtrace(entry, fp, regs->lr);
+
+	while (fp && !(fp & 0x3) && entry->nr < entry->max_stack)
+		fp = user_backtrace(entry, fp, 0);
+}
+
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
+			   struct pt_regs *regs)
+{
+	struct stackframe fr;
+
+	/* C-SKY does not support virtualization. */
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		pr_warn("C-SKY does not support perf in guest mode!");
+		return;
+	}
+
+	fr.fp = regs->regs[4];
+	fr.lr = regs->lr;
+	walk_stackframe(&fr, entry);
+}
-- 
cgit v1.2.3-59-g8ed1b


From 2f7932b011e7fb9f98732f95a68f6017d4d8c542 Mon Sep 17 00:00:00 2001
From: Guo Ren <ren_guo@c-sky.com>
Date: Wed, 20 Mar 2019 18:27:27 +0800
Subject: csky: Update syscall_trace_enter/exit implementation

Previous syscall_trace implementation couldn't support AUDITSYSCALL and
SYSCALL_TRACEPOINTS. Now we redesign it to support audit_syscall
and syscall_tracepoints just like other archs'.

Signed-off-by: Guo Ren <ren_guo@c-sky.com>
Cc: Dmitry V. Levin <ldv@altlinux.org>
Cc: Arnd Bergmann <arnd@arndb.de>
---
 arch/csky/Kconfig                   |  2 ++
 arch/csky/abiv1/inc/abi/entry.h     |  4 +++
 arch/csky/abiv2/inc/abi/entry.h     |  5 ++++
 arch/csky/include/asm/syscall.h     |  2 ++
 arch/csky/include/asm/thread_info.h | 27 +++++++++-----------
 arch/csky/include/asm/unistd.h      |  2 ++
 arch/csky/include/uapi/asm/ptrace.h |  5 ++++
 arch/csky/kernel/entry.S            | 25 +++++++++----------
 arch/csky/kernel/ptrace.c           | 50 +++++++++++++++++--------------------
 9 files changed, 67 insertions(+), 55 deletions(-)

(limited to 'arch')

diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 60ebaa325584..c4974cf6a222 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -29,6 +29,7 @@ config CSKY
 	select GENERIC_SCHED_CLOCK
 	select GENERIC_SMP_IDLE_THREAD
 	select HAVE_ARCH_TRACEHOOK
+	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
@@ -39,6 +40,7 @@ config CSKY
 	select HAVE_PERF_EVENTS
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_CONTIGUOUS
+	select HAVE_SYSCALL_TRACEPOINTS
 	select MAY_HAVE_SPARSE_IRQ
 	select MODULES_USE_ELF_RELA if MODULES
 	select OF
diff --git a/arch/csky/abiv1/inc/abi/entry.h b/arch/csky/abiv1/inc/abi/entry.h
index 3f3faab3d747..7dacce4c0f15 100644
--- a/arch/csky/abiv1/inc/abi/entry.h
+++ b/arch/csky/abiv1/inc/abi/entry.h
@@ -157,4 +157,8 @@
 	cpwcr	\rx, cpcr31
 .endm
 
+.macro ANDI_R3 rx, imm
+	lsri	\rx, 3
+	andi	\rx, (\imm >> 3)
+.endm
 #endif /* __ASM_CSKY_ENTRY_H */
diff --git a/arch/csky/abiv2/inc/abi/entry.h b/arch/csky/abiv2/inc/abi/entry.h
index edc5cc04c4de..ea376ed716c4 100644
--- a/arch/csky/abiv2/inc/abi/entry.h
+++ b/arch/csky/abiv2/inc/abi/entry.h
@@ -175,4 +175,9 @@
 	lrw	\rx, (PHYS_OFFSET + 0x20000000) | 0xe
 	mtcr	\rx, cr<31, 15>
 .endm
+
+.macro ANDI_R3 rx, imm
+	lsri	\rx, 3
+	andi	\rx, (\imm >> 3)
+.endm
 #endif /* __ASM_CSKY_ENTRY_H */
diff --git a/arch/csky/include/asm/syscall.h b/arch/csky/include/asm/syscall.h
index bda0a446c63e..850b694a463e 100644
--- a/arch/csky/include/asm/syscall.h
+++ b/arch/csky/include/asm/syscall.h
@@ -8,6 +8,8 @@
 #include <abi/regdef.h>
 #include <uapi/linux/audit.h>
 
+extern void *sys_call_table[];
+
 static inline int
 syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
 {
diff --git a/arch/csky/include/asm/thread_info.h b/arch/csky/include/asm/thread_info.h
index 0e9d035d712b..0b546a55a8bf 100644
--- a/arch/csky/include/asm/thread_info.h
+++ b/arch/csky/include/asm/thread_info.h
@@ -51,29 +51,26 @@ static inline struct thread_info *current_thread_info(void)
 
 #endif /* !__ASSEMBLY__ */
 
-/* entry.S relies on these definitions!
- * bits 0-5 are tested at every exception exit
- */
 #define TIF_SIGPENDING		0	/* signal pending */
 #define TIF_NOTIFY_RESUME	1       /* callback before returning to user */
 #define TIF_NEED_RESCHED	2	/* rescheduling necessary */
-#define TIF_SYSCALL_TRACE	5	/* syscall trace active */
-#define TIF_DELAYED_TRACE	14	/* single step a syscall */
+#define TIF_SYSCALL_TRACE	3	/* syscall trace active */
+#define TIF_SYSCALL_TRACEPOINT	4       /* syscall tracepoint instrumentation */
+#define TIF_SYSCALL_AUDIT	5	/* syscall auditing */
 #define TIF_POLLING_NRFLAG	16	/* poll_idle() is TIF_NEED_RESCHED */
 #define TIF_MEMDIE		18      /* is terminating due to OOM killer */
-#define TIF_FREEZE		19	/* thread is freezing for suspend */
 #define TIF_RESTORE_SIGMASK	20	/* restore signal mask in do_signal() */
 #define TIF_SECCOMP		21	/* secure computing */
 
-#define _TIF_SIGPENDING         (1 << TIF_SIGPENDING)
-#define _TIF_NOTIFY_RESUME      (1 << TIF_NOTIFY_RESUME)
-#define _TIF_NEED_RESCHED       (1 << TIF_NEED_RESCHED)
-#define _TIF_SYSCALL_TRACE      (1 << TIF_SYSCALL_TRACE)
-#define _TIF_DELAYED_TRACE	(1 << TIF_DELAYED_TRACE)
-#define _TIF_POLLING_NRFLAG     (1 << TIF_POLLING_NRFLAG)
+#define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
+#define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
+#define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
+#define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
+#define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
+#define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_MEMDIE		(1 << TIF_MEMDIE)
-#define _TIF_FREEZE             (1 << TIF_FREEZE)
-#define _TIF_RESTORE_SIGMASK    (1 << TIF_RESTORE_SIGMASK)
-#define _TIF_SECCOMP            (1 << TIF_SECCOMP)
+#define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
+#define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 
 #endif	/* _ASM_CSKY_THREAD_INFO_H */
diff --git a/arch/csky/include/asm/unistd.h b/arch/csky/include/asm/unistd.h
index 284487477a61..da7a18295615 100644
--- a/arch/csky/include/asm/unistd.h
+++ b/arch/csky/include/asm/unistd.h
@@ -2,3 +2,5 @@
 // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
 
 #include <uapi/asm/unistd.h>
+
+#define NR_syscalls (__NR_syscalls)
diff --git a/arch/csky/include/uapi/asm/ptrace.h b/arch/csky/include/uapi/asm/ptrace.h
index a4eaa8ddf0b1..9bf5b1a415d0 100644
--- a/arch/csky/include/uapi/asm/ptrace.h
+++ b/arch/csky/include/uapi/asm/ptrace.h
@@ -62,6 +62,11 @@ struct user_fp {
 #define instruction_pointer(regs) ((regs)->pc)
 #define profile_pc(regs) instruction_pointer(regs)
 
+static inline unsigned long regs_return_value(struct pt_regs *regs)
+{
+	return regs->a0;
+}
+
 #endif /* __KERNEL__ */
 #endif /* __ASSEMBLY__ */
 #endif /* _CSKY_PTRACE_H */
diff --git a/arch/csky/kernel/entry.S b/arch/csky/kernel/entry.S
index 5137ed9062bd..ecc6e7d2e95d 100644
--- a/arch/csky/kernel/entry.S
+++ b/arch/csky/kernel/entry.S
@@ -136,8 +136,9 @@ ENTRY(csky_systemcall)
 	bmaski  r10, THREAD_SHIFT
 	andn    r9, r10
 	ldw     r8, (r9, TINFO_FLAGS)
-	btsti   r8, TIF_SYSCALL_TRACE
-	bt      1f
+	ANDI_R3	r8, (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT)
+	cmpnei	r8, 0
+	bt      csky_syscall_trace
 #if defined(__CSKYABIV2__)
 	subi    sp, 8
 	stw  	r5, (sp, 0x4)
@@ -150,10 +151,9 @@ ENTRY(csky_systemcall)
 	stw     a0, (sp, LSAVE_A0)      /* Save return value */
 	jmpi    ret_from_exception
 
-1:
-	movi	a0, 0                   /* enter system call */
-	mov	a1, sp                  /* sp = pt_regs pointer */
-	jbsr	syscall_trace
+csky_syscall_trace:
+	mov	a0, sp                  /* sp = pt_regs pointer */
+	jbsr	syscall_trace_enter
 	/* Prepare args before do system call */
 	ldw	a0, (sp, LSAVE_A0)
 	ldw	a1, (sp, LSAVE_A1)
@@ -173,9 +173,8 @@ ENTRY(csky_systemcall)
 #endif
 	stw	a0, (sp, LSAVE_A0)	/* Save return value */
 
-	movi    a0, 1                   /* leave system call */
-	mov     a1, sp                  /* right now, sp --> pt_regs */
-	jbsr    syscall_trace
+	mov     a0, sp                  /* right now, sp --> pt_regs */
+	jbsr    syscall_trace_exit
 	br	ret_from_exception
 
 ENTRY(ret_from_kernel_thread)
@@ -191,11 +190,11 @@ ENTRY(ret_from_fork)
 	andn	r9, r10
 	ldw	r8, (r9, TINFO_FLAGS)
 	movi	r11_sig, 1
-	btsti	r8, TIF_SYSCALL_TRACE
+	ANDI_R3	r8, (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT)
+	cmpnei	r8, 0
 	bf	3f
-	movi	a0, 1
-	mov	a1, sp			/* sp = pt_regs pointer */
-	jbsr	syscall_trace
+	mov	a0, sp			/* sp = pt_regs pointer */
+	jbsr	syscall_trace_exit
 3:
 	jbsr	ret_from_exception
 
diff --git a/arch/csky/kernel/ptrace.c b/arch/csky/kernel/ptrace.c
index f2f12fff36f7..91bc74bb569f 100644
--- a/arch/csky/kernel/ptrace.c
+++ b/arch/csky/kernel/ptrace.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
 
+#include <linux/audit.h>
 #include <linux/elf.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
@@ -11,6 +12,7 @@
 #include <linux/sched/task_stack.h>
 #include <linux/signal.h>
 #include <linux/smp.h>
+#include <linux/tracehook.h>
 #include <linux/uaccess.h>
 #include <linux/user.h>
 
@@ -22,6 +24,9 @@
 
 #include <abi/regdef.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
 /* sets the trace bits. */
 #define TRACE_MODE_SI      (1 << 14)
 #define TRACE_MODE_RUN     0
@@ -207,35 +212,26 @@ long arch_ptrace(struct task_struct *child, long request,
 	return ret;
 }
 
-/*
- * If process's system calls is traces, do some corresponding handles in this
- * function before entering system call function and after exiting system call
- * function.
- */
-asmlinkage void syscall_trace(int why, struct pt_regs *regs)
+asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 {
-	long saved_why;
-	/*
-	 * Save saved_why, why is used to denote syscall entry/exit;
-	 * why = 0:entry, why = 1: exit
-	 */
-	saved_why = regs->regs[SYSTRACE_SAVENUM];
-	regs->regs[SYSTRACE_SAVENUM] = why;
-
-	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
-					? 0x80 : 0));
-
-	/*
-	 * this isn't the same as continuing with a signal, but it will do
-	 * for normal use.  strace only continues with a signal if the
-	 * stopping signal is not SIGTRAP.  -brl
-	 */
-	if (current->exit_code) {
-		send_sig(current->exit_code, current, 1);
-		current->exit_code = 0;
-	}
+	if (test_thread_flag(TIF_SYSCALL_TRACE))
+		tracehook_report_syscall_entry(regs);
+
+	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
+		trace_sys_enter(regs, syscall_get_nr(current, regs));
+
+	audit_syscall_entry(regs_syscallid(regs), regs->a0, regs->a1, regs->a2, regs->a3);
+}
+
+asmlinkage void syscall_trace_exit(struct pt_regs *regs)
+{
+	audit_syscall_exit(regs);
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE))
+		tracehook_report_syscall_exit(regs, 0);
 
-	regs->regs[SYSTRACE_SAVENUM] = saved_why;
+	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
+		trace_sys_exit(regs, syscall_get_return_value(current, regs));
 }
 
 extern void show_stack(struct task_struct *task, unsigned long *stack);
-- 
cgit v1.2.3-59-g8ed1b


From 1b2707fb1189b890e538ed09ca3f3512173cb836 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 22 Mar 2019 20:19:14 +0900
Subject: csky: remove redundant generic-y

Since commit 7cbbbb8bc297 ("kbuild: warn redundant generic-y"),
redundant generic-y is reported. I missed to delete this one.

scripts/Makefile.asm-generic:25: redundant generic-y found in arch/csky/include/asm/Kbuild: ftrace.h

In this case, csky-specific implementation exists in
arch/csky/include/asm/ftrace.h

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Guo Ren <ren_guo@c-sky.com>
---
 arch/csky/include/asm/Kbuild | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch')

diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild
index 2a0abe8f2a35..e01a5768f89c 100644
--- a/arch/csky/include/asm/Kbuild
+++ b/arch/csky/include/asm/Kbuild
@@ -12,7 +12,6 @@ generic-y += dma-mapping.h
 generic-y += emergency-restart.h
 generic-y += exec.h
 generic-y += fb.h
-generic-y += ftrace.h
 generic-y += futex.h
 generic-y += gpio.h
 generic-y += hardirq.h
-- 
cgit v1.2.3-59-g8ed1b


From ce63cd5bd4482ded3c5907a48928627de623b185 Mon Sep 17 00:00:00 2001
From: Jagadeesh Pagadala <jagdsh.linux@gmail.com>
Date: Sat, 23 Mar 2019 17:25:17 +0530
Subject: csky: mm/fault.c: Remove duplicate header

Remove duplicate header which is included twice.

Signed-off-by: Jagadeesh Pagadala <jagdsh.linux@gmail.com>
Signed-off-by: Guo Ren <ren_guo@c-sky.com>
---
 arch/csky/mm/fault.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch')

diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c
index d6f4b66b93e2..e1725f8a06f9 100644
--- a/arch/csky/mm/fault.c
+++ b/arch/csky/mm/fault.c
@@ -15,7 +15,6 @@
 #include <linux/smp.h>
 #include <linux/version.h>
 #include <linux/vt_kern.h>
-#include <linux/kernel.h>
 #include <linux/extable.h>
 #include <linux/uaccess.h>
 
-- 
cgit v1.2.3-59-g8ed1b


From f335b10f3b6ca2d11adef95092fff65152c31b48 Mon Sep 17 00:00:00 2001
From: Guo Ren <ren_guo@c-sky.com>
Date: Tue, 26 Mar 2019 15:56:33 +0800
Subject: csky: Add non-uapi asm/ptrace.h namespace

Move #ifdef __KERNEL__ code in the uapi namespace to non-uapi
include/asm/ptrace.h namespace and remove #ifdef __KERNEL__ in
include/asm/ptrace.h. Seperate ptrace.h in uapi and non-uapi is more
common and clear.

Signed-off-by: Guo Ren <ren_guo@c-sky.com>
Cc: Dmitry V. Levin <ldv@altlinux.org>
---
 arch/csky/include/asm/ptrace.h      | 29 +++++++++++++++++++++++++++++
 arch/csky/include/uapi/asm/ptrace.h | 20 --------------------
 2 files changed, 29 insertions(+), 20 deletions(-)
 create mode 100644 arch/csky/include/asm/ptrace.h

(limited to 'arch')

diff --git a/arch/csky/include/asm/ptrace.h b/arch/csky/include/asm/ptrace.h
new file mode 100644
index 000000000000..1e00578166f0
--- /dev/null
+++ b/arch/csky/include/asm/ptrace.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
+
+#ifndef __ASM_CSKY_PTRACE_H
+#define __ASM_CSKY_PTRACE_H
+
+#include <uapi/asm/ptrace.h>
+
+#ifndef __ASSEMBLY__
+
+#define PS_S	0x80000000 /* Supervisor Mode */
+
+#define arch_has_single_step() (1)
+#define current_pt_regs() \
+({ (struct pt_regs *)((char *)current_thread_info() + THREAD_SIZE) - 1; })
+
+#define user_stack_pointer(regs) ((regs)->usp)
+
+#define user_mode(regs) (!((regs)->sr & PS_S))
+#define instruction_pointer(regs) ((regs)->pc)
+#define profile_pc(regs) instruction_pointer(regs)
+
+static inline unsigned long regs_return_value(struct pt_regs *regs)
+{
+	return regs->a0;
+}
+
+#endif /* __ASSEMBLY__ */
+#endif /* __ASM_CSKY_PTRACE_H */
diff --git a/arch/csky/include/uapi/asm/ptrace.h b/arch/csky/include/uapi/asm/ptrace.h
index 9bf5b1a415d0..4e248d5b86ef 100644
--- a/arch/csky/include/uapi/asm/ptrace.h
+++ b/arch/csky/include/uapi/asm/ptrace.h
@@ -48,25 +48,5 @@ struct user_fp {
 	unsigned long	reserved;
 };
 
-#ifdef __KERNEL__
-
-#define PS_S	0x80000000 /* Supervisor Mode */
-
-#define arch_has_single_step() (1)
-#define current_pt_regs() \
-({ (struct pt_regs *)((char *)current_thread_info() + THREAD_SIZE) - 1; })
-
-#define user_stack_pointer(regs) ((regs)->usp)
-
-#define user_mode(regs) (!((regs)->sr & PS_S))
-#define instruction_pointer(regs) ((regs)->pc)
-#define profile_pc(regs) instruction_pointer(regs)
-
-static inline unsigned long regs_return_value(struct pt_regs *regs)
-{
-	return regs->a0;
-}
-
-#endif /* __KERNEL__ */
 #endif /* __ASSEMBLY__ */
 #endif /* _CSKY_PTRACE_H */
-- 
cgit v1.2.3-59-g8ed1b


From f4625ee0e40a5c724bb3f3eb7fd89e491bfd7646 Mon Sep 17 00:00:00 2001
From: Guo Ren <ren_guo@c-sky.com>
Date: Sat, 30 Mar 2019 23:44:34 +0800
Subject: csky: Use in_syscall & forget_syscall instead of r11_sig

We could use regs->sr 16-24 bits to detect syscall: VEC_TRAP0 and
r11_sig is no necessary for current implementation.

In this patch, we implement the in_syscall and forget_syscall which are
inspired from arm & nds32, but csky pt_regs has no syscall_num element
and we just set zero to regs->sr's vector-bits-field instead.

For ret_from_fork, current task was forked from parent which is in syscall
progress and its regs->sr has been already setted with VEC_TRAP0. See:
arch/csky/kernel/process.c: copy_thread()

Signed-off-by: Guo Ren <ren_guo@c-sky.com>
---
 arch/csky/abiv1/inc/abi/regdef.h |  2 --
 arch/csky/abiv2/inc/abi/regdef.h |  2 --
 arch/csky/include/asm/ptrace.h   | 12 ++++++++++++
 arch/csky/kernel/entry.S         | 11 +----------
 arch/csky/kernel/signal.c        | 15 +++++++++------
 5 files changed, 22 insertions(+), 20 deletions(-)

(limited to 'arch')

diff --git a/arch/csky/abiv1/inc/abi/regdef.h b/arch/csky/abiv1/inc/abi/regdef.h
index 876689291b71..9e7e692dd271 100644
--- a/arch/csky/abiv1/inc/abi/regdef.h
+++ b/arch/csky/abiv1/inc/abi/regdef.h
@@ -5,8 +5,6 @@
 #define __ASM_CSKY_REGDEF_H
 
 #define syscallid	r1
-#define r11_sig		r11
-
 #define regs_syscallid(regs) regs->regs[9]
 
 /*
diff --git a/arch/csky/abiv2/inc/abi/regdef.h b/arch/csky/abiv2/inc/abi/regdef.h
index c72abb781bdc..652f5ce4c3dd 100644
--- a/arch/csky/abiv2/inc/abi/regdef.h
+++ b/arch/csky/abiv2/inc/abi/regdef.h
@@ -5,8 +5,6 @@
 #define __ASM_CSKY_REGDEF_H
 
 #define syscallid	r7
-#define r11_sig		r11
-
 #define regs_syscallid(regs) regs->regs[3]
 
 /*
diff --git a/arch/csky/include/asm/ptrace.h b/arch/csky/include/asm/ptrace.h
index 1e00578166f0..d0aba7b32417 100644
--- a/arch/csky/include/asm/ptrace.h
+++ b/arch/csky/include/asm/ptrace.h
@@ -5,6 +5,8 @@
 #define __ASM_CSKY_PTRACE_H
 
 #include <uapi/asm/ptrace.h>
+#include <asm/traps.h>
+#include <linux/types.h>
 
 #ifndef __ASSEMBLY__
 
@@ -20,6 +22,16 @@
 #define instruction_pointer(regs) ((regs)->pc)
 #define profile_pc(regs) instruction_pointer(regs)
 
+static inline bool in_syscall(struct pt_regs const *regs)
+{
+	return ((regs->sr >> 16) & 0xff) == VEC_TRAP0;
+}
+
+static inline void forget_syscall(struct pt_regs *regs)
+{
+	regs->sr &= ~(0xff << 16);
+}
+
 static inline unsigned long regs_return_value(struct pt_regs *regs)
 {
 	return regs->a0;
diff --git a/arch/csky/kernel/entry.S b/arch/csky/kernel/entry.S
index ecc6e7d2e95d..b3b3b0bbfcc7 100644
--- a/arch/csky/kernel/entry.S
+++ b/arch/csky/kernel/entry.S
@@ -99,7 +99,6 @@ ENTRY(csky_\name)
 	mov     a0, sp
 	movi    a1, \is_write
 	jbsr    do_page_fault
-	movi    r11_sig, 0             /* r11 = 0, Not a syscall. */
 	jmpi    ret_from_exception
 .endm
 
@@ -189,7 +188,6 @@ ENTRY(ret_from_fork)
 	bmaski	r10, THREAD_SHIFT
 	andn	r9, r10
 	ldw	r8, (r9, TINFO_FLAGS)
-	movi	r11_sig, 1
 	ANDI_R3	r8, (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT)
 	cmpnei	r8, 0
 	bf	3f
@@ -224,12 +222,8 @@ exit_work:
 	/* If thread_info->flag is empty, RESTORE_ALL */
 	cmpnei	r8, 0
 	bf	1b
-	mov	a1, sp
 	mov	a0, r8
-	mov	a2, r11_sig		/* syscall? */
-	btsti	r8, TIF_SIGPENDING	/* delivering a signal? */
-	/* prevent further restarts(set r11 = 0) */
-	clrt	r11_sig
+	mov	a1, sp
 	jbsr	do_notify_resume	/* do signals */
 	br	resume_userspace
 
@@ -239,13 +233,11 @@ work_resched:
 	jmpi	schedule
 
 ENTRY(sys_rt_sigreturn)
-	movi	r11_sig, 0
 	jmpi	do_rt_sigreturn
 
 ENTRY(csky_trap)
 	SAVE_ALL EPC_KEEP
 	psrset	ee
-	movi	r11_sig, 0             /* r11 = 0, Not a syscall. */
 	mov	a0, sp                 /* Push Stack pointer arg */
 	jbsr	trap_c                 /* Call C-level trap handler */
 	jmpi	ret_from_exception
@@ -279,7 +271,6 @@ ENTRY(csky_get_tls)
 ENTRY(csky_irq)
 	SAVE_ALL EPC_KEEP
 	psrset	ee
-	movi	r11_sig, 0		/* r11 = 0, Not a syscall. */
 
 #ifdef CONFIG_PREEMPT
 	mov	r9, sp			/* Get current stack  pointer */
diff --git a/arch/csky/kernel/signal.c b/arch/csky/kernel/signal.c
index 207a891479d2..5a18940f0b09 100644
--- a/arch/csky/kernel/signal.c
+++ b/arch/csky/kernel/signal.c
@@ -224,7 +224,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
  * that the kernel can handle, and then we build all the user-level signal
  * handling stack-frames in one go after that.
  */
-static void do_signal(struct pt_regs *regs, int syscall)
+static void do_signal(struct pt_regs *regs)
 {
 	unsigned int retval = 0, continue_addr = 0, restart_addr = 0;
 	struct ksignal ksig;
@@ -241,7 +241,9 @@ static void do_signal(struct pt_regs *regs, int syscall)
 	/*
 	 * If we were from a system call, check for system call restarting...
 	 */
-	if (syscall) {
+	if (in_syscall(regs)) {
+		forget_syscall(regs);
+
 		continue_addr = regs->pc;
 #if defined(__CSKYABIV2__)
 		restart_addr = continue_addr - 4;
@@ -249,7 +251,6 @@ static void do_signal(struct pt_regs *regs, int syscall)
 		restart_addr = continue_addr - 2;
 #endif
 		retval = regs->a0;
-
 		/*
 		 * Prepare for system call restart.  We do this here so that a
 		 * debugger will see the already changed.
@@ -304,7 +305,9 @@ static void do_signal(struct pt_regs *regs, int syscall)
 	}
 
 no_signal:
-	if (syscall) {
+	if (in_syscall(regs)) {
+		forget_syscall(regs);
+
 		/*
 		 * Handle restarting a different system call.  As above,
 		 * if a debugger has chosen to restart at a different PC,
@@ -333,10 +336,10 @@ no_signal:
 }
 
 asmlinkage void
-do_notify_resume(unsigned int thread_flags, struct pt_regs *regs, int syscall)
+do_notify_resume(unsigned int thread_flags, struct pt_regs *regs)
 {
 	if (thread_flags & _TIF_SIGPENDING)
-		do_signal(regs, syscall);
+		do_signal(regs);
 
 	if (thread_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
-- 
cgit v1.2.3-59-g8ed1b


From bf241682936293291dcf40fd93cdd0f5e6222902 Mon Sep 17 00:00:00 2001
From: Guo Ren <ren_guo@c-sky.com>
Date: Mon, 1 Apr 2019 19:06:09 +0800
Subject: csky: Reconstruct signal processing

Linux kernel has provided some apis for arch signal's implementation.
For example:
	restore_saved_sigmask()
	set_current_blocked()
	restore_altstack()

But in last version of csky signal.c didn't use them and some codes are
confusing, so reconstruct signal.c with reference to riscv's code.

Now csky signal.c implementation are very close to riscv and we can
get the following benefits:
 - Clear code structure
 - The signal code of riscv and csky can be reviewed together
 - Promoting the unification of arch's signal implementation

Also modified the related code in entry.S

Signed-off-by: Guo Ren <ren_guo@c-sky.com>
Cc: Arnd Bergmann <arnd@arndb.de>
---
 arch/csky/abiv1/inc/abi/entry.h  |   7 -
 arch/csky/abiv1/inc/abi/regdef.h |   2 +
 arch/csky/abiv2/inc/abi/entry.h  |   7 -
 arch/csky/abiv2/inc/abi/regdef.h |   2 +
 arch/csky/kernel/atomic.S        |  26 +--
 arch/csky/kernel/entry.S         |  37 ++---
 arch/csky/kernel/signal.c        | 343 +++++++++++++++------------------------
 7 files changed, 150 insertions(+), 274 deletions(-)

(limited to 'arch')

diff --git a/arch/csky/abiv1/inc/abi/entry.h b/arch/csky/abiv1/inc/abi/entry.h
index 7dacce4c0f15..4a485b142be1 100644
--- a/arch/csky/abiv1/inc/abi/entry.h
+++ b/arch/csky/abiv1/inc/abi/entry.h
@@ -16,9 +16,6 @@
 #define LSAVE_A4	40
 #define LSAVE_A5	44
 
-#define EPC_INCREASE	2
-#define EPC_KEEP	0
-
 .macro USPTOKSP
 	mtcr	sp, ss1
 	mfcr	sp, ss0
@@ -29,10 +26,6 @@
 	mfcr	sp, ss1
 .endm
 
-.macro INCTRAP	rx
-	addi	\rx, EPC_INCREASE
-.endm
-
 .macro	SAVE_ALL epc_inc
 	mtcr    r13, ss2
 	mfcr    r13, epsr
diff --git a/arch/csky/abiv1/inc/abi/regdef.h b/arch/csky/abiv1/inc/abi/regdef.h
index 9e7e692dd271..729b1c3edcfd 100644
--- a/arch/csky/abiv1/inc/abi/regdef.h
+++ b/arch/csky/abiv1/inc/abi/regdef.h
@@ -21,4 +21,6 @@
 
 #define SYSTRACE_SAVENUM	2
 
+#define TRAP0_SIZE		2
+
 #endif /* __ASM_CSKY_REGDEF_H */
diff --git a/arch/csky/abiv2/inc/abi/entry.h b/arch/csky/abiv2/inc/abi/entry.h
index ea376ed716c4..6a0df655182c 100644
--- a/arch/csky/abiv2/inc/abi/entry.h
+++ b/arch/csky/abiv2/inc/abi/entry.h
@@ -14,18 +14,11 @@
 #define LSAVE_A2	32
 #define LSAVE_A3	36
 
-#define EPC_INCREASE	4
-#define EPC_KEEP	0
-
 #define KSPTOUSP
 #define USPTOKSP
 
 #define usp cr<14, 1>
 
-.macro INCTRAP	rx
-	addi	\rx, EPC_INCREASE
-.endm
-
 .macro SAVE_ALL epc_inc
 	subi    sp, 152
 	stw	tls, (sp, 0)
diff --git a/arch/csky/abiv2/inc/abi/regdef.h b/arch/csky/abiv2/inc/abi/regdef.h
index 652f5ce4c3dd..77cb1788b04c 100644
--- a/arch/csky/abiv2/inc/abi/regdef.h
+++ b/arch/csky/abiv2/inc/abi/regdef.h
@@ -21,4 +21,6 @@
 
 #define SYSTRACE_SAVENUM	5
 
+#define TRAP0_SIZE		4
+
 #endif /* __ASM_CSKY_REGDEF_H */
diff --git a/arch/csky/kernel/atomic.S b/arch/csky/kernel/atomic.S
index d2357c8f85bd..5b84f11485ae 100644
--- a/arch/csky/kernel/atomic.S
+++ b/arch/csky/kernel/atomic.S
@@ -12,11 +12,10 @@
  * If *ptr != oldval && return 1,
  * else *ptr = newval return 0.
  */
-#ifdef CONFIG_CPU_HAS_LDSTEX
 ENTRY(csky_cmpxchg)
 	USPTOKSP
 	mfcr	a3, epc
-	INCTRAP	a3
+	addi	a3, TRAP0_SIZE
 
 	subi    sp, 8
 	stw     a3, (sp, 0)
@@ -24,6 +23,7 @@ ENTRY(csky_cmpxchg)
 	stw     a3, (sp, 4)
 
 	psrset	ee
+#ifdef CONFIG_CPU_HAS_LDSTEX
 1:
 	ldex	a3, (a2)
 	cmpne	a0, a3
@@ -33,27 +33,7 @@ ENTRY(csky_cmpxchg)
 	bez	a3, 1b
 2:
 	sync.is
-	mvc	a0
-	ldw	a3, (sp, 0)
-	mtcr	a3, epc
-	ldw     a3, (sp, 4)
-	mtcr	a3, epsr
-	addi	sp, 8
-	KSPTOUSP
-	rte
-END(csky_cmpxchg)
 #else
-ENTRY(csky_cmpxchg)
-	USPTOKSP
-	mfcr	a3, epc
-	INCTRAP	a3
-
-	subi    sp, 8
-	stw     a3, (sp, 0)
-	mfcr    a3, epsr
-	stw     a3, (sp, 4)
-
-	psrset	ee
 1:
 	ldw	a3, (a2)
 	cmpne	a0, a3
@@ -61,6 +41,7 @@ ENTRY(csky_cmpxchg)
 2:
 	stw	a1, (a2)
 3:
+#endif
 	mvc	a0
 	ldw	a3, (sp, 0)
 	mtcr	a3, epc
@@ -71,6 +52,7 @@ ENTRY(csky_cmpxchg)
 	rte
 END(csky_cmpxchg)
 
+#ifndef CONFIG_CPU_HAS_LDSTEX
 /*
  * Called from tlbmodified exception
  */
diff --git a/arch/csky/kernel/entry.S b/arch/csky/kernel/entry.S
index b3b3b0bbfcc7..c0f80736dac6 100644
--- a/arch/csky/kernel/entry.S
+++ b/arch/csky/kernel/entry.S
@@ -91,7 +91,7 @@ ENTRY(csky_\name)
 	mfcr    a3, ss2
 	mfcr    r6, ss3
 	mfcr    a2, ss4
-	SAVE_ALL EPC_KEEP
+	SAVE_ALL 0
 .endm
 .macro tlbop_end is_write
 	RD_MEH	a2
@@ -117,7 +117,7 @@ jbsr csky_cmpxchg_fixup
 tlbop_end 1
 
 ENTRY(csky_systemcall)
-	SAVE_ALL EPC_INCREASE
+	SAVE_ALL TRAP0_SIZE
 
 	psrset  ee, ie
 
@@ -190,11 +190,9 @@ ENTRY(ret_from_fork)
 	ldw	r8, (r9, TINFO_FLAGS)
 	ANDI_R3	r8, (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT)
 	cmpnei	r8, 0
-	bf	3f
+	bf	ret_from_exception
 	mov	a0, sp			/* sp = pt_regs pointer */
 	jbsr	syscall_trace_exit
-3:
-	jbsr	ret_from_exception
 
 ret_from_exception:
 	ld	syscallid, (sp, LSAVE_PSR)
@@ -209,34 +207,29 @@ ret_from_exception:
 	bmaski	r10, THREAD_SHIFT
 	andn	r9, r10
 
-resume_userspace:
 	ldw	r8, (r9, TINFO_FLAGS)
 	andi	r8, (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED)
 	cmpnei	r8, 0
 	bt	exit_work
-1:  RESTORE_ALL
+1:
+	RESTORE_ALL
 
 exit_work:
+	lrw	syscallid, ret_from_exception
+	mov	lr, syscallid
+
 	btsti	r8, TIF_NEED_RESCHED
 	bt	work_resched
-	/* If thread_info->flag is empty, RESTORE_ALL */
-	cmpnei	r8, 0
-	bf	1b
-	mov	a0, r8
-	mov	a1, sp
-	jbsr	do_notify_resume	/* do signals */
-	br	resume_userspace
+
+	mov	a0, sp
+	mov	a1, r8
+	jmpi	do_notify_resume
 
 work_resched:
-	lrw	syscallid, ret_from_exception
-	mov	r15, syscallid		/* Return address in link */
 	jmpi	schedule
 
-ENTRY(sys_rt_sigreturn)
-	jmpi	do_rt_sigreturn
-
 ENTRY(csky_trap)
-	SAVE_ALL EPC_KEEP
+	SAVE_ALL 0
 	psrset	ee
 	mov	a0, sp                 /* Push Stack pointer arg */
 	jbsr	trap_c                 /* Call C-level trap handler */
@@ -252,7 +245,7 @@ ENTRY(csky_get_tls)
 
 	/* increase epc for continue */
 	mfcr	a0, epc
-	INCTRAP	a0
+	addi	a0, TRAP0_SIZE
 	mtcr	a0, epc
 
 	/* get current task thread_info with kernel 8K stack */
@@ -269,7 +262,7 @@ ENTRY(csky_get_tls)
 	rte
 
 ENTRY(csky_irq)
-	SAVE_ALL EPC_KEEP
+	SAVE_ALL 0
 	psrset	ee
 
 #ifdef CONFIG_PREEMPT
diff --git a/arch/csky/kernel/signal.c b/arch/csky/kernel/signal.c
index 5a18940f0b09..04a43cfd4e09 100644
--- a/arch/csky/kernel/signal.c
+++ b/arch/csky/kernel/signal.c
@@ -1,26 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
 
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/kernel.h>
 #include <linux/signal.h>
+#include <linux/uaccess.h>
 #include <linux/syscalls.h>
-#include <linux/errno.h>
-#include <linux/wait.h>
-#include <linux/ptrace.h>
-#include <linux/unistd.h>
-#include <linux/stddef.h>
-#include <linux/highuid.h>
-#include <linux/personality.h>
-#include <linux/tty.h>
-#include <linux/binfmts.h>
 #include <linux/tracehook.h>
-#include <linux/freezer.h>
-#include <linux/uaccess.h>
 
-#include <asm/setup.h>
-#include <asm/pgtable.h>
 #include <asm/traps.h>
 #include <asm/ucontext.h>
 #include <asm/vdso.h>
@@ -29,110 +13,117 @@
 
 #ifdef CONFIG_CPU_HAS_FPU
 #include <abi/fpu.h>
-
-static int restore_fpu_state(struct sigcontext *sc)
+static int restore_fpu_state(struct sigcontext __user *sc)
 {
 	int err = 0;
 	struct user_fp user_fp;
 
-	err = copy_from_user(&user_fp, &sc->sc_user_fp, sizeof(user_fp));
+	err = __copy_from_user(&user_fp, &sc->sc_user_fp, sizeof(user_fp));
 
 	restore_from_user_fp(&user_fp);
 
 	return err;
 }
 
-static int save_fpu_state(struct sigcontext *sc)
+static int save_fpu_state(struct sigcontext __user *sc)
 {
 	struct user_fp user_fp;
 
 	save_to_user_fp(&user_fp);
 
-	return copy_to_user(&sc->sc_user_fp, &user_fp, sizeof(user_fp));
+	return __copy_to_user(&sc->sc_user_fp, &user_fp, sizeof(user_fp));
 }
 #else
-static inline int restore_fpu_state(struct sigcontext *sc) { return 0; }
-static inline int save_fpu_state(struct sigcontext *sc) { return 0; }
+#define restore_fpu_state(sigcontext)	(0)
+#define save_fpu_state(sigcontext)	(0)
 #endif
 
 struct rt_sigframe {
-	int sig;
-	struct siginfo *pinfo;
-	void *puc;
 	struct siginfo info;
 	struct ucontext uc;
 };
 
-static int
-restore_sigframe(struct pt_regs *regs,
-		 struct sigcontext *sc, int *pr2)
+static long restore_sigcontext(struct pt_regs *regs,
+	struct sigcontext __user *sc)
 {
 	int err = 0;
 
-	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->task->restart_block.fn = do_no_restart_syscall;
-
-	err |= copy_from_user(regs, &sc->sc_pt_regs, sizeof(struct pt_regs));
+	/* sc_pt_regs is structured the same as the start of pt_regs */
+	err |= __copy_from_user(regs, &sc->sc_pt_regs, sizeof(struct pt_regs));
 
+	/* Restore the floating-point state. */
 	err |= restore_fpu_state(sc);
 
-	*pr2 = regs->a0;
 	return err;
 }
 
-asmlinkage int
-do_rt_sigreturn(void)
+SYSCALL_DEFINE0(rt_sigreturn)
 {
-	sigset_t set;
-	int a0;
 	struct pt_regs *regs = current_pt_regs();
-	struct rt_sigframe *frame = (struct rt_sigframe *)(regs->usp);
+	struct rt_sigframe __user *frame;
+	struct task_struct *task;
+	sigset_t set;
+
+	/* Always make any pending restarted system calls return -EINTR */
+	current->restart_block.fn = do_no_restart_syscall;
+
+	frame = (struct rt_sigframe __user *)regs->usp;
 
 	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
+
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, (sigmask(SIGKILL) | sigmask(SIGSTOP)));
-	spin_lock_irq(&current->sighand->siglock);
-	current->blocked = set;
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
+	set_current_blocked(&set);
+
+	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
+		goto badframe;
 
-	if (restore_sigframe(regs, &frame->uc.uc_mcontext, &a0))
+	if (restore_altstack(&frame->uc.uc_stack))
 		goto badframe;
 
-	return a0;
+	return regs->a0;
 
 badframe:
-	force_sig(SIGSEGV, current);
+	task = current;
+	force_sig(SIGSEGV, task);
 	return 0;
 }
 
-static int setup_sigframe(struct sigcontext *sc, struct pt_regs *regs)
+static int setup_sigcontext(struct rt_sigframe __user *frame,
+	struct pt_regs *regs)
 {
+	struct sigcontext __user *sc = &frame->uc.uc_mcontext;
 	int err = 0;
 
-	err |= copy_to_user(&sc->sc_pt_regs, regs, sizeof(struct pt_regs));
+	err |= __copy_to_user(&sc->sc_pt_regs, regs, sizeof(struct pt_regs));
 	err |= save_fpu_state(sc);
 
 	return err;
 }
 
-static inline void *
-get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
+static inline void __user *get_sigframe(struct ksignal *ksig,
+	struct pt_regs *regs, size_t framesize)
 {
-	unsigned long usp;
+	unsigned long sp;
+	/* Default to using normal stack */
+	sp = regs->usp;
 
-	/* Default to using normal stack.  */
-	usp = regs->usp;
+	/*
+	 * If we are on the alternate signal stack and would overflow it, don't.
+	 * Return an always-bogus address instead so we will die with SIGSEGV.
+	 */
+	if (on_sig_stack(sp) && !likely(on_sig_stack(sp - framesize)))
+		return (void __user __force *)(-1UL);
 
-	/* This is the X/Open sanctioned signal stack switching.  */
-	if ((ka->sa.sa_flags & SA_ONSTACK) && !sas_ss_flags(usp)) {
-		if (!on_sig_stack(usp))
-			usp = current->sas_ss_sp + current->sas_ss_size;
-	}
-	return (void *)((usp - frame_size) & -8UL);
+	/* This is the X/Open sanctioned signal stack switching. */
+	sp = sigsp(sp, ksig) - framesize;
+
+	/* Align the stack frame. */
+	sp &= -8UL;
+
+	return (void __user *)sp;
 }
 
 static int
@@ -140,208 +131,128 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs)
 {
 	struct rt_sigframe *frame;
 	int err = 0;
-
 	struct csky_vdso *vdso = current->mm->context.vdso;
 
-	frame = get_sigframe(&ksig->ka, regs, sizeof(*frame));
-	if (!frame)
-		return 1;
+	frame = get_sigframe(ksig, regs, sizeof(*frame));
+	if (!access_ok(frame, sizeof(*frame)))
+		return -EFAULT;
 
-	err |= __put_user(ksig->sig, &frame->sig);
-	err |= __put_user(&frame->info, &frame->pinfo);
-	err |= __put_user(&frame->uc, &frame->puc);
 	err |= copy_siginfo_to_user(&frame->info, &ksig->info);
 
-	/* Create the ucontext.  */
+	/* Create the ucontext. */
 	err |= __put_user(0, &frame->uc.uc_flags);
-	err |= __put_user(0, &frame->uc.uc_link);
-	err |= __put_user((void *)current->sas_ss_sp,
-			&frame->uc.uc_stack.ss_sp);
-	err |= __put_user(sas_ss_flags(regs->usp),
-			&frame->uc.uc_stack.ss_flags);
-	err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
-	err |= setup_sigframe(&frame->uc.uc_mcontext, regs);
-	err |= copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-
+	err |= __put_user(NULL, &frame->uc.uc_link);
+	err |= __save_altstack(&frame->uc.uc_stack, regs->usp);
+	err |= setup_sigcontext(frame, regs);
+	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
 	if (err)
-		goto give_sigsegv;
+		return -EFAULT;
 
-	/* Set up registers for signal handler */
-	regs->usp = (unsigned long)frame;
-	regs->pc = (unsigned long)ksig->ka.sa.sa_handler;
-	regs->lr = (unsigned long)vdso->rt_signal_retcode;
+	/* Set up to return from userspace. */
+	regs->lr = (unsigned long)(vdso->rt_signal_retcode);
 
-adjust_stack:
-	regs->a0 = ksig->sig; /* first arg is signo */
-	regs->a1 = (unsigned long)(&(frame->info));
-	regs->a2 = (unsigned long)(&(frame->uc));
-	return err;
+	/*
+	 * Set up registers for signal handler.
+	 * Registers that we don't modify keep the value they had from
+	 * user-space at the time we took the signal.
+	 * We always pass siginfo and mcontext, regardless of SA_SIGINFO,
+	 * since some things rely on this (e.g. glibc's debug/segfault.c).
+	 */
+	regs->pc  = (unsigned long)ksig->ka.sa.sa_handler;
+	regs->usp = (unsigned long)frame;
+	regs->a0  = ksig->sig;				/* a0: signal number */
+	regs->a1  = (unsigned long)(&(frame->info));	/* a1: siginfo pointer */
+	regs->a2  = (unsigned long)(&(frame->uc));	/* a2: ucontext pointer */
 
-give_sigsegv:
-	if (ksig->sig == SIGSEGV)
-		ksig->ka.sa.sa_handler = SIG_DFL;
-	force_sig(SIGSEGV, current);
-	goto adjust_stack;
+	return 0;
 }
 
-/*
- * OK, we're invoking a handler
- */
-static int
-handle_signal(struct ksignal *ksig, struct pt_regs *regs)
+static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 {
-	int ret;
 	sigset_t *oldset = sigmask_to_save();
+	int ret;
 
-	/*
-	 * set up the stack frame, regardless of SA_SIGINFO,
-	 * and pass info anyway.
-	 */
-	ret = setup_rt_frame(ksig, oldset, regs);
+	/* Are we from a system call? */
+	if (in_syscall(regs)) {
+		/* Avoid additional syscall restarting via ret_from_exception */
+		forget_syscall(regs);
+
+		/* If so, check system call restarting.. */
+		switch (regs->a0) {
+		case -ERESTART_RESTARTBLOCK:
+		case -ERESTARTNOHAND:
+			regs->a0 = -EINTR;
+			break;
 
-	if (ret != 0) {
-		force_sigsegv(ksig->sig, current);
-		return ret;
+		case -ERESTARTSYS:
+			if (!(ksig->ka.sa.sa_flags & SA_RESTART)) {
+				regs->a0 = -EINTR;
+				break;
+			}
+			/* fallthrough */
+		case -ERESTARTNOINTR:
+			regs->a0 = regs->orig_a0;
+			regs->pc -= TRAP0_SIZE;
+			break;
+		}
 	}
 
-	/* Block the signal if we were successful. */
-	spin_lock_irq(&current->sighand->siglock);
-	sigorsets(&current->blocked, &current->blocked, &ksig->ka.sa.sa_mask);
-	if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
-		sigaddset(&current->blocked, ksig->sig);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
+	/* Set up the stack frame */
+	ret = setup_rt_frame(ksig, oldset, regs);
 
-	return 0;
+	signal_setup_done(ret, ksig, 0);
 }
 
-/*
- * Note that 'init' is a special process: it doesn't get signals it doesn't
- * want to handle. Thus you cannot kill init even with a SIGKILL even by
- * mistake.
- *
- * Note that we go through the signals twice: once to check the signals
- * that the kernel can handle, and then we build all the user-level signal
- * handling stack-frames in one go after that.
- */
 static void do_signal(struct pt_regs *regs)
 {
-	unsigned int retval = 0, continue_addr = 0, restart_addr = 0;
 	struct ksignal ksig;
 
-	/*
-	 * We want the common case to go fast, which
-	 * is why we may in certain cases get here from
-	 * kernel mode. Just return without doing anything
-	 * if so.
-	 */
-	if (!user_mode(regs))
+	if (get_signal(&ksig)) {
+		/* Actually deliver the signal */
+		handle_signal(&ksig, regs);
 		return;
+	}
 
-	/*
-	 * If we were from a system call, check for system call restarting...
-	 */
+	/* Did we come from a system call? */
 	if (in_syscall(regs)) {
+		/* Avoid additional syscall restarting via ret_from_exception */
 		forget_syscall(regs);
 
-		continue_addr = regs->pc;
-#if defined(__CSKYABIV2__)
-		restart_addr = continue_addr - 4;
-#else
-		restart_addr = continue_addr - 2;
-#endif
-		retval = regs->a0;
-		/*
-		 * Prepare for system call restart.  We do this here so that a
-		 * debugger will see the already changed.
-		 */
-		switch (retval) {
+		/* Restart the system call - no handlers present */
+		switch (regs->a0) {
 		case -ERESTARTNOHAND:
 		case -ERESTARTSYS:
 		case -ERESTARTNOINTR:
 			regs->a0 = regs->orig_a0;
-			regs->pc = restart_addr;
+			regs->pc -= TRAP0_SIZE;
 			break;
 		case -ERESTART_RESTARTBLOCK:
-			regs->a0 = -EINTR;
+			regs->a0 = regs->orig_a0;
+			regs_syscallid(regs) = __NR_restart_syscall;
+			regs->pc -= TRAP0_SIZE;
 			break;
 		}
 	}
 
-	if (try_to_freeze())
-		goto no_signal;
-
 	/*
-	 * Get the signal to deliver.  When running under ptrace, at this
-	 * point the debugger may change all our registers ...
+	 * If there is no signal to deliver, we just put the saved
+	 * sigmask back.
 	 */
-	if (get_signal(&ksig)) {
-		/*
-		 * Depending on the signal settings we may need to revert the
-		 * decision to restart the system call.  But skip this if a
-		 * debugger has chosen to restart at a different PC.
-		 */
-		if (regs->pc == restart_addr) {
-			if (retval == -ERESTARTNOHAND ||
-			    (retval == -ERESTARTSYS &&
-			     !(ksig.ka.sa.sa_flags & SA_RESTART))) {
-				regs->a0 = -EINTR;
-				regs->pc = continue_addr;
-			}
-		}
-
-		/* Whee!  Actually deliver the signal.  */
-		if (handle_signal(&ksig, regs) == 0) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag.
-			 */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-		}
-		return;
-	}
-
-no_signal:
-	if (in_syscall(regs)) {
-		forget_syscall(regs);
-
-		/*
-		 * Handle restarting a different system call.  As above,
-		 * if a debugger has chosen to restart at a different PC,
-		 * ignore the restart.
-		 */
-		if (retval == -ERESTART_RESTARTBLOCK
-				&& regs->pc == continue_addr) {
-#if defined(__CSKYABIV2__)
-			regs->regs[3] = __NR_restart_syscall;
-			regs->pc -= 4;
-#else
-			regs->regs[9] = __NR_restart_syscall;
-			regs->pc -= 2;
-#endif
-		}
-
-		/*
-		 * If there's no signal to deliver, we just put the saved
-		 * sigmask back.
-		 */
-		if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-			clear_thread_flag(TIF_RESTORE_SIGMASK);
-			sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-		}
-	}
+	restore_saved_sigmask();
 }
 
-asmlinkage void
-do_notify_resume(unsigned int thread_flags, struct pt_regs *regs)
+/*
+ * notification of userspace execution resumption
+ * - triggered by the _TIF_WORK_MASK flags
+ */
+asmlinkage void do_notify_resume(struct pt_regs *regs,
+	unsigned long thread_info_flags)
 {
-	if (thread_flags & _TIF_SIGPENDING)
+	/* Handle pending signal delivery */
+	if (thread_info_flags & _TIF_SIGPENDING)
 		do_signal(regs);
 
-	if (thread_flags & _TIF_NOTIFY_RESUME) {
+	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From f62e31623d718a7c20d9da98de48361624d7360a Mon Sep 17 00:00:00 2001
From: Guo Ren <ren_guo@c-sky.com>
Date: Mon, 8 Apr 2019 11:12:25 +0800
Subject: csky: Support dynamic start physical address

Before this patch csky-linux need CONFIG_RAM_BASE to determine start
physical address. Now we use phys_offset variable to replace the macro
of PHYS_OFFSET and we setup phys_offset with real physical address which
is determined during startup in head.S.

With this patch we needn't re-compile kernel for different start
physical address. ie: 0x0 / 0xc0000000 start physical address could use
the same vmlinux, be care different start address must be 512MB aligned.

Signed-off-by: Guo Ren <ren_guo@c-sky.com>
Cc: Arnd Bergmann <arnd@arndb.de>
---
 arch/csky/abiv1/inc/abi/ckmmu.h     | 20 +++++++++++++++++++
 arch/csky/abiv1/inc/abi/entry.h     | 19 ++++++++++++++++--
 arch/csky/abiv2/inc/abi/ckmmu.h     | 20 +++++++++++++++++++
 arch/csky/abiv2/inc/abi/entry.h     | 24 +++++++++++++++++++++--
 arch/csky/include/asm/mmu_context.h |  4 ++--
 arch/csky/include/asm/page.h        | 39 +++++++++++++++----------------------
 arch/csky/kernel/entry.S            |  6 ++++--
 arch/csky/kernel/setup.c            |  5 +++++
 8 files changed, 106 insertions(+), 31 deletions(-)

(limited to 'arch')

diff --git a/arch/csky/abiv1/inc/abi/ckmmu.h b/arch/csky/abiv1/inc/abi/ckmmu.h
index 3a002017bebe..85339bd224c4 100644
--- a/arch/csky/abiv1/inc/abi/ckmmu.h
+++ b/arch/csky/abiv1/inc/abi/ckmmu.h
@@ -40,6 +40,26 @@ static inline void write_mmu_entryhi(int value)
 	cpwcr("cpcr4", value);
 }
 
+static inline unsigned long read_mmu_msa0(void)
+{
+	return cprcr("cpcr30");
+}
+
+static inline void write_mmu_msa0(unsigned long value)
+{
+	cpwcr("cpcr30", value);
+}
+
+static inline unsigned long read_mmu_msa1(void)
+{
+	return cprcr("cpcr31");
+}
+
+static inline void write_mmu_msa1(unsigned long value)
+{
+	cpwcr("cpcr31", value);
+}
+
 /*
  * TLB operations.
  */
diff --git a/arch/csky/abiv1/inc/abi/entry.h b/arch/csky/abiv1/inc/abi/entry.h
index 4a485b142be1..18e2023bf165 100644
--- a/arch/csky/abiv1/inc/abi/entry.h
+++ b/arch/csky/abiv1/inc/abi/entry.h
@@ -144,9 +144,24 @@
 .endm
 
 .macro SETUP_MMU rx
-	lrw	\rx, PHYS_OFFSET | 0xe
+	/* Select MMU as co-processor */
+	cpseti	cp15
+
+	/*
+	 * cpcr30 format:
+	 * 31 - 29 | 28 - 4 | 3 | 2 | 1 | 0
+	 *   BA     Reserved  C   D   V
+	 */
+	cprcr	\rx, cpcr30
+	lsri	\rx, 28
+	lsli	\rx, 28
+	addi	\rx, 0xe
 	cpwcr	\rx, cpcr30
-	lrw	\rx, (PHYS_OFFSET + 0x20000000) | 0xe
+
+	lsri	\rx, 28
+	addi	\rx, 2
+	lsli	\rx, 28
+	addi	\rx, 0xe
 	cpwcr	\rx, cpcr31
 .endm
 
diff --git a/arch/csky/abiv2/inc/abi/ckmmu.h b/arch/csky/abiv2/inc/abi/ckmmu.h
index 97230ad9427c..31d75e1a724a 100644
--- a/arch/csky/abiv2/inc/abi/ckmmu.h
+++ b/arch/csky/abiv2/inc/abi/ckmmu.h
@@ -42,6 +42,26 @@ static inline void write_mmu_entryhi(int value)
 	mtcr("cr<4, 15>", value);
 }
 
+static inline unsigned long read_mmu_msa0(void)
+{
+	return mfcr("cr<30, 15>");
+}
+
+static inline void write_mmu_msa0(unsigned long value)
+{
+	mtcr("cr<30, 15>", value);
+}
+
+static inline unsigned long read_mmu_msa1(void)
+{
+	return mfcr("cr<31, 15>");
+}
+
+static inline void write_mmu_msa1(unsigned long value)
+{
+	mtcr("cr<31, 15>", value);
+}
+
 /*
  * TLB operations.
  */
diff --git a/arch/csky/abiv2/inc/abi/entry.h b/arch/csky/abiv2/inc/abi/entry.h
index 6a0df655182c..c0a76c43cded 100644
--- a/arch/csky/abiv2/inc/abi/entry.h
+++ b/arch/csky/abiv2/inc/abi/entry.h
@@ -163,9 +163,29 @@
 .endm
 
 .macro SETUP_MMU rx
-	lrw	\rx, PHYS_OFFSET | 0xe
+	/* Check MMU on | off */
+	mfcr	\rx, cr18
+	btsti	\rx, 0
+	bt	1f
+	grs	\rx, 1f
+	br	2f
+1:
+	/*
+	 * cr<30, 15> format:
+	 * 31 - 29 | 28 - 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0
+	 *   BA     Reserved  SH  WA  B   SO SEC  C   D   V
+	 */
+	mfcr	\rx, cr<30, 15>
+2:
+	lsri	\rx, 28
+	lsli	\rx, 28
+	addi	\rx, 0x1ce
 	mtcr	\rx, cr<30, 15>
-	lrw	\rx, (PHYS_OFFSET + 0x20000000) | 0xe
+
+	lsri	\rx, 28
+	addi	\rx, 2
+	lsli	\rx, 28
+	addi	\rx, 0x1ce
 	mtcr	\rx, cr<31, 15>
 .endm
 
diff --git a/arch/csky/include/asm/mmu_context.h b/arch/csky/include/asm/mmu_context.h
index b2905c0485a7..c41f86b22460 100644
--- a/arch/csky/include/asm/mmu_context.h
+++ b/arch/csky/include/asm/mmu_context.h
@@ -17,7 +17,7 @@
 static inline void tlbmiss_handler_setup_pgd(unsigned long pgd, bool kernel)
 {
 	pgd -= PAGE_OFFSET;
-	pgd += PHYS_OFFSET;
+	pgd += phys_offset;
 	pgd |= 1;
 	setup_pgd(pgd, kernel);
 }
@@ -29,7 +29,7 @@ static inline void tlbmiss_handler_setup_pgd(unsigned long pgd, bool kernel)
 
 static inline unsigned long tlb_get_pgd(void)
 {
-	return ((get_pgd() - PHYS_OFFSET) & ~1) + PAGE_OFFSET;
+	return ((get_pgd() - phys_offset) & ~1) + PAGE_OFFSET;
 }
 
 #define cpu_context(cpu, mm)	((mm)->context.asid[cpu])
diff --git a/arch/csky/include/asm/page.h b/arch/csky/include/asm/page.h
index 73cf2bd66a13..4ce3d5c28ffc 100644
--- a/arch/csky/include/asm/page.h
+++ b/arch/csky/include/asm/page.h
@@ -8,7 +8,7 @@
 #include <linux/const.h>
 
 /*
- * PAGE_SHIFT determines the page size
+ * PAGE_SHIFT determines the page size: 4KB
  */
 #define PAGE_SHIFT	12
 #define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
@@ -17,12 +17,18 @@
 #define THREAD_MASK	(~(THREAD_SIZE - 1))
 #define THREAD_SHIFT	(PAGE_SHIFT + 1)
 
+
 /*
- * NOTE: virtual isn't really correct, actually it should be the offset into the
- * memory node, but we have no highmem, so that works for now.
- * TODO: implement (fast) pfn<->pgdat_idx conversion functions, this makes lots
- * of the shifts unnecessary.
+ * For C-SKY "User-space:Kernel-space" is "2GB:2GB" fixed by hardware and there
+ * are two segment registers (MSA0 + MSA1) to mapping 512MB + 512MB physical
+ * address region. We use them mapping kernel 1GB direct-map address area and
+ * for more than 1GB of memory we use highmem.
  */
+#define PAGE_OFFSET	0x80000000
+#define SSEG_SIZE	0x20000000
+#define LOWMEM_LIMIT	(SSEG_SIZE * 2)
+
+#define PHYS_OFFSET_OFFSET (CONFIG_RAM_BASE & (SSEG_SIZE - 1))
 
 #ifndef __ASSEMBLY__
 
@@ -50,9 +56,6 @@ struct page;
 
 struct vm_area_struct;
 
-/*
- * These are used to make use of C type-checking..
- */
 typedef struct { unsigned long pte_low; } pte_t;
 #define pte_val(x)	((x).pte_low)
 
@@ -69,18 +72,13 @@ typedef struct page *pgtable_t;
 #define __pgd(x)	((pgd_t) { (x) })
 #define __pgprot(x)	((pgprot_t) { (x) })
 
-#endif /* !__ASSEMBLY__ */
-
-#define PHYS_OFFSET		(CONFIG_RAM_BASE & ~(LOWMEM_LIMIT - 1))
-#define PHYS_OFFSET_OFFSET	(CONFIG_RAM_BASE & (LOWMEM_LIMIT - 1))
-#define ARCH_PFN_OFFSET		PFN_DOWN(CONFIG_RAM_BASE)
+extern unsigned long phys_offset;
 
-#define	PAGE_OFFSET	0x80000000
-#define LOWMEM_LIMIT	0x40000000
+#define ARCH_PFN_OFFSET	PFN_DOWN(phys_offset + PHYS_OFFSET_OFFSET)
 
-#define __pa(x)		((unsigned long)(x) - PAGE_OFFSET + PHYS_OFFSET)
+#define __pa(x)		((unsigned long)(x) - PAGE_OFFSET + phys_offset)
 #define __va(x)		((void *)((unsigned long)(x) + PAGE_OFFSET - \
-				  PHYS_OFFSET))
+				  phys_offset))
 #define __pa_symbol(x)	__pa(RELOC_HIDE((unsigned long)(x), 0))
 
 #define MAP_NR(x)	PFN_DOWN((unsigned long)(x) - PAGE_OFFSET - \
@@ -90,15 +88,10 @@ typedef struct page *pgtable_t;
 #define VM_DATA_DEFAULT_FLAGS	(VM_READ | VM_WRITE | VM_EXEC | \
 				VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
 
-/*
- * main RAM and kernel working space are coincident at 0x80000000, but to make
- * life more interesting, there's also an uncached virtual shadow at 0xb0000000
- * - these mappings are fixed in the MMU
- */
-
 #define pfn_to_kaddr(x)	__va(PFN_PHYS(x))
 
 #include <asm-generic/memory_model.h>
 #include <asm-generic/getorder.h>
 
+#endif /* !__ASSEMBLY__ */
 #endif /* __ASM_CSKY_PAGE_H */
diff --git a/arch/csky/kernel/entry.S b/arch/csky/kernel/entry.S
index c0f80736dac6..e5bbd8c184f3 100644
--- a/arch/csky/kernel/entry.S
+++ b/arch/csky/kernel/entry.S
@@ -40,7 +40,8 @@ ENTRY(csky_\name)
 	WR_MCIR	a2
 #endif
 	bclri   r6, 0
-	lrw	a2, PHYS_OFFSET
+	lrw	a2, phys_offset
+	ld.w	a2, (a2, 0)
 	subu	r6, a2
 	bseti	r6, 31
 
@@ -50,7 +51,8 @@ ENTRY(csky_\name)
 	addu    r6, a2
 	ldw     r6, (r6)
 
-	lrw	a2, PHYS_OFFSET
+	lrw	a2, phys_offset
+	ld.w	a2, (a2, 0)
 	subu	r6, a2
 	bseti	r6, 31
 
diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c
index dff8b89444ec..c377194e4b8f 100644
--- a/arch/csky/kernel/setup.c
+++ b/arch/csky/kernel/setup.c
@@ -142,11 +142,16 @@ void __init setup_arch(char **cmdline_p)
 #endif
 }
 
+unsigned long phys_offset;
+EXPORT_SYMBOL(phys_offset);
+
 asmlinkage __visible void __init csky_start(unsigned int unused, void *param)
 {
 	/* Clean up bss section */
 	memset(__bss_start, 0, __bss_stop - __bss_start);
 
+	phys_offset = read_mmu_msa0() & ~(SSEG_SIZE - 1);
+
 	pre_trap_init();
 	pre_mmu_init();
 
-- 
cgit v1.2.3-59-g8ed1b


From 981bbf274b64496fd4376b44120d47dae4ca94e8 Mon Sep 17 00:00:00 2001
From: Guo Ren <ren_guo@c-sky.com>
Date: Wed, 10 Apr 2019 10:55:07 +0800
Subject: csky: Fixup wrong update_mmu_cache implementation

In our stress test, we found some crash problem caused by:

if (!(vma->vm_flags & VM_EXEC))
	return;

in update_mmu_cache().

Seems current update_mmu_cache implementation is wrong and we retread
to the conservative implementation.

Also the usage of kmap_atomic in update_mmu_cache is risky, page-virtual
may be scheduled out and changed, so we must use preempt_disable &
pagefault_disable which is called by kmap_atomic().

Signed-off-by: Guo Ren <ren_guo@c-sky.com>
Cc: Arnd Bergmann <arnd@arndb.de>
---
 arch/csky/abiv2/cacheflush.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/csky/abiv2/cacheflush.c b/arch/csky/abiv2/cacheflush.c
index d22c95ffc74d..5bb887b275e1 100644
--- a/arch/csky/abiv2/cacheflush.c
+++ b/arch/csky/abiv2/cacheflush.c
@@ -34,10 +34,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 {
 	unsigned long addr, pfn;
 	struct page *page;
-	void *va;
-
-	if (!(vma->vm_flags & VM_EXEC))
-		return;
 
 	pfn = pte_pfn(*pte);
 	if (unlikely(!pfn_valid(pfn)))
@@ -47,14 +43,9 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 	if (page == ZERO_PAGE(0))
 		return;
 
-	va = page_address(page);
-	addr = (unsigned long) va;
-
-	if (va == NULL && PageHighMem(page))
-		addr = (unsigned long) kmap_atomic(page);
+	addr = (unsigned long) kmap_atomic(page);
 
 	cache_wbinv_range(addr, addr + PAGE_SIZE);
 
-	if (va == NULL && PageHighMem(page))
-		kunmap_atomic((void *) addr);
+	kunmap_atomic((void *) addr);
 }
-- 
cgit v1.2.3-59-g8ed1b


From b4bf274198bd415e66af0b54a2e181c59fd43ba4 Mon Sep 17 00:00:00 2001
From: Mao Han <han_mao@c-sky.com>
Date: Wed, 10 Apr 2019 10:27:10 +0800
Subject: csky: Add perf_arch_fetch_caller_regs support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In trace events as tracepoints context are not able to
be retrieve with task_pt_regs. Without arch caller regs
support the pt_regs context will be all zero, perf can
not parsing the callchain and resolving the symbols
correctly, some time will even get into deadlock
while handling the page fault, eg:

perf kmem —page record ls

Changelog
 - Add test case cmd in comment
 - Use regs_fp(regs) which is defined in abi/regdef.h

Signed-off-by: Mao Han <han_mao@c-sky.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
---
 arch/csky/abiv1/inc/abi/regdef.h   | 1 +
 arch/csky/abiv2/inc/abi/regdef.h   | 1 +
 arch/csky/include/asm/perf_event.h | 8 ++++++++
 3 files changed, 10 insertions(+)

(limited to 'arch')

diff --git a/arch/csky/abiv1/inc/abi/regdef.h b/arch/csky/abiv1/inc/abi/regdef.h
index 729b1c3edcfd..104707fbdcc1 100644
--- a/arch/csky/abiv1/inc/abi/regdef.h
+++ b/arch/csky/abiv1/inc/abi/regdef.h
@@ -6,6 +6,7 @@
 
 #define syscallid	r1
 #define regs_syscallid(regs) regs->regs[9]
+#define regs_fp(regs) regs->regs[2]
 
 /*
  * PSR format:
diff --git a/arch/csky/abiv2/inc/abi/regdef.h b/arch/csky/abiv2/inc/abi/regdef.h
index 77cb1788b04c..d7328bbc1ce7 100644
--- a/arch/csky/abiv2/inc/abi/regdef.h
+++ b/arch/csky/abiv2/inc/abi/regdef.h
@@ -6,6 +6,7 @@
 
 #define syscallid	r7
 #define regs_syscallid(regs) regs->regs[3]
+#define regs_fp(regs) regs->regs[4]
 
 /*
  * PSR format:
diff --git a/arch/csky/include/asm/perf_event.h b/arch/csky/include/asm/perf_event.h
index ea8193122294..572093e11001 100644
--- a/arch/csky/include/asm/perf_event.h
+++ b/arch/csky/include/asm/perf_event.h
@@ -4,4 +4,12 @@
 #ifndef __ASM_CSKY_PERF_EVENT_H
 #define __ASM_CSKY_PERF_EVENT_H
 
+#include <abi/regdef.h>
+
+#define perf_arch_fetch_caller_regs(regs, __ip) { \
+	(regs)->pc = (__ip); \
+	regs_fp(regs) = (unsigned long) __builtin_frame_address(0); \
+	asm volatile("mov %0, sp\n":"=r"((regs)->usp)); \
+}
+
 #endif /* __ASM_PERF_EVENT_ELF_H */
-- 
cgit v1.2.3-59-g8ed1b


From 205353fa06cc5dbfe1949de013ba905bb151c702 Mon Sep 17 00:00:00 2001
From: Guo Ren <ren_guo@c-sky.com>
Date: Fri, 12 Apr 2019 19:08:34 +0800
Subject: csky: Support vmlinux bootup with MMU off

Modify SETUP_MMU macro to fit on both MMU-on or MMU-off enviornment
and vmlinux could bootup from MMU off enviornment for some cases.

Unify the style of _start and _start_smp_secondary in head.S to make
head.S looks more concise and easy to understand.

Signed-off-by: Guo Ren <ren_guo@c-sky.com>
Cc: Arnd Bergmann <arnd@arndb.de>
---
 arch/csky/abiv1/inc/abi/entry.h | 29 ++++++++-------
 arch/csky/abiv2/inc/abi/entry.h | 79 ++++++++++++++++++++++++++++++++---------
 arch/csky/kernel/head.S         | 60 ++++---------------------------
 arch/csky/kernel/setup.c        |  7 ++--
 4 files changed, 90 insertions(+), 85 deletions(-)

(limited to 'arch')

diff --git a/arch/csky/abiv1/inc/abi/entry.h b/arch/csky/abiv1/inc/abi/entry.h
index 18e2023bf165..7ab78bd0f3b1 100644
--- a/arch/csky/abiv1/inc/abi/entry.h
+++ b/arch/csky/abiv1/inc/abi/entry.h
@@ -143,7 +143,12 @@
 	cpwcr   \rx, cpcr8
 .endm
 
-.macro SETUP_MMU rx
+.macro SETUP_MMU
+	/* Init psr and enable ee */
+	lrw	r6, DEFAULT_PSR_VALUE
+	mtcr    r6, psr
+	psrset  ee
+
 	/* Select MMU as co-processor */
 	cpseti	cp15
 
@@ -152,17 +157,17 @@
 	 * 31 - 29 | 28 - 4 | 3 | 2 | 1 | 0
 	 *   BA     Reserved  C   D   V
 	 */
-	cprcr	\rx, cpcr30
-	lsri	\rx, 28
-	lsli	\rx, 28
-	addi	\rx, 0xe
-	cpwcr	\rx, cpcr30
-
-	lsri	\rx, 28
-	addi	\rx, 2
-	lsli	\rx, 28
-	addi	\rx, 0xe
-	cpwcr	\rx, cpcr31
+	cprcr	r6, cpcr30
+	lsri	r6, 28
+	lsli	r6, 28
+	addi	r6, 0xe
+	cpwcr	r6, cpcr30
+
+	lsri	r6, 28
+	addi	r6, 2
+	lsli	r6, 28
+	addi	r6, 0xe
+	cpwcr	r6, cpcr31
 .endm
 
 .macro ANDI_R3 rx, imm
diff --git a/arch/csky/abiv2/inc/abi/entry.h b/arch/csky/abiv2/inc/abi/entry.h
index c0a76c43cded..9897a16b45e5 100644
--- a/arch/csky/abiv2/inc/abi/entry.h
+++ b/arch/csky/abiv2/inc/abi/entry.h
@@ -162,31 +162,76 @@
 	mtcr	\rx, cr<8, 15>
 .endm
 
-.macro SETUP_MMU rx
-	/* Check MMU on | off */
-	mfcr	\rx, cr18
-	btsti	\rx, 0
+.macro SETUP_MMU
+	/* Init psr and enable ee */
+	lrw	r6, DEFAULT_PSR_VALUE
+	mtcr    r6, psr
+	psrset  ee
+
+	/* Invalid I/Dcache BTB BHT */
+	movi	r6, 7
+	lsli	r6, 16
+	addi	r6, (1<<4) | 3
+	mtcr	r6, cr17
+
+	/* Invalid all TLB */
+	bgeni   r6, 26
+	mtcr	r6, cr<8, 15> /* Set MCIR */
+
+	/* Check MMU on/off */
+	mfcr	r6, cr18
+	btsti	r6, 0
 	bt	1f
-	grs	\rx, 1f
+
+	/* MMU off: setup mapping tlb entry */
+	movi	r6, 0
+	mtcr	r6, cr<6, 15> /* Set MPR with 4K page size */
+
+	grs	r6, 1f /* Get current pa by PC */
+	bmaski  r7, (PAGE_SHIFT + 1) /* r7 = 0x1fff */
+	andn    r6, r7
+	mtcr	r6, cr<4, 15> /* Set MEH */
+
+	mov	r8, r6
+	movi    r7, 0x00000006
+	or      r8, r7
+	mtcr	r8, cr<2, 15> /* Set MEL0 */
+	movi    r7, 0x00001006
+	or      r8, r7
+	mtcr	r8, cr<3, 15> /* Set MEL1 */
+
+	bgeni   r8, 28
+	mtcr	r8, cr<8, 15> /* Set MCIR to write TLB */
+
 	br	2f
 1:
 	/*
-	 * cr<30, 15> format:
+	 * MMU on: use origin MSA value from bootloader
+	 *
+	 * cr<30/31, 15> MSA register format:
 	 * 31 - 29 | 28 - 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0
 	 *   BA     Reserved  SH  WA  B   SO SEC  C   D   V
 	 */
-	mfcr	\rx, cr<30, 15>
+	mfcr	r6, cr<30, 15> /* Get MSA0 */
 2:
-	lsri	\rx, 28
-	lsli	\rx, 28
-	addi	\rx, 0x1ce
-	mtcr	\rx, cr<30, 15>
-
-	lsri	\rx, 28
-	addi	\rx, 2
-	lsli	\rx, 28
-	addi	\rx, 0x1ce
-	mtcr	\rx, cr<31, 15>
+	lsri	r6, 28
+	lsli	r6, 28
+	addi	r6, 0x1ce
+	mtcr	r6, cr<30, 15> /* Set MSA0 */
+
+	lsri	r6, 28
+	addi	r6, 2
+	lsli	r6, 28
+	addi	r6, 0x1ce
+	mtcr	r6, cr<31, 15> /* Set MSA1 */
+
+	/* enable MMU */
+	mfcr    r6, cr18
+	bseti	r6, 0
+	mtcr    r6, cr18
+
+	jmpi	3f /* jump to va */
+3:
 .endm
 
 .macro ANDI_R3 rx, imm
diff --git a/arch/csky/kernel/head.S b/arch/csky/kernel/head.S
index 9c4ec473b76b..61989f9241c0 100644
--- a/arch/csky/kernel/head.S
+++ b/arch/csky/kernel/head.S
@@ -7,16 +7,11 @@
 
 __HEAD
 ENTRY(_start)
-	/* set super user mode */
-	lrw	a3, DEFAULT_PSR_VALUE
-	mtcr    a3, psr
-	psrset  ee
-
-	SETUP_MMU a3
+	SETUP_MMU
 
 	/* set stack point */
-	lrw     a3, init_thread_union + THREAD_SIZE
-	mov	sp, a3
+	lrw     r6, init_thread_union + THREAD_SIZE
+	mov	sp, r6
 
 	jmpi	csky_start
 END(_start)
@@ -24,53 +19,12 @@ END(_start)
 #ifdef CONFIG_SMP
 .align 10
 ENTRY(_start_smp_secondary)
-	/* Invalid I/Dcache BTB BHT */
-	movi	a3, 7
-	lsli	a3, 16
-	addi	a3, (1<<4) | 3
-	mtcr	a3, cr17
-
-	tlbi.alls
-
-	/* setup PAGEMASK */
-	movi	a3, 0
-	mtcr	a3, cr<6, 15>
-
-	/* setup MEL0/MEL1 */
-	grs	a0, _start_smp_pc
-_start_smp_pc:
-	bmaski  a1, 13
-	andn    a0, a1
-	movi    a1, 0x00000006
-	movi    a2, 0x00001006
-	or      a1, a0
-	or      a2, a0
-	mtcr	a1, cr<2, 15>
-	mtcr	a2, cr<3, 15>
-
-	/* setup MEH */
-	mtcr	a0, cr<4, 15>
-
-	/* write TLB */
-	bgeni   a3, 28
-	mtcr	a3, cr<8, 15>
-
-	SETUP_MMU a3
-
-	/* enable MMU */
-	movi	a3, 1
-	mtcr    a3, cr18
-
-	jmpi	_goto_mmu_on
-_goto_mmu_on:
-	lrw	a3, DEFAULT_PSR_VALUE
-	mtcr    a3, psr
-	psrset  ee
+	SETUP_MMU
 
 	/* set stack point */
-	lrw     a3, secondary_stack
-	ld.w	a3, (a3, 0)
-	mov	sp, a3
+	lrw     r6, secondary_stack
+	ld.w	r6, (r6, 0)
+	mov	sp, r6
 
 	jmpi	csky_start_secondary
 END(_start_smp_secondary)
diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c
index c377194e4b8f..36fc04b9417d 100644
--- a/arch/csky/kernel/setup.c
+++ b/arch/csky/kernel/setup.c
@@ -145,7 +145,8 @@ void __init setup_arch(char **cmdline_p)
 unsigned long phys_offset;
 EXPORT_SYMBOL(phys_offset);
 
-asmlinkage __visible void __init csky_start(unsigned int unused, void *param)
+asmlinkage __visible void __init csky_start(unsigned int unused,
+					    void *dtb_start)
 {
 	/* Clean up bss section */
 	memset(__bss_start, 0, __bss_stop - __bss_start);
@@ -155,10 +156,10 @@ asmlinkage __visible void __init csky_start(unsigned int unused, void *param)
 	pre_trap_init();
 	pre_mmu_init();
 
-	if (param == NULL)
+	if (dtb_start == NULL)
 		early_init_dt_scan(__dtb_start);
 	else
-		early_init_dt_scan(param);
+		early_init_dt_scan(dtb_start);
 
 	start_kernel();
 
-- 
cgit v1.2.3-59-g8ed1b


From 683fafebf93bcde9948246849348b888e185cb22 Mon Sep 17 00:00:00 2001
From: Guo Ren <ren_guo@c-sky.com>
Date: Fri, 19 Apr 2019 17:10:52 +0800
Subject: csky: Use va_pa_offset instead of phys_offset

The name of phys_offset is so common for global export and it may
conflict with some local name. So change phys_offset to va_pa_offset
which also used by riscv.

Also use __pa() and __va() instead of using phys_offset directly.

Signed-off-by: Guo Ren <ren_guo@c-sky.com>
Cc: Arnd Bergmann <arnd@arndb.de>
---
 arch/csky/abiv1/inc/abi/ckmmu.h     |  4 ++--
 arch/csky/abiv2/inc/abi/ckmmu.h     | 14 ++++++--------
 arch/csky/include/asm/mmu_context.h | 17 ++---------------
 arch/csky/include/asm/page.h        | 10 +++++-----
 arch/csky/kernel/entry.S            |  4 ++--
 arch/csky/kernel/setup.c            |  6 +++---
 arch/csky/mm/fault.c                |  2 +-
 7 files changed, 21 insertions(+), 36 deletions(-)

(limited to 'arch')

diff --git a/arch/csky/abiv1/inc/abi/ckmmu.h b/arch/csky/abiv1/inc/abi/ckmmu.h
index 85339bd224c4..81f37715c0d2 100644
--- a/arch/csky/abiv1/inc/abi/ckmmu.h
+++ b/arch/csky/abiv1/inc/abi/ckmmu.h
@@ -85,11 +85,11 @@ static inline void tlb_invalid_indexed(void)
 
 static inline void setup_pgd(unsigned long pgd, bool kernel)
 {
-	cpwcr("cpcr29", pgd);
+	cpwcr("cpcr29", pgd | BIT(0));
 }
 
 static inline unsigned long get_pgd(void)
 {
-	return cprcr("cpcr29");
+	return cprcr("cpcr29") & ~BIT(0);
 }
 #endif /* __ASM_CSKY_CKMMUV1_H */
diff --git a/arch/csky/abiv2/inc/abi/ckmmu.h b/arch/csky/abiv2/inc/abi/ckmmu.h
index 31d75e1a724a..e4480e6bc3b3 100644
--- a/arch/csky/abiv2/inc/abi/ckmmu.h
+++ b/arch/csky/abiv2/inc/abi/ckmmu.h
@@ -90,18 +90,16 @@ static inline void tlb_invalid_indexed(void)
 	mtcr("cr<8, 15>", 0x02000000);
 }
 
-/* setup hardrefil pgd */
-static inline unsigned long get_pgd(void)
-{
-	return mfcr("cr<29, 15>");
-}
-
 static inline void setup_pgd(unsigned long pgd, bool kernel)
 {
 	if (kernel)
-		mtcr("cr<28, 15>", pgd);
+		mtcr("cr<28, 15>", pgd | BIT(0));
 	else
-		mtcr("cr<29, 15>", pgd);
+		mtcr("cr<29, 15>", pgd | BIT(0));
 }
 
+static inline unsigned long get_pgd(void)
+{
+	return mfcr("cr<29, 15>") & ~BIT(0);
+}
 #endif /* __ASM_CSKY_CKMMUV2_H */
diff --git a/arch/csky/include/asm/mmu_context.h b/arch/csky/include/asm/mmu_context.h
index c41f86b22460..734db3a122e1 100644
--- a/arch/csky/include/asm/mmu_context.h
+++ b/arch/csky/include/asm/mmu_context.h
@@ -14,23 +14,10 @@
 #include <linux/sched.h>
 #include <abi/ckmmu.h>
 
-static inline void tlbmiss_handler_setup_pgd(unsigned long pgd, bool kernel)
-{
-	pgd -= PAGE_OFFSET;
-	pgd += phys_offset;
-	pgd |= 1;
-	setup_pgd(pgd, kernel);
-}
-
 #define TLBMISS_HANDLER_SETUP_PGD(pgd) \
-	tlbmiss_handler_setup_pgd((unsigned long)pgd, 0)
+	setup_pgd(__pa(pgd), false)
 #define TLBMISS_HANDLER_SETUP_PGD_KERNEL(pgd) \
-	tlbmiss_handler_setup_pgd((unsigned long)pgd, 1)
-
-static inline unsigned long tlb_get_pgd(void)
-{
-	return ((get_pgd() - phys_offset) & ~1) + PAGE_OFFSET;
-}
+	setup_pgd(__pa(pgd), true)
 
 #define cpu_context(cpu, mm)	((mm)->context.asid[cpu])
 #define cpu_asid(cpu, mm)	(cpu_context((cpu), (mm)) & ASID_MASK)
diff --git a/arch/csky/include/asm/page.h b/arch/csky/include/asm/page.h
index 4ce3d5c28ffc..9738eacefdc7 100644
--- a/arch/csky/include/asm/page.h
+++ b/arch/csky/include/asm/page.h
@@ -72,13 +72,13 @@ typedef struct page *pgtable_t;
 #define __pgd(x)	((pgd_t) { (x) })
 #define __pgprot(x)	((pgprot_t) { (x) })
 
-extern unsigned long phys_offset;
+extern unsigned long va_pa_offset;
 
-#define ARCH_PFN_OFFSET	PFN_DOWN(phys_offset + PHYS_OFFSET_OFFSET)
+#define ARCH_PFN_OFFSET	PFN_DOWN(va_pa_offset + PHYS_OFFSET_OFFSET)
+
+#define __pa(x)		 ((unsigned long)(x) - PAGE_OFFSET + va_pa_offset)
+#define __va(x) ((void *)((unsigned long)(x) + PAGE_OFFSET - va_pa_offset))
 
-#define __pa(x)		((unsigned long)(x) - PAGE_OFFSET + phys_offset)
-#define __va(x)		((void *)((unsigned long)(x) + PAGE_OFFSET - \
-				  phys_offset))
 #define __pa_symbol(x)	__pa(RELOC_HIDE((unsigned long)(x), 0))
 
 #define MAP_NR(x)	PFN_DOWN((unsigned long)(x) - PAGE_OFFSET - \
diff --git a/arch/csky/kernel/entry.S b/arch/csky/kernel/entry.S
index e5bbd8c184f3..a7e84ccccbd8 100644
--- a/arch/csky/kernel/entry.S
+++ b/arch/csky/kernel/entry.S
@@ -40,7 +40,7 @@ ENTRY(csky_\name)
 	WR_MCIR	a2
 #endif
 	bclri   r6, 0
-	lrw	a2, phys_offset
+	lrw	a2, va_pa_offset
 	ld.w	a2, (a2, 0)
 	subu	r6, a2
 	bseti	r6, 31
@@ -51,7 +51,7 @@ ENTRY(csky_\name)
 	addu    r6, a2
 	ldw     r6, (r6)
 
-	lrw	a2, phys_offset
+	lrw	a2, va_pa_offset
 	ld.w	a2, (a2, 0)
 	subu	r6, a2
 	bseti	r6, 31
diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c
index 36fc04b9417d..23ee604aafdb 100644
--- a/arch/csky/kernel/setup.c
+++ b/arch/csky/kernel/setup.c
@@ -142,8 +142,8 @@ void __init setup_arch(char **cmdline_p)
 #endif
 }
 
-unsigned long phys_offset;
-EXPORT_SYMBOL(phys_offset);
+unsigned long va_pa_offset;
+EXPORT_SYMBOL(va_pa_offset);
 
 asmlinkage __visible void __init csky_start(unsigned int unused,
 					    void *dtb_start)
@@ -151,7 +151,7 @@ asmlinkage __visible void __init csky_start(unsigned int unused,
 	/* Clean up bss section */
 	memset(__bss_start, 0, __bss_stop - __bss_start);
 
-	phys_offset = read_mmu_msa0() & ~(SSEG_SIZE - 1);
+	va_pa_offset = read_mmu_msa0() & ~(SSEG_SIZE - 1);
 
 	pre_trap_init();
 	pre_mmu_init();
diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c
index e1725f8a06f9..5beb25ca1c79 100644
--- a/arch/csky/mm/fault.c
+++ b/arch/csky/mm/fault.c
@@ -81,7 +81,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
 
 		unsigned long pgd_base;
 
-		pgd_base = tlb_get_pgd();
+		pgd_base = __va(get_pgd());
 		pgd = (pgd_t *)pgd_base + offset;
 		pgd_k = init_mm.pgd + offset;
 
-- 
cgit v1.2.3-59-g8ed1b


From 0eaf50deec8d550164b3cf6a5d68ec1072916f0e Mon Sep 17 00:00:00 2001
From: Mao Han <han_mao@c-sky.com>
Date: Thu, 18 Apr 2019 14:20:40 +0800
Subject: csky: add page fault perf event support

This patch add support for page fault count, major fault count
and minorfault count. Without this patch page faults are not
sampled for perf event.

Performance counter stats for '/usr/lib/perf-test/callchain_test':
	0      page-faults               #    0.000 K/sec

Signed-off-by: Mao Han <han_mao@c-sky.com>
Signed-off-by: Guo Ren <ren_guo@c-sky.com>
---
 arch/csky/mm/fault.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c
index 5beb25ca1c79..aeb9a5f11e00 100644
--- a/arch/csky/mm/fault.c
+++ b/arch/csky/mm/fault.c
@@ -17,6 +17,7 @@
 #include <linux/vt_kern.h>
 #include <linux/extable.h>
 #include <linux/uaccess.h>
+#include <linux/perf_event.h>
 
 #include <asm/hardirq.h>
 #include <asm/mmu_context.h>
@@ -106,6 +107,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
 		return;
 	}
 #endif
+
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 	/*
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
@@ -153,10 +156,15 @@ good_area:
 			goto bad_area;
 		BUG();
 	}
-	if (fault & VM_FAULT_MAJOR)
+	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-	else
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs,
+			      address);
+	} else {
 		tsk->min_flt++;
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs,
+			      address);
+	}
 
 	up_read(&mm->mmap_sem);
 	return;
-- 
cgit v1.2.3-59-g8ed1b


From daac95e70f482e7add3305ee5e38f00dca505268 Mon Sep 17 00:00:00 2001
From: Mao Han <han_mao@c-sky.com>
Date: Mon, 15 Apr 2019 17:17:29 +0800
Subject: csky: Add support for perf registers sampling

This patch implements the perf registers sampling and validation API
for csky arch. The valid registers and their register ID are defined in
perf_regs.h. Perf tool can backtrace in userspace with unwind library
and the registers/user stack dump support.

Signed-off-by: Mao Han <han_mao@c-sky.com>
Signed-off-by: Guo Ren <ren_guo@c-sky.com>
---
 arch/csky/Kconfig                      |  2 ++
 arch/csky/include/uapi/asm/perf_regs.h | 51 ++++++++++++++++++++++++++++++++++
 arch/csky/kernel/Makefile              |  1 +
 arch/csky/kernel/perf_regs.c           | 40 ++++++++++++++++++++++++++
 4 files changed, 94 insertions(+)
 create mode 100644 arch/csky/include/uapi/asm/perf_regs.h
 create mode 100644 arch/csky/kernel/perf_regs.c

(limited to 'arch')

diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index c4974cf6a222..8e45c7ac8e24 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -38,6 +38,8 @@ config CSKY
 	select HAVE_KERNEL_LZO
 	select HAVE_KERNEL_LZMA
 	select HAVE_PERF_EVENTS
+	select HAVE_PERF_REGS
+	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/csky/include/uapi/asm/perf_regs.h b/arch/csky/include/uapi/asm/perf_regs.h
new file mode 100644
index 000000000000..ee323d818592
--- /dev/null
+++ b/arch/csky/include/uapi/asm/perf_regs.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd.
+
+#ifndef _ASM_CSKY_PERF_REGS_H
+#define _ASM_CSKY_PERF_REGS_H
+
+/* Index of struct pt_regs */
+enum perf_event_csky_regs {
+	PERF_REG_CSKY_TLS,
+	PERF_REG_CSKY_LR,
+	PERF_REG_CSKY_PC,
+	PERF_REG_CSKY_SR,
+	PERF_REG_CSKY_SP,
+	PERF_REG_CSKY_ORIG_A0,
+	PERF_REG_CSKY_A0,
+	PERF_REG_CSKY_A1,
+	PERF_REG_CSKY_A2,
+	PERF_REG_CSKY_A3,
+	PERF_REG_CSKY_REGS0,
+	PERF_REG_CSKY_REGS1,
+	PERF_REG_CSKY_REGS2,
+	PERF_REG_CSKY_REGS3,
+	PERF_REG_CSKY_REGS4,
+	PERF_REG_CSKY_REGS5,
+	PERF_REG_CSKY_REGS6,
+	PERF_REG_CSKY_REGS7,
+	PERF_REG_CSKY_REGS8,
+	PERF_REG_CSKY_REGS9,
+#if defined(__CSKYABIV2__)
+	PERF_REG_CSKY_EXREGS0,
+	PERF_REG_CSKY_EXREGS1,
+	PERF_REG_CSKY_EXREGS2,
+	PERF_REG_CSKY_EXREGS3,
+	PERF_REG_CSKY_EXREGS4,
+	PERF_REG_CSKY_EXREGS5,
+	PERF_REG_CSKY_EXREGS6,
+	PERF_REG_CSKY_EXREGS7,
+	PERF_REG_CSKY_EXREGS8,
+	PERF_REG_CSKY_EXREGS9,
+	PERF_REG_CSKY_EXREGS10,
+	PERF_REG_CSKY_EXREGS11,
+	PERF_REG_CSKY_EXREGS12,
+	PERF_REG_CSKY_EXREGS13,
+	PERF_REG_CSKY_EXREGS14,
+	PERF_REG_CSKY_HI,
+	PERF_REG_CSKY_LO,
+	PERF_REG_CSKY_DCSR,
+#endif
+	PERF_REG_CSKY_MAX,
+};
+#endif /* _ASM_CSKY_PERF_REGS_H */
diff --git a/arch/csky/kernel/Makefile b/arch/csky/kernel/Makefile
index 4c462f584dd1..1624b04bffb5 100644
--- a/arch/csky/kernel/Makefile
+++ b/arch/csky/kernel/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_FUNCTION_TRACER)		+= ftrace.o
 obj-$(CONFIG_STACKTRACE)		+= stacktrace.o
 obj-$(CONFIG_CSKY_PMU_V1)		+= perf_event.o
 obj-$(CONFIG_PERF_EVENTS)		+= perf_callchain.o
+obj-$(CONFIG_HAVE_PERF_REGS)            += perf_regs.o
 
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE)
diff --git a/arch/csky/kernel/perf_regs.c b/arch/csky/kernel/perf_regs.c
new file mode 100644
index 000000000000..eb32838b8210
--- /dev/null
+++ b/arch/csky/kernel/perf_regs.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd.
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/perf_event.h>
+#include <linux/bug.h>
+#include <asm/perf_regs.h>
+#include <asm/ptrace.h>
+
+u64 perf_reg_value(struct pt_regs *regs, int idx)
+{
+	if (WARN_ON_ONCE((u32)idx >= PERF_REG_CSKY_MAX))
+		return 0;
+
+	return (u64)*((u32 *)regs + idx);
+}
+
+#define REG_RESERVED (~((1ULL << PERF_REG_CSKY_MAX) - 1))
+
+int perf_reg_validate(u64 mask)
+{
+	if (!mask || mask & REG_RESERVED)
+		return -EINVAL;
+
+	return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+	return PERF_SAMPLE_REGS_ABI_32;
+}
+
+void perf_get_regs_user(struct perf_regs *regs_user,
+			struct pt_regs *regs,
+			struct pt_regs *regs_user_copy)
+{
+	regs_user->regs = task_pt_regs(current);
+	regs_user->abi = perf_reg_abi(current);
+}
-- 
cgit v1.2.3-59-g8ed1b


From 1a23710c71bbfe2df10584afb9971b99c45e2576 Mon Sep 17 00:00:00 2001
From: Guo Ren <ren_guo@c-sky.com>
Date: Mon, 22 Apr 2019 14:21:09 +0800
Subject: csky: Fixup compile warning

The function of __va() will return "void *", but the pgd_base is
unsigned long.

Signed-off-by: Guo Ren <ren_guo@c-sky.com>
Cc: Arnd Bergmann <arnd@arndb.de>
---
 arch/csky/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c
index aeb9a5f11e00..18041f46ded1 100644
--- a/arch/csky/mm/fault.c
+++ b/arch/csky/mm/fault.c
@@ -82,7 +82,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
 
 		unsigned long pgd_base;
 
-		pgd_base = __va(get_pgd());
+		pgd_base = (unsigned long)__va(get_pgd());
 		pgd = (pgd_t *)pgd_base + offset;
 		pgd_k = init_mm.pgd + offset;
 
-- 
cgit v1.2.3-59-g8ed1b


From a691f3334d58b833e41d56de1b9820e687edcd78 Mon Sep 17 00:00:00 2001
From: Guo Ren <ren_guo@c-sky.com>
Date: Mon, 22 Apr 2019 14:46:44 +0800
Subject: csky/syscall_trace: Fixup return processing flow

The function tracehook_report_syscall_entry's return value is
__must_check attribute. We should add return processing flow in
ptrace.c and set the syscall number to -1 when failed just like
riscv's.

Signed-off-by: Guo Ren <ren_guo@c-sky.com>
Cc: Arnd Bergmann <arnd@arndb.de>
---
 arch/csky/include/asm/syscall.h | 7 +++++++
 arch/csky/kernel/ptrace.c       | 3 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/csky/include/asm/syscall.h b/arch/csky/include/asm/syscall.h
index 850b694a463e..8278658e74f9 100644
--- a/arch/csky/include/asm/syscall.h
+++ b/arch/csky/include/asm/syscall.h
@@ -16,6 +16,13 @@ syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
 	return regs_syscallid(regs);
 }
 
+static inline void
+syscall_set_nr(struct task_struct *task, struct pt_regs *regs,
+	       int sysno)
+{
+	regs_syscallid(regs) = sysno;
+}
+
 static inline void
 syscall_rollback(struct task_struct *task, struct pt_regs *regs)
 {
diff --git a/arch/csky/kernel/ptrace.c b/arch/csky/kernel/ptrace.c
index 91bc74bb569f..313623a19ecb 100644
--- a/arch/csky/kernel/ptrace.c
+++ b/arch/csky/kernel/ptrace.c
@@ -215,7 +215,8 @@ long arch_ptrace(struct task_struct *child, long request,
 asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 {
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_entry(regs);
+		if (tracehook_report_syscall_entry(regs))
+			syscall_set_nr(current, regs, -1);
 
 	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
 		trace_sys_enter(regs, syscall_get_nr(current, regs));
-- 
cgit v1.2.3-59-g8ed1b


From ce0c890e2a2f53b4e242c8b2d29ba966c5ee5c5f Mon Sep 17 00:00:00 2001
From: Jonas Gorski <jonas.gorski@gmail.com>
Date: Thu, 18 Apr 2019 13:12:09 +0200
Subject: powerpc/512x: mark clocks as big endian

These clocks' registers are accessed as big endian, so mark them as
such.

Signed-off-by: Jonas Gorski <jonas.gorski@gmail.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 arch/powerpc/platforms/512x/clock-commonclk.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/512x/clock-commonclk.c b/arch/powerpc/platforms/512x/clock-commonclk.c
index b3097fe6441b..af265ae40a61 100644
--- a/arch/powerpc/platforms/512x/clock-commonclk.c
+++ b/arch/powerpc/platforms/512x/clock-commonclk.c
@@ -239,6 +239,7 @@ static inline struct clk *mpc512x_clk_divider(
 	const char *name, const char *parent_name, u8 clkflags,
 	u32 __iomem *reg, u8 pos, u8 len, int divflags)
 {
+	divflags |= CLK_DIVIDER_BIG_ENDIAN;
 	return clk_register_divider(NULL, name, parent_name, clkflags,
 				    reg, pos, len, divflags, &clklock);
 }
@@ -250,7 +251,7 @@ static inline struct clk *mpc512x_clk_divtable(
 {
 	u8 divflags;
 
-	divflags = 0;
+	divflags = CLK_DIVIDER_BIG_ENDIAN;
 	return clk_register_divider_table(NULL, name, parent_name, 0,
 					  reg, pos, len, divflags,
 					  divtab, &clklock);
@@ -261,10 +262,12 @@ static inline struct clk *mpc512x_clk_gated(
 	u32 __iomem *reg, u8 pos)
 {
 	int clkflags;
+	u8 gateflags;
 
 	clkflags = CLK_SET_RATE_PARENT;
+	gateflags = CLK_GATE_BIG_ENDIAN;
 	return clk_register_gate(NULL, name, parent_name, clkflags,
-				 reg, pos, 0, &clklock);
+				 reg, pos, gateflags, &clklock);
 }
 
 static inline struct clk *mpc512x_clk_muxed(const char *name,
@@ -275,7 +278,7 @@ static inline struct clk *mpc512x_clk_muxed(const char *name,
 	u8 muxflags;
 
 	clkflags = CLK_SET_RATE_PARENT;
-	muxflags = 0;
+	muxflags = CLK_MUX_BIG_ENDIAN;
 	return clk_register_mux(NULL, name,
 				parent_names, parent_count, clkflags,
 				reg, pos, len, muxflags, &clklock);
-- 
cgit v1.2.3-59-g8ed1b


From 1e0221374e30a765a02e00f01f68869eac57c8d3 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Tue, 23 Apr 2019 13:55:19 -0700
Subject: mips: vdso: drop unnecessary cc-ldoption

Towards the goal of removing cc-ldoption, it seems that --hash-style=
was added to binutils 2.17.50.0.2 in 2006. The minimal required version
of binutils for the kernel according to
Documentation/process/changes.rst is 2.20.

--build-id was added in 2.18 according to binutils-gdb/ld/NEWS.

Link: https://gcc.gnu.org/ml/gcc/2007-01/msg01141.html
Cc: clang-built-linux@googlegroups.com
Suggested-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: ralf@linux-mips.org
Cc: jhogan@kernel.org
Cc: Matt Redfearn <matt.redfearn@mips.com>
Cc: Joel Stanley <joel@jms.id.au>
Cc: Hassan Naveed <hnaveed@wavecomp.com>
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/mips/vdso/Makefile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile
index 0ede4deb8181..7221df24cb23 100644
--- a/arch/mips/vdso/Makefile
+++ b/arch/mips/vdso/Makefile
@@ -46,9 +46,7 @@ endif
 VDSO_LDFLAGS := \
 	-Wl,-Bsymbolic -Wl,--no-undefined -Wl,-soname=linux-vdso.so.1 \
 	$(addprefix -Wl$(comma),$(filter -E%,$(KBUILD_CFLAGS))) \
-	-nostdlib -shared \
-	$(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \
-	$(call cc-ldoption, -Wl$(comma)--build-id)
+	-nostdlib -shared -Wl,--hash-style=sysv -Wl,--build-id
 
 GCOV_PROFILE := n
 UBSAN_SANITIZE := n
-- 
cgit v1.2.3-59-g8ed1b


From a703db3d5b4b43cb7e46b65a661471015cbcba57 Mon Sep 17 00:00:00 2001
From: Serge Semin <fancer.lancer@gmail.com>
Date: Wed, 24 Apr 2019 01:47:37 +0300
Subject: mips: Make sure kernel .bss exists in boot mem pool

Current MIPS platform code makes sure the kernel text, data and init
sections are added to the boot memory map pool right after the
arch-specific memory setup method has been executed. But for some reason
the MIPS platform code skipped the kernel .bss section, which definitely
should be in the boot mem pool as well in any case. Lets fix this just be
adding the space between __bss_start and __bss_stop.

Reviewed-by: Matt Redfearn <matt.redfearn@mips.com>
Signed-off-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Cc: Huacai Chen <chenhc@lemote.com>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/mips/kernel/setup.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 8d1dc6c71173..0ee033c44116 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -809,6 +809,9 @@ static void __init arch_mem_init(char **cmdline_p)
 	arch_mem_addpart(PFN_UP(__pa_symbol(&__init_begin)) << PAGE_SHIFT,
 			 PFN_DOWN(__pa_symbol(&__init_end)) << PAGE_SHIFT,
 			 BOOT_MEM_INIT_RAM);
+	arch_mem_addpart(PFN_DOWN(__pa_symbol(&__bss_start)) << PAGE_SHIFT,
+			 PFN_UP(__pa_symbol(&__bss_stop)) << PAGE_SHIFT,
+			 BOOT_MEM_RAM);
 
 	pr_info("Determined physical RAM map:\n");
 	print_memory_map();
-- 
cgit v1.2.3-59-g8ed1b


From 6ea3ba6fac31380af86339c37be095a672dd01bb Mon Sep 17 00:00:00 2001
From: Serge Semin <fancer.lancer@gmail.com>
Date: Wed, 24 Apr 2019 01:47:38 +0300
Subject: mips: Discard rudiments from bootmem_init

There is a pointless code left in the bootmem_init() method since
the bootmem allocator removal. First part resides the PFN ranges
calculation loop. The conditional expressions and continue operator
are useless there, since nothing is done after them. Second part is
in RAM ranges installation loop. We can simplify the conditions cascade
a bit without much of the logic redefinition, so to reduce the code
length. In particular the end boundary value can be verified after
the possible reduction to be below max_low_pfn.

Signed-off-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Matt Redfearn <matt.redfearn@mips.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Cc: Huacai Chen <chenhc@lemote.com>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/mips/kernel/setup.c | 25 +++++--------------------
 1 file changed, 5 insertions(+), 20 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 0ee033c44116..53d93a727d1a 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -394,10 +394,7 @@ static void __init bootmem_init(void)
 	min_low_pfn = ~0UL;
 	max_low_pfn = 0;
 
-	/*
-	 * Find the highest page frame number we have available
-	 * and the lowest used RAM address
-	 */
+	/* Find the highest and lowest page frame numbers we have available. */
 	for (i = 0; i < boot_mem_map.nr_map; i++) {
 		unsigned long start, end;
 
@@ -427,13 +424,6 @@ static void __init bootmem_init(void)
 			max_low_pfn = end;
 		if (start < min_low_pfn)
 			min_low_pfn = start;
-		if (end <= reserved_end)
-			continue;
-#ifdef CONFIG_BLK_DEV_INITRD
-		/* Skip zones before initrd and initrd itself */
-		if (initrd_end && end <= (unsigned long)PFN_UP(__pa(initrd_end)))
-			continue;
-#endif
 	}
 
 	if (min_low_pfn >= max_low_pfn)
@@ -474,6 +464,7 @@ static void __init bootmem_init(void)
 		max_low_pfn = PFN_DOWN(HIGHMEM_START);
 	}
 
+	/* Install all valid RAM ranges to the memblock memory region */
 	for (i = 0; i < boot_mem_map.nr_map; i++) {
 		unsigned long start, end;
 
@@ -481,21 +472,15 @@ static void __init bootmem_init(void)
 		end = PFN_DOWN(boot_mem_map.map[i].addr
 				+ boot_mem_map.map[i].size);
 
-		if (start <= min_low_pfn)
+		if (start < min_low_pfn)
 			start = min_low_pfn;
-		if (start >= end)
-			continue;
-
 #ifndef CONFIG_HIGHMEM
+		/* Ignore highmem regions if highmem is unsupported */
 		if (end > max_low_pfn)
 			end = max_low_pfn;
-
-		/*
-		 * ... finally, is the area going away?
-		 */
+#endif
 		if (end <= start)
 			continue;
-#endif
 
 		memblock_add_node(PFN_PHYS(start), PFN_PHYS(end - start), 0);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From cf0c4876684d287b6fe72699598517ca9b827216 Mon Sep 17 00:00:00 2001
From: Serge Semin <fancer.lancer@gmail.com>
Date: Wed, 24 Apr 2019 01:47:39 +0300
Subject: mips: Combine memblock init and memory reservation loops

Before bootmem was completely removed from the kernel, the last loop
in the bootmem_init() had been used to reserve the correspondingly
marked regions, initialize sparsemem sections and to free the low memory
pages, which then would be used for early memory allocations. After the
bootmem removing patchset had been merged the loop was left to do the first
two things only. But it didn't do them quite well.

First of all it leaves the BOOT_MEM_INIT_RAM memory types unreserved,
which is definitely bug (although it isn't noticeable due to being used
by the kernel region only, which is fully marked as reserved). Secondly
the reservation is supposed to be done for any memory including the
high one. (I couldn't figure out why the highmem was ignored in the first
place, since platforms and dts' may declare any memory region for
reservation) Thirdly the reserved_end variable had been used here to not
accidentally free memory occupied by kernel. Since we already reserved the
corresponding region higher in this method there is no need in using the
variable here anymore. Fourthly the sparsemem should be aware of all the
memory types in the system including the ROM_DATA even if it is going to
be reserved for the whole system uptime. Finally after all these notes are
fixed the loop of memory reservation can be freely merged into the memory
installation loop as it's done in this patch.

Signed-off-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Matt Redfearn <matt.redfearn@mips.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Cc: Huacai Chen <chenhc@lemote.com>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/mips/kernel/setup.c | 48 +++++++-----------------------------------------
 1 file changed, 7 insertions(+), 41 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 53d93a727d1a..185e0e42e009 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -483,55 +483,21 @@ static void __init bootmem_init(void)
 			continue;
 
 		memblock_add_node(PFN_PHYS(start), PFN_PHYS(end - start), 0);
-	}
-
-	/*
-	 * Register fully available low RAM pages with the bootmem allocator.
-	 */
-	for (i = 0; i < boot_mem_map.nr_map; i++) {
-		unsigned long start, end, size;
-
-		start = PFN_UP(boot_mem_map.map[i].addr);
-		end   = PFN_DOWN(boot_mem_map.map[i].addr
-				    + boot_mem_map.map[i].size);
 
-		/*
-		 * Reserve usable memory.
-		 */
+		/* Reserve any memory except the ordinary RAM ranges. */
 		switch (boot_mem_map.map[i].type) {
 		case BOOT_MEM_RAM:
 			break;
-		case BOOT_MEM_INIT_RAM:
-			memory_present(0, start, end);
-			continue;
-		default:
-			/* Not usable memory */
-			if (start > min_low_pfn && end < max_low_pfn)
-				memblock_reserve(boot_mem_map.map[i].addr,
-						boot_mem_map.map[i].size);
-
-			continue;
+		default: /* Reserve the rest of the memory types at boot time */
+			memblock_reserve(PFN_PHYS(start), PFN_PHYS(end - start));
+			break;
 		}
 
 		/*
-		 * We are rounding up the start address of usable memory
-		 * and at the end of the usable range downwards.
-		 */
-		if (start >= max_low_pfn)
-			continue;
-		if (start < reserved_end)
-			start = reserved_end;
-		if (end > max_low_pfn)
-			end = max_low_pfn;
-
-		/*
-		 * ... finally, is the area going away?
+		 * In any case the added to the memblock memory regions
+		 * (highmem/lowmem, available/reserved, etc) are considered
+		 * as present, so inform sparsemem about them.
 		 */
-		if (end <= start)
-			continue;
-		size = end - start;
-
-		/* Register lowmem ranges */
 		memory_present(0, start, end);
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 877b5691f27a1aec0d9b53095a323e45c30069e2 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 14 Apr 2019 17:37:09 -0700
Subject: crypto: shash - remove shash_desc::flags

The flags field in 'struct shash_desc' never actually does anything.
The only ostensibly supported flag is CRYPTO_TFM_REQ_MAY_SLEEP.
However, no shash algorithm ever sleeps, making this flag a no-op.

With this being the case, inevitably some users who can't sleep wrongly
pass MAY_SLEEP.  These would all need to be fixed if any shash algorithm
actually started sleeping.  For example, the shash_ahash_*() functions,
which wrap a shash algorithm with the ahash API, pass through MAY_SLEEP
from the ahash API to the shash API.  However, the shash functions are
called under kmap_atomic(), so actually they're assumed to never sleep.

Even if it turns out that some users do need preemption points while
hashing large buffers, we could easily provide a helper function
crypto_shash_update_large() which divides the data into smaller chunks
and calls crypto_shash_update() and cond_resched() for each chunk.  It's
not necessary to have a flag in 'struct shash_desc', nor is it necessary
to make individual shash algorithms aware of this at all.

Therefore, remove shash_desc::flags, and document that the
crypto_shash_*() functions can be called from any context.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 Documentation/crypto/api-samples.rst                      |  1 -
 arch/arm/crypto/ghash-ce-glue.c                           |  3 ---
 arch/x86/crypto/ghash-clmulni-intel_glue.c                |  2 --
 arch/x86/power/hibernate.c                                |  1 -
 crypto/adiantum.c                                         |  1 -
 crypto/asymmetric_keys/pkcs7_verify.c                     |  1 -
 crypto/asymmetric_keys/verify_pefile.c                    |  1 -
 crypto/asymmetric_keys/x509_public_key.c                  |  1 -
 crypto/cryptd.c                                           |  3 ---
 crypto/drbg.c                                             |  1 -
 crypto/hmac.c                                             | 11 -----------
 crypto/shash.c                                            |  4 ----
 crypto/testmgr.c                                          |  2 --
 drivers/block/drbd/drbd_receiver.c                        |  1 -
 drivers/block/drbd/drbd_worker.c                          |  2 --
 drivers/crypto/axis/artpec6_crypto.c                      |  2 --
 drivers/crypto/bcm/cipher.c                               |  1 -
 drivers/crypto/bcm/util.c                                 |  1 -
 drivers/crypto/ccp/ccp-crypto-sha.c                       |  2 --
 drivers/crypto/chelsio/chcr_algo.c                        |  2 --
 drivers/crypto/mediatek/mtk-sha.c                         |  3 ---
 drivers/crypto/n2_core.c                                  |  2 --
 drivers/crypto/omap-sham.c                                |  2 --
 drivers/crypto/padlock-sha.c                              |  5 -----
 drivers/crypto/qat/qat_common/qat_algs.c                  |  1 -
 drivers/crypto/s5p-sss.c                                  |  1 -
 drivers/crypto/vmx/ghash.c                                |  1 -
 drivers/infiniband/sw/rxe/rxe.h                           |  1 -
 drivers/md/dm-crypt.c                                     |  3 ---
 drivers/md/dm-integrity.c                                 |  2 --
 drivers/net/ppp/ppp_mppe.c                                |  1 -
 drivers/net/wireless/intersil/orinoco/mic.c               |  1 -
 drivers/nfc/s3fwrn5/firmware.c                            |  1 -
 drivers/staging/ks7010/ks_hostif.c                        |  1 -
 drivers/staging/rtl8192e/rtllib_crypt_tkip.c              |  1 -
 drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_tkip.c |  1 -
 drivers/target/iscsi/iscsi_target_auth.c                  |  1 -
 drivers/thunderbolt/domain.c                              |  1 -
 fs/cifs/misc.c                                            |  1 -
 fs/crypto/keyinfo.c                                       |  1 -
 fs/ecryptfs/crypto.c                                      |  1 -
 fs/ecryptfs/keystore.c                                    |  1 -
 fs/ext4/ext4.h                                            |  1 -
 fs/f2fs/f2fs.h                                            |  1 -
 fs/nfsd/nfs4recover.c                                     |  1 -
 fs/ubifs/auth.c                                           |  6 ------
 fs/ubifs/replay.c                                         |  2 --
 include/crypto/hash.h                                     | 10 ++++++++--
 include/linux/jbd2.h                                      |  1 -
 kernel/kexec_file.c                                       |  1 -
 lib/crc-t10dif.c                                          |  1 -
 lib/digsig.c                                              |  1 -
 lib/libcrc32c.c                                           |  1 -
 net/bluetooth/amp.c                                       |  1 -
 net/bluetooth/smp.c                                       |  1 -
 net/sctp/auth.c                                           |  1 -
 net/sctp/sm_make_chunk.c                                  |  2 --
 net/sunrpc/auth_gss/gss_krb5_crypto.c                     |  2 --
 net/sunrpc/auth_gss/gss_krb5_mech.c                       |  1 -
 net/wireless/lib80211_crypt_tkip.c                        |  1 -
 security/apparmor/crypto.c                                |  2 --
 security/integrity/evm/evm_crypto.c                       |  1 -
 security/integrity/ima/ima_crypto.c                       |  4 ----
 security/keys/dh.c                                        |  1 -
 security/keys/encrypted-keys/encrypted.c                  |  1 -
 security/keys/trusted.c                                   |  1 -
 66 files changed, 8 insertions(+), 113 deletions(-)

(limited to 'arch')

diff --git a/Documentation/crypto/api-samples.rst b/Documentation/crypto/api-samples.rst
index 0f6ca8b7261e..f14afaaf2f32 100644
--- a/Documentation/crypto/api-samples.rst
+++ b/Documentation/crypto/api-samples.rst
@@ -133,7 +133,6 @@ Code Example For Use of Operational State Memory With SHASH
         if (!sdesc)
             return ERR_PTR(-ENOMEM);
         sdesc->shash.tfm = alg;
-        sdesc->shash.flags = 0x0;
         return sdesc;
     }
 
diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c
index 60123e9ea9d8..39d1ccec1aab 100644
--- a/arch/arm/crypto/ghash-ce-glue.c
+++ b/arch/arm/crypto/ghash-ce-glue.c
@@ -186,7 +186,6 @@ static int ghash_async_init(struct ahash_request *req)
 	struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
 
 	desc->tfm = child;
-	desc->flags = req->base.flags;
 	return crypto_shash_init(desc);
 }
 
@@ -243,7 +242,6 @@ static int ghash_async_digest(struct ahash_request *req)
 		struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
 
 		desc->tfm = child;
-		desc->flags = req->base.flags;
 		return shash_ahash_digest(req, desc);
 	}
 }
@@ -256,7 +254,6 @@ static int ghash_async_import(struct ahash_request *req, const void *in)
 	struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
 
 	desc->tfm = cryptd_ahash_child(ctx->cryptd_tfm);
-	desc->flags = req->base.flags;
 
 	return crypto_shash_import(desc, in);
 }
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 4099a0ae17dd..e3f3e6fd9d65 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -172,7 +172,6 @@ static int ghash_async_init(struct ahash_request *req)
 	struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
 
 	desc->tfm = child;
-	desc->flags = req->base.flags;
 	return crypto_shash_init(desc);
 }
 
@@ -252,7 +251,6 @@ static int ghash_async_digest(struct ahash_request *req)
 		struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
 
 		desc->tfm = child;
-		desc->flags = req->base.flags;
 		return shash_ahash_digest(req, desc);
 	}
 }
diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c
index bcddf09b5aa3..4845b8c7be7f 100644
--- a/arch/x86/power/hibernate.c
+++ b/arch/x86/power/hibernate.c
@@ -90,7 +90,6 @@ static int get_e820_md5(struct e820_table *table, void *buf)
 	}
 
 	desc->tfm = tfm;
-	desc->flags = 0;
 
 	size = offsetof(struct e820_table, entries) +
 		sizeof(struct e820_entry) * table->nr_entries;
diff --git a/crypto/adiantum.c b/crypto/adiantum.c
index e6de50f669aa..395a3ddd3707 100644
--- a/crypto/adiantum.c
+++ b/crypto/adiantum.c
@@ -265,7 +265,6 @@ static int adiantum_hash_message(struct skcipher_request *req,
 	int err;
 
 	hash_desc->tfm = tctx->hash;
-	hash_desc->flags = 0;
 
 	err = crypto_shash_init(hash_desc);
 	if (err)
diff --git a/crypto/asymmetric_keys/pkcs7_verify.c b/crypto/asymmetric_keys/pkcs7_verify.c
index 97c77f66b20d..f7b0980bf02d 100644
--- a/crypto/asymmetric_keys/pkcs7_verify.c
+++ b/crypto/asymmetric_keys/pkcs7_verify.c
@@ -56,7 +56,6 @@ static int pkcs7_digest(struct pkcs7_message *pkcs7,
 		goto error_no_desc;
 
 	desc->tfm   = tfm;
-	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	/* Digest the message [RFC2315 9.3] */
 	ret = crypto_shash_digest(desc, pkcs7->data, pkcs7->data_len,
diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c
index d178650fd524..f8e4a932bcfb 100644
--- a/crypto/asymmetric_keys/verify_pefile.c
+++ b/crypto/asymmetric_keys/verify_pefile.c
@@ -354,7 +354,6 @@ static int pefile_digest_pe(const void *pebuf, unsigned int pelen,
 		goto error_no_desc;
 
 	desc->tfm   = tfm;
-	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 	ret = crypto_shash_init(desc);
 	if (ret < 0)
 		goto error;
diff --git a/crypto/asymmetric_keys/x509_public_key.c b/crypto/asymmetric_keys/x509_public_key.c
index 9338b4558cdc..bd96683d8cde 100644
--- a/crypto/asymmetric_keys/x509_public_key.c
+++ b/crypto/asymmetric_keys/x509_public_key.c
@@ -77,7 +77,6 @@ int x509_get_sig_params(struct x509_certificate *cert)
 		goto error;
 
 	desc->tfm = tfm;
-	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	ret = crypto_shash_digest(desc, cert->tbs, cert->tbs_size, sig->digest);
 	if (ret < 0)
diff --git a/crypto/cryptd.c b/crypto/cryptd.c
index 42533cf80acc..b3bb99390ae7 100644
--- a/crypto/cryptd.c
+++ b/crypto/cryptd.c
@@ -545,7 +545,6 @@ static void cryptd_hash_init(struct crypto_async_request *req_async, int err)
 		goto out;
 
 	desc->tfm = child;
-	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	err = crypto_shash_init(desc);
 
@@ -637,7 +636,6 @@ static void cryptd_hash_digest(struct crypto_async_request *req_async, int err)
 		goto out;
 
 	desc->tfm = child;
-	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	err = shash_ahash_digest(req, desc);
 
@@ -666,7 +664,6 @@ static int cryptd_hash_import(struct ahash_request *req, const void *in)
 	struct shash_desc *desc = cryptd_shash_desc(req);
 
 	desc->tfm = ctx->child;
-	desc->flags = req->base.flags;
 
 	return crypto_shash_import(desc, in);
 }
diff --git a/crypto/drbg.c b/crypto/drbg.c
index 710b3046a4df..2a5b16bb000c 100644
--- a/crypto/drbg.c
+++ b/crypto/drbg.c
@@ -1587,7 +1587,6 @@ static int drbg_init_hash_kernel(struct drbg_state *drbg)
 	}
 
 	sdesc->shash.tfm = tfm;
-	sdesc->shash.flags = 0;
 	drbg->priv_data = sdesc;
 
 	return crypto_shash_alignmask(tfm);
diff --git a/crypto/hmac.c b/crypto/hmac.c
index 4ceb3f1f0eb8..a68c1266121f 100644
--- a/crypto/hmac.c
+++ b/crypto/hmac.c
@@ -57,8 +57,6 @@ static int hmac_setkey(struct crypto_shash *parent,
 	unsigned int i;
 
 	shash->tfm = hash;
-	shash->flags = crypto_shash_get_flags(parent)
-		& CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	if (keylen > bs) {
 		int err;
@@ -91,8 +89,6 @@ static int hmac_export(struct shash_desc *pdesc, void *out)
 {
 	struct shash_desc *desc = shash_desc_ctx(pdesc);
 
-	desc->flags = pdesc->flags & CRYPTO_TFM_REQ_MAY_SLEEP;
-
 	return crypto_shash_export(desc, out);
 }
 
@@ -102,7 +98,6 @@ static int hmac_import(struct shash_desc *pdesc, const void *in)
 	struct hmac_ctx *ctx = hmac_ctx(pdesc->tfm);
 
 	desc->tfm = ctx->hash;
-	desc->flags = pdesc->flags & CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	return crypto_shash_import(desc, in);
 }
@@ -117,8 +112,6 @@ static int hmac_update(struct shash_desc *pdesc,
 {
 	struct shash_desc *desc = shash_desc_ctx(pdesc);
 
-	desc->flags = pdesc->flags & CRYPTO_TFM_REQ_MAY_SLEEP;
-
 	return crypto_shash_update(desc, data, nbytes);
 }
 
@@ -130,8 +123,6 @@ static int hmac_final(struct shash_desc *pdesc, u8 *out)
 	char *opad = crypto_shash_ctx_aligned(parent) + ss;
 	struct shash_desc *desc = shash_desc_ctx(pdesc);
 
-	desc->flags = pdesc->flags & CRYPTO_TFM_REQ_MAY_SLEEP;
-
 	return crypto_shash_final(desc, out) ?:
 	       crypto_shash_import(desc, opad) ?:
 	       crypto_shash_finup(desc, out, ds, out);
@@ -147,8 +138,6 @@ static int hmac_finup(struct shash_desc *pdesc, const u8 *data,
 	char *opad = crypto_shash_ctx_aligned(parent) + ss;
 	struct shash_desc *desc = shash_desc_ctx(pdesc);
 
-	desc->flags = pdesc->flags & CRYPTO_TFM_REQ_MAY_SLEEP;
-
 	return crypto_shash_finup(desc, data, nbytes, out) ?:
 	       crypto_shash_import(desc, opad) ?:
 	       crypto_shash_finup(desc, out, ds, out);
diff --git a/crypto/shash.c b/crypto/shash.c
index 599468478f7b..e55c1f558bc3 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -238,7 +238,6 @@ static int shash_async_init(struct ahash_request *req)
 	struct shash_desc *desc = ahash_request_ctx(req);
 
 	desc->tfm = *ctx;
-	desc->flags = req->base.flags;
 
 	return crypto_shash_init(desc);
 }
@@ -293,7 +292,6 @@ static int shash_async_finup(struct ahash_request *req)
 	struct shash_desc *desc = ahash_request_ctx(req);
 
 	desc->tfm = *ctx;
-	desc->flags = req->base.flags;
 
 	return shash_ahash_finup(req, desc);
 }
@@ -328,7 +326,6 @@ static int shash_async_digest(struct ahash_request *req)
 	struct shash_desc *desc = ahash_request_ctx(req);
 
 	desc->tfm = *ctx;
-	desc->flags = req->base.flags;
 
 	return shash_ahash_digest(req, desc);
 }
@@ -344,7 +341,6 @@ static int shash_async_import(struct ahash_request *req, const void *in)
 	struct shash_desc *desc = ahash_request_ctx(req);
 
 	desc->tfm = *ctx;
-	desc->flags = req->base.flags;
 
 	return crypto_shash_import(desc, in);
 }
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 87abfd1ce232..2bd89a65e9e7 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1328,7 +1328,6 @@ static void generate_random_hash_testvec(struct crypto_shash *tfm,
 
 	/* Digest */
 	desc->tfm = tfm;
-	desc->flags = 0;
 	vec->digest_error = crypto_shash_digest(desc, vec->plaintext,
 						vec->psize, (u8 *)vec->digest);
 done:
@@ -3027,7 +3026,6 @@ static int alg_test_crc32c(const struct alg_test_desc *desc,
 		u32 *ctx = (u32 *)shash_desc_ctx(shash);
 
 		shash->tfm = tfm;
-		shash->flags = 0;
 
 		*ctx = 420553207;
 		err = crypto_shash_final(shash, (u8 *)&val);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index c7ad88d91a09..843a9b9b3d74 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -5443,7 +5443,6 @@ static int drbd_do_auth(struct drbd_connection *connection)
 	rcu_read_unlock();
 
 	desc->tfm = connection->cram_hmac_tfm;
-	desc->flags = 0;
 
 	rv = crypto_shash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len);
 	if (rv) {
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 268ef0c5d4ab..6781bcf3ec26 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -304,7 +304,6 @@ void drbd_csum_ee(struct crypto_shash *tfm, struct drbd_peer_request *peer_req,
 	void *src;
 
 	desc->tfm = tfm;
-	desc->flags = 0;
 
 	crypto_shash_init(desc);
 
@@ -332,7 +331,6 @@ void drbd_csum_bio(struct crypto_shash *tfm, struct bio *bio, void *digest)
 	struct bvec_iter iter;
 
 	desc->tfm = tfm;
-	desc->flags = 0;
 
 	crypto_shash_init(desc);
 
diff --git a/drivers/crypto/axis/artpec6_crypto.c b/drivers/crypto/axis/artpec6_crypto.c
index 57e5dca3253f..d2fb72811442 100644
--- a/drivers/crypto/axis/artpec6_crypto.c
+++ b/drivers/crypto/axis/artpec6_crypto.c
@@ -2247,8 +2247,6 @@ artpec6_crypto_hash_set_key(struct crypto_ahash *tfm,
 		SHASH_DESC_ON_STACK(hdesc, tfm_ctx->child_hash);
 
 		hdesc->tfm = tfm_ctx->child_hash;
-		hdesc->flags = crypto_ahash_get_flags(tfm) &
-			       CRYPTO_TFM_REQ_MAY_SLEEP;
 
 		tfm_ctx->hmac_key_length = blocksize;
 		ret = crypto_shash_digest(hdesc, key, keylen,
diff --git a/drivers/crypto/bcm/cipher.c b/drivers/crypto/bcm/cipher.c
index 8862200d4a0b..25f8d3913ceb 100644
--- a/drivers/crypto/bcm/cipher.c
+++ b/drivers/crypto/bcm/cipher.c
@@ -2140,7 +2140,6 @@ static int ahash_init(struct ahash_request *req)
 			goto err_hash;
 		}
 		ctx->shash->tfm = hash;
-		ctx->shash->flags = 0;
 
 		/* Set the key using data we already have from setkey */
 		if (ctx->authkeylen > 0) {
diff --git a/drivers/crypto/bcm/util.c b/drivers/crypto/bcm/util.c
index d8cda5fb75ad..91ec56399d84 100644
--- a/drivers/crypto/bcm/util.c
+++ b/drivers/crypto/bcm/util.c
@@ -242,7 +242,6 @@ int do_shash(unsigned char *name, unsigned char *result,
 		goto do_shash_err;
 	}
 	sdesc->shash.tfm = hash;
-	sdesc->shash.flags = 0x0;
 
 	if (key_len > 0) {
 		rc = crypto_shash_setkey(hash, key, key_len);
diff --git a/drivers/crypto/ccp/ccp-crypto-sha.c b/drivers/crypto/ccp/ccp-crypto-sha.c
index 10a61cd54fce..3e10573f589e 100644
--- a/drivers/crypto/ccp/ccp-crypto-sha.c
+++ b/drivers/crypto/ccp/ccp-crypto-sha.c
@@ -293,8 +293,6 @@ static int ccp_sha_setkey(struct crypto_ahash *tfm, const u8 *key,
 	if (key_len > block_size) {
 		/* Must hash the input key */
 		sdesc->tfm = shash;
-		sdesc->flags = crypto_ahash_get_flags(tfm) &
-			CRYPTO_TFM_REQ_MAY_SLEEP;
 
 		ret = crypto_shash_digest(sdesc, key, key_len,
 					  ctx->u.sha.key);
diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c
index 8d8cf80b9294..8a76fce22943 100644
--- a/drivers/crypto/chelsio/chcr_algo.c
+++ b/drivers/crypto/chelsio/chcr_algo.c
@@ -2130,7 +2130,6 @@ static int chcr_ahash_setkey(struct crypto_ahash *tfm, const u8 *key,
 	 * ipad in hmacctx->ipad and opad in hmacctx->opad location
 	 */
 	shash->tfm = hmacctx->base_hash;
-	shash->flags = crypto_shash_get_flags(hmacctx->base_hash);
 	if (keylen > bs) {
 		err = crypto_shash_digest(shash, key, keylen,
 					  hmacctx->ipad);
@@ -3517,7 +3516,6 @@ static int chcr_authenc_setkey(struct crypto_aead *authenc, const u8 *key,
 		SHASH_DESC_ON_STACK(shash, base_hash);
 
 		shash->tfm = base_hash;
-		shash->flags = crypto_shash_get_flags(base_hash);
 		bs = crypto_shash_blocksize(base_hash);
 		align = KEYCTX_ALIGN_PAD(max_authsize);
 		o_ptr =  actx->h_iopad + param.result_size + align;
diff --git a/drivers/crypto/mediatek/mtk-sha.c b/drivers/crypto/mediatek/mtk-sha.c
index 5f4f845adbb8..a0806ba40c68 100644
--- a/drivers/crypto/mediatek/mtk-sha.c
+++ b/drivers/crypto/mediatek/mtk-sha.c
@@ -365,7 +365,6 @@ static int mtk_sha_finish_hmac(struct ahash_request *req)
 	SHASH_DESC_ON_STACK(shash, bctx->shash);
 
 	shash->tfm = bctx->shash;
-	shash->flags = 0; /* not CRYPTO_TFM_REQ_MAY_SLEEP */
 
 	return crypto_shash_init(shash) ?:
 	       crypto_shash_update(shash, bctx->opad, ctx->bs) ?:
@@ -810,8 +809,6 @@ static int mtk_sha_setkey(struct crypto_ahash *tfm, const u8 *key,
 	SHASH_DESC_ON_STACK(shash, bctx->shash);
 
 	shash->tfm = bctx->shash;
-	shash->flags = crypto_shash_get_flags(bctx->shash) &
-		       CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	if (keylen > bs) {
 		err = crypto_shash_digest(shash, key, keylen, bctx->ipad);
diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c
index df675aea58f6..0d5d3d8eb680 100644
--- a/drivers/crypto/n2_core.c
+++ b/drivers/crypto/n2_core.c
@@ -469,8 +469,6 @@ static int n2_hmac_async_setkey(struct crypto_ahash *tfm, const u8 *key,
 		return err;
 
 	shash->tfm = child_shash;
-	shash->flags = crypto_ahash_get_flags(tfm) &
-		CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	bs = crypto_shash_blocksize(child_shash);
 	ds = crypto_shash_digestsize(child_shash);
diff --git a/drivers/crypto/omap-sham.c b/drivers/crypto/omap-sham.c
index 0641185bd82f..51b20abac464 100644
--- a/drivers/crypto/omap-sham.c
+++ b/drivers/crypto/omap-sham.c
@@ -1055,7 +1055,6 @@ static int omap_sham_finish_hmac(struct ahash_request *req)
 	SHASH_DESC_ON_STACK(shash, bctx->shash);
 
 	shash->tfm = bctx->shash;
-	shash->flags = 0; /* not CRYPTO_TFM_REQ_MAY_SLEEP */
 
 	return crypto_shash_init(shash) ?:
 	       crypto_shash_update(shash, bctx->opad, bs) ?:
@@ -1226,7 +1225,6 @@ static int omap_sham_shash_digest(struct crypto_shash *tfm, u32 flags,
 	SHASH_DESC_ON_STACK(shash, tfm);
 
 	shash->tfm = tfm;
-	shash->flags = flags & CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	return crypto_shash_digest(shash, data, len, out);
 }
diff --git a/drivers/crypto/padlock-sha.c b/drivers/crypto/padlock-sha.c
index 21e5cae0a1e0..e641481a3cd9 100644
--- a/drivers/crypto/padlock-sha.c
+++ b/drivers/crypto/padlock-sha.c
@@ -39,7 +39,6 @@ static int padlock_sha_init(struct shash_desc *desc)
 	struct padlock_sha_ctx *ctx = crypto_shash_ctx(desc->tfm);
 
 	dctx->fallback.tfm = ctx->fallback;
-	dctx->fallback.flags = desc->flags & CRYPTO_TFM_REQ_MAY_SLEEP;
 	return crypto_shash_init(&dctx->fallback);
 }
 
@@ -48,7 +47,6 @@ static int padlock_sha_update(struct shash_desc *desc,
 {
 	struct padlock_sha_desc *dctx = shash_desc_ctx(desc);
 
-	dctx->fallback.flags = desc->flags & CRYPTO_TFM_REQ_MAY_SLEEP;
 	return crypto_shash_update(&dctx->fallback, data, length);
 }
 
@@ -65,7 +63,6 @@ static int padlock_sha_import(struct shash_desc *desc, const void *in)
 	struct padlock_sha_ctx *ctx = crypto_shash_ctx(desc->tfm);
 
 	dctx->fallback.tfm = ctx->fallback;
-	dctx->fallback.flags = desc->flags & CRYPTO_TFM_REQ_MAY_SLEEP;
 	return crypto_shash_import(&dctx->fallback, in);
 }
 
@@ -91,7 +88,6 @@ static int padlock_sha1_finup(struct shash_desc *desc, const u8 *in,
 	unsigned int leftover;
 	int err;
 
-	dctx->fallback.flags = desc->flags & CRYPTO_TFM_REQ_MAY_SLEEP;
 	err = crypto_shash_export(&dctx->fallback, &state);
 	if (err)
 		goto out;
@@ -153,7 +149,6 @@ static int padlock_sha256_finup(struct shash_desc *desc, const u8 *in,
 	unsigned int leftover;
 	int err;
 
-	dctx->fallback.flags = desc->flags & CRYPTO_TFM_REQ_MAY_SLEEP;
 	err = crypto_shash_export(&dctx->fallback, &state);
 	if (err)
 		goto out;
diff --git a/drivers/crypto/qat/qat_common/qat_algs.c b/drivers/crypto/qat/qat_common/qat_algs.c
index 975c75198f56..c8d401646902 100644
--- a/drivers/crypto/qat/qat_common/qat_algs.c
+++ b/drivers/crypto/qat/qat_common/qat_algs.c
@@ -164,7 +164,6 @@ static int qat_alg_do_precomputes(struct icp_qat_hw_auth_algo_blk *hash,
 	memset(ctx->ipad, 0, block_size);
 	memset(ctx->opad, 0, block_size);
 	shash->tfm = ctx->hash_tfm;
-	shash->flags = 0x0;
 
 	if (auth_keylen > block_size) {
 		int ret = crypto_shash_digest(shash, auth_key,
diff --git a/drivers/crypto/s5p-sss.c b/drivers/crypto/s5p-sss.c
index 1afdcb81d8ed..9ef25230c199 100644
--- a/drivers/crypto/s5p-sss.c
+++ b/drivers/crypto/s5p-sss.c
@@ -1534,7 +1534,6 @@ static int s5p_hash_shash_digest(struct crypto_shash *tfm, u32 flags,
 	SHASH_DESC_ON_STACK(shash, tfm);
 
 	shash->tfm = tfm;
-	shash->flags = flags & ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	return crypto_shash_digest(shash, data, len, out);
 }
diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c
index 611ff591410e..b5a6883bb09e 100644
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
@@ -101,7 +101,6 @@ static int p8_ghash_init(struct shash_desc *desc)
 	dctx->bytes = 0;
 	memset(dctx->shash, 0, GHASH_DIGEST_SIZE);
 	dctx->fallback_desc.tfm = ctx->fallback;
-	dctx->fallback_desc.flags = desc->flags;
 	return crypto_shash_init(&dctx->fallback_desc);
 }
 
diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h
index 2e2dff478833..ecf6e659c0da 100644
--- a/drivers/infiniband/sw/rxe/rxe.h
+++ b/drivers/infiniband/sw/rxe/rxe.h
@@ -80,7 +80,6 @@ static inline u32 rxe_crc32(struct rxe_dev *rxe,
 	SHASH_DESC_ON_STACK(shash, rxe->tfm);
 
 	shash->tfm = rxe->tfm;
-	shash->flags = 0;
 	*(u32 *)shash_desc_ctx(shash) = crc;
 	err = crypto_shash_update(shash, next, len);
 	if (unlikely(err)) {
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index dd6565798778..9faed1c92b52 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -332,7 +332,6 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
 	int err;
 
 	desc->tfm = essiv->hash_tfm;
-	desc->flags = 0;
 
 	err = crypto_shash_digest(desc, cc->key, cc->key_size, essiv->salt);
 	shash_desc_zero(desc);
@@ -606,7 +605,6 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
 	int i, r;
 
 	desc->tfm = lmk->hash_tfm;
-	desc->flags = 0;
 
 	r = crypto_shash_init(desc);
 	if (r)
@@ -768,7 +766,6 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc,
 
 	/* calculate crc32 for every 32bit part and xor it */
 	desc->tfm = tcw->crc32_tfm;
-	desc->flags = 0;
 	for (i = 0; i < 4; i++) {
 		r = crypto_shash_init(desc);
 		if (r)
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index d57d997a52c8..1366d886907c 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -532,7 +532,6 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result
 	unsigned j, size;
 
 	desc->tfm = ic->journal_mac;
-	desc->flags = 0;
 
 	r = crypto_shash_init(desc);
 	if (unlikely(r)) {
@@ -1278,7 +1277,6 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector
 	unsigned digest_size;
 
 	req->tfm = ic->internal_hash;
-	req->flags = 0;
 
 	r = crypto_shash_init(req);
 	if (unlikely(r < 0)) {
diff --git a/drivers/net/ppp/ppp_mppe.c b/drivers/net/ppp/ppp_mppe.c
index 7ccdc62c6052..ff61dd8748de 100644
--- a/drivers/net/ppp/ppp_mppe.c
+++ b/drivers/net/ppp/ppp_mppe.c
@@ -222,7 +222,6 @@ static void *mppe_alloc(unsigned char *options, int optlen)
 		goto out_free;
 	}
 	state->sha1->tfm = shash;
-	state->sha1->flags = 0;
 
 	digestsize = crypto_shash_digestsize(shash);
 	if (digestsize < MPPE_MAX_KEY_LEN)
diff --git a/drivers/net/wireless/intersil/orinoco/mic.c b/drivers/net/wireless/intersil/orinoco/mic.c
index 67b0c05afbdb..a324bc4b7938 100644
--- a/drivers/net/wireless/intersil/orinoco/mic.c
+++ b/drivers/net/wireless/intersil/orinoco/mic.c
@@ -65,7 +65,6 @@ int orinoco_mic(struct crypto_shash *tfm_michael, u8 *key,
 	hdr[ETH_ALEN * 2 + 3] = 0;
 
 	desc->tfm = tfm_michael;
-	desc->flags = 0;
 
 	err = crypto_shash_setkey(tfm_michael, key, MIC_KEYLEN);
 	if (err)
diff --git a/drivers/nfc/s3fwrn5/firmware.c b/drivers/nfc/s3fwrn5/firmware.c
index b7828fb252f2..b681073ae8ba 100644
--- a/drivers/nfc/s3fwrn5/firmware.c
+++ b/drivers/nfc/s3fwrn5/firmware.c
@@ -449,7 +449,6 @@ int s3fwrn5_fw_download(struct s3fwrn5_fw_info *fw_info)
 		SHASH_DESC_ON_STACK(desc, tfm);
 
 		desc->tfm = tfm;
-		desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 		ret = crypto_shash_digest(desc, fw->image, image_size,
 					  hash_data);
diff --git a/drivers/staging/ks7010/ks_hostif.c b/drivers/staging/ks7010/ks_hostif.c
index 06ebea0be118..122d4c0af363 100644
--- a/drivers/staging/ks7010/ks_hostif.c
+++ b/drivers/staging/ks7010/ks_hostif.c
@@ -219,7 +219,6 @@ michael_mic(u8 *key, u8 *data, unsigned int len, u8 priority, u8 *result)
 	}
 
 	desc->tfm = tfm;
-	desc->flags = 0;
 
 	ret = crypto_shash_init(desc);
 	if (ret < 0)
diff --git a/drivers/staging/rtl8192e/rtllib_crypt_tkip.c b/drivers/staging/rtl8192e/rtllib_crypt_tkip.c
index 55da8c9dfe50..a084e1501f9d 100644
--- a/drivers/staging/rtl8192e/rtllib_crypt_tkip.c
+++ b/drivers/staging/rtl8192e/rtllib_crypt_tkip.c
@@ -507,7 +507,6 @@ static int michael_mic(struct crypto_shash *tfm_michael, u8 *key, u8 *hdr,
 	int err;
 
 	desc->tfm = tfm_michael;
-	desc->flags = 0;
 
 	if (crypto_shash_setkey(tfm_michael, key, 8))
 		return -1;
diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_tkip.c b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_tkip.c
index 829fa4bd253c..d67bb57994c4 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_tkip.c
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_tkip.c
@@ -503,7 +503,6 @@ static int michael_mic(struct crypto_shash *tfm_michael, u8 *key, u8 *hdr,
 	int err;
 
 	desc->tfm = tfm_michael;
-	desc->flags = 0;
 
 	if (crypto_shash_setkey(tfm_michael, key, 8))
 		return -1;
diff --git a/drivers/target/iscsi/iscsi_target_auth.c b/drivers/target/iscsi/iscsi_target_auth.c
index 4e680d753941..ca7d7e8aecc0 100644
--- a/drivers/target/iscsi/iscsi_target_auth.c
+++ b/drivers/target/iscsi/iscsi_target_auth.c
@@ -252,7 +252,6 @@ static int chap_server_compute_md5(
 	}
 
 	desc->tfm = tfm;
-	desc->flags = 0;
 
 	ret = crypto_shash_init(desc);
 	if (ret < 0) {
diff --git a/drivers/thunderbolt/domain.c b/drivers/thunderbolt/domain.c
index 7416bdbd8576..b7980c856898 100644
--- a/drivers/thunderbolt/domain.c
+++ b/drivers/thunderbolt/domain.c
@@ -678,7 +678,6 @@ int tb_domain_challenge_switch_key(struct tb *tb, struct tb_switch *sw)
 	}
 
 	shash->tfm = tfm;
-	shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	memset(hmac, 0, sizeof(hmac));
 	ret = crypto_shash_digest(shash, challenge, sizeof(hmac), hmac);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index bee203055b30..1a6040998227 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -917,7 +917,6 @@ cifs_alloc_hash(const char *name,
 	}
 
 	(*sdesc)->shash.tfm = *shash;
-	(*sdesc)->shash.flags = 0x0;
 	return 0;
 }
 
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 322ce9686bdb..2cb4956f8511 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -402,7 +402,6 @@ static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt)
 	{
 		SHASH_DESC_ON_STACK(desc, tfm);
 		desc->tfm = tfm;
-		desc->flags = 0;
 
 		return crypto_shash_digest(desc, key, keysize, salt);
 	}
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index f664da55234e..491cf5baa8c2 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -68,7 +68,6 @@ static int ecryptfs_hash_digest(struct crypto_shash *tfm,
 	int err;
 
 	desc->tfm = tfm;
-	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 	err = crypto_shash_digest(desc, src, len, dst);
 	shash_desc_zero(desc);
 	return err;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e74fe84d0886..90fbac5d485b 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -769,7 +769,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 	}
 
 	s->hash_desc->tfm = s->hash_tfm;
-	s->hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	rc = crypto_shash_digest(s->hash_desc,
 				 (u8 *)s->auth_tok->token.password.session_key_encryption_key,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 82ffdacdc7fa..0833b5fc0668 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2024,7 +2024,6 @@ static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
 	BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx));
 
 	desc.shash.tfm = sbi->s_chksum_driver;
-	desc.shash.flags = 0;
 	*(u32 *)desc.ctx = crc;
 
 	BUG_ON(crypto_shash_update(&desc.shash, address, length));
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 87f75ebd2fd6..21b0ab6bd15a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1422,7 +1422,6 @@ static inline u32 __f2fs_crc32(struct f2fs_sb_info *sbi, u32 crc,
 	BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx));
 
 	desc.shash.tfm = sbi->s_chksum_driver;
-	desc.shash.flags = 0;
 	*(u32 *)desc.ctx = crc;
 
 	err = crypto_shash_update(&desc.shash, address, length);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 5188f9f70c78..8c8563441208 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -126,7 +126,6 @@ nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)
 		SHASH_DESC_ON_STACK(desc, tfm);
 
 		desc->tfm = tfm;
-		desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 		status = crypto_shash_digest(desc, clname->data, clname->len,
 					     cksum.data);
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index 5bf5fd08879e..b758004085c4 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -33,7 +33,6 @@ int __ubifs_node_calc_hash(const struct ubifs_info *c, const void *node,
 	int err;
 
 	shash->tfm = c->hash_tfm;
-	shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	err = crypto_shash_digest(shash, node, le32_to_cpu(ch->len), hash);
 	if (err < 0)
@@ -56,7 +55,6 @@ static int ubifs_hash_calc_hmac(const struct ubifs_info *c, const u8 *hash,
 	int err;
 
 	shash->tfm = c->hmac_tfm;
-	shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	err = crypto_shash_digest(shash, hash, c->hash_len, hmac);
 	if (err < 0)
@@ -88,7 +86,6 @@ int ubifs_prepare_auth_node(struct ubifs_info *c, void *node,
 		return -ENOMEM;
 
 	hash_desc->tfm = c->hash_tfm;
-	hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 	ubifs_shash_copy_state(c, inhash, hash_desc);
 
 	err = crypto_shash_final(hash_desc, hash);
@@ -123,7 +120,6 @@ static struct shash_desc *ubifs_get_desc(const struct ubifs_info *c,
 		return ERR_PTR(-ENOMEM);
 
 	desc->tfm = tfm;
-	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	err = crypto_shash_init(desc);
 	if (err) {
@@ -364,7 +360,6 @@ static int ubifs_node_calc_hmac(const struct ubifs_info *c, const void *node,
 	ubifs_assert(c, ofs_hmac + hmac_len < len);
 
 	shash->tfm = c->hmac_tfm;
-	shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	err = crypto_shash_init(shash);
 	if (err)
@@ -483,7 +478,6 @@ int ubifs_hmac_wkm(struct ubifs_info *c, u8 *hmac)
 		return 0;
 
 	shash->tfm = c->hmac_tfm;
-	shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	err = crypto_shash_init(shash);
 	if (err)
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 0a0e65c07c6d..5c8a81a019a4 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -576,7 +576,6 @@ static int authenticate_sleb_hash(struct ubifs_info *c, struct shash_desc *log_h
 	SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm);
 
 	hash_desc->tfm = c->hash_tfm;
-	hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	ubifs_shash_copy_state(c, log_hash, hash_desc);
 	return crypto_shash_final(hash_desc, hash);
@@ -587,7 +586,6 @@ static int authenticate_sleb_hmac(struct ubifs_info *c, u8 *hash, u8 *hmac)
 	SHASH_DESC_ON_STACK(hmac_desc, c->hmac_tfm);
 
 	hmac_desc->tfm = c->hmac_tfm;
-	hmac_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	return crypto_shash_digest(hmac_desc, hash, c->hash_len, hmac);
 }
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index 3b31c1b349ae..d21bea2c4382 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -146,8 +146,6 @@ struct ahash_alg {
 
 struct shash_desc {
 	struct crypto_shash *tfm;
-	u32 flags;
-
 	void *__ctx[] CRYPTO_MINALIGN_ATTR;
 };
 
@@ -819,6 +817,7 @@ static inline void *shash_desc_ctx(struct shash_desc *desc)
  * cipher handle must point to a keyed message digest cipher in order for this
  * function to succeed.
  *
+ * Context: Any context.
  * Return: 0 if the setting of the key was successful; < 0 if an error occurred
  */
 int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key,
@@ -835,6 +834,7 @@ int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key,
  * crypto_shash_update and crypto_shash_final. The parameters have the same
  * meaning as discussed for those separate three functions.
  *
+ * Context: Any context.
  * Return: 0 if the message digest creation was successful; < 0 if an error
  *	   occurred
  */
@@ -850,6 +850,7 @@ int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
  * caller-allocated output buffer out which must have sufficient size (e.g. by
  * calling crypto_shash_descsize).
  *
+ * Context: Any context.
  * Return: 0 if the export creation was successful; < 0 if an error occurred
  */
 static inline int crypto_shash_export(struct shash_desc *desc, void *out)
@@ -866,6 +867,7 @@ static inline int crypto_shash_export(struct shash_desc *desc, void *out)
  * the input buffer. That buffer should have been generated with the
  * crypto_ahash_export function.
  *
+ * Context: Any context.
  * Return: 0 if the import was successful; < 0 if an error occurred
  */
 static inline int crypto_shash_import(struct shash_desc *desc, const void *in)
@@ -886,6 +888,7 @@ static inline int crypto_shash_import(struct shash_desc *desc, const void *in)
  * operational state handle. Any potentially existing state created by
  * previous operations is discarded.
  *
+ * Context: Any context.
  * Return: 0 if the message digest initialization was successful; < 0 if an
  *	   error occurred
  */
@@ -907,6 +910,7 @@ static inline int crypto_shash_init(struct shash_desc *desc)
  *
  * Updates the message digest state of the operational state handle.
  *
+ * Context: Any context.
  * Return: 0 if the message digest update was successful; < 0 if an error
  *	   occurred
  */
@@ -923,6 +927,7 @@ int crypto_shash_update(struct shash_desc *desc, const u8 *data,
  * into the output buffer. The caller must ensure that the output buffer is
  * large enough by using crypto_shash_digestsize.
  *
+ * Context: Any context.
  * Return: 0 if the message digest creation was successful; < 0 if an error
  *	   occurred
  */
@@ -939,6 +944,7 @@ int crypto_shash_final(struct shash_desc *desc, u8 *out);
  * crypto_shash_update and crypto_shash_final. The parameters have the same
  * meaning as discussed for those separate functions.
  *
+ * Context: Any context.
  * Return: 0 if the message digest creation was successful; < 0 if an error
  *	   occurred
  */
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 0f919d5fe84f..c2ffff5f9ae2 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1606,7 +1606,6 @@ static inline u32 jbd2_chksum(journal_t *journal, u32 crc,
 		JBD_MAX_CHECKSUM_SIZE);
 
 	desc.shash.tfm = journal->j_chksum_driver;
-	desc.shash.flags = 0;
 	*(u32 *)desc.ctx = crc;
 
 	err = crypto_shash_update(&desc.shash, address, length);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f1d0e00a3971..f7fb8f6a688f 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -688,7 +688,6 @@ static int kexec_calculate_store_digests(struct kimage *image)
 		goto out_free_desc;
 
 	desc->tfm   = tfm;
-	desc->flags = 0;
 
 	ret = crypto_shash_init(desc);
 	if (ret < 0)
diff --git a/lib/crc-t10dif.c b/lib/crc-t10dif.c
index 4d0d47c1ffbd..e89ebfdbb0fc 100644
--- a/lib/crc-t10dif.c
+++ b/lib/crc-t10dif.c
@@ -69,7 +69,6 @@ __u16 crc_t10dif_update(__u16 crc, const unsigned char *buffer, size_t len)
 
 	rcu_read_lock();
 	desc.shash.tfm = rcu_dereference(crct10dif_tfm);
-	desc.shash.flags = 0;
 	*(__u16 *)desc.ctx = crc;
 
 	err = crypto_shash_update(&desc.shash, buffer, len);
diff --git a/lib/digsig.c b/lib/digsig.c
index 6ba6fcd92dd1..3b0a579bdcdf 100644
--- a/lib/digsig.c
+++ b/lib/digsig.c
@@ -240,7 +240,6 @@ int digsig_verify(struct key *keyring, const char *sig, int siglen,
 		goto err;
 
 	desc->tfm = shash;
-	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	crypto_shash_init(desc);
 	crypto_shash_update(desc, data, datalen);
diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
index f0a2934605bf..4e9829c4d64c 100644
--- a/lib/libcrc32c.c
+++ b/lib/libcrc32c.c
@@ -47,7 +47,6 @@ u32 crc32c(u32 crc, const void *address, unsigned int length)
 	int err;
 
 	shash->tfm = tfm;
-	shash->flags = 0;
 	*ctx = crc;
 
 	err = crypto_shash_update(shash, address, length);
diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c
index 78bec8df8525..aaa39409eeb7 100644
--- a/net/bluetooth/amp.c
+++ b/net/bluetooth/amp.c
@@ -161,7 +161,6 @@ static int hmac_sha256(u8 *key, u8 ksize, char *plaintext, u8 psize, u8 *output)
 	}
 
 	shash->tfm = tfm;
-	shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	ret = crypto_shash_digest(shash, plaintext, psize, output);
 
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 621146d04c03..e68c715f8d37 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -183,7 +183,6 @@ static int aes_cmac(struct crypto_shash *tfm, const u8 k[16], const u8 *m,
 	}
 
 	desc->tfm = tfm;
-	desc->flags = 0;
 
 	/* Swap key and message from LSB to MSB */
 	swap_buf(k, tmp, 16);
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 39d72e58b8e5..31569f4809f6 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -760,7 +760,6 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
 		SHASH_DESC_ON_STACK(desc, tfm);
 
 		desc->tfm = tfm;
-		desc->flags = 0;
 		crypto_shash_digest(desc, (u8 *)auth,
 				    end - (unsigned char *)auth, digest);
 		shash_desc_zero(desc);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index d05c57664e36..72e74503f9fc 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1684,7 +1684,6 @@ static struct sctp_cookie_param *sctp_pack_cookie(
 
 		/* Sign the message.  */
 		desc->tfm = sctp_sk(ep->base.sk)->hmac;
-		desc->flags = 0;
 
 		err = crypto_shash_setkey(desc->tfm, ep->secret_key,
 					  sizeof(ep->secret_key)) ?:
@@ -1755,7 +1754,6 @@ struct sctp_association *sctp_unpack_cookie(
 		int err;
 
 		desc->tfm = sctp_sk(ep->base.sk)->hmac;
-		desc->flags = 0;
 
 		err = crypto_shash_setkey(desc->tfm, ep->secret_key,
 					  sizeof(ep->secret_key)) ?:
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 4f43383971ba..6f2d30d7b766 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -977,7 +977,6 @@ krb5_rc4_setup_seq_key(struct krb5_ctx *kctx,
 	}
 
 	desc->tfm = hmac;
-	desc->flags = 0;
 
 	/* Compute intermediate Kseq from session key */
 	err = crypto_shash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength);
@@ -1045,7 +1044,6 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx,
 	}
 
 	desc->tfm = hmac;
-	desc->flags = 0;
 
 	/* Compute intermediate Kcrypt from session key */
 	for (i = 0; i < kctx->gk5e->keylength; i++)
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 56cc85c5bc06..6e5d6d240215 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -438,7 +438,6 @@ context_derive_keys_rc4(struct krb5_ctx *ctx)
 	}
 
 	desc->tfm = hmac;
-	desc->flags = 0;
 
 	err = crypto_shash_digest(desc, sigkeyconstant, slen, ctx->cksum);
 	kzfree(desc);
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index 35f06563207d..11eaa5956f00 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -501,7 +501,6 @@ static int michael_mic(struct crypto_shash *tfm_michael, u8 *key, u8 *hdr,
 	}
 
 	desc->tfm = tfm_michael;
-	desc->flags = 0;
 
 	if (crypto_shash_setkey(tfm_michael, key, 8))
 		return -1;
diff --git a/security/apparmor/crypto.c b/security/apparmor/crypto.c
index af03d98c7552..baba63bc66b1 100644
--- a/security/apparmor/crypto.c
+++ b/security/apparmor/crypto.c
@@ -43,7 +43,6 @@ char *aa_calc_hash(void *data, size_t len)
 		goto fail;
 
 	desc->tfm = apparmor_tfm;
-	desc->flags = 0;
 
 	error = crypto_shash_init(desc);
 	if (error)
@@ -81,7 +80,6 @@ int aa_calc_profile_hash(struct aa_profile *profile, u32 version, void *start,
 		goto fail;
 
 	desc->tfm = apparmor_tfm;
-	desc->flags = 0;
 
 	error = crypto_shash_init(desc);
 	if (error)
diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
index c37d08118af5..e11564eb645b 100644
--- a/security/integrity/evm/evm_crypto.c
+++ b/security/integrity/evm/evm_crypto.c
@@ -124,7 +124,6 @@ out:
 		return ERR_PTR(-ENOMEM);
 
 	desc->tfm = *tfm;
-	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
 	rc = crypto_shash_init(desc);
 	if (rc) {
diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c
index 16a4f45863b1..a32878e10ebc 100644
--- a/security/integrity/ima/ima_crypto.c
+++ b/security/integrity/ima/ima_crypto.c
@@ -333,7 +333,6 @@ static int ima_calc_file_hash_tfm(struct file *file,
 	SHASH_DESC_ON_STACK(shash, tfm);
 
 	shash->tfm = tfm;
-	shash->flags = 0;
 
 	hash->length = crypto_shash_digestsize(tfm);
 
@@ -469,7 +468,6 @@ static int ima_calc_field_array_hash_tfm(struct ima_field_data *field_data,
 	int rc, i;
 
 	shash->tfm = tfm;
-	shash->flags = 0;
 
 	hash->length = crypto_shash_digestsize(tfm);
 
@@ -591,7 +589,6 @@ static int calc_buffer_shash_tfm(const void *buf, loff_t size,
 	int rc;
 
 	shash->tfm = tfm;
-	shash->flags = 0;
 
 	hash->length = crypto_shash_digestsize(tfm);
 
@@ -664,7 +661,6 @@ static int __init ima_calc_boot_aggregate_tfm(char *digest,
 	SHASH_DESC_ON_STACK(shash, tfm);
 
 	shash->tfm = tfm;
-	shash->flags = 0;
 
 	rc = crypto_shash_init(shash);
 	if (rc != 0)
diff --git a/security/keys/dh.c b/security/keys/dh.c
index 711e89d8c415..23f95dec771b 100644
--- a/security/keys/dh.c
+++ b/security/keys/dh.c
@@ -112,7 +112,6 @@ static int kdf_alloc(struct kdf_sdesc **sdesc_ret, char *hashname)
 	if (!sdesc)
 		goto out_free_tfm;
 	sdesc->shash.tfm = tfm;
-	sdesc->shash.flags = 0x0;
 
 	*sdesc_ret = sdesc;
 
diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c
index 347108f660a1..1b1456b21a93 100644
--- a/security/keys/encrypted-keys/encrypted.c
+++ b/security/keys/encrypted-keys/encrypted.c
@@ -333,7 +333,6 @@ static int calc_hash(struct crypto_shash *tfm, u8 *digest,
 	int err;
 
 	desc->tfm = tfm;
-	desc->flags = 0;
 
 	err = crypto_shash_digest(desc, buf, buflen, digest);
 	shash_desc_zero(desc);
diff --git a/security/keys/trusted.c b/security/keys/trusted.c
index bcc9c6ead7fd..45ffd9e53937 100644
--- a/security/keys/trusted.c
+++ b/security/keys/trusted.c
@@ -55,7 +55,6 @@ static struct sdesc *init_sdesc(struct crypto_shash *alg)
 	if (!sdesc)
 		return ERR_PTR(-ENOMEM);
 	sdesc->shash.tfm = alg;
-	sdesc->shash.flags = 0x0;
 	return sdesc;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From a348f05361c968cf4a54b6a4f3aeb6f9a271956a Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Thu, 4 Apr 2019 11:11:03 +0300
Subject: ARM: omap2+: hwmod: drop CLK_IS_BASIC flag usage

CLK_IS_BASIC flag is about to get deprecated, and as such, can't be used.
Instead, the API call for checking whether a clock is of type hw_omap shall
be used, so convert the code to use this.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 arch/arm/mach-omap2/omap_hwmod.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c
index 3a04c73ac03c..baadddf9aad4 100644
--- a/arch/arm/mach-omap2/omap_hwmod.c
+++ b/arch/arm/mach-omap2/omap_hwmod.c
@@ -648,10 +648,10 @@ static struct clockdomain *_get_clkdm(struct omap_hwmod *oh)
 	if (oh->clkdm) {
 		return oh->clkdm;
 	} else if (oh->_clk) {
-		if (__clk_get_flags(oh->_clk) & CLK_IS_BASIC)
+		if (!omap2_clk_is_hw_omap(__clk_get_hw(oh->_clk)))
 			return NULL;
 		clk = to_clk_hw_omap(__clk_get_hw(oh->_clk));
-		return  clk->clkdm;
+		return clk->clkdm;
 	}
 	return NULL;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 869decd1ff197c3083cb8b58f7dcac201038c381 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Thu, 4 Apr 2019 11:11:05 +0300
Subject: clk: ti: dra7: disable the RNG and TIMER12 clkctrl clocks on HS
 devices

RNG and TIMER12 are reserved for secure side usage only on HS devices,
so disable their clkctrl clocks on HS SoCs also.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
Tested-by: Andrew F. Davis <afd@ti.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 arch/arm/mach-omap2/clock.c     | 3 +++
 drivers/clk/ti/clk-7xx-compat.c | 4 ++--
 drivers/clk/ti/clk-7xx.c        | 4 ++--
 drivers/clk/ti/clkctrl.c        | 3 +++
 drivers/clk/ti/clock.h          | 9 +++++----
 include/linux/clk/ti.h          | 1 +
 6 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/mach-omap2/clock.c b/arch/arm/mach-omap2/clock.c
index 42881f21cede..3e0f09cc0028 100644
--- a/arch/arm/mach-omap2/clock.c
+++ b/arch/arm/mach-omap2/clock.c
@@ -119,6 +119,9 @@ void __init ti_clk_init_features(void)
 	if (cpu_is_omap343x())
 		features.flags |= TI_CLK_DPLL_HAS_FREQSEL;
 
+	if (omap_type() == OMAP2_DEVICE_TYPE_GP)
+		features.flags |= TI_CLK_DEVICE_TYPE_GP;
+
 	/* Idlest value for interface clocks.
 	 * 24xx uses 0 to indicate not ready, and 1 to indicate ready.
 	 * 34xx reverses this, just to keep us on our toes
diff --git a/drivers/clk/ti/clk-7xx-compat.c b/drivers/clk/ti/clk-7xx-compat.c
index 0d53bd0998ba..b3cd2296f84b 100644
--- a/drivers/clk/ti/clk-7xx-compat.c
+++ b/drivers/clk/ti/clk-7xx-compat.c
@@ -662,7 +662,7 @@ static const struct omap_clkctrl_reg_data dra7_l4per_clkctrl_regs[] __initconst
 	{ DRA7_AES1_CLKCTRL, NULL, CLKF_HW_SUP, "l3_iclk_div", "l4sec_clkdm" },
 	{ DRA7_AES2_CLKCTRL, NULL, CLKF_HW_SUP, "l3_iclk_div", "l4sec_clkdm" },
 	{ DRA7_DES_CLKCTRL, NULL, CLKF_HW_SUP, "l3_iclk_div", "l4sec_clkdm" },
-	{ DRA7_RNG_CLKCTRL, NULL, CLKF_HW_SUP, "l3_iclk_div", "l4sec_clkdm" },
+	{ DRA7_RNG_CLKCTRL, NULL, CLKF_HW_SUP | CLKF_SOC_NONSEC, "l3_iclk_div", "l4sec_clkdm" },
 	{ DRA7_SHAM_CLKCTRL, NULL, CLKF_HW_SUP, "l3_iclk_div", "l4sec_clkdm" },
 	{ DRA7_UART7_CLKCTRL, dra7_uart7_bit_data, CLKF_SW_SUP, "l4per_cm:clk:01d0:24", "l4per2_clkdm" },
 	{ DRA7_UART8_CLKCTRL, dra7_uart8_bit_data, CLKF_SW_SUP, "l4per_cm:clk:01e0:24", "l4per2_clkdm" },
@@ -704,7 +704,7 @@ static const struct omap_clkctrl_reg_data dra7_wkupaon_clkctrl_regs[] __initcons
 	{ DRA7_WD_TIMER2_CLKCTRL, NULL, CLKF_SW_SUP, "sys_32k_ck" },
 	{ DRA7_GPIO1_CLKCTRL, dra7_gpio1_bit_data, CLKF_HW_SUP, "wkupaon_iclk_mux" },
 	{ DRA7_TIMER1_CLKCTRL, dra7_timer1_bit_data, CLKF_SW_SUP, "wkupaon_cm:clk:0020:24" },
-	{ DRA7_TIMER12_CLKCTRL, NULL, 0, "secure_32k_clk_src_ck" },
+	{ DRA7_TIMER12_CLKCTRL, NULL, CLKF_SOC_NONSEC, "secure_32k_clk_src_ck" },
 	{ DRA7_COUNTER_32K_CLKCTRL, NULL, 0, "wkupaon_iclk_mux" },
 	{ DRA7_UART10_CLKCTRL, dra7_uart10_bit_data, CLKF_SW_SUP, "wkupaon_cm:clk:0060:24" },
 	{ DRA7_DCAN1_CLKCTRL, dra7_dcan1_bit_data, CLKF_SW_SUP, "wkupaon_cm:clk:0068:24" },
diff --git a/drivers/clk/ti/clk-7xx.c b/drivers/clk/ti/clk-7xx.c
index 098c342d9c36..79186b918d87 100644
--- a/drivers/clk/ti/clk-7xx.c
+++ b/drivers/clk/ti/clk-7xx.c
@@ -590,7 +590,7 @@ static const struct omap_clkctrl_reg_data dra7_l4sec_clkctrl_regs[] __initconst
 	{ DRA7_L4SEC_AES1_CLKCTRL, NULL, CLKF_HW_SUP, "l3_iclk_div" },
 	{ DRA7_L4SEC_AES2_CLKCTRL, NULL, CLKF_HW_SUP, "l3_iclk_div" },
 	{ DRA7_L4SEC_DES_CLKCTRL, NULL, CLKF_HW_SUP, "l3_iclk_div" },
-	{ DRA7_L4SEC_RNG_CLKCTRL, NULL, CLKF_HW_SUP, "" },
+	{ DRA7_L4SEC_RNG_CLKCTRL, NULL, CLKF_HW_SUP | CLKF_SOC_NONSEC, "" },
 	{ DRA7_L4SEC_SHAM_CLKCTRL, NULL, CLKF_HW_SUP, "l3_iclk_div" },
 	{ 0 },
 };
@@ -757,7 +757,7 @@ static const struct omap_clkctrl_reg_data dra7_wkupaon_clkctrl_regs[] __initcons
 	{ DRA7_WKUPAON_WD_TIMER2_CLKCTRL, NULL, CLKF_SW_SUP, "sys_32k_ck" },
 	{ DRA7_WKUPAON_GPIO1_CLKCTRL, dra7_gpio1_bit_data, CLKF_HW_SUP, "wkupaon_iclk_mux" },
 	{ DRA7_WKUPAON_TIMER1_CLKCTRL, dra7_timer1_bit_data, CLKF_SW_SUP, "wkupaon-clkctrl:0020:24" },
-	{ DRA7_WKUPAON_TIMER12_CLKCTRL, NULL, 0, "secure_32k_clk_src_ck" },
+	{ DRA7_WKUPAON_TIMER12_CLKCTRL, NULL, CLKF_SOC_NONSEC, "secure_32k_clk_src_ck" },
 	{ DRA7_WKUPAON_COUNTER_32K_CLKCTRL, NULL, 0, "wkupaon_iclk_mux" },
 	{ DRA7_WKUPAON_UART10_CLKCTRL, dra7_uart10_bit_data, CLKF_SW_SUP, "wkupaon-clkctrl:0060:24" },
 	{ DRA7_WKUPAON_DCAN1_CLKCTRL, dra7_dcan1_bit_data, CLKF_SW_SUP, "wkupaon-clkctrl:0068:24" },
diff --git a/drivers/clk/ti/clkctrl.c b/drivers/clk/ti/clkctrl.c
index 4cdeb8d4830c..96d65a1cf7be 100644
--- a/drivers/clk/ti/clkctrl.c
+++ b/drivers/clk/ti/clkctrl.c
@@ -509,6 +509,9 @@ static void __init _ti_omap4_clkctrl_setup(struct device_node *node)
 		data = dm816_clkctrl_data;
 #endif
 
+	if (ti_clk_get_features()->flags & TI_CLK_DEVICE_TYPE_GP)
+		soc_mask |= CLKF_SOC_NONSEC;
+
 	while (data->addr) {
 		if (addr == data->addr)
 			break;
diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h
index 773e2c4ac390..e4b8392ff63c 100644
--- a/drivers/clk/ti/clock.h
+++ b/drivers/clk/ti/clock.h
@@ -83,11 +83,12 @@ enum {
 #define CLKF_HW_SUP			BIT(6)
 #define CLKF_NO_IDLEST			BIT(7)
 
-#define CLKF_SOC_MASK			GENMASK(10, 8)
+#define CLKF_SOC_MASK			GENMASK(11, 8)
 
-#define CLKF_SOC_DRA72			BIT(8)
-#define CLKF_SOC_DRA74			BIT(9)
-#define CLKF_SOC_DRA76			BIT(10)
+#define CLKF_SOC_NONSEC			BIT(8)
+#define CLKF_SOC_DRA72			BIT(9)
+#define CLKF_SOC_DRA74			BIT(10)
+#define CLKF_SOC_DRA76			BIT(11)
 
 #define CLK(dev, con, ck)		\
 	{				\
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 2821f7cb1ca9..1e8ef96555ce 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -294,6 +294,7 @@ struct ti_clk_features {
 #define TI_CLK_DISABLE_CLKDM_CONTROL		BIT(2)
 #define TI_CLK_ERRATA_I810			BIT(3)
 #define TI_CLK_CLKCTRL_COMPAT			BIT(4)
+#define TI_CLK_DEVICE_TYPE_GP			BIT(5)
 
 void ti_clk_setup_features(struct ti_clk_features *features);
 const struct ti_clk_features *ti_clk_get_features(void);
-- 
cgit v1.2.3-59-g8ed1b


From 575d927c426b83f5f8d5809f46de177cceffe40c Mon Sep 17 00:00:00 2001
From: Patrick Havelange <patrick.havelange@essensium.com>
Date: Tue, 2 Apr 2019 15:30:53 +0900
Subject: LS1021A: dtsi: add ftm quad decoder entries

Add the 4 Quadrature counters for this board.

Reviewed-by: Esben Haabendal <esben@haabendal.dk>
Signed-off-by: Patrick Havelange <patrick.havelange@essensium.com>
Signed-off-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/boot/dts/ls1021a.dtsi | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'arch')

diff --git a/arch/arm/boot/dts/ls1021a.dtsi b/arch/arm/boot/dts/ls1021a.dtsi
index b4f2723ecd86..b10ff5877b4c 100644
--- a/arch/arm/boot/dts/ls1021a.dtsi
+++ b/arch/arm/boot/dts/ls1021a.dtsi
@@ -446,6 +446,34 @@
 			status = "disabled";
 		};
 
+		counter0: counter@29d0000 {
+			compatible = "fsl,ftm-quaddec";
+			reg = <0x0 0x29d0000 0x0 0x10000>;
+			big-endian;
+			status = "disabled";
+		};
+
+		counter1: counter@29e0000 {
+			compatible = "fsl,ftm-quaddec";
+			reg = <0x0 0x29e0000 0x0 0x10000>;
+			big-endian;
+			status = "disabled";
+		};
+
+		counter2: counter@29f0000 {
+			compatible = "fsl,ftm-quaddec";
+			reg = <0x0 0x29f0000 0x0 0x10000>;
+			big-endian;
+			status = "disabled";
+		};
+
+		counter3: counter@2a00000 {
+			compatible = "fsl,ftm-quaddec";
+			reg = <0x0 0x2a00000 0x0 0x10000>;
+			big-endian;
+			status = "disabled";
+		};
+
 		gpio0: gpio@2300000 {
 			compatible = "fsl,ls1021a-gpio", "fsl,qoriq-gpio";
 			reg = <0x0 0x2300000 0x0 0x10000>;
-- 
cgit v1.2.3-59-g8ed1b


From 90b6c5c73c6904ac200161fc38974d867f0535b0 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@kernel.org>
Date: Thu, 25 Apr 2019 10:57:37 -0700
Subject: clk: Remove CLK_IS_BASIC clk flag

This flag was historically used to indicate that a clk is a "basic" type
of clk like a mux, divider, gate, etc. This never turned out to be very
useful though because it was hard to cleanly split "basic" clks from
other clks in a system. This one flag was a way for type introspection
and it just didn't scale. If anything, it was used by the TI clk driver
to indicate that a clk_hw wasn't contained in the SoC specific clk
structure. We can get rid of this define now that TI is finding those
clks a different way.

Cc: Tero Kristo <t-kristo@ti.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: <linux-mips@vger.kernel.org>
Cc: Thierry Reding <thierry.reding@gmail.com>
Cc: Kevin Hilman <khilman@baylibre.com>
Cc: <linux-pwm@vger.kernel.org>
Cc: <linux-amlogic@lists.infradead.org>
Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 arch/mips/alchemy/common/clock.c     | 2 +-
 drivers/clk/clk-composite.c          | 2 +-
 drivers/clk/clk-divider.c            | 2 +-
 drivers/clk/clk-fixed-factor.c       | 2 +-
 drivers/clk/clk-fixed-rate.c         | 2 +-
 drivers/clk/clk-fractional-divider.c | 2 +-
 drivers/clk/clk-gate.c               | 2 +-
 drivers/clk/clk-gpio.c               | 2 +-
 drivers/clk/clk-mux.c                | 2 +-
 drivers/clk/clk-pwm.c                | 2 +-
 drivers/clk/clk.c                    | 1 -
 drivers/clk/mmp/clk-gate.c           | 2 +-
 drivers/pwm/pwm-meson.c              | 2 +-
 include/linux/clk-provider.h         | 2 +-
 14 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/alchemy/common/clock.c b/arch/mips/alchemy/common/clock.c
index d129475fd40d..a95a894aceaf 100644
--- a/arch/mips/alchemy/common/clock.c
+++ b/arch/mips/alchemy/common/clock.c
@@ -160,7 +160,7 @@ static struct clk __init *alchemy_clk_setup_cpu(const char *parent_name,
 	id.name = ALCHEMY_CPU_CLK;
 	id.parent_names = &parent_name;
 	id.num_parents = 1;
-	id.flags = CLK_IS_BASIC;
+	id.flags = 0;
 	id.ops = &alchemy_clkops_cpu;
 	h->init = &id;
 
diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c
index 46604214bba0..b06038b8f658 100644
--- a/drivers/clk/clk-composite.c
+++ b/drivers/clk/clk-composite.c
@@ -218,7 +218,7 @@ struct clk_hw *clk_hw_register_composite(struct device *dev, const char *name,
 		return ERR_PTR(-ENOMEM);
 
 	init.name = name;
-	init.flags = flags | CLK_IS_BASIC;
+	init.flags = flags;
 	init.parent_names = parent_names;
 	init.num_parents = num_parents;
 	hw = &composite->hw;
diff --git a/drivers/clk/clk-divider.c b/drivers/clk/clk-divider.c
index e5a17265cfaf..568e10a33ea4 100644
--- a/drivers/clk/clk-divider.c
+++ b/drivers/clk/clk-divider.c
@@ -475,7 +475,7 @@ static struct clk_hw *_register_divider(struct device *dev, const char *name,
 		init.ops = &clk_divider_ro_ops;
 	else
 		init.ops = &clk_divider_ops;
-	init.flags = flags | CLK_IS_BASIC;
+	init.flags = flags;
 	init.parent_names = (parent_name ? &parent_name: NULL);
 	init.num_parents = (parent_name ? 1 : 0);
 
diff --git a/drivers/clk/clk-fixed-factor.c b/drivers/clk/clk-fixed-factor.c
index 241b3f8c61a9..8aac2d1b6fea 100644
--- a/drivers/clk/clk-fixed-factor.c
+++ b/drivers/clk/clk-fixed-factor.c
@@ -84,7 +84,7 @@ struct clk_hw *clk_hw_register_fixed_factor(struct device *dev,
 
 	init.name = name;
 	init.ops = &clk_fixed_factor_ops;
-	init.flags = flags | CLK_IS_BASIC;
+	init.flags = flags;
 	init.parent_names = &parent_name;
 	init.num_parents = 1;
 
diff --git a/drivers/clk/clk-fixed-rate.c b/drivers/clk/clk-fixed-rate.c
index 00ef4f5e53fe..a7e4aef7a376 100644
--- a/drivers/clk/clk-fixed-rate.c
+++ b/drivers/clk/clk-fixed-rate.c
@@ -68,7 +68,7 @@ struct clk_hw *clk_hw_register_fixed_rate_with_accuracy(struct device *dev,
 
 	init.name = name;
 	init.ops = &clk_fixed_rate_ops;
-	init.flags = flags | CLK_IS_BASIC;
+	init.flags = flags;
 	init.parent_names = (parent_name ? &parent_name: NULL);
 	init.num_parents = (parent_name ? 1 : 0);
 
diff --git a/drivers/clk/clk-fractional-divider.c b/drivers/clk/clk-fractional-divider.c
index fdfe2e423d15..aa45dd257fe3 100644
--- a/drivers/clk/clk-fractional-divider.c
+++ b/drivers/clk/clk-fractional-divider.c
@@ -151,7 +151,7 @@ struct clk_hw *clk_hw_register_fractional_divider(struct device *dev,
 
 	init.name = name;
 	init.ops = &clk_fractional_divider_ops;
-	init.flags = flags | CLK_IS_BASIC;
+	init.flags = flags;
 	init.parent_names = parent_name ? &parent_name : NULL;
 	init.num_parents = parent_name ? 1 : 0;
 
diff --git a/drivers/clk/clk-gate.c b/drivers/clk/clk-gate.c
index f05823cd9b21..f58a58d5d80a 100644
--- a/drivers/clk/clk-gate.c
+++ b/drivers/clk/clk-gate.c
@@ -142,7 +142,7 @@ struct clk_hw *clk_hw_register_gate(struct device *dev, const char *name,
 
 	init.name = name;
 	init.ops = &clk_gate_ops;
-	init.flags = flags | CLK_IS_BASIC;
+	init.flags = flags;
 	init.parent_names = parent_name ? &parent_name : NULL;
 	init.num_parents = parent_name ? 1 : 0;
 
diff --git a/drivers/clk/clk-gpio.c b/drivers/clk/clk-gpio.c
index c2f07f0d077c..9d930edd6516 100644
--- a/drivers/clk/clk-gpio.c
+++ b/drivers/clk/clk-gpio.c
@@ -137,7 +137,7 @@ static struct clk_hw *clk_register_gpio(struct device *dev, const char *name,
 
 	init.name = name;
 	init.ops = clk_gpio_ops;
-	init.flags = flags | CLK_IS_BASIC;
+	init.flags = flags;
 	init.parent_names = parent_names;
 	init.num_parents = num_parents;
 
diff --git a/drivers/clk/clk-mux.c b/drivers/clk/clk-mux.c
index 2ad2df2e8909..7d60d690b7f2 100644
--- a/drivers/clk/clk-mux.c
+++ b/drivers/clk/clk-mux.c
@@ -159,7 +159,7 @@ struct clk_hw *clk_hw_register_mux_table(struct device *dev, const char *name,
 		init.ops = &clk_mux_ro_ops;
 	else
 		init.ops = &clk_mux_ops;
-	init.flags = flags | CLK_IS_BASIC;
+	init.flags = flags;
 	init.parent_names = parent_names;
 	init.num_parents = num_parents;
 
diff --git a/drivers/clk/clk-pwm.c b/drivers/clk/clk-pwm.c
index 8cb9d117fdbf..02b472a1f9b0 100644
--- a/drivers/clk/clk-pwm.c
+++ b/drivers/clk/clk-pwm.c
@@ -101,7 +101,7 @@ static int clk_pwm_probe(struct platform_device *pdev)
 
 	init.name = clk_name;
 	init.ops = &clk_pwm_ops;
-	init.flags = CLK_IS_BASIC;
+	init.flags = 0;
 	init.num_parents = 0;
 
 	clk_pwm->pwm = pwm;
diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 96053a96fe2f..7279573eefd5 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -2850,7 +2850,6 @@ static const struct {
 	ENTRY(CLK_SET_PARENT_GATE),
 	ENTRY(CLK_SET_RATE_PARENT),
 	ENTRY(CLK_IGNORE_UNUSED),
-	ENTRY(CLK_IS_BASIC),
 	ENTRY(CLK_GET_RATE_NOCACHE),
 	ENTRY(CLK_SET_RATE_NO_REPARENT),
 	ENTRY(CLK_GET_ACCURACY_NOCACHE),
diff --git a/drivers/clk/mmp/clk-gate.c b/drivers/clk/mmp/clk-gate.c
index 7355595c42e2..1755916ddef2 100644
--- a/drivers/clk/mmp/clk-gate.c
+++ b/drivers/clk/mmp/clk-gate.c
@@ -108,7 +108,7 @@ struct clk *mmp_clk_register_gate(struct device *dev, const char *name,
 
 	init.name = name;
 	init.ops = &mmp_clk_gate_ops;
-	init.flags = flags | CLK_IS_BASIC;
+	init.flags = flags;
 	init.parent_names = (parent_name ? &parent_name : NULL);
 	init.num_parents = (parent_name ? 1 : 0);
 
diff --git a/drivers/pwm/pwm-meson.c b/drivers/pwm/pwm-meson.c
index c1ed641b3e26..4ae5d774443e 100644
--- a/drivers/pwm/pwm-meson.c
+++ b/drivers/pwm/pwm-meson.c
@@ -470,7 +470,7 @@ static int meson_pwm_init_channels(struct meson_pwm *meson,
 
 		init.name = name;
 		init.ops = &clk_mux_ops;
-		init.flags = CLK_IS_BASIC;
+		init.flags = 0;
 		init.parent_names = meson->data->parent_names;
 		init.num_parents = meson->data->num_parents;
 
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index b7cf80a71293..9245a377295b 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -24,7 +24,7 @@
 #define CLK_SET_RATE_PARENT	BIT(2) /* propagate rate change up one level */
 #define CLK_IGNORE_UNUSED	BIT(3) /* do not gate even if unused */
 				/* unused */
-#define CLK_IS_BASIC		BIT(5) /* Basic clk, can't do a to_clk_foo() */
+				/* unused */
 #define CLK_GET_RATE_NOCACHE	BIT(6) /* do not use the cached clk rate */
 #define CLK_SET_RATE_NO_REPARENT BIT(7) /* don't re-parent on rate change */
 #define CLK_GET_ACCURACY_NOCACHE BIT(8) /* do not use the cached clk accuracy */
-- 
cgit v1.2.3-59-g8ed1b


From 8968c67a82ab7501bc3b9439c3624a49b42fe54c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 26 Apr 2019 21:48:21 +0200
Subject: bpf, arm64: remove prefetch insn in xadd mapping

Prefetch-with-intent-to-write is currently part of the XADD mapping in
the AArch64 JIT and follows the kernel's implementation of atomic_add.
This may interfere with other threads executing the LDXR/STXR loop,
leading to potential starvation and fairness issues. Drop the optional
prefetch instruction.

Fixes: 85f68fe89832 ("bpf, arm64: implement jiting of BPF_XADD")
Reported-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/net/bpf_jit.h      | 6 ------
 arch/arm64/net/bpf_jit_comp.c | 1 -
 2 files changed, 7 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h
index 783de51a6c4e..6c881659ee8a 100644
--- a/arch/arm64/net/bpf_jit.h
+++ b/arch/arm64/net/bpf_jit.h
@@ -100,12 +100,6 @@
 #define A64_STXR(sf, Rt, Rn, Rs) \
 	A64_LSX(sf, Rt, Rn, Rs, STORE_EX)
 
-/* Prefetch */
-#define A64_PRFM(Rn, type, target, policy) \
-	aarch64_insn_gen_prefetch(Rn, AARCH64_INSN_PRFM_TYPE_##type, \
-				  AARCH64_INSN_PRFM_TARGET_##target, \
-				  AARCH64_INSN_PRFM_POLICY_##policy)
-
 /* Add/subtract (immediate) */
 #define A64_ADDSUB_IMM(sf, Rd, Rn, imm12, type) \
 	aarch64_insn_gen_add_sub_imm(Rd, Rn, imm12, \
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index aaddc0217e73..a1420626fca2 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -762,7 +762,6 @@ emit_cond_jmp:
 	case BPF_STX | BPF_XADD | BPF_DW:
 		emit_a64_mov_i(1, tmp, off, ctx);
 		emit(A64_ADD(1, tmp, tmp, dst), ctx);
-		emit(A64_PRFM(tmp, PST, L1, STRM), ctx);
 		emit(A64_LDXR(isdw, tmp2, tmp), ctx);
 		emit(A64_ADD(isdw, tmp2, tmp2, src), ctx);
 		emit(A64_STXR(isdw, tmp2, tmp, tmp3), ctx);
-- 
cgit v1.2.3-59-g8ed1b


From 34b8ab091f9ef57a2bb3c8c8359a0a03a8abf2f9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 26 Apr 2019 21:48:22 +0200
Subject: bpf, arm64: use more scalable stadd over ldxr / stxr loop in xadd

Since ARMv8.1 supplement introduced LSE atomic instructions back in 2016,
lets add support for STADD and use that in favor of LDXR / STXR loop for
the XADD mapping if available. STADD is encoded as an alias for LDADD with
XZR as the destination register, therefore add LDADD to the instruction
encoder along with STADD as special case and use it in the JIT for CPUs
that advertise LSE atomics in CPUID register. If immediate offset in the
BPF XADD insn is 0, then use dst register directly instead of temporary
one.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/include/asm/insn.h |  8 ++++++++
 arch/arm64/kernel/insn.c      | 40 ++++++++++++++++++++++++++++++++++++++++
 arch/arm64/net/bpf_jit.h      |  4 ++++
 arch/arm64/net/bpf_jit_comp.c | 28 +++++++++++++++++++---------
 4 files changed, 71 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 9c01f04db64d..ec894de0ed4e 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -277,6 +277,7 @@ __AARCH64_INSN_FUNCS(adrp,	0x9F000000, 0x90000000)
 __AARCH64_INSN_FUNCS(prfm,	0x3FC00000, 0x39800000)
 __AARCH64_INSN_FUNCS(prfm_lit,	0xFF000000, 0xD8000000)
 __AARCH64_INSN_FUNCS(str_reg,	0x3FE0EC00, 0x38206800)
+__AARCH64_INSN_FUNCS(ldadd,	0x3F20FC00, 0xB8200000)
 __AARCH64_INSN_FUNCS(ldr_reg,	0x3FE0EC00, 0x38606800)
 __AARCH64_INSN_FUNCS(ldr_lit,	0xBF000000, 0x18000000)
 __AARCH64_INSN_FUNCS(ldrsw_lit,	0xFF000000, 0x98000000)
@@ -394,6 +395,13 @@ u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg,
 				   enum aarch64_insn_register state,
 				   enum aarch64_insn_size_type size,
 				   enum aarch64_insn_ldst_type type);
+u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result,
+			   enum aarch64_insn_register address,
+			   enum aarch64_insn_register value,
+			   enum aarch64_insn_size_type size);
+u32 aarch64_insn_gen_stadd(enum aarch64_insn_register address,
+			   enum aarch64_insn_register value,
+			   enum aarch64_insn_size_type size);
 u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst,
 				 enum aarch64_insn_register src,
 				 int imm, enum aarch64_insn_variant variant,
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 7820a4a688fa..9e2b5882cdeb 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -734,6 +734,46 @@ u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg,
 					    state);
 }
 
+u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result,
+			   enum aarch64_insn_register address,
+			   enum aarch64_insn_register value,
+			   enum aarch64_insn_size_type size)
+{
+	u32 insn = aarch64_insn_get_ldadd_value();
+
+	switch (size) {
+	case AARCH64_INSN_SIZE_32:
+	case AARCH64_INSN_SIZE_64:
+		break;
+	default:
+		pr_err("%s: unimplemented size encoding %d\n", __func__, size);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_ldst_size(size, insn);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
+					    result);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+					    address);
+
+	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn,
+					    value);
+}
+
+u32 aarch64_insn_gen_stadd(enum aarch64_insn_register address,
+			   enum aarch64_insn_register value,
+			   enum aarch64_insn_size_type size)
+{
+	/*
+	 * STADD is simply encoded as an alias for LDADD with XZR as
+	 * the destination register.
+	 */
+	return aarch64_insn_gen_ldadd(AARCH64_INSN_REG_ZR, address,
+				      value, size);
+}
+
 static u32 aarch64_insn_encode_prfm_imm(enum aarch64_insn_prfm_type type,
 					enum aarch64_insn_prfm_target target,
 					enum aarch64_insn_prfm_policy policy,
diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h
index 6c881659ee8a..76606e87233f 100644
--- a/arch/arm64/net/bpf_jit.h
+++ b/arch/arm64/net/bpf_jit.h
@@ -100,6 +100,10 @@
 #define A64_STXR(sf, Rt, Rn, Rs) \
 	A64_LSX(sf, Rt, Rn, Rs, STORE_EX)
 
+/* LSE atomics */
+#define A64_STADD(sf, Rn, Rs) \
+	aarch64_insn_gen_stadd(Rn, Rs, A64_SIZE(sf))
+
 /* Add/subtract (immediate) */
 #define A64_ADDSUB_IMM(sf, Rd, Rn, imm12, type) \
 	aarch64_insn_gen_add_sub_imm(Rd, Rn, imm12, \
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index a1420626fca2..df845cee438e 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -365,7 +365,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 	const bool is64 = BPF_CLASS(code) == BPF_ALU64 ||
 			  BPF_CLASS(code) == BPF_JMP;
 	const bool isdw = BPF_SIZE(code) == BPF_DW;
-	u8 jmp_cond;
+	u8 jmp_cond, reg;
 	s32 jmp_offset;
 
 #define check_imm(bits, imm) do {				\
@@ -756,18 +756,28 @@ emit_cond_jmp:
 			break;
 		}
 		break;
+
 	/* STX XADD: lock *(u32 *)(dst + off) += src */
 	case BPF_STX | BPF_XADD | BPF_W:
 	/* STX XADD: lock *(u64 *)(dst + off) += src */
 	case BPF_STX | BPF_XADD | BPF_DW:
-		emit_a64_mov_i(1, tmp, off, ctx);
-		emit(A64_ADD(1, tmp, tmp, dst), ctx);
-		emit(A64_LDXR(isdw, tmp2, tmp), ctx);
-		emit(A64_ADD(isdw, tmp2, tmp2, src), ctx);
-		emit(A64_STXR(isdw, tmp2, tmp, tmp3), ctx);
-		jmp_offset = -3;
-		check_imm19(jmp_offset);
-		emit(A64_CBNZ(0, tmp3, jmp_offset), ctx);
+		if (!off) {
+			reg = dst;
+		} else {
+			emit_a64_mov_i(1, tmp, off, ctx);
+			emit(A64_ADD(1, tmp, tmp, dst), ctx);
+			reg = tmp;
+		}
+		if (cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) {
+			emit(A64_STADD(isdw, reg, src), ctx);
+		} else {
+			emit(A64_LDXR(isdw, tmp2, reg), ctx);
+			emit(A64_ADD(isdw, tmp2, tmp2, src), ctx);
+			emit(A64_STXR(isdw, tmp2, reg, tmp3), ctx);
+			jmp_offset = -3;
+			check_imm19(jmp_offset);
+			emit(A64_CBNZ(0, tmp3, jmp_offset), ctx);
+		}
 		break;
 
 	default:
-- 
cgit v1.2.3-59-g8ed1b


From f341d89790b0b7f99ca7835e0cf7de1026ceae39 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 23 Apr 2019 16:10:17 +0100
Subject: powerpc/mm: fix spelling mistake "Outisde" -> "Outside"

There are several identical spelling mistakes in warning messages,
fix these.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/hash_utils_64.c  | 4 ++--
 arch/powerpc/mm/pgtable-hash64.c | 2 +-
 arch/powerpc/mm/pgtable-radix.c  | 6 +++---
 arch/powerpc/mm/pgtable_64.c     | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index f727197de713..6eb89643ce58 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -784,7 +784,7 @@ int hash__create_section_mapping(unsigned long start, unsigned long end, int nid
 	int rc;
 
 	if (end >= H_VMALLOC_START) {
-		pr_warn("Outisde the supported range\n");
+		pr_warn("Outside the supported range\n");
 		return -1;
 	}
 
@@ -932,7 +932,7 @@ static void __init htab_initialize(void)
 		    base, size, prot);
 
 		if ((base + size) >= H_VMALLOC_START) {
-			pr_warn("Outisde the supported range\n");
+			pr_warn("Outside the supported range\n");
 			continue;
 		}
 
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
index d934de4e2b3a..097a3b3538b1 100644
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -115,7 +115,7 @@ int __meminit hash__vmemmap_create_mapping(unsigned long start,
 	int rc;
 
 	if ((start + page_size) >= H_VMEMMAP_END) {
-		pr_warn("Outisde the supported range\n");
+		pr_warn("Outside the supported range\n");
 		return -1;
 	}
 
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index e6d5065b0bc8..fcb0169e2d32 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -341,7 +341,7 @@ void __init radix_init_pgtable(void)
 		 */
 
 		if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
-			pr_warn("Outisde the supported range\n");
+			pr_warn("Outside the supported range\n");
 			continue;
 		}
 
@@ -902,7 +902,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
 int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
 {
 	if (end >= RADIX_VMALLOC_START) {
-		pr_warn("Outisde the supported range\n");
+		pr_warn("Outside the supported range\n");
 		return -1;
 	}
 
@@ -934,7 +934,7 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start,
 	int ret;
 
 	if ((start + page_size) >= RADIX_VMEMMAP_END) {
-		pr_warn("Outisde the supported range\n");
+		pr_warn("Outside the supported range\n");
 		return -1;
 	}
 
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 72f58c076e26..95ad2a09501c 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -122,7 +122,7 @@ void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_
 		return NULL;
 
 	if ((ea + size) >= (void *)IOREMAP_END) {
-		pr_warn("Outisde the supported range\n");
+		pr_warn("Outside the supported range\n");
 		return NULL;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From b2d3b5ee66f2a04a918cc043cec0c9ed3de58f40 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Tue, 2 Oct 2018 10:35:59 -0500
Subject: powerpc/pseries: Track LMB nid instead of using device tree

When removing memory we need to remove the memory from the node
it was added to instead of looking up the node it should be in
in the device tree.

During testing we have seen scenarios where the affinity for a
LMB changes due to a partition migration or PRRN event. In these
cases the node the LMB exists in may not match the node the device
tree indicates it belongs in. This can lead to a system crash
when trying to DLPAR remove the LMB after a migration or PRRN
event. The current code looks up the node in the device tree to
remove the LMB from, the crash occurs when we try to offline this
node and it does not have any data, i.e. node_data[nid] == NULL.

36:mon> e
cpu 0x36: Vector: 300 (Data Access) at [c0000001828b7810]
    pc: c00000000036d08c: try_offline_node+0x2c/0x1b0
    lr: c0000000003a14ec: remove_memory+0xbc/0x110
    sp: c0000001828b7a90
   msr: 800000000280b033
   dar: 9a28
 dsisr: 40000000
  current = 0xc0000006329c4c80
  paca    = 0xc000000007a55200   softe: 0        irq_happened: 0x01
    pid   = 76926, comm = kworker/u320:3

36:mon> t
[link register   ] c0000000003a14ec remove_memory+0xbc/0x110
[c0000001828b7a90] c00000000006a1cc arch_remove_memory+0x9c/0xd0 (unreliable)
[c0000001828b7ad0] c0000000003a14e0 remove_memory+0xb0/0x110
[c0000001828b7b20] c0000000000c7db4 dlpar_remove_lmb+0x94/0x160
[c0000001828b7b60] c0000000000c8ef8 dlpar_memory+0x7e8/0xd10
[c0000001828b7bf0] c0000000000bf828 handle_dlpar_errorlog+0xf8/0x160
[c0000001828b7c60] c0000000000bf8cc pseries_hp_work_fn+0x3c/0xa0
[c0000001828b7c90] c000000000128cd8 process_one_work+0x298/0x5a0
[c0000001828b7d20] c000000000129068 worker_thread+0x88/0x620
[c0000001828b7dc0] c00000000013223c kthread+0x1ac/0x1c0
[c0000001828b7e30] c00000000000b45c ret_from_kernel_thread+0x5c/0x80

To resolve this we need to track the node a LMB belongs to when
it is added to the system so we can remove it from that node instead
of the node that the device tree indicates it should belong to.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/drmem.h                | 21 +++++++++++++++++++++
 arch/powerpc/mm/drmem.c                         |  6 +++++-
 arch/powerpc/platforms/pseries/hotplug-memory.c | 17 ++++++++---------
 3 files changed, 34 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index 7c1d8e74b25d..7f3279b014db 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -17,6 +17,9 @@ struct drmem_lmb {
 	u32     drc_index;
 	u32     aa_index;
 	u32     flags;
+#ifdef CONFIG_MEMORY_HOTPLUG
+	int	nid;
+#endif
 };
 
 struct drmem_lmb_info {
@@ -104,4 +107,22 @@ static inline void invalidate_lmb_associativity_index(struct drmem_lmb *lmb)
 	lmb->aa_index = 0xffffffff;
 }
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+static inline void lmb_set_nid(struct drmem_lmb *lmb)
+{
+	lmb->nid = memory_add_physaddr_to_nid(lmb->base_addr);
+}
+static inline void lmb_clear_nid(struct drmem_lmb *lmb)
+{
+	lmb->nid = -1;
+}
+#else
+static inline void lmb_set_nid(struct drmem_lmb *lmb)
+{
+}
+static inline void lmb_clear_nid(struct drmem_lmb *lmb)
+{
+}
+#endif
+
 #endif /* _ASM_POWERPC_LMB_H */
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index 3f1803672c9b..641891df2046 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -366,8 +366,10 @@ static void __init init_drmem_v1_lmbs(const __be32 *prop)
 	if (!drmem_info->lmbs)
 		return;
 
-	for_each_drmem_lmb(lmb)
+	for_each_drmem_lmb(lmb) {
 		read_drconf_v1_cell(lmb, &prop);
+		lmb_set_nid(lmb);
+	}
 }
 
 static void __init init_drmem_v2_lmbs(const __be32 *prop)
@@ -412,6 +414,8 @@ static void __init init_drmem_v2_lmbs(const __be32 *prop)
 
 			lmb->aa_index = dr_cell.aa_index;
 			lmb->flags = dr_cell.flags;
+
+			lmb_set_nid(lmb);
 		}
 	}
 }
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index d291b618a559..47087832f8b2 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -379,7 +379,7 @@ static int dlpar_add_lmb(struct drmem_lmb *);
 static int dlpar_remove_lmb(struct drmem_lmb *lmb)
 {
 	unsigned long block_sz;
-	int nid, rc;
+	int rc;
 
 	if (!lmb_is_removable(lmb))
 		return -EINVAL;
@@ -389,14 +389,14 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
 		return rc;
 
 	block_sz = pseries_memory_block_size();
-	nid = memory_add_physaddr_to_nid(lmb->base_addr);
 
-	__remove_memory(nid, lmb->base_addr, block_sz);
+	__remove_memory(lmb->nid, lmb->base_addr, block_sz);
 
 	/* Update memory regions for memory remove */
 	memblock_remove(lmb->base_addr, block_sz);
 
 	invalidate_lmb_associativity_index(lmb);
+	lmb_clear_nid(lmb);
 	lmb->flags &= ~DRCONF_MEM_ASSIGNED;
 
 	return 0;
@@ -653,7 +653,7 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
 static int dlpar_add_lmb(struct drmem_lmb *lmb)
 {
 	unsigned long block_sz;
-	int nid, rc;
+	int rc;
 
 	if (lmb->flags & DRCONF_MEM_ASSIGNED)
 		return -EINVAL;
@@ -664,13 +664,11 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
 		return rc;
 	}
 
+	lmb_set_nid(lmb);
 	block_sz = memory_block_size_bytes();
 
-	/* Find the node id for this address */
-	nid = memory_add_physaddr_to_nid(lmb->base_addr);
-
 	/* Add the memory */
-	rc = __add_memory(nid, lmb->base_addr, block_sz);
+	rc = __add_memory(lmb->nid, lmb->base_addr, block_sz);
 	if (rc) {
 		invalidate_lmb_associativity_index(lmb);
 		return rc;
@@ -678,8 +676,9 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
 
 	rc = dlpar_online_lmb(lmb);
 	if (rc) {
-		__remove_memory(nid, lmb->base_addr, block_sz);
+		__remove_memory(lmb->nid, lmb->base_addr, block_sz);
 		invalidate_lmb_associativity_index(lmb);
+		lmb_clear_nid(lmb);
 	} else {
 		lmb->flags |= DRCONF_MEM_ASSIGNED;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 51c51a48de4bfda78d381616e553445a76b1d407 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 24 Apr 2019 21:14:25 +0200
Subject: powerpc/powernv/npu: Use pci_dev_id() helper

Use new helper pci_dev_id() to simplify the code.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/platforms/powernv/npu-dma.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index dc23d9d2a7d9..495550432f3d 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -1213,9 +1213,8 @@ int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
 	 * Currently we only support radix and non-zero LPCR only makes sense
 	 * for hash tables so skiboot expects the LPCR parameter to be a zero.
 	 */
-	ret = opal_npu_map_lpar(nphb->opal_id,
-			PCI_DEVID(gpdev->bus->number, gpdev->devfn), lparid,
-			0 /* LPCR bits */);
+	ret = opal_npu_map_lpar(nphb->opal_id, pci_dev_id(gpdev), lparid,
+				0 /* LPCR bits */);
 	if (ret) {
 		dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret);
 		return ret;
@@ -1224,7 +1223,7 @@ int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
 	dev_dbg(&gpdev->dev, "init context opalid=%llu msr=%lx\n",
 			nphb->opal_id, msr);
 	ret = opal_npu_init_context(nphb->opal_id, 0/*__unused*/, msr,
-			PCI_DEVID(gpdev->bus->number, gpdev->devfn));
+				    pci_dev_id(gpdev));
 	if (ret < 0)
 		dev_err(&gpdev->dev, "Failed to init context: %d\n", ret);
 	else
@@ -1258,7 +1257,7 @@ int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev)
 	dev_dbg(&gpdev->dev, "destroy context opalid=%llu\n",
 			nphb->opal_id);
 	ret = opal_npu_destroy_context(nphb->opal_id, 0/*__unused*/,
-			PCI_DEVID(gpdev->bus->number, gpdev->devfn));
+				       pci_dev_id(gpdev));
 	if (ret < 0) {
 		dev_err(&gpdev->dev, "Failed to destroy context: %d\n", ret);
 		return ret;
@@ -1266,9 +1265,8 @@ int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev)
 
 	/* Set LPID to 0 anyway, just to be safe */
 	dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=0\n", nphb->opal_id);
-	ret = opal_npu_map_lpar(nphb->opal_id,
-			PCI_DEVID(gpdev->bus->number, gpdev->devfn), 0 /*LPID*/,
-			0 /* LPCR bits */);
+	ret = opal_npu_map_lpar(nphb->opal_id, pci_dev_id(gpdev), 0 /*LPID*/,
+				0 /* LPCR bits */);
 	if (ret)
 		dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret);
 
-- 
cgit v1.2.3-59-g8ed1b


From 7ae3f6e130e8dc6188b59e3b4ebc2f16e9c8d053 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 9 Apr 2019 14:40:05 +1000
Subject: powerpc/watchdog: Use hrtimers for per-CPU heartbeat

Using a jiffies timer creates a dependency on the tick_do_timer_cpu
incrementing jiffies. If that CPU has locked up and jiffies is not
incrementing, the watchdog heartbeat timer for all CPUs stops and
creates false positives and confusing warnings on local CPUs, and
also causes the SMP detector to stop, so the root cause is never
detected.

Fix this by using hrtimer based timers for the watchdog heartbeat,
like the generic kernel hardlockup detector.

Cc: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Reported-by: Ravikumar Bangoria <ravi.bangoria@in.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Tested-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Reported-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/watchdog.c | 81 +++++++++++++++++++++---------------------
 1 file changed, 40 insertions(+), 41 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 3c6ab22a0c4e..af3c15a1d41e 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -77,7 +77,7 @@ static u64 wd_smp_panic_timeout_tb __read_mostly; /* panic other CPUs */
 
 static u64 wd_timer_period_ms __read_mostly;  /* interval between heartbeat */
 
-static DEFINE_PER_CPU(struct timer_list, wd_timer);
+static DEFINE_PER_CPU(struct hrtimer, wd_hrtimer);
 static DEFINE_PER_CPU(u64, wd_timer_tb);
 
 /* SMP checker bits */
@@ -293,21 +293,21 @@ out:
 	nmi_exit();
 }
 
-static void wd_timer_reset(unsigned int cpu, struct timer_list *t)
-{
-	t->expires = jiffies + msecs_to_jiffies(wd_timer_period_ms);
-	if (wd_timer_period_ms > 1000)
-		t->expires = __round_jiffies_up(t->expires, cpu);
-	add_timer_on(t, cpu);
-}
-
-static void wd_timer_fn(struct timer_list *t)
+static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 {
 	int cpu = smp_processor_id();
 
+	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+		return HRTIMER_NORESTART;
+
+	if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
+		return HRTIMER_NORESTART;
+
 	watchdog_timer_interrupt(cpu);
 
-	wd_timer_reset(cpu, t);
+	hrtimer_forward_now(hrtimer, ms_to_ktime(wd_timer_period_ms));
+
+	return HRTIMER_RESTART;
 }
 
 void arch_touch_nmi_watchdog(void)
@@ -323,37 +323,22 @@ void arch_touch_nmi_watchdog(void)
 }
 EXPORT_SYMBOL(arch_touch_nmi_watchdog);
 
-static void start_watchdog_timer_on(unsigned int cpu)
-{
-	struct timer_list *t = per_cpu_ptr(&wd_timer, cpu);
-
-	per_cpu(wd_timer_tb, cpu) = get_tb();
-
-	timer_setup(t, wd_timer_fn, TIMER_PINNED);
-	wd_timer_reset(cpu, t);
-}
-
-static void stop_watchdog_timer_on(unsigned int cpu)
-{
-	struct timer_list *t = per_cpu_ptr(&wd_timer, cpu);
-
-	del_timer_sync(t);
-}
-
-static int start_wd_on_cpu(unsigned int cpu)
+static void start_watchdog(void *arg)
 {
+	struct hrtimer *hrtimer = this_cpu_ptr(&wd_hrtimer);
+	int cpu = smp_processor_id();
 	unsigned long flags;
 
 	if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) {
 		WARN_ON(1);
-		return 0;
+		return;
 	}
 
 	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-		return 0;
+		return;
 
 	if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
-		return 0;
+		return;
 
 	wd_smp_lock(&flags);
 	cpumask_set_cpu(cpu, &wd_cpus_enabled);
@@ -363,27 +348,40 @@ static int start_wd_on_cpu(unsigned int cpu)
 	}
 	wd_smp_unlock(&flags);
 
-	start_watchdog_timer_on(cpu);
+	*this_cpu_ptr(&wd_timer_tb) = get_tb();
 
-	return 0;
+	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer->function = watchdog_timer_fn;
+	hrtimer_start(hrtimer, ms_to_ktime(wd_timer_period_ms),
+		      HRTIMER_MODE_REL_PINNED);
 }
 
-static int stop_wd_on_cpu(unsigned int cpu)
+static int start_watchdog_on_cpu(unsigned int cpu)
 {
+	return smp_call_function_single(cpu, start_watchdog, NULL, true);
+}
+
+static void stop_watchdog(void *arg)
+{
+	struct hrtimer *hrtimer = this_cpu_ptr(&wd_hrtimer);
+	int cpu = smp_processor_id();
 	unsigned long flags;
 
 	if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
-		return 0; /* Can happen in CPU unplug case */
+		return; /* Can happen in CPU unplug case */
 
-	stop_watchdog_timer_on(cpu);
+	hrtimer_cancel(hrtimer);
 
 	wd_smp_lock(&flags);
 	cpumask_clear_cpu(cpu, &wd_cpus_enabled);
 	wd_smp_unlock(&flags);
 
 	wd_smp_clear_cpu_pending(cpu, get_tb());
+}
 
-	return 0;
+static int stop_watchdog_on_cpu(unsigned int cpu)
+{
+	return smp_call_function_single(cpu, stop_watchdog, NULL, true);
 }
 
 static void watchdog_calc_timeouts(void)
@@ -402,7 +400,7 @@ void watchdog_nmi_stop(void)
 	int cpu;
 
 	for_each_cpu(cpu, &wd_cpus_enabled)
-		stop_wd_on_cpu(cpu);
+		stop_watchdog_on_cpu(cpu);
 }
 
 void watchdog_nmi_start(void)
@@ -411,7 +409,7 @@ void watchdog_nmi_start(void)
 
 	watchdog_calc_timeouts();
 	for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
-		start_wd_on_cpu(cpu);
+		start_watchdog_on_cpu(cpu);
 }
 
 /*
@@ -423,7 +421,8 @@ int __init watchdog_nmi_probe(void)
 
 	err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
 					"powerpc/watchdog:online",
-					start_wd_on_cpu, stop_wd_on_cpu);
+					start_watchdog_on_cpu,
+					stop_watchdog_on_cpu);
 	if (err < 0) {
 		pr_warn("could not be initialized");
 		return err;
-- 
cgit v1.2.3-59-g8ed1b


From 10d91611f426d4bafd2a83d966c36da811b2f7ad Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Sat, 13 Apr 2019 00:30:52 +1000
Subject: powerpc/64s: Reimplement book3s idle code in C

Reimplement Book3S idle code in C, moving POWER7/8/9 implementation
speific HV idle code to the powernv platform code.

Book3S assembly stubs are kept in common code and used only to save
the stack frame and non-volatile GPRs before executing architected
idle instructions, and restoring the stack and reloading GPRs then
returning to C after waking from idle.

The complex logic dealing with threads and subcores, locking, SPRs,
HMIs, timebase resync, etc., is all done in C which makes it more
maintainable.

This is not a strict translation to C code, there are some
significant differences:

- Idle wakeup no longer uses the ->cpu_restore call to reinit SPRs,
  but saves and restores them itself.

- The optimisation where EC=ESL=0 idle modes did not have to save GPRs
  or change MSR is restored, because it's now simple to do. ESL=1
  sleeps that do not lose GPRs can use this optimization too.

- KVM secondary entry and cede is now more of a call/return style
  rather than branchy. nap_state_lost is not required because KVM
  always returns via NVGPR restoring path.

- KVM secondary wakeup from offline sequence is moved entirely into
  the offline wakeup, which avoids a hwsync in the normal idle wakeup
  path.

Performance measured with context switch ping-pong on different
threads or cores, is possibly improved a small amount, 1-3% depending
on stop state and core vs thread test for shallow states. Deep states
it's in the noise compared with other latencies.

KVM improvements:

- Idle sleepers now always return to caller rather than branch out
  to KVM first.

- This allows optimisations like very fast return to caller when no
  state has been lost.

- KVM no longer requires nap_state_lost because it controls NVGPR
  save/restore itself on the way in and out.

- The heavy idle wakeup KVM request check can be moved out of the
  normal host idle code and into the not-performance-critical offline
  code.

- KVM nap code now returns from where it is called, which makes the
  flow a bit easier to follow.

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Squash the KVM changes in]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cpuidle.h       |   19 +-
 arch/powerpc/include/asm/paca.h          |   40 +-
 arch/powerpc/include/asm/processor.h     |    9 +-
 arch/powerpc/include/asm/reg.h           |    8 +-
 arch/powerpc/kernel/asm-offsets.c        |   18 -
 arch/powerpc/kernel/exceptions-64s.S     |   23 +-
 arch/powerpc/kernel/idle_book3s.S        | 1060 ++++--------------------------
 arch/powerpc/kernel/setup-common.c       |    4 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |  118 ++--
 arch/powerpc/platforms/powernv/idle.c    |  862 +++++++++++++++++++-----
 arch/powerpc/platforms/powernv/subcore.c |    2 +-
 arch/powerpc/xmon/xmon.c                 |   24 +-
 12 files changed, 969 insertions(+), 1218 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h
index 43e5f31fe64d..9844b3ded187 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -27,10 +27,11 @@
  * the THREAD_WINKLE_BITS are set, which indicate which threads have not
  * yet woken from the winkle state.
  */
-#define PNV_CORE_IDLE_LOCK_BIT			0x10000000
+#define NR_PNV_CORE_IDLE_LOCK_BIT		28
+#define PNV_CORE_IDLE_LOCK_BIT			(1ULL << NR_PNV_CORE_IDLE_LOCK_BIT)
 
+#define PNV_CORE_IDLE_WINKLE_COUNT_SHIFT	16
 #define PNV_CORE_IDLE_WINKLE_COUNT		0x00010000
-#define PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT	0x00080000
 #define PNV_CORE_IDLE_WINKLE_COUNT_BITS		0x000F0000
 #define PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT	8
 #define PNV_CORE_IDLE_THREAD_WINKLE_BITS	0x0000FF00
@@ -68,16 +69,6 @@
 #define ERR_DEEP_STATE_ESL_MISMATCH	-2
 
 #ifndef __ASSEMBLY__
-/* Additional SPRs that need to be saved/restored during stop */
-struct stop_sprs {
-	u64 pid;
-	u64 ldbar;
-	u64 fscr;
-	u64 hfscr;
-	u64 mmcr1;
-	u64 mmcr2;
-	u64 mmcra;
-};
 
 #define PNV_IDLE_NAME_LEN    16
 struct pnv_idle_states_t {
@@ -92,10 +83,6 @@ struct pnv_idle_states_t {
 
 extern struct pnv_idle_states_t *pnv_idle_states;
 extern int nr_pnv_idle_states;
-extern u32 pnv_fastsleep_workaround_at_entry[];
-extern u32 pnv_fastsleep_workaround_at_exit[];
-
-extern u64 pnv_first_deep_stop_state;
 
 unsigned long pnv_cpu_offline(unsigned int cpu);
 int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index e843bc5d1a0f..245d11a71784 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -173,7 +173,6 @@ struct paca_struct {
 	u8 irq_happened;		/* irq happened while soft-disabled */
 	u8 io_sync;			/* writel() needs spin_unlock sync */
 	u8 irq_work_pending;		/* IRQ_WORK interrupt while soft-disable */
-	u8 nap_state_lost;		/* NV GPR values lost in power7_idle */
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	u8 pmcregs_in_use;		/* pseries puts this in lppaca */
 #endif
@@ -183,23 +182,28 @@ struct paca_struct {
 #endif
 
 #ifdef CONFIG_PPC_POWERNV
-	/* Per-core mask tracking idle threads and a lock bit-[L][TTTTTTTT] */
-	u32 *core_idle_state_ptr;
-	u8 thread_idle_state;		/* PNV_THREAD_RUNNING/NAP/SLEEP	*/
-	/* Mask to indicate thread id in core */
-	u8 thread_mask;
-	/* Mask to denote subcore sibling threads */
-	u8 subcore_sibling_mask;
-	/* Flag to request this thread not to stop */
-	atomic_t dont_stop;
-	/* The PSSCR value that the kernel requested before going to stop */
-	u64 requested_psscr;
-
-	/*
-	 * Save area for additional SPRs that need to be
-	 * saved/restored during cpuidle stop.
-	 */
-	struct stop_sprs stop_sprs;
+	/* PowerNV idle fields */
+	/* PNV_CORE_IDLE_* bits, all siblings work on thread 0 paca */
+	unsigned long idle_state;
+	union {
+		/* P7/P8 specific fields */
+		struct {
+			/* PNV_THREAD_RUNNING/NAP/SLEEP	*/
+			u8 thread_idle_state;
+			/* Mask to denote subcore sibling threads */
+			u8 subcore_sibling_mask;
+		};
+
+		/* P9 specific fields */
+		struct {
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+			/* The PSSCR value that the kernel requested before going to stop */
+			u64 requested_psscr;
+			/* Flag to request this thread not to stop */
+			atomic_t dont_stop;
+#endif
+		};
+	};
 #endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 3351bcf42f2d..3120cca72e1f 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -411,14 +411,17 @@ static inline unsigned long get_clean_sp(unsigned long sp, int is_32)
 }
 #endif
 
+/* asm stubs */
+extern unsigned long isa300_idle_stop_noloss(unsigned long psscr_val);
+extern unsigned long isa300_idle_stop_mayloss(unsigned long psscr_val);
+extern unsigned long isa206_idle_insn_mayloss(unsigned long type);
+
 extern unsigned long cpuidle_disable;
 enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
 
 extern int powersave_nap;	/* set if nap mode can be used in idle loop */
-extern unsigned long power7_idle_insn(unsigned long type); /* PNV_THREAD_NAP/etc*/
+
 extern void power7_idle_type(unsigned long type);
-extern unsigned long power9_idle_stop(unsigned long psscr_val);
-extern unsigned long power9_offline_stop(unsigned long psscr_val);
 extern void power9_idle_type(unsigned long stop_psscr_val,
 			      unsigned long stop_psscr_mask);
 
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index c5b2aff0ce8e..10caa145f98b 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -168,6 +168,7 @@
 #define PSSCR_ESL		0x00200000 /* Enable State Loss */
 #define PSSCR_SD		0x00400000 /* Status Disable */
 #define PSSCR_PLS	0xf000000000000000 /* Power-saving Level Status */
+#define PSSCR_PLS_SHIFT	60
 #define PSSCR_GUEST_VIS	0xf0000000000003ffUL /* Guest-visible PSSCR fields */
 #define PSSCR_FAKE_SUSPEND	0x00000400 /* Fake-suspend bit (P9 DD2.2) */
 #define PSSCR_FAKE_SUSPEND_LG	10	   /* Fake-suspend bit position */
@@ -758,10 +759,9 @@
 #define	  SRR1_WAKERESET	0x00100000 /* System reset */
 #define   SRR1_WAKEHDBELL	0x000c0000 /* Hypervisor doorbell on P8 */
 #define	  SRR1_WAKESTATE	0x00030000 /* Powersave exit mask [46:47] */
-#define	  SRR1_WS_DEEPEST	0x00030000 /* Some resources not maintained,
-					  * may not be recoverable */
-#define	  SRR1_WS_DEEPER	0x00020000 /* Some resources not maintained */
-#define	  SRR1_WS_DEEP		0x00010000 /* All resources maintained */
+#define	  SRR1_WS_HVLOSS	0x00030000 /* HV resources not maintained */
+#define	  SRR1_WS_GPRLOSS	0x00020000 /* GPRs not maintained */
+#define	  SRR1_WS_NOLOSS	0x00010000 /* All resources maintained */
 #define   SRR1_PROGTM		0x00200000 /* TM Bad Thing */
 #define   SRR1_PROGFPE		0x00100000 /* Floating Point Enabled */
 #define   SRR1_PROGILL		0x00080000 /* Illegal instruction */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 86a61e5f8285..83ad99f9f05d 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -268,7 +268,6 @@ int main(void)
 	OFFSET(ACCOUNT_USER_TIME, paca_struct, accounting.utime);
 	OFFSET(ACCOUNT_SYSTEM_TIME, paca_struct, accounting.stime);
 	OFFSET(PACA_TRAP_SAVE, paca_struct, trap_save);
-	OFFSET(PACA_NAPSTATELOST, paca_struct, nap_state_lost);
 	OFFSET(PACA_SPRG_VDSO, paca_struct, sprg_vdso);
 #else /* CONFIG_PPC64 */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
@@ -766,23 +765,6 @@ int main(void)
 	OFFSET(VCPU_TIMING_LAST_ENTER_TBL, kvm_vcpu, arch.timing_last_enter.tv32.tbl);
 #endif
 
-#ifdef CONFIG_PPC_POWERNV
-	OFFSET(PACA_CORE_IDLE_STATE_PTR, paca_struct, core_idle_state_ptr);
-	OFFSET(PACA_THREAD_IDLE_STATE, paca_struct, thread_idle_state);
-	OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask);
-	OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask);
-	OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr);
-	OFFSET(PACA_DONT_STOP, paca_struct, dont_stop);
-#define STOP_SPR(x, f)	OFFSET(x, paca_struct, stop_sprs.f)
-	STOP_SPR(STOP_PID, pid);
-	STOP_SPR(STOP_LDBAR, ldbar);
-	STOP_SPR(STOP_FSCR, fscr);
-	STOP_SPR(STOP_HFSCR, hfscr);
-	STOP_SPR(STOP_MMCR1, mmcr1);
-	STOP_SPR(STOP_MMCR2, mmcr2);
-	STOP_SPR(STOP_MMCRA, mmcra);
-#endif
-
 	DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER);
 	DEFINE(PPC_DBELL_MSGTYPE, PPC_DBELL_MSGTYPE);
 
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index a5b8fbae56a0..6247b5bbfa5c 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -120,7 +120,9 @@ EXC_VIRT_NONE(0x4000, 0x100)
 	mfspr	r10,SPRN_SRR1 ;						\
 	rlwinm.	r10,r10,47-31,30,31 ;					\
 	beq-	1f ;							\
-	cmpwi	cr3,r10,2 ;						\
+	cmpwi	cr1,r10,2 ;						\
+	mfspr	r3,SPRN_SRR1 ;						\
+	bltlr	cr1 ;	/* no state loss, return to idle caller */	\
 	BRANCH_TO_C000(r10, system_reset_idle_common) ;			\
 1:									\
 	KVMTEST_PR(n) ;							\
@@ -144,8 +146,11 @@ TRAMP_KVM(PACA_EXNMI, 0x100)
 
 #ifdef CONFIG_PPC_P7_NAP
 EXC_COMMON_BEGIN(system_reset_idle_common)
-	mfspr	r12,SPRN_SRR1
-	b	pnv_powersave_wakeup
+	/*
+	 * This must be a direct branch (without linker branch stub) because
+	 * we can not use TOC at this point as r2 may not be restored yet.
+	 */
+	b	idle_return_gpr_loss
 #endif
 
 /*
@@ -427,17 +432,17 @@ EXC_COMMON_BEGIN(machine_check_idle_common)
 	 * Then decrement MCE nesting after finishing with the stack.
 	 */
 	ld	r3,_MSR(r1)
+	ld	r4,_LINK(r1)
 
 	lhz	r11,PACA_IN_MCE(r13)
 	subi	r11,r11,1
 	sth	r11,PACA_IN_MCE(r13)
 
-	/* Turn off the RI bit because SRR1 is used by idle wakeup code. */
-	/* Recoverability could be improved by reducing the use of SRR1. */
-	li	r11,0
-	mtmsrd	r11,1
-
-	b	pnv_powersave_wakeup_mce
+	mtlr	r4
+	rlwinm	r10,r3,47-31,30,31
+	cmpwi	cr1,r10,2
+	bltlr	cr1	/* no state loss, return to idle caller */
+	b	idle_return_gpr_loss
 #endif
 	/*
 	 * Handle machine check early in real mode. We come here with
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 7f5ac2e8581b..2dfbd5d5b932 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -1,956 +1,188 @@
 /*
- *  This file contains idle entry/exit functions for POWER7,
- *  POWER8 and POWER9 CPUs.
+ *  Copyright 2018, IBM Corporation.
  *
  *  This program is free software; you can redistribute it and/or
  *  modify it under the terms of the GNU General Public License
  *  as published by the Free Software Foundation; either version
  *  2 of the License, or (at your option) any later version.
+ *
+ *  This file contains general idle entry/exit functions to save
+ *  and restore stack and NVGPRs which allows C code to call idle
+ *  states that lose GPRs, and it will return transparently with
+ *  SRR1 wakeup reason return value.
+ *
+ *  The platform / CPU caller must ensure SPRs and any other non-GPR
+ *  state is saved and restored correctly, handle KVM, interrupts, etc.
  */
 
-#include <linux/threads.h>
-#include <asm/processor.h>
-#include <asm/page.h>
-#include <asm/cputable.h>
-#include <asm/thread_info.h>
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/ppc-opcode.h>
-#include <asm/hw_irq.h>
-#include <asm/kvm_book3s_asm.h>
-#include <asm/opal.h>
 #include <asm/cpuidle.h>
-#include <asm/exception-64s.h>
-#include <asm/book3s/64/mmu-hash.h>
-#include <asm/mmu.h>
-#include <asm/asm-compat.h>
-#include <asm/feature-fixups.h>
-
-#undef DEBUG
-
-/*
- * Use unused space in the interrupt stack to save and restore
- * registers for winkle support.
- */
-#define _MMCR0	GPR0
-#define _SDR1	GPR3
-#define _PTCR	GPR3
-#define _RPR	GPR4
-#define _SPURR	GPR5
-#define _PURR	GPR6
-#define _TSCR	GPR7
-#define _DSCR	GPR8
-#define _AMOR	GPR9
-#define _WORT	GPR10
-#define _WORC	GPR11
-#define _LPCR	GPR12
-
-#define PSSCR_EC_ESL_MASK_SHIFTED          (PSSCR_EC | PSSCR_ESL) >> 16
 
-	.text
-
-/*
- * Used by threads before entering deep idle states. Saves SPRs
- * in interrupt stack frame
- */
-save_sprs_to_stack:
-	/*
-	 * Note all register i.e per-core, per-subcore or per-thread is saved
-	 * here since any thread in the core might wake up first
-	 */
-BEGIN_FTR_SECTION
-	/*
-	 * Note - SDR1 is dropped in Power ISA v3. Hence not restoring
-	 * SDR1 here
-	 */
-	mfspr	r3,SPRN_PTCR
-	std	r3,_PTCR(r1)
-	mfspr	r3,SPRN_LPCR
-	std	r3,_LPCR(r1)
-FTR_SECTION_ELSE
-	mfspr	r3,SPRN_SDR1
-	std	r3,_SDR1(r1)
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
-	mfspr	r3,SPRN_RPR
-	std	r3,_RPR(r1)
-	mfspr	r3,SPRN_SPURR
-	std	r3,_SPURR(r1)
-	mfspr	r3,SPRN_PURR
-	std	r3,_PURR(r1)
-	mfspr	r3,SPRN_TSCR
-	std	r3,_TSCR(r1)
-	mfspr	r3,SPRN_DSCR
-	std	r3,_DSCR(r1)
-	mfspr	r3,SPRN_AMOR
-	std	r3,_AMOR(r1)
-	mfspr	r3,SPRN_WORT
-	std	r3,_WORT(r1)
-	mfspr	r3,SPRN_WORC
-	std	r3,_WORC(r1)
 /*
- * On POWER9, there are idle states such as stop4, invoked via cpuidle,
- * that lose hypervisor resources. In such cases, we need to save
- * additional SPRs before entering those idle states so that they can
- * be restored to their older values on wakeup from the idle state.
+ * Desired PSSCR in r3
  *
- * On POWER8, the only such deep idle state is winkle which is used
- * only in the context of CPU-Hotplug, where these additional SPRs are
- * reinitiazed to a sane value. Hence there is no need to save/restore
- * these SPRs.
+ * No state will be lost regardless of wakeup mechanism (interrupt or NIA).
+ *
+ * An EC=0 type wakeup will return with a value of 0. SRESET wakeup (which can
+ * happen with xscom SRESET and possibly MCE) may clobber volatiles except LR,
+ * and must blr, to return to caller with r3 set according to caller's expected
+ * return code (for Book3S/64 that is SRR1).
  */
-BEGIN_FTR_SECTION
-	blr
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
-
-power9_save_additional_sprs:
-	mfspr	r3, SPRN_PID
-	mfspr	r4, SPRN_LDBAR
-	std	r3, STOP_PID(r13)
-	std	r4, STOP_LDBAR(r13)
-
-	mfspr	r3, SPRN_FSCR
-	mfspr	r4, SPRN_HFSCR
-	std	r3, STOP_FSCR(r13)
-	std	r4, STOP_HFSCR(r13)
-
-	mfspr	r3, SPRN_MMCRA
-	mfspr	r4, SPRN_MMCR0
-	std	r3, STOP_MMCRA(r13)
-	std	r4, _MMCR0(r1)
-
-	mfspr	r3, SPRN_MMCR1
-	mfspr	r4, SPRN_MMCR2
-	std	r3, STOP_MMCR1(r13)
-	std	r4, STOP_MMCR2(r13)
-	blr
-
-power9_restore_additional_sprs:
-	ld	r3,_LPCR(r1)
-	ld	r4, STOP_PID(r13)
-	mtspr	SPRN_LPCR,r3
-	mtspr	SPRN_PID, r4
-
-	ld	r3, STOP_LDBAR(r13)
-	ld	r4, STOP_FSCR(r13)
-	mtspr	SPRN_LDBAR, r3
-	mtspr	SPRN_FSCR, r4
-
-	ld	r3, STOP_HFSCR(r13)
-	ld	r4, STOP_MMCRA(r13)
-	mtspr	SPRN_HFSCR, r3
-	mtspr	SPRN_MMCRA, r4
-
-	ld	r3, _MMCR0(r1)
-	ld	r4, STOP_MMCR1(r13)
-	mtspr	SPRN_MMCR0, r3
-	mtspr	SPRN_MMCR1, r4
-
-	ld	r3, STOP_MMCR2(r13)
-	ld	r4, PACA_SPRG_VDSO(r13)
-	mtspr	SPRN_MMCR2, r3
-	mtspr	SPRN_SPRG3, r4
+_GLOBAL(isa300_idle_stop_noloss)
+	mtspr 	SPRN_PSSCR,r3
+	PPC_STOP
+	li	r3,0
 	blr
 
 /*
- * Used by threads when the lock bit of core_idle_state is set.
- * Threads will spin in HMT_LOW until the lock bit is cleared.
- * r14 - pointer to core_idle_state
- * r15 - used to load contents of core_idle_state
- * r9  - used as a temporary variable
+ * Desired PSSCR in r3
+ *
+ * GPRs may be lost, so they are saved here. Wakeup is by interrupt only.
+ * The SRESET wakeup returns to this function's caller by calling
+ * idle_return_gpr_loss with r3 set to desired return value.
+ *
+ * A wakeup without GPR loss may alteratively be handled as in
+ * isa300_idle_stop_noloss and blr directly, as an optimisation.
+ *
+ * The caller is responsible for saving/restoring SPRs, MSR, timebase,
+ * etc.
  */
-
-core_idle_lock_held:
-	HMT_LOW
-3:	lwz	r15,0(r14)
-	andis.	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	bne	3b
-	HMT_MEDIUM
-	lwarx	r15,0,r14
-	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	bne-	core_idle_lock_held
-	blr
+_GLOBAL(isa300_idle_stop_mayloss)
+	mtspr 	SPRN_PSSCR,r3
+	std	r1,PACAR1(r13)
+	mflr	r4
+	mfcr	r5
+	/* use stack red zone rather than a new frame for saving regs */
+	std	r2,-8*0(r1)
+	std	r14,-8*1(r1)
+	std	r15,-8*2(r1)
+	std	r16,-8*3(r1)
+	std	r17,-8*4(r1)
+	std	r18,-8*5(r1)
+	std	r19,-8*6(r1)
+	std	r20,-8*7(r1)
+	std	r21,-8*8(r1)
+	std	r22,-8*9(r1)
+	std	r23,-8*10(r1)
+	std	r24,-8*11(r1)
+	std	r25,-8*12(r1)
+	std	r26,-8*13(r1)
+	std	r27,-8*14(r1)
+	std	r28,-8*15(r1)
+	std	r29,-8*16(r1)
+	std	r30,-8*17(r1)
+	std	r31,-8*18(r1)
+	std	r4,-8*19(r1)
+	std	r5,-8*20(r1)
+	/* 168 bytes */
+	PPC_STOP
+	b	.	/* catch bugs */
 
 /*
- * Pass requested state in r3:
- *	r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8
- *	   - Requested PSSCR value in POWER9
+ * Desired return value in r3
+ *
+ * The idle wakeup SRESET interrupt can call this after calling
+ * to return to the idle sleep function caller with r3 as the return code.
  *
- * Address of idle handler to branch to in realmode in r4
+ * This must not be used if idle was entered via a _noloss function (use
+ * a simple blr instead).
  */
-pnv_powersave_common:
-	/* Use r3 to pass state nap/sleep/winkle */
-	/* NAP is a state loss, we create a regs frame on the
-	 * stack, fill it up with the state we care about and
-	 * stick a pointer to it in PACAR1. We really only
-	 * need to save PC, some CR bits and the NV GPRs,
-	 * but for now an interrupt frame will do.
-	 */
-	mtctr	r4
-
-	mflr	r0
-	std	r0,16(r1)
-	stdu	r1,-INT_FRAME_SIZE(r1)
-	std	r0,_LINK(r1)
-	std	r0,_NIP(r1)
-
-	/* We haven't lost state ... yet */
-	li	r0,0
-	stb	r0,PACA_NAPSTATELOST(r13)
-
-	/* Continue saving state */
-	SAVE_GPR(2, r1)
-	SAVE_NVGPRS(r1)
-	mfcr	r5
-	std	r5,_CCR(r1)
-	std	r1,PACAR1(r13)
-
-BEGIN_FTR_SECTION
-	/*
-	 * POWER9 does not require real mode to stop, and presently does not
-	 * set hwthread_state for KVM (threads don't share MMU context), so
-	 * we can remain in virtual mode for this.
-	 */
-	bctr
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-	/*
-	 * POWER8
-	 * Go to real mode to do the nap, as required by the architecture.
-	 * Also, we need to be in real mode before setting hwthread_state,
-	 * because as soon as we do that, another thread can switch
-	 * the MMU context to the guest.
-	 */
-	LOAD_REG_IMMEDIATE(r7, MSR_IDLE)
-	mtmsrd	r7,0
-	bctr
+_GLOBAL(idle_return_gpr_loss)
+	ld	r1,PACAR1(r13)
+	ld	r4,-8*19(r1)
+	ld	r5,-8*20(r1)
+	mtlr	r4
+	mtcr	r5
+	/*
+	 * KVM nap requires r2 to be saved, rather than just restoring it
+	 * from PACATOC. This could be avoided for that less common case
+	 * if KVM saved its r2.
+	 */
+	ld	r2,-8*0(r1)
+	ld	r14,-8*1(r1)
+	ld	r15,-8*2(r1)
+	ld	r16,-8*3(r1)
+	ld	r17,-8*4(r1)
+	ld	r18,-8*5(r1)
+	ld	r19,-8*6(r1)
+	ld	r20,-8*7(r1)
+	ld	r21,-8*8(r1)
+	ld	r22,-8*9(r1)
+	ld	r23,-8*10(r1)
+	ld	r24,-8*11(r1)
+	ld	r25,-8*12(r1)
+	ld	r26,-8*13(r1)
+	ld	r27,-8*14(r1)
+	ld	r28,-8*15(r1)
+	ld	r29,-8*16(r1)
+	ld	r30,-8*17(r1)
+	ld	r31,-8*18(r1)
+	blr
 
 /*
  * This is the sequence required to execute idle instructions, as
  * specified in ISA v2.07 (and earlier). MSR[IR] and MSR[DR] must be 0.
+ *
+ * The 0(r1) slot is used to save r2 in isa206, so use that here.
  */
 #define IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST)			\
 	/* Magic NAP/SLEEP/WINKLE mode enter sequence */	\
-	std	r0,0(r1);					\
+	std	r2,0(r1);					\
 	ptesync;						\
-	ld	r0,0(r1);					\
-236:	cmpd	cr0,r0,r0;					\
+	ld	r2,0(r1);					\
+236:	cmpd	cr0,r2,r2;					\
 	bne	236b;						\
-	IDLE_INST;
-
-
-	.globl pnv_enter_arch207_idle_mode
-pnv_enter_arch207_idle_mode:
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-	/* Tell KVM we're entering idle */
-	li	r4,KVM_HWTHREAD_IN_IDLE
-	/******************************************************/
-	/*  N O T E   W E L L    ! ! !    N O T E   W E L L   */
-	/* The following store to HSTATE_HWTHREAD_STATE(r13)  */
-	/* MUST occur in real mode, i.e. with the MMU off,    */
-	/* and the MMU must stay off until we clear this flag */
-	/* and test HSTATE_HWTHREAD_REQ(r13) in               */
-	/* pnv_powersave_wakeup in this file.                 */
-	/* The reason is that another thread can switch the   */
-	/* MMU to a guest context whenever this flag is set   */
-	/* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on,    */
-	/* that would potentially cause this thread to start  */
-	/* executing instructions from guest memory in        */
-	/* hypervisor mode, leading to a host crash or data   */
-	/* corruption, or worse.                              */
-	/******************************************************/
-	stb	r4,HSTATE_HWTHREAD_STATE(r13)
-#endif
-	stb	r3,PACA_THREAD_IDLE_STATE(r13)
-	cmpwi	cr3,r3,PNV_THREAD_SLEEP
-	bge	cr3,2f
-	IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP)
-	/* No return */
-2:
-	/* Sleep or winkle */
-	lbz	r7,PACA_THREAD_MASK(r13)
-	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
-	li	r5,0
-	beq	cr3,3f
-	lis	r5,PNV_CORE_IDLE_WINKLE_COUNT@h
-3:
-lwarx_loop1:
-	lwarx	r15,0,r14
-
-	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	bnel-	core_idle_lock_held
-
-	add	r15,r15,r5			/* Add if winkle */
-	andc	r15,r15,r7			/* Clear thread bit */
-
-	andi.	r9,r15,PNV_CORE_IDLE_THREAD_BITS
-
-/*
- * If cr0 = 0, then current thread is the last thread of the core entering
- * sleep. Last thread needs to execute the hardware bug workaround code if
- * required by the platform.
- * Make the workaround call unconditionally here. The below branch call is
- * patched out when the idle states are discovered if the platform does not
- * require it.
- */
-.global pnv_fastsleep_workaround_at_entry
-pnv_fastsleep_workaround_at_entry:
-	beq	fastsleep_workaround_at_entry
-
-	stwcx.	r15,0,r14
-	bne-	lwarx_loop1
-	isync
-
-common_enter: /* common code for all the threads entering sleep or winkle */
-	bgt	cr3,enter_winkle
-	IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
-
-fastsleep_workaround_at_entry:
-	oris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	stwcx.	r15,0,r14
-	bne-	lwarx_loop1
-	isync
-
-	/* Fast sleep workaround */
-	li	r3,1
-	li	r4,1
-	bl	opal_config_cpu_idle_state
-
-	/* Unlock */
-	xoris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	lwsync
-	stw	r15,0(r14)
-	b	common_enter
-
-enter_winkle:
-	bl	save_sprs_to_stack
-
-	IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
-
-/*
- * r3 - PSSCR value corresponding to the requested stop state.
- */
-power_enter_stop:
-/*
- * Check if we are executing the lite variant with ESL=EC=0
- */
-	andis.   r4,r3,PSSCR_EC_ESL_MASK_SHIFTED
-	clrldi   r3,r3,60 /* r3 = Bits[60:63] = Requested Level (RL) */
-	bne	 .Lhandle_esl_ec_set
-	PPC_STOP
-	li	r3,0  /* Since we didn't lose state, return 0 */
-	std	r3, PACA_REQ_PSSCR(r13)
-
-	/*
-	 * pnv_wakeup_noloss() expects r12 to contain the SRR1 value so
-	 * it can determine if the wakeup reason is an HMI in
-	 * CHECK_HMI_INTERRUPT.
-	 *
-	 * However, when we wakeup with ESL=0, SRR1 will not contain the wakeup
-	 * reason, so there is no point setting r12 to SRR1.
-	 *
-	 * Further, we clear r12 here, so that we don't accidentally enter the
-	 * HMI in pnv_wakeup_noloss() if the value of r12[42:45] == WAKE_HMI.
-	 */
-	li	r12, 0
-	b 	pnv_wakeup_noloss
-
-.Lhandle_esl_ec_set:
-BEGIN_FTR_SECTION
-	/*
-	 * POWER9 DD2.0 or earlier can incorrectly set PMAO when waking up after
-	 * a state-loss idle. Saving and restoring MMCR0 over idle is a
-	 * workaround.
-	 */
-	mfspr	r4,SPRN_MMCR0
-	std	r4,_MMCR0(r1)
-END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1)
+	IDLE_INST;						\
+	b	.	/* catch bugs */
 
 /*
- * Check if the requested state is a deep idle state.
- */
-	LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
-	ld	r4,ADDROFF(pnv_first_deep_stop_state)(r5)
-	cmpd	r3,r4
-	bge	.Lhandle_deep_stop
-	PPC_STOP	/* Does not return (system reset interrupt) */
-
-.Lhandle_deep_stop:
-/*
- * Entering deep idle state.
- * Clear thread bit in PACA_CORE_IDLE_STATE, save SPRs to
- * stack and enter stop
- */
-	lbz     r7,PACA_THREAD_MASK(r13)
-	ld      r14,PACA_CORE_IDLE_STATE_PTR(r13)
-
-lwarx_loop_stop:
-	lwarx   r15,0,r14
-	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	bnel-	core_idle_lock_held
-	andc    r15,r15,r7                      /* Clear thread bit */
-
-	stwcx.  r15,0,r14
-	bne-    lwarx_loop_stop
-	isync
-
-	bl	save_sprs_to_stack
-
-	PPC_STOP	/* Does not return (system reset interrupt) */
-
-/*
- * Entered with MSR[EE]=0 and no soft-masked interrupts pending.
- * r3 contains desired idle state (PNV_THREAD_NAP/SLEEP/WINKLE).
- */
-_GLOBAL(power7_idle_insn)
-	/* Now check if user or arch enabled NAP mode */
-	LOAD_REG_ADDR(r4, pnv_enter_arch207_idle_mode)
-	b	pnv_powersave_common
-
-#define CHECK_HMI_INTERRUPT						\
-BEGIN_FTR_SECTION_NESTED(66);						\
-	rlwinm	r0,r12,45-31,0xf;  /* extract wake reason field (P8) */	\
-FTR_SECTION_ELSE_NESTED(66);						\
-	rlwinm	r0,r12,45-31,0xe;  /* P7 wake reason field is 3 bits */	\
-ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);		\
-	cmpwi	r0,0xa;			/* Hypervisor maintenance ? */	\
-	bne+	20f;							\
-	/* Invoke opal call to handle hmi */				\
-	ld	r2,PACATOC(r13);					\
-	ld	r1,PACAR1(r13);						\
-	std	r3,ORIG_GPR3(r1);	/* Save original r3 */		\
-	li	r3,0;			/* NULL argument */		\
-	bl	hmi_exception_realmode;					\
-	nop;								\
-	ld	r3,ORIG_GPR3(r1);	/* Restore original r3 */	\
-20:	nop;
-
-/*
- * Entered with MSR[EE]=0 and no soft-masked interrupts pending.
- * r3 contains desired PSSCR register value.
+ * Desired instruction type in r3
  *
- * Offline (CPU unplug) case also must notify KVM that the CPU is
- * idle.
- */
-_GLOBAL(power9_offline_stop)
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-	/*
-	 * Tell KVM we're entering idle.
-	 * This does not have to be done in real mode because the P9 MMU
-	 * is independent per-thread. Some steppings share radix/hash mode
-	 * between threads, but in that case KVM has a barrier sync in real
-	 * mode before and after switching between radix and hash.
-	 */
-	li	r4,KVM_HWTHREAD_IN_IDLE
-	stb	r4,HSTATE_HWTHREAD_STATE(r13)
-#endif
-	/* fall through */
-
-_GLOBAL(power9_idle_stop)
-	std	r3, PACA_REQ_PSSCR(r13)
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-BEGIN_FTR_SECTION
-	sync
-	lwz	r5, PACA_DONT_STOP(r13)
-	cmpwi	r5, 0
-	bne	1f
-END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
-#endif
-	mtspr 	SPRN_PSSCR,r3
-	LOAD_REG_ADDR(r4,power_enter_stop)
-	b	pnv_powersave_common
-	/* No return */
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-1:
-	/*
-	 * We get here when TM / thread reconfiguration bug workaround
-	 * code wants to get the CPU into SMT4 mode, and therefore
-	 * we are being asked not to stop.
-	 */
-	li	r3, 0
-	std	r3, PACA_REQ_PSSCR(r13)
-	blr		/* return 0 for wakeup cause / SRR1 value */
-#endif
-
-/*
- * Called from machine check handler for powersave wakeups.
- * Low level machine check processing has already been done. Now just
- * go through the wake up path to get everything in order.
+ * GPRs may be lost, so they are saved here. Wakeup is by interrupt only.
+ * The SRESET wakeup returns to this function's caller by calling
+ * idle_return_gpr_loss with r3 set to desired return value.
  *
- * r3 - The original SRR1 value.
- * Original SRR[01] have been clobbered.
- * MSR_RI is clear.
- */
-.global pnv_powersave_wakeup_mce
-pnv_powersave_wakeup_mce:
-	/* Set cr3 for pnv_powersave_wakeup */
-	rlwinm	r11,r3,47-31,30,31
-	cmpwi	cr3,r11,2
-
-	/*
-	 * Now put the original SRR1 with SRR1_WAKEMCE_RESVD as the wake
-	 * reason into r12, which allows reuse of the system reset wakeup
-	 * code without being mistaken for another type of wakeup.
-	 */
-	oris	r12,r3,SRR1_WAKEMCE_RESVD@h
-
-	b	pnv_powersave_wakeup
-
-/*
- * Called from reset vector for powersave wakeups.
- * cr3 - set to gt if waking up with partial/complete hypervisor state loss
- * r12 - SRR1
- */
-.global pnv_powersave_wakeup
-pnv_powersave_wakeup:
-	ld	r2, PACATOC(r13)
-
-BEGIN_FTR_SECTION
-	bl	pnv_restore_hyp_resource_arch300
-FTR_SECTION_ELSE
-	bl	pnv_restore_hyp_resource_arch207
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
-
-	li	r0,PNV_THREAD_RUNNING
-	stb	r0,PACA_THREAD_IDLE_STATE(r13)	/* Clear thread state */
-
-	mr	r3,r12
-
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-	lbz	r0,HSTATE_HWTHREAD_STATE(r13)
-	cmpwi	r0,KVM_HWTHREAD_IN_KERNEL
-	beq	0f
-	li	r0,KVM_HWTHREAD_IN_KERNEL
-	stb	r0,HSTATE_HWTHREAD_STATE(r13)
-	/* Order setting hwthread_state vs. testing hwthread_req */
-	sync
-0:	lbz	r0,HSTATE_HWTHREAD_REQ(r13)
-	cmpwi	r0,0
-	beq	1f
-	b	kvm_start_guest
-1:
-#endif
-
-	/* Return SRR1 from power7_nap() */
-	blt	cr3,pnv_wakeup_noloss
-	b	pnv_wakeup_loss
-
-/*
- * Check whether we have woken up with hypervisor state loss.
- * If yes, restore hypervisor state and return back to link.
+ * A wakeup without GPR loss may alteratively be handled as in
+ * isa300_idle_stop_noloss and blr directly, as an optimisation.
  *
- * cr3 - set to gt if waking up with partial/complete hypervisor state loss
- */
-pnv_restore_hyp_resource_arch300:
-	/*
-	 * Workaround for POWER9, if we lost resources, the ERAT
-	 * might have been mixed up and needs flushing. We also need
-	 * to reload MMCR0 (see comment above). We also need to set
-	 * then clear bit 60 in MMCRA to ensure the PMU starts running.
-	 */
-	blt	cr3,1f
-BEGIN_FTR_SECTION
-	PPC_INVALIDATE_ERAT
-	ld	r1,PACAR1(r13)
-	ld	r4,_MMCR0(r1)
-	mtspr	SPRN_MMCR0,r4
-END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1)
-	mfspr	r4,SPRN_MMCRA
-	ori	r4,r4,(1 << (63-60))
-	mtspr	SPRN_MMCRA,r4
-	xori	r4,r4,(1 << (63-60))
-	mtspr	SPRN_MMCRA,r4
-1:
-	/*
-	 * POWER ISA 3. Use PSSCR to determine if we
-	 * are waking up from deep idle state
-	 */
-	LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
-	ld	r4,ADDROFF(pnv_first_deep_stop_state)(r5)
-
-	/*
-	 * 0-3 bits correspond to Power-Saving Level Status
-	 * which indicates the idle state we are waking up from
-	 */
-	mfspr	r5, SPRN_PSSCR
-	rldicl  r5,r5,4,60
-	li	r0, 0		/* clear requested_psscr to say we're awake */
-	std	r0, PACA_REQ_PSSCR(r13)
-	cmpd	cr4,r5,r4
-	bge	cr4,pnv_wakeup_tb_loss /* returns to caller */
-
-	blr	/* Waking up without hypervisor state loss. */
-
-/* Same calling convention as arch300 */
-pnv_restore_hyp_resource_arch207:
-	/*
-	 * POWER ISA 2.07 or less.
-	 * Check if we slept with sleep or winkle.
-	 */
-	lbz	r4,PACA_THREAD_IDLE_STATE(r13)
-	cmpwi	cr2,r4,PNV_THREAD_NAP
-	bgt	cr2,pnv_wakeup_tb_loss	/* Either sleep or Winkle */
-
-	/*
-	 * We fall through here if PACA_THREAD_IDLE_STATE shows we are waking
-	 * up from nap. At this stage CR3 shouldn't contains 'gt' since that
-	 * indicates we are waking with hypervisor state loss from nap.
-	 */
-	bgt	cr3,.
-
-	blr	/* Waking up without hypervisor state loss */
-
-/*
- * Called if waking up from idle state which can cause either partial or
- * complete hyp state loss.
- * In POWER8, called if waking up from fastsleep or winkle
- * In POWER9, called if waking up from stop state >= pnv_first_deep_stop_state
- *
- * r13 - PACA
- * cr3 - gt if waking up with partial/complete hypervisor state loss
- *
- * If ISA300:
- * cr4 - gt or eq if waking up from complete hypervisor state loss.
+ * The caller is responsible for saving/restoring SPRs, MSR, timebase,
+ * etc.
  *
- * If ISA207:
- * r4 - PACA_THREAD_IDLE_STATE
+ * This must be called in real-mode (MSR_IDLE).
  */
-pnv_wakeup_tb_loss:
-	ld	r1,PACAR1(r13)
-	/*
-	 * Before entering any idle state, the NVGPRs are saved in the stack.
-	 * If there was a state loss, or PACA_NAPSTATELOST was set, then the
-	 * NVGPRs are restored. If we are here, it is likely that state is lost,
-	 * but not guaranteed -- neither ISA207 nor ISA300 tests to reach
-	 * here are the same as the test to restore NVGPRS:
-	 * PACA_THREAD_IDLE_STATE test for ISA207, PSSCR test for ISA300,
-	 * and SRR1 test for restoring NVGPRs.
-	 *
-	 * We are about to clobber NVGPRs now, so set NAPSTATELOST to
-	 * guarantee they will always be restored. This might be tightened
-	 * with careful reading of specs (particularly for ISA300) but this
-	 * is already a slow wakeup path and it's simpler to be safe.
-	 */
-	li	r0,1
-	stb	r0,PACA_NAPSTATELOST(r13)
-
-	/*
-	 *
-	 * Save SRR1 and LR in NVGPRs as they might be clobbered in
-	 * opal_call() (called in CHECK_HMI_INTERRUPT). SRR1 is required
-	 * to determine the wakeup reason if we branch to kvm_start_guest. LR
-	 * is required to return back to reset vector after hypervisor state
-	 * restore is complete.
-	 */
-	mr	r19,r12
-	mr	r18,r4
-	mflr	r17
-BEGIN_FTR_SECTION
-	CHECK_HMI_INTERRUPT
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
-
-	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
-	lbz	r7,PACA_THREAD_MASK(r13)
-
-	/*
-	 * Take the core lock to synchronize against other threads.
-	 *
-	 * Lock bit is set in one of the 2 cases-
-	 * a. In the sleep/winkle enter path, the last thread is executing
-	 * fastsleep workaround code.
-	 * b. In the wake up path, another thread is executing fastsleep
-	 * workaround undo code or resyncing timebase or restoring context
-	 * In either case loop until the lock bit is cleared.
-	 */
-1:
-	lwarx	r15,0,r14
-	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	bnel-	core_idle_lock_held
-	oris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	stwcx.	r15,0,r14
-	bne-	1b
-	isync
-
-	andi.	r9,r15,PNV_CORE_IDLE_THREAD_BITS
-	cmpwi	cr2,r9,0
-
-	/*
-	 * At this stage
-	 * cr2 - eq if first thread to wakeup in core
-	 * cr3-  gt if waking up with partial/complete hypervisor state loss
-	 * ISA300:
-	 * cr4 - gt or eq if waking up from complete hypervisor state loss.
-	 */
-
-BEGIN_FTR_SECTION
-	/*
-	 * Were we in winkle?
-	 * If yes, check if all threads were in winkle, decrement our
-	 * winkle count, set all thread winkle bits if all were in winkle.
-	 * Check if our thread has a winkle bit set, and set cr4 accordingly
-	 * (to match ISA300, above). Pseudo-code for core idle state
-	 * transitions for ISA207 is as follows (everything happens atomically
-	 * due to store conditional and/or lock bit):
-	 *
-	 * nap_idle() { }
-	 * nap_wake() { }
-	 *
-	 * sleep_idle()
-	 * {
-	 *	core_idle_state &= ~thread_in_core
-	 * }
-	 *
-	 * sleep_wake()
-	 * {
-	 *     bool first_in_core, first_in_subcore;
-	 *
-	 *     first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0;
-	 *     first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0;
-	 *
-	 *     core_idle_state |= thread_in_core;
-	 * }
-	 *
-	 * winkle_idle()
-	 * {
-	 *	core_idle_state &= ~thread_in_core;
-	 *	core_idle_state += 1 << WINKLE_COUNT_SHIFT;
-	 * }
-	 *
-	 * winkle_wake()
-	 * {
-	 *     bool first_in_core, first_in_subcore, winkle_state_lost;
-	 *
-	 *     first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0;
-	 *     first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0;
-	 *
-	 *     core_idle_state |= thread_in_core;
-	 *
-	 *     if ((core_idle_state & WINKLE_MASK) == (8 << WINKLE_COUNT_SIHFT))
-	 *         core_idle_state |= THREAD_WINKLE_BITS;
-	 *     core_idle_state -= 1 << WINKLE_COUNT_SHIFT;
-	 *
-	 *     winkle_state_lost = core_idle_state &
-	 *				(thread_in_core << WINKLE_THREAD_SHIFT);
-	 *     core_idle_state &= ~(thread_in_core << WINKLE_THREAD_SHIFT);
-	 * }
-	 *
-	 */
-	cmpwi	r18,PNV_THREAD_WINKLE
+_GLOBAL(isa206_idle_insn_mayloss)
+	std	r1,PACAR1(r13)
+	mflr	r4
+	mfcr	r5
+	/* use stack red zone rather than a new frame for saving regs */
+	std	r2,-8*0(r1)
+	std	r14,-8*1(r1)
+	std	r15,-8*2(r1)
+	std	r16,-8*3(r1)
+	std	r17,-8*4(r1)
+	std	r18,-8*5(r1)
+	std	r19,-8*6(r1)
+	std	r20,-8*7(r1)
+	std	r21,-8*8(r1)
+	std	r22,-8*9(r1)
+	std	r23,-8*10(r1)
+	std	r24,-8*11(r1)
+	std	r25,-8*12(r1)
+	std	r26,-8*13(r1)
+	std	r27,-8*14(r1)
+	std	r28,-8*15(r1)
+	std	r29,-8*16(r1)
+	std	r30,-8*17(r1)
+	std	r31,-8*18(r1)
+	std	r4,-8*19(r1)
+	std	r5,-8*20(r1)
+	cmpwi	r3,PNV_THREAD_NAP
+	bne	1f
+	IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP)
+1:	cmpwi	r3,PNV_THREAD_SLEEP
 	bne	2f
-	andis.	r9,r15,PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT@h
-	subis	r15,r15,PNV_CORE_IDLE_WINKLE_COUNT@h
-	beq	2f
-	ori	r15,r15,PNV_CORE_IDLE_THREAD_WINKLE_BITS /* all were winkle */
-2:
-	/* Shift thread bit to winkle mask, then test if this thread is set,
-	 * and remove it from the winkle bits */
-	slwi	r8,r7,8
-	and	r8,r8,r15
-	andc	r15,r15,r8
-	cmpwi	cr4,r8,1 /* cr4 will be gt if our bit is set, lt if not */
-
-	lbz	r4,PACA_SUBCORE_SIBLING_MASK(r13)
-	and	r4,r4,r15
-	cmpwi	r4,0	/* Check if first in subcore */
-
-	or	r15,r15,r7		/* Set thread bit */
-	beq	first_thread_in_subcore
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
-
-	or	r15,r15,r7		/* Set thread bit */
-	beq	cr2,first_thread_in_core
-
-	/* Not first thread in core or subcore to wake up */
-	b	clear_lock
-
-first_thread_in_subcore:
-	/*
-	 * If waking up from sleep, subcore state is not lost. Hence
-	 * skip subcore state restore
-	 */
-	blt	cr4,subcore_state_restored
-
-	/* Restore per-subcore state */
-	ld      r4,_SDR1(r1)
-	mtspr   SPRN_SDR1,r4
-
-	ld      r4,_RPR(r1)
-	mtspr   SPRN_RPR,r4
-	ld	r4,_AMOR(r1)
-	mtspr	SPRN_AMOR,r4
-
-subcore_state_restored:
-	/*
-	 * Check if the thread is also the first thread in the core. If not,
-	 * skip to clear_lock.
-	 */
-	bne	cr2,clear_lock
-
-first_thread_in_core:
-
-	/*
-	 * First thread in the core waking up from any state which can cause
-	 * partial or complete hypervisor state loss. It needs to
-	 * call the fastsleep workaround code if the platform requires it.
-	 * Call it unconditionally here. The below branch instruction will
-	 * be patched out if the platform does not have fastsleep or does not
-	 * require the workaround. Patching will be performed during the
-	 * discovery of idle-states.
-	 */
-.global pnv_fastsleep_workaround_at_exit
-pnv_fastsleep_workaround_at_exit:
-	b	fastsleep_workaround_at_exit
-
-timebase_resync:
-	/*
-	 * Use cr3 which indicates that we are waking up with atleast partial
-	 * hypervisor state loss to determine if TIMEBASE RESYNC is needed.
-	 */
-	ble	cr3,.Ltb_resynced
-	/* Time base re-sync */
-	bl	opal_resync_timebase;
-	/*
-	 * If waking up from sleep (POWER8), per core state
-	 * is not lost, skip to clear_lock.
-	 */
-.Ltb_resynced:
-	blt	cr4,clear_lock
-
-	/*
-	 * First thread in the core to wake up and its waking up with
-	 * complete hypervisor state loss. Restore per core hypervisor
-	 * state.
-	 */
-BEGIN_FTR_SECTION
-	ld	r4,_PTCR(r1)
-	mtspr	SPRN_PTCR,r4
-	ld	r4,_RPR(r1)
-	mtspr	SPRN_RPR,r4
-	ld	r4,_AMOR(r1)
-	mtspr	SPRN_AMOR,r4
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-
-	ld	r4,_TSCR(r1)
-	mtspr	SPRN_TSCR,r4
-	ld	r4,_WORC(r1)
-	mtspr	SPRN_WORC,r4
-
-clear_lock:
-	xoris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	lwsync
-	stw	r15,0(r14)
-
-common_exit:
-	/*
-	 * Common to all threads.
-	 *
-	 * If waking up from sleep, hypervisor state is not lost. Hence
-	 * skip hypervisor state restore.
-	 */
-	blt	cr4,hypervisor_state_restored
-
-	/* Waking up from winkle */
-
-BEGIN_MMU_FTR_SECTION
-	b	no_segments
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
-	/* Restore SLB  from PACA */
-	ld	r8,PACA_SLBSHADOWPTR(r13)
-
-	.rept	SLB_NUM_BOLTED
-	li	r3, SLBSHADOW_SAVEAREA
-	LDX_BE	r5, r8, r3
-	addi	r3, r3, 8
-	LDX_BE	r6, r8, r3
-	andis.	r7,r5,SLB_ESID_V@h
-	beq	1f
-	slbmte	r6,r5
-1:	addi	r8,r8,16
-	.endr
-no_segments:
-
-	/* Restore per thread state */
-
-	ld	r4,_SPURR(r1)
-	mtspr	SPRN_SPURR,r4
-	ld	r4,_PURR(r1)
-	mtspr	SPRN_PURR,r4
-	ld	r4,_DSCR(r1)
-	mtspr	SPRN_DSCR,r4
-	ld	r4,_WORT(r1)
-	mtspr	SPRN_WORT,r4
-
-	/* Call cur_cpu_spec->cpu_restore() */
-	LOAD_REG_ADDR(r4, cur_cpu_spec)
-	ld	r4,0(r4)
-	ld	r12,CPU_SPEC_RESTORE(r4)
-#ifdef PPC64_ELF_ABI_v1
-	ld	r12,0(r12)
-#endif
-	mtctr	r12
-	bctrl
-
-/*
- * On POWER9, we can come here on wakeup from a cpuidle stop state.
- * Hence restore the additional SPRs to the saved value.
- *
- * On POWER8, we come here only on winkle. Since winkle is used
- * only in the case of CPU-Hotplug, we don't need to restore
- * the additional SPRs.
- */
-BEGIN_FTR_SECTION
-	bl 	power9_restore_additional_sprs
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-hypervisor_state_restored:
-
-	mr	r12,r19
-	mtlr	r17
-	blr		/* return to pnv_powersave_wakeup */
-
-fastsleep_workaround_at_exit:
-	li	r3,1
-	li	r4,0
-	bl	opal_config_cpu_idle_state
-	b	timebase_resync
-
-/*
- * R3 here contains the value that will be returned to the caller
- * of power7_nap.
- * R12 contains SRR1 for CHECK_HMI_INTERRUPT.
- */
-.global pnv_wakeup_loss
-pnv_wakeup_loss:
-	ld	r1,PACAR1(r13)
-BEGIN_FTR_SECTION
-	CHECK_HMI_INTERRUPT
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
-	REST_NVGPRS(r1)
-	REST_GPR(2, r1)
-	ld	r4,PACAKMSR(r13)
-	ld	r5,_LINK(r1)
-	ld	r6,_CCR(r1)
-	addi	r1,r1,INT_FRAME_SIZE
-	mtlr	r5
-	mtcr	r6
-	mtmsrd	r4
-	blr
+	IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
+2:	IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
 
-/*
- * R3 here contains the value that will be returned to the caller
- * of power7_nap.
- * R12 contains SRR1 for CHECK_HMI_INTERRUPT.
- */
-pnv_wakeup_noloss:
-	lbz	r0,PACA_NAPSTATELOST(r13)
-	cmpwi	r0,0
-	bne	pnv_wakeup_loss
-	ld	r1,PACAR1(r13)
-BEGIN_FTR_SECTION
-	CHECK_HMI_INTERRUPT
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
-	ld	r4,PACAKMSR(r13)
-	ld	r5,_NIP(r1)
-	ld	r6,_CCR(r1)
-	addi	r1,r1,INT_FRAME_SIZE
-	mtlr	r5
-	mtcr	r6
-	mtmsrd	r4
-	blr
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 2e5dfb6e0823..8b4858f82229 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -401,8 +401,8 @@ void __init check_for_initrd(void)
 
 #ifdef CONFIG_SMP
 
-int threads_per_core, threads_per_subcore, threads_shift;
-cpumask_t threads_core_mask;
+int threads_per_core, threads_per_subcore, threads_shift __read_mostly;
+cpumask_t threads_core_mask __read_mostly;
 EXPORT_SYMBOL_GPL(threads_per_core);
 EXPORT_SYMBOL_GPL(threads_per_subcore);
 EXPORT_SYMBOL_GPL(threads_shift);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 139027c62dc2..dd014308f065 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -35,6 +35,7 @@
 #include <asm/thread_info.h>
 #include <asm/asm-compat.h>
 #include <asm/feature-fixups.h>
+#include <asm/cpuidle.h>
 
 /* Sign-extend HDEC if not on POWER9 */
 #define EXTEND_HDEC(reg)			\
@@ -45,6 +46,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 /* Values in HSTATE_NAPPING(r13) */
 #define NAPPING_CEDE	1
 #define NAPPING_NOVCPU	2
+#define NAPPING_UNSPLIT	3
 
 /* Stack frame offsets for kvmppc_hv_entry */
 #define SFS			208
@@ -290,17 +292,19 @@ kvm_novcpu_exit:
 	b	kvmhv_switch_to_host
 
 /*
- * We come in here when wakened from nap mode.
- * Relocation is off and most register values are lost.
- * r13 points to the PACA.
+ * We come in here when wakened from Linux offline idle code.
+ * Relocation is off
  * r3 contains the SRR1 wakeup value, SRR1 is trashed.
  */
-	.globl	kvm_start_guest
-kvm_start_guest:
-	/* Set runlatch bit the minute you wake up from nap */
-	mfspr	r0, SPRN_CTRLF
-	ori 	r0, r0, 1
-	mtspr	SPRN_CTRLT, r0
+_GLOBAL(idle_kvm_start_guest)
+	ld	r4,PACAEMERGSP(r13)
+	mfcr	r5
+	mflr	r0
+	std	r1,0(r4)
+	std	r5,8(r4)
+	std	r0,16(r4)
+	subi	r1,r4,STACK_FRAME_OVERHEAD
+	SAVE_NVGPRS(r1)
 
 	/*
 	 * Could avoid this and pass it through in r3. For now,
@@ -308,27 +312,23 @@ kvm_start_guest:
 	 */
 	mtspr	SPRN_SRR1,r3
 
-	ld	r2,PACATOC(r13)
-
 	li	r0,0
 	stb	r0,PACA_FTRACE_ENABLED(r13)
 
 	li	r0,KVM_HWTHREAD_IN_KVM
 	stb	r0,HSTATE_HWTHREAD_STATE(r13)
 
-	/* NV GPR values from power7_idle() will no longer be valid */
-	li	r0,1
-	stb	r0,PACA_NAPSTATELOST(r13)
-
-	/* were we napping due to cede? */
+	/* kvm cede / napping does not come through here */
 	lbz	r0,HSTATE_NAPPING(r13)
-	cmpwi	r0,NAPPING_CEDE
-	beq	kvm_end_cede
-	cmpwi	r0,NAPPING_NOVCPU
-	beq	kvm_novcpu_wakeup
+	twnei	r0,0
+
+	b	1f
 
-	ld	r1,PACAEMERGSP(r13)
-	subi	r1,r1,STACK_FRAME_OVERHEAD
+kvm_unsplit_wakeup:
+	li	r0, 0
+	stb	r0, HSTATE_NAPPING(r13)
+
+1:
 
 	/*
 	 * We weren't napping due to cede, so this must be a secondary
@@ -437,19 +437,25 @@ kvm_no_guest:
 	lbz	r3, HSTATE_HWTHREAD_REQ(r13)
 	cmpwi	r3, 0
 	bne	54f
-/*
- * We jump to pnv_wakeup_loss, which will return to the caller
- * of power7_nap in the powernv cpu offline loop.  The value we
- * put in r3 becomes the return value for power7_nap. pnv_wakeup_loss
- * requires SRR1 in r12.
- */
+
+	/*
+	 * Jump to idle_return_gpr_loss, which returns to the
+	 * idle_kvm_start_guest caller.
+	 */
 	li	r3, LPCR_PECE0
 	mfspr	r4, SPRN_LPCR
 	rlwimi	r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
 	mtspr	SPRN_LPCR, r4
-	li	r3, 0
-	mfspr	r12,SPRN_SRR1
-	b	pnv_wakeup_loss
+	/* set up r3 for return */
+	mfspr	r3,SPRN_SRR1
+	REST_NVGPRS(r1)
+	addi	r1, r1, STACK_FRAME_OVERHEAD
+	ld	r0, 16(r1)
+	ld	r5, 8(r1)
+	ld	r1, 0(r1)
+	mtlr	r0
+	mtcr	r5
+	blr
 
 53:	HMT_LOW
 	ld	r5, HSTATE_KVM_VCORE(r13)
@@ -534,6 +540,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	lbz	r0, KVM_SPLIT_DO_NAP(r3)
 	cmpwi	r0, 0
 	beq	57f
+	li	r3, NAPPING_UNSPLIT
+	stb	r3, HSTATE_NAPPING(r13)
 	li	r3, (LPCR_PECEDH | LPCR_PECE0) >> 4
 	mfspr	r5, SPRN_LPCR
 	rlwimi	r5, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1)
@@ -2657,6 +2665,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
 	lis	r3, LPCR_PECEDP@h	/* Do wake on privileged doorbell */
 
+	/* Go back to host stack */
+	ld	r1, HSTATE_HOST_R1(r13)
+
 	/*
 	 * Take a nap until a decrementer or external or doobell interrupt
 	 * occurs, with PECE1 and PECE0 set in LPCR.
@@ -2685,26 +2696,42 @@ BEGIN_FTR_SECTION
 	 *		requested level = 0 (just stop dispatching)
 	 */
 	lis	r3, (PSSCR_EC | PSSCR_ESL)@h
-	mtspr	SPRN_PSSCR, r3
 	/* Set LPCR_PECE_HVEE bit to enable wakeup by HV interrupts */
 	li	r4, LPCR_PECE_HVEE@higher
 	sldi	r4, r4, 32
 	or	r5, r5, r4
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+FTR_SECTION_ELSE
+	li	r3, PNV_THREAD_NAP
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
 	mtspr	SPRN_LPCR,r5
 	isync
-	li	r0, 0
-	std	r0, HSTATE_SCRATCH0(r13)
-	ptesync
-	ld	r0, HSTATE_SCRATCH0(r13)
-1:	cmpd	r0, r0
-	bne	1b
+
 BEGIN_FTR_SECTION
-	nap
+	bl	isa300_idle_stop_mayloss
 FTR_SECTION_ELSE
-	PPC_STOP
-ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
-	b	.
+	bl	isa206_idle_insn_mayloss
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
+
+	mfspr	r0, SPRN_CTRLF
+	ori	r0, r0, 1
+	mtspr	SPRN_CTRLT, r0
+
+	mtspr	SPRN_SRR1, r3
+
+	li	r0, 0
+	stb	r0, PACA_FTRACE_ENABLED(r13)
+
+	li	r0, KVM_HWTHREAD_IN_KVM
+	stb	r0, HSTATE_HWTHREAD_STATE(r13)
+
+	lbz	r0, HSTATE_NAPPING(r13)
+	cmpwi	r0, NAPPING_CEDE
+	beq	kvm_end_cede
+	cmpwi	r0, NAPPING_NOVCPU
+	beq	kvm_novcpu_wakeup
+	cmpwi	r0, NAPPING_UNSPLIT
+	beq	kvm_unsplit_wakeup
+	twi	31,0,0 /* Nap state must not be zero */
 
 33:	mr	r4, r3
 	li	r3, 0
@@ -2712,12 +2739,11 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	b	34f
 
 kvm_end_cede:
+	/* Woken by external or decrementer interrupt */
+
 	/* get vcpu pointer */
 	ld	r4, HSTATE_KVM_VCPU(r13)
 
-	/* Woken by external or decrementer interrupt */
-	ld	r1, HSTATE_HOST_R1(r13)
-
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
 	addi	r3, r4, VCPU_TB_RMINTR
 	bl	kvmhv_accumulate_time
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index e52f9b06dd9c..87f5f4ae60ca 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -16,6 +16,7 @@
 #include <linux/device.h>
 #include <linux/cpu.h>
 
+#include <asm/asm-prototypes.h>
 #include <asm/firmware.h>
 #include <asm/machdep.h>
 #include <asm/opal.h>
@@ -48,10 +49,10 @@ static u64 pnv_default_stop_mask;
 static bool default_stop_found;
 
 /*
- * First deep stop state. Used to figure out when to save/restore
- * hypervisor context.
+ * First stop state levels when SPR and TB loss can occur.
  */
-u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
+static u64 pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
+static u64 pnv_first_spr_loss_level = MAX_STOP_STATE + 1;
 
 /*
  * psscr value and mask of the deepest stop idle state.
@@ -62,6 +63,8 @@ static u64 pnv_deepest_stop_psscr_mask;
 static u64 pnv_deepest_stop_flag;
 static bool deepest_stop_found;
 
+static unsigned long power7_offline_type;
+
 static int pnv_save_sprs_for_deep_states(void)
 {
 	int cpu;
@@ -72,12 +75,12 @@ static int pnv_save_sprs_for_deep_states(void)
 	 * all cpus at boot. Get these reg values of current cpu and use the
 	 * same across all cpus.
 	 */
-	uint64_t lpcr_val = mfspr(SPRN_LPCR);
-	uint64_t hid0_val = mfspr(SPRN_HID0);
-	uint64_t hid1_val = mfspr(SPRN_HID1);
-	uint64_t hid4_val = mfspr(SPRN_HID4);
-	uint64_t hid5_val = mfspr(SPRN_HID5);
-	uint64_t hmeer_val = mfspr(SPRN_HMEER);
+	uint64_t lpcr_val	= mfspr(SPRN_LPCR);
+	uint64_t hid0_val	= mfspr(SPRN_HID0);
+	uint64_t hid1_val	= mfspr(SPRN_HID1);
+	uint64_t hid4_val	= mfspr(SPRN_HID4);
+	uint64_t hid5_val	= mfspr(SPRN_HID5);
+	uint64_t hmeer_val	= mfspr(SPRN_HMEER);
 	uint64_t msr_val = MSR_IDLE;
 	uint64_t psscr_val = pnv_deepest_stop_psscr_val;
 
@@ -137,89 +140,6 @@ static int pnv_save_sprs_for_deep_states(void)
 	return 0;
 }
 
-static void pnv_alloc_idle_core_states(void)
-{
-	int i, j;
-	int nr_cores = cpu_nr_cores();
-	u32 *core_idle_state;
-
-	/*
-	 * core_idle_state - The lower 8 bits track the idle state of
-	 * each thread of the core.
-	 *
-	 * The most significant bit is the lock bit.
-	 *
-	 * Initially all the bits corresponding to threads_per_core
-	 * are set. They are cleared when the thread enters deep idle
-	 * state like sleep and winkle/stop.
-	 *
-	 * Initially the lock bit is cleared.  The lock bit has 2
-	 * purposes:
-	 * 	a. While the first thread in the core waking up from
-	 * 	   idle is restoring core state, it prevents other
-	 * 	   threads in the core from switching to process
-	 * 	   context.
-	 * 	b. While the last thread in the core is saving the
-	 *	   core state, it prevents a different thread from
-	 *	   waking up.
-	 */
-	for (i = 0; i < nr_cores; i++) {
-		int first_cpu = i * threads_per_core;
-		int node = cpu_to_node(first_cpu);
-		size_t paca_ptr_array_size;
-
-		core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
-		*core_idle_state = (1 << threads_per_core) - 1;
-		paca_ptr_array_size = (threads_per_core *
-				       sizeof(struct paca_struct *));
-
-		for (j = 0; j < threads_per_core; j++) {
-			int cpu = first_cpu + j;
-
-			paca_ptrs[cpu]->core_idle_state_ptr = core_idle_state;
-			paca_ptrs[cpu]->thread_idle_state = PNV_THREAD_RUNNING;
-			paca_ptrs[cpu]->thread_mask = 1 << j;
-		}
-	}
-
-	update_subcore_sibling_mask();
-
-	if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) {
-		int rc = pnv_save_sprs_for_deep_states();
-
-		if (likely(!rc))
-			return;
-
-		/*
-		 * The stop-api is unable to restore hypervisor
-		 * resources on wakeup from platform idle states which
-		 * lose full context. So disable such states.
-		 */
-		supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT;
-		pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n");
-		pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n");
-
-		if (cpu_has_feature(CPU_FTR_ARCH_300) &&
-		    (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) {
-			/*
-			 * Use the default stop state for CPU-Hotplug
-			 * if available.
-			 */
-			if (default_stop_found) {
-				pnv_deepest_stop_psscr_val =
-					pnv_default_stop_val;
-				pnv_deepest_stop_psscr_mask =
-					pnv_default_stop_mask;
-				pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n",
-					pnv_deepest_stop_psscr_val);
-			} else { /* Fallback to snooze loop for CPU-Hotplug */
-				deepest_stop_found = false;
-				pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n");
-			}
-		}
-	}
-}
-
 u32 pnv_get_supported_cpuidle_states(void)
 {
 	return supported_cpuidle_states;
@@ -238,6 +158,9 @@ static void pnv_fastsleep_workaround_apply(void *info)
 		*err = 1;
 }
 
+static bool power7_fastsleep_workaround_entry = true;
+static bool power7_fastsleep_workaround_exit = true;
+
 /*
  * Used to store fastsleep workaround state
  * 0 - Workaround applied/undone at fastsleep entry/exit path (Default)
@@ -269,21 +192,15 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev,
 	 * fastsleep_workaround_applyonce = 1 implies
 	 * fastsleep workaround needs to be left in 'applied' state on all
 	 * the cores. Do this by-
-	 * 1. Patching out the call to 'undo' workaround in fastsleep exit path
-	 * 2. Sending ipi to all the cores which have at least one online thread
-	 * 3. Patching out the call to 'apply' workaround in fastsleep entry
-	 * path
+	 * 1. Disable the 'undo' workaround in fastsleep exit path
+	 * 2. Sendi IPIs to all the cores which have at least one online thread
+	 * 3. Disable the 'apply' workaround in fastsleep entry path
+	 *
 	 * There is no need to send ipi to cores which have all threads
 	 * offlined, as last thread of the core entering fastsleep or deeper
 	 * state would have applied workaround.
 	 */
-	err = patch_instruction(
-		(unsigned int *)pnv_fastsleep_workaround_at_exit,
-		PPC_INST_NOP);
-	if (err) {
-		pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_exit");
-		goto fail;
-	}
+	power7_fastsleep_workaround_exit = false;
 
 	get_online_cpus();
 	primary_thread_mask = cpu_online_cores_map();
@@ -296,13 +213,7 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev,
 		goto fail;
 	}
 
-	err = patch_instruction(
-		(unsigned int *)pnv_fastsleep_workaround_at_entry,
-		PPC_INST_NOP);
-	if (err) {
-		pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_entry");
-		goto fail;
-	}
+	power7_fastsleep_workaround_entry = false;
 
 	fastsleep_workaround_applyonce = 1;
 
@@ -315,27 +226,323 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600,
 			show_fastsleep_workaround_applyonce,
 			store_fastsleep_workaround_applyonce);
 
-static unsigned long __power7_idle_type(unsigned long type)
+static inline void atomic_start_thread_idle(void)
 {
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	int thread_nr = cpu_thread_in_core(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+
+	clear_bit(thread_nr, state);
+}
+
+static inline void atomic_stop_thread_idle(void)
+{
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	int thread_nr = cpu_thread_in_core(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+
+	set_bit(thread_nr, state);
+}
+
+static inline void atomic_lock_thread_idle(void)
+{
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+
+	while (unlikely(test_and_set_bit_lock(NR_PNV_CORE_IDLE_LOCK_BIT, state)))
+		barrier();
+}
+
+static inline void atomic_unlock_and_stop_thread_idle(void)
+{
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	unsigned long thread = 1UL << cpu_thread_in_core(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+	u64 s = READ_ONCE(*state);
+	u64 new, tmp;
+
+	BUG_ON(!(s & PNV_CORE_IDLE_LOCK_BIT));
+	BUG_ON(s & thread);
+
+again:
+	new = (s | thread) & ~PNV_CORE_IDLE_LOCK_BIT;
+	tmp = cmpxchg(state, s, new);
+	if (unlikely(tmp != s)) {
+		s = tmp;
+		goto again;
+	}
+}
+
+static inline void atomic_unlock_thread_idle(void)
+{
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+
+	BUG_ON(!test_bit(NR_PNV_CORE_IDLE_LOCK_BIT, state));
+	clear_bit_unlock(NR_PNV_CORE_IDLE_LOCK_BIT, state);
+}
+
+/* P7 and P8 */
+struct p7_sprs {
+	/* per core */
+	u64 tscr;
+	u64 worc;
+
+	/* per subcore */
+	u64 sdr1;
+	u64 rpr;
+	u64 amor;
+
+	/* per thread */
+	u64 lpcr;
+	u64 hfscr;
+	u64 fscr;
+	u64 purr;
+	u64 spurr;
+	u64 dscr;
+	u64 wort;
+};
+
+static unsigned long power7_idle_insn(unsigned long type)
+{
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+	unsigned long thread = 1UL << cpu_thread_in_core(cpu);
+	unsigned long core_thread_mask = (1UL << threads_per_core) - 1;
 	unsigned long srr1;
+	bool full_winkle;
+	struct p7_sprs sprs = {}; /* avoid false use-uninitialised */
+	bool sprs_saved = false;
+	int rc;
 
-	if (!prep_irq_for_idle_irqsoff())
-		return 0;
+	if (unlikely(type != PNV_THREAD_NAP)) {
+		atomic_lock_thread_idle();
+
+		BUG_ON(!(*state & thread));
+		*state &= ~thread;
+
+		if (power7_fastsleep_workaround_entry) {
+			if ((*state & core_thread_mask) == 0) {
+				rc = opal_config_cpu_idle_state(
+						OPAL_CONFIG_IDLE_FASTSLEEP,
+						OPAL_CONFIG_IDLE_APPLY);
+				BUG_ON(rc);
+			}
+		}
+
+		if (type == PNV_THREAD_WINKLE) {
+			sprs.tscr	= mfspr(SPRN_TSCR);
+			sprs.worc	= mfspr(SPRN_WORC);
+
+			sprs.sdr1	= mfspr(SPRN_SDR1);
+			sprs.rpr	= mfspr(SPRN_RPR);
+			sprs.amor	= mfspr(SPRN_AMOR);
+
+			sprs.lpcr	= mfspr(SPRN_LPCR);
+			if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+				sprs.hfscr	= mfspr(SPRN_HFSCR);
+				sprs.fscr	= mfspr(SPRN_FSCR);
+			}
+			sprs.purr	= mfspr(SPRN_PURR);
+			sprs.spurr	= mfspr(SPRN_SPURR);
+			sprs.dscr	= mfspr(SPRN_DSCR);
+			sprs.wort	= mfspr(SPRN_WORT);
+
+			sprs_saved = true;
+
+			/*
+			 * Increment winkle counter and set all winkle bits if
+			 * all threads are winkling. This allows wakeup side to
+			 * distinguish between fast sleep and winkle state
+			 * loss. Fast sleep still has to resync the timebase so
+			 * this may not be a really big win.
+			 */
+			*state += 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT;
+			if ((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS)
+					>> PNV_CORE_IDLE_WINKLE_COUNT_SHIFT
+					== threads_per_core)
+				*state |= PNV_CORE_IDLE_THREAD_WINKLE_BITS;
+			WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0);
+		}
+
+		atomic_unlock_thread_idle();
+	}
+
+	local_paca->thread_idle_state = type;
+	srr1 = isa206_idle_insn_mayloss(type);		/* go idle */
+	local_paca->thread_idle_state = PNV_THREAD_RUNNING;
+
+	WARN_ON_ONCE(!srr1);
+	WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR));
+
+	if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI))
+		hmi_exception_realmode(NULL);
+
+	if (likely((srr1 & SRR1_WAKESTATE) != SRR1_WS_HVLOSS)) {
+		if (unlikely(type != PNV_THREAD_NAP)) {
+			atomic_lock_thread_idle();
+			if (type == PNV_THREAD_WINKLE) {
+				WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0);
+				*state -= 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT;
+				*state &= ~(thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT);
+			}
+			atomic_unlock_and_stop_thread_idle();
+		}
+		return srr1;
+	}
+
+	/* HV state loss */
+	BUG_ON(type == PNV_THREAD_NAP);
+
+	atomic_lock_thread_idle();
+
+	full_winkle = false;
+	if (type == PNV_THREAD_WINKLE) {
+		WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0);
+		*state -= 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT;
+		if (*state & (thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT)) {
+			*state &= ~(thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT);
+			full_winkle = true;
+			BUG_ON(!sprs_saved);
+		}
+	}
+
+	WARN_ON(*state & thread);
+
+	if ((*state & core_thread_mask) != 0)
+		goto core_woken;
+
+	/* Per-core SPRs */
+	if (full_winkle) {
+		mtspr(SPRN_TSCR,	sprs.tscr);
+		mtspr(SPRN_WORC,	sprs.worc);
+	}
+
+	if (power7_fastsleep_workaround_exit) {
+		rc = opal_config_cpu_idle_state(OPAL_CONFIG_IDLE_FASTSLEEP,
+						OPAL_CONFIG_IDLE_UNDO);
+		BUG_ON(rc);
+	}
+
+	/* TB */
+	if (opal_resync_timebase() != OPAL_SUCCESS)
+		BUG();
+
+core_woken:
+	if (!full_winkle)
+		goto subcore_woken;
+
+	if ((*state & local_paca->subcore_sibling_mask) != 0)
+		goto subcore_woken;
+
+	/* Per-subcore SPRs */
+	mtspr(SPRN_SDR1,	sprs.sdr1);
+	mtspr(SPRN_RPR,		sprs.rpr);
+	mtspr(SPRN_AMOR,	sprs.amor);
+
+subcore_woken:
+	/*
+	 * isync after restoring shared SPRs and before unlocking. Unlock
+	 * only contains hwsync which does not necessarily do the right
+	 * thing for SPRs.
+	 */
+	isync();
+	atomic_unlock_and_stop_thread_idle();
+
+	/* Fast sleep does not lose SPRs */
+	if (!full_winkle)
+		return srr1;
+
+	/* Per-thread SPRs */
+	mtspr(SPRN_LPCR,	sprs.lpcr);
+	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		mtspr(SPRN_HFSCR,	sprs.hfscr);
+		mtspr(SPRN_FSCR,	sprs.fscr);
+	}
+	mtspr(SPRN_PURR,	sprs.purr);
+	mtspr(SPRN_SPURR,	sprs.spurr);
+	mtspr(SPRN_DSCR,	sprs.dscr);
+	mtspr(SPRN_WORT,	sprs.wort);
+
+	mtspr(SPRN_SPRG3,	local_paca->sprg_vdso);
+
+	/*
+	 * The SLB has to be restored here, but it sometimes still
+	 * contains entries, so the __ variant must be used to prevent
+	 * multi hits.
+	 */
+	__slb_restore_bolted_realmode();
+
+	return srr1;
+}
+
+extern unsigned long idle_kvm_start_guest(unsigned long srr1);
+
+#ifdef CONFIG_HOTPLUG_CPU
+static unsigned long power7_offline(void)
+{
+	unsigned long srr1;
+
+	mtmsr(MSR_IDLE);
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	/* Tell KVM we're entering idle. */
+	/******************************************************/
+	/*  N O T E   W E L L    ! ! !    N O T E   W E L L   */
+	/* The following store to HSTATE_HWTHREAD_STATE(r13)  */
+	/* MUST occur in real mode, i.e. with the MMU off,    */
+	/* and the MMU must stay off until we clear this flag */
+	/* and test HSTATE_HWTHREAD_REQ(r13) in               */
+	/* pnv_powersave_wakeup in this file.                 */
+	/* The reason is that another thread can switch the   */
+	/* MMU to a guest context whenever this flag is set   */
+	/* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on,    */
+	/* that would potentially cause this thread to start  */
+	/* executing instructions from guest memory in        */
+	/* hypervisor mode, leading to a host crash or data   */
+	/* corruption, or worse.                              */
+	/******************************************************/
+	local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE;
+#endif
 
 	__ppc64_runlatch_off();
-	srr1 = power7_idle_insn(type);
+	srr1 = power7_idle_insn(power7_offline_type);
 	__ppc64_runlatch_on();
 
-	fini_irq_for_idle_irqsoff();
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL;
+	/* Order setting hwthread_state vs. testing hwthread_req */
+	smp_mb();
+	if (local_paca->kvm_hstate.hwthread_req)
+		srr1 = idle_kvm_start_guest(srr1);
+#endif
+
+	mtmsr(MSR_KERNEL);
 
 	return srr1;
 }
+#endif
 
 void power7_idle_type(unsigned long type)
 {
 	unsigned long srr1;
 
-	srr1 = __power7_idle_type(type);
+	if (!prep_irq_for_idle_irqsoff())
+		return;
+
+	mtmsr(MSR_IDLE);
+	__ppc64_runlatch_off();
+	srr1 = power7_idle_insn(type);
+	__ppc64_runlatch_on();
+	mtmsr(MSR_KERNEL);
+
+	fini_irq_for_idle_irqsoff();
 	irq_set_pending_from_srr1(srr1);
 }
 
@@ -347,33 +554,275 @@ void power7_idle(void)
 	power7_idle_type(PNV_THREAD_NAP);
 }
 
-static unsigned long __power9_idle_type(unsigned long stop_psscr_val,
-				      unsigned long stop_psscr_mask)
+struct p9_sprs {
+	/* per core */
+	u64 ptcr;
+	u64 rpr;
+	u64 tscr;
+	u64 ldbar;
+	u64 amor;
+
+	/* per thread */
+	u64 lpcr;
+	u64 hfscr;
+	u64 fscr;
+	u64 pid;
+	u64 purr;
+	u64 spurr;
+	u64 dscr;
+	u64 wort;
+
+	u64 mmcra;
+	u32 mmcr0;
+	u32 mmcr1;
+	u64 mmcr2;
+};
+
+static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
 {
-	unsigned long psscr;
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+	unsigned long core_thread_mask = (1UL << threads_per_core) - 1;
 	unsigned long srr1;
+	unsigned long pls;
+	unsigned long mmcr0 = 0;
+	struct p9_sprs sprs = {}; /* avoid false used-uninitialised */
+	bool sprs_saved = false;
 
-	if (!prep_irq_for_idle_irqsoff())
-		return 0;
+	if (!(psscr & (PSSCR_EC|PSSCR_ESL))) {
+		/* EC=ESL=0 case */
+
+		BUG_ON(!mmu_on);
+
+		/*
+		 * Wake synchronously. SRESET via xscom may still cause
+		 * a 0x100 powersave wakeup with SRR1 reason!
+		 */
+		srr1 = isa300_idle_stop_noloss(psscr);		/* go idle */
+		if (likely(!srr1))
+			return 0;
+
+		/*
+		 * Registers not saved, can't recover!
+		 * This would be a hardware bug
+		 */
+		BUG_ON((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS);
+
+		goto out;
+	}
+
+	/* EC=ESL=1 case */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	if (cpu_has_feature(CPU_FTR_P9_TM_XER_SO_BUG)) {
+		local_paca->requested_psscr = psscr;
+		/* order setting requested_psscr vs testing dont_stop */
+		smp_mb();
+		if (atomic_read(&local_paca->dont_stop)) {
+			local_paca->requested_psscr = 0;
+			return 0;
+		}
+	}
+#endif
+
+	if (!cpu_has_feature(CPU_FTR_POWER9_DD2_1)) {
+		 /*
+		  * POWER9 DD2 can incorrectly set PMAO when waking up
+		  * after a state-loss idle. Saving and restoring MMCR0
+		  * over idle is a workaround.
+		  */
+		mmcr0		= mfspr(SPRN_MMCR0);
+	}
+	if ((psscr & PSSCR_RL_MASK) >= pnv_first_spr_loss_level) {
+		sprs.lpcr	= mfspr(SPRN_LPCR);
+		sprs.hfscr	= mfspr(SPRN_HFSCR);
+		sprs.fscr	= mfspr(SPRN_FSCR);
+		sprs.pid	= mfspr(SPRN_PID);
+		sprs.purr	= mfspr(SPRN_PURR);
+		sprs.spurr	= mfspr(SPRN_SPURR);
+		sprs.dscr	= mfspr(SPRN_DSCR);
+		sprs.wort	= mfspr(SPRN_WORT);
+
+		sprs.mmcra	= mfspr(SPRN_MMCRA);
+		sprs.mmcr0	= mfspr(SPRN_MMCR0);
+		sprs.mmcr1	= mfspr(SPRN_MMCR1);
+		sprs.mmcr2	= mfspr(SPRN_MMCR2);
+
+		sprs.ptcr	= mfspr(SPRN_PTCR);
+		sprs.rpr	= mfspr(SPRN_RPR);
+		sprs.tscr	= mfspr(SPRN_TSCR);
+		sprs.ldbar	= mfspr(SPRN_LDBAR);
+		sprs.amor	= mfspr(SPRN_AMOR);
+
+		sprs_saved = true;
+
+		atomic_start_thread_idle();
+	}
+
+	srr1 = isa300_idle_stop_mayloss(psscr);		/* go idle */
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	local_paca->requested_psscr = 0;
+#endif
 
 	psscr = mfspr(SPRN_PSSCR);
-	psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
+
+	WARN_ON_ONCE(!srr1);
+	WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR));
+
+	if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) {
+		unsigned long mmcra;
+
+		/*
+		 * Workaround for POWER9 DD2.0, if we lost resources, the ERAT
+		 * might have been corrupted and needs flushing. We also need
+		 * to reload MMCR0 (see mmcr0 comment above).
+		 */
+		if (!cpu_has_feature(CPU_FTR_POWER9_DD2_1)) {
+			asm volatile(PPC_INVALIDATE_ERAT);
+			mtspr(SPRN_MMCR0, mmcr0);
+		}
+
+		/*
+		 * DD2.2 and earlier need to set then clear bit 60 in MMCRA
+		 * to ensure the PMU starts running.
+		 */
+		mmcra = mfspr(SPRN_MMCRA);
+		mmcra |= PPC_BIT(60);
+		mtspr(SPRN_MMCRA, mmcra);
+		mmcra &= ~PPC_BIT(60);
+		mtspr(SPRN_MMCRA, mmcra);
+	}
+
+	if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI))
+		hmi_exception_realmode(NULL);
+
+	/*
+	 * On POWER9, SRR1 bits do not match exactly as expected.
+	 * SRR1_WS_GPRLOSS (10b) can also result in SPR loss, so
+	 * just always test PSSCR for SPR/TB state loss.
+	 */
+	pls = (psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT;
+	if (likely(pls < pnv_first_spr_loss_level)) {
+		if (sprs_saved)
+			atomic_stop_thread_idle();
+		goto out;
+	}
+
+	/* HV state loss */
+	BUG_ON(!sprs_saved);
+
+	atomic_lock_thread_idle();
+
+	if ((*state & core_thread_mask) != 0)
+		goto core_woken;
+
+	/* Per-core SPRs */
+	mtspr(SPRN_PTCR,	sprs.ptcr);
+	mtspr(SPRN_RPR,		sprs.rpr);
+	mtspr(SPRN_TSCR,	sprs.tscr);
+	mtspr(SPRN_LDBAR,	sprs.ldbar);
+	mtspr(SPRN_AMOR,	sprs.amor);
+
+	if (pls >= pnv_first_tb_loss_level) {
+		/* TB loss */
+		if (opal_resync_timebase() != OPAL_SUCCESS)
+			BUG();
+	}
+
+	/*
+	 * isync after restoring shared SPRs and before unlocking. Unlock
+	 * only contains hwsync which does not necessarily do the right
+	 * thing for SPRs.
+	 */
+	isync();
+
+core_woken:
+	atomic_unlock_and_stop_thread_idle();
+
+	/* Per-thread SPRs */
+	mtspr(SPRN_LPCR,	sprs.lpcr);
+	mtspr(SPRN_HFSCR,	sprs.hfscr);
+	mtspr(SPRN_FSCR,	sprs.fscr);
+	mtspr(SPRN_PID,		sprs.pid);
+	mtspr(SPRN_PURR,	sprs.purr);
+	mtspr(SPRN_SPURR,	sprs.spurr);
+	mtspr(SPRN_DSCR,	sprs.dscr);
+	mtspr(SPRN_WORT,	sprs.wort);
+
+	mtspr(SPRN_MMCRA,	sprs.mmcra);
+	mtspr(SPRN_MMCR0,	sprs.mmcr0);
+	mtspr(SPRN_MMCR1,	sprs.mmcr1);
+	mtspr(SPRN_MMCR2,	sprs.mmcr2);
+
+	mtspr(SPRN_SPRG3,	local_paca->sprg_vdso);
+
+	if (!radix_enabled())
+		__slb_restore_bolted_realmode();
+
+out:
+	if (mmu_on)
+		mtmsr(MSR_KERNEL);
+
+	return srr1;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static unsigned long power9_offline_stop(unsigned long psscr)
+{
+	unsigned long srr1;
+
+#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	__ppc64_runlatch_off();
+	srr1 = power9_idle_stop(psscr, true);
+	__ppc64_runlatch_on();
+#else
+	/*
+	 * Tell KVM we're entering idle.
+	 * This does not have to be done in real mode because the P9 MMU
+	 * is independent per-thread. Some steppings share radix/hash mode
+	 * between threads, but in that case KVM has a barrier sync in real
+	 * mode before and after switching between radix and hash.
+	 *
+	 * kvm_start_guest must still be called in real mode though, hence
+	 * the false argument.
+	 */
+	local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE;
 
 	__ppc64_runlatch_off();
-	srr1 = power9_idle_stop(psscr);
+	srr1 = power9_idle_stop(psscr, false);
 	__ppc64_runlatch_on();
 
-	fini_irq_for_idle_irqsoff();
+	local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL;
+	/* Order setting hwthread_state vs. testing hwthread_req */
+	smp_mb();
+	if (local_paca->kvm_hstate.hwthread_req)
+		srr1 = idle_kvm_start_guest(srr1);
+	mtmsr(MSR_KERNEL);
+#endif
 
 	return srr1;
 }
+#endif
 
 void power9_idle_type(unsigned long stop_psscr_val,
 				      unsigned long stop_psscr_mask)
 {
+	unsigned long psscr;
 	unsigned long srr1;
 
-	srr1 = __power9_idle_type(stop_psscr_val, stop_psscr_mask);
+	if (!prep_irq_for_idle_irqsoff())
+		return;
+
+	psscr = mfspr(SPRN_PSSCR);
+	psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
+
+	__ppc64_runlatch_off();
+	srr1 = power9_idle_stop(psscr, true);
+	__ppc64_runlatch_on();
+
+	fini_irq_for_idle_irqsoff();
+
 	irq_set_pending_from_srr1(srr1);
 }
 
@@ -409,7 +858,7 @@ void pnv_power9_force_smt4_catch(void)
 			atomic_inc(&paca_ptrs[cpu0+thr]->dont_stop);
 	}
 	/* order setting dont_stop vs testing requested_psscr */
-	mb();
+	smp_mb();
 	for (thr = 0; thr < threads_per_core; ++thr) {
 		if (!paca_ptrs[cpu0+thr]->requested_psscr)
 			++awake_threads;
@@ -481,7 +930,6 @@ void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val)
 unsigned long pnv_cpu_offline(unsigned int cpu)
 {
 	unsigned long srr1;
-	u32 idle_states = pnv_get_supported_cpuidle_states();
 
 	__ppc64_runlatch_off();
 
@@ -492,15 +940,8 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
 		psscr = (psscr & ~pnv_deepest_stop_psscr_mask) |
 						pnv_deepest_stop_psscr_val;
 		srr1 = power9_offline_stop(psscr);
-
-	} else if ((idle_states & OPAL_PM_WINKLE_ENABLED) &&
-		   (idle_states & OPAL_PM_LOSE_FULL_CONTEXT)) {
-		srr1 = power7_idle_insn(PNV_THREAD_WINKLE);
-	} else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
-		   (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
-		srr1 = power7_idle_insn(PNV_THREAD_SLEEP);
-	} else if (idle_states & OPAL_PM_NAP_ENABLED) {
-		srr1 = power7_idle_insn(PNV_THREAD_NAP);
+	} else if (cpu_has_feature(CPU_FTR_ARCH_206) && power7_offline_type) {
+		srr1 = power7_offline();
 	} else {
 		/* This is the fallback method. We emulate snooze */
 		while (!generic_check_cpu_restart(cpu)) {
@@ -596,33 +1037,44 @@ int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags)
  * @dt_idle_states: Number of idle state entries
  * Returns 0 on success
  */
-static int __init pnv_power9_idle_init(void)
+static void __init pnv_power9_idle_init(void)
 {
 	u64 max_residency_ns = 0;
 	int i;
 
 	/*
-	 * Set pnv_first_deep_stop_state, pnv_deepest_stop_psscr_{val,mask},
-	 * and the pnv_default_stop_{val,mask}.
-	 *
-	 * pnv_first_deep_stop_state should be set to the first stop
-	 * level to cause hypervisor state loss.
-	 *
 	 * pnv_deepest_stop_{val,mask} should be set to values corresponding to
 	 * the deepest stop state.
 	 *
 	 * pnv_default_stop_{val,mask} should be set to values corresponding to
-	 * the shallowest (OPAL_PM_STOP_INST_FAST) loss-less stop state.
+	 * the deepest loss-less (OPAL_PM_STOP_INST_FAST) stop state.
 	 */
-	pnv_first_deep_stop_state = MAX_STOP_STATE;
+	pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
+	pnv_first_spr_loss_level = MAX_STOP_STATE + 1;
 	for (i = 0; i < nr_pnv_idle_states; i++) {
 		int err;
 		struct pnv_idle_states_t *state = &pnv_idle_states[i];
 		u64 psscr_rl = state->psscr_val & PSSCR_RL_MASK;
 
+		if ((state->flags & OPAL_PM_TIMEBASE_STOP) &&
+		     (pnv_first_tb_loss_level > psscr_rl))
+			pnv_first_tb_loss_level = psscr_rl;
+
 		if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) &&
-		    pnv_first_deep_stop_state > psscr_rl)
-			pnv_first_deep_stop_state = psscr_rl;
+		     (pnv_first_spr_loss_level > psscr_rl))
+			pnv_first_spr_loss_level = psscr_rl;
+
+		/*
+		 * The idle code does not deal with TB loss occurring
+		 * in a shallower state than SPR loss, so force it to
+		 * behave like SPRs are lost if TB is lost. POWER9 would
+		 * never encouter this, but a POWER8 core would if it
+		 * implemented the stop instruction. So this is for forward
+		 * compatibility.
+		 */
+		if ((state->flags & OPAL_PM_TIMEBASE_STOP) &&
+		     (pnv_first_spr_loss_level > psscr_rl))
+			pnv_first_spr_loss_level = psscr_rl;
 
 		err = validate_psscr_val_mask(&state->psscr_val,
 					      &state->psscr_mask,
@@ -647,6 +1099,7 @@ static int __init pnv_power9_idle_init(void)
 			pnv_default_stop_val = state->psscr_val;
 			pnv_default_stop_mask = state->psscr_mask;
 			default_stop_found = true;
+			WARN_ON(state->flags & OPAL_PM_LOSE_FULL_CONTEXT);
 		}
 	}
 
@@ -666,10 +1119,40 @@ static int __init pnv_power9_idle_init(void)
 			pnv_deepest_stop_psscr_mask);
 	}
 
-	pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n",
-		pnv_first_deep_stop_state);
+	pr_info("cpuidle-powernv: First stop level that may lose SPRs = 0x%lld\n",
+		pnv_first_spr_loss_level);
 
-	return 0;
+	pr_info("cpuidle-powernv: First stop level that may lose timebase = 0x%lld\n",
+		pnv_first_tb_loss_level);
+}
+
+static void __init pnv_disable_deep_states(void)
+{
+	/*
+	 * The stop-api is unable to restore hypervisor
+	 * resources on wakeup from platform idle states which
+	 * lose full context. So disable such states.
+	 */
+	supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT;
+	pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n");
+	pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n");
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+	    (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) {
+		/*
+		 * Use the default stop state for CPU-Hotplug
+		 * if available.
+		 */
+		if (default_stop_found) {
+			pnv_deepest_stop_psscr_val = pnv_default_stop_val;
+			pnv_deepest_stop_psscr_mask = pnv_default_stop_mask;
+			pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n",
+				pnv_deepest_stop_psscr_val);
+		} else { /* Fallback to snooze loop for CPU-Hotplug */
+			deepest_stop_found = false;
+			pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n");
+		}
+	}
 }
 
 /*
@@ -684,10 +1167,8 @@ static void __init pnv_probe_idle_states(void)
 		return;
 	}
 
-	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
-		if (pnv_power9_idle_init())
-			return;
-	}
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		pnv_power9_idle_init();
 
 	for (i = 0; i < nr_pnv_idle_states; i++)
 		supported_cpuidle_states |= pnv_idle_states[i].flags;
@@ -807,11 +1288,33 @@ out:
 
 static int __init pnv_init_idle_states(void)
 {
+	int cpu;
 	int rc = 0;
-	supported_cpuidle_states = 0;
+
+	/* Set up PACA fields */
+	for_each_present_cpu(cpu) {
+		struct paca_struct *p = paca_ptrs[cpu];
+
+		p->idle_state = 0;
+		if (cpu == cpu_first_thread_sibling(cpu))
+			p->idle_state = (1 << threads_per_core) - 1;
+
+		if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+			/* P7/P8 nap */
+			p->thread_idle_state = PNV_THREAD_RUNNING;
+		} else {
+			/* P9 stop */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+			p->requested_psscr = 0;
+			atomic_set(&p->dont_stop, 0);
+#endif
+		}
+	}
 
 	/* In case we error out nr_pnv_idle_states will be zero */
 	nr_pnv_idle_states = 0;
+	supported_cpuidle_states = 0;
+
 	if (cpuidle_disable != IDLE_NO_OVERRIDE)
 		goto out;
 	rc = pnv_parse_cpuidle_dt();
@@ -819,27 +1322,40 @@ static int __init pnv_init_idle_states(void)
 		return rc;
 	pnv_probe_idle_states();
 
-	if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
-		patch_instruction(
-			(unsigned int *)pnv_fastsleep_workaround_at_entry,
-			PPC_INST_NOP);
-		patch_instruction(
-			(unsigned int *)pnv_fastsleep_workaround_at_exit,
-			PPC_INST_NOP);
-	} else {
-		/*
-		 * OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that
-		 * workaround is needed to use fastsleep. Provide sysfs
-		 * control to choose how this workaround has to be applied.
-		 */
-		device_create_file(cpu_subsys.dev_root,
+	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+		if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
+			power7_fastsleep_workaround_entry = false;
+			power7_fastsleep_workaround_exit = false;
+		} else {
+			/*
+			 * OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that
+			 * workaround is needed to use fastsleep. Provide sysfs
+			 * control to choose how this workaround has to be
+			 * applied.
+			 */
+			device_create_file(cpu_subsys.dev_root,
 				&dev_attr_fastsleep_workaround_applyonce);
-	}
+		}
+
+		update_subcore_sibling_mask();
+
+		if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED) {
+			ppc_md.power_save = power7_idle;
+			power7_offline_type = PNV_THREAD_NAP;
+		}
 
-	pnv_alloc_idle_core_states();
+		if ((supported_cpuidle_states & OPAL_PM_WINKLE_ENABLED) &&
+			   (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT))
+			power7_offline_type = PNV_THREAD_WINKLE;
+		else if ((supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED) ||
+			   (supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1))
+			power7_offline_type = PNV_THREAD_SLEEP;
+	}
 
-	if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED)
-		ppc_md.power_save = power7_idle;
+	if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) {
+		if (pnv_save_sprs_for_deep_states())
+			pnv_disable_deep_states();
+	}
 
 out:
 	return 0;
diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c
index 45563004feda..1d7a9fd30dd1 100644
--- a/arch/powerpc/platforms/powernv/subcore.c
+++ b/arch/powerpc/platforms/powernv/subcore.c
@@ -183,7 +183,7 @@ static void unsplit_core(void)
 	cpu = smp_processor_id();
 	if (cpu_thread_in_core(cpu) != 0) {
 		while (mfspr(SPRN_HID0) & mask)
-			power7_idle_insn(PNV_THREAD_NAP);
+			power7_idle_type(PNV_THREAD_NAP);
 
 		per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT;
 		return;
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index a0f44f992360..e583ed3f6b93 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2431,7 +2431,6 @@ static void dump_one_paca(int cpu)
 	DUMP(p, irq_happened, "%#-*x");
 	DUMP(p, io_sync, "%#-*x");
 	DUMP(p, irq_work_pending, "%#-*x");
-	DUMP(p, nap_state_lost, "%#-*x");
 	DUMP(p, sprg_vdso, "%#-*llx");
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@ -2439,19 +2438,16 @@ static void dump_one_paca(int cpu)
 #endif
 
 #ifdef CONFIG_PPC_POWERNV
-	DUMP(p, core_idle_state_ptr, "%-*px");
-	DUMP(p, thread_idle_state, "%#-*x");
-	DUMP(p, thread_mask, "%#-*x");
-	DUMP(p, subcore_sibling_mask, "%#-*x");
-	DUMP(p, requested_psscr, "%#-*llx");
-	DUMP(p, stop_sprs.pid, "%#-*llx");
-	DUMP(p, stop_sprs.ldbar, "%#-*llx");
-	DUMP(p, stop_sprs.fscr, "%#-*llx");
-	DUMP(p, stop_sprs.hfscr, "%#-*llx");
-	DUMP(p, stop_sprs.mmcr1, "%#-*llx");
-	DUMP(p, stop_sprs.mmcr2, "%#-*llx");
-	DUMP(p, stop_sprs.mmcra, "%#-*llx");
-	DUMP(p, dont_stop.counter, "%#-*x");
+	DUMP(p, idle_state, "%#-*lx");
+	if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) {
+		DUMP(p, thread_idle_state, "%#-*x");
+		DUMP(p, subcore_sibling_mask, "%#-*x");
+	} else {
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+		DUMP(p, requested_psscr, "%#-*llx");
+		DUMP(p, dont_stop.counter, "%#-*x");
+#endif
+	}
 #endif
 
 	DUMP(p, accounting.utime, "%#-*lx");
-- 
cgit v1.2.3-59-g8ed1b


From e9cef0189c5b217fcd4788562862defc27632a01 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 30 Apr 2019 14:28:17 +1000
Subject: powerpc/powernv/idle: Restore AMR/UAMOR/AMOR/IAMR after idle

This is an implementation of commits 53a712bae5dd
("powerpc/powernv/idle: Restore AMR/UAMOR/AMOR after idle") and
a3f3072db6ca ("powerpc/powernv/idle: Restore IAMR after idle") using
the new C-based idle code.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Extract from Nick's patch]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/idle.c | 52 +++++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 87f5f4ae60ca..c9133f7908ca 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -296,7 +296,6 @@ struct p7_sprs {
 	/* per subcore */
 	u64 sdr1;
 	u64 rpr;
-	u64 amor;
 
 	/* per thread */
 	u64 lpcr;
@@ -306,6 +305,12 @@ struct p7_sprs {
 	u64 spurr;
 	u64 dscr;
 	u64 wort;
+
+	/* per thread SPRs that get lost in shallow states */
+	u64 amr;
+	u64 iamr;
+	u64 amor;
+	u64 uamor;
 };
 
 static unsigned long power7_idle_insn(unsigned long type)
@@ -342,7 +347,6 @@ static unsigned long power7_idle_insn(unsigned long type)
 
 			sprs.sdr1	= mfspr(SPRN_SDR1);
 			sprs.rpr	= mfspr(SPRN_RPR);
-			sprs.amor	= mfspr(SPRN_AMOR);
 
 			sprs.lpcr	= mfspr(SPRN_LPCR);
 			if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
@@ -374,6 +378,13 @@ static unsigned long power7_idle_insn(unsigned long type)
 		atomic_unlock_thread_idle();
 	}
 
+	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		sprs.amr	= mfspr(SPRN_AMR);
+		sprs.iamr	= mfspr(SPRN_IAMR);
+		sprs.amor	= mfspr(SPRN_AMOR);
+		sprs.uamor	= mfspr(SPRN_UAMOR);
+	}
+
 	local_paca->thread_idle_state = type;
 	srr1 = isa206_idle_insn_mayloss(type);		/* go idle */
 	local_paca->thread_idle_state = PNV_THREAD_RUNNING;
@@ -381,6 +392,19 @@ static unsigned long power7_idle_insn(unsigned long type)
 	WARN_ON_ONCE(!srr1);
 	WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR));
 
+	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) {
+			/*
+			 * We don't need an isync after the mtsprs here because
+			 * the upcoming mtmsrd is execution synchronizing.
+			 */
+			mtspr(SPRN_AMR,		sprs.amr);
+			mtspr(SPRN_IAMR,	sprs.iamr);
+			mtspr(SPRN_AMOR,	sprs.amor);
+			mtspr(SPRN_UAMOR,	sprs.uamor);
+		}
+	}
+
 	if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI))
 		hmi_exception_realmode(NULL);
 
@@ -444,7 +468,6 @@ core_woken:
 	/* Per-subcore SPRs */
 	mtspr(SPRN_SDR1,	sprs.sdr1);
 	mtspr(SPRN_RPR,		sprs.rpr);
-	mtspr(SPRN_AMOR,	sprs.amor);
 
 subcore_woken:
 	/*
@@ -560,7 +583,6 @@ struct p9_sprs {
 	u64 rpr;
 	u64 tscr;
 	u64 ldbar;
-	u64 amor;
 
 	/* per thread */
 	u64 lpcr;
@@ -576,6 +598,12 @@ struct p9_sprs {
 	u32 mmcr0;
 	u32 mmcr1;
 	u64 mmcr2;
+
+	/* per thread SPRs that get lost in shallow states */
+	u64 amr;
+	u64 iamr;
+	u64 amor;
+	u64 uamor;
 };
 
 static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
@@ -652,13 +680,17 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
 		sprs.rpr	= mfspr(SPRN_RPR);
 		sprs.tscr	= mfspr(SPRN_TSCR);
 		sprs.ldbar	= mfspr(SPRN_LDBAR);
-		sprs.amor	= mfspr(SPRN_AMOR);
 
 		sprs_saved = true;
 
 		atomic_start_thread_idle();
 	}
 
+	sprs.amr	= mfspr(SPRN_AMR);
+	sprs.iamr	= mfspr(SPRN_IAMR);
+	sprs.amor	= mfspr(SPRN_AMOR);
+	sprs.uamor	= mfspr(SPRN_UAMOR);
+
 	srr1 = isa300_idle_stop_mayloss(psscr);		/* go idle */
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -673,6 +705,15 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
 	if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) {
 		unsigned long mmcra;
 
+		/*
+		 * We don't need an isync after the mtsprs here because the
+		 * upcoming mtmsrd is execution synchronizing.
+		 */
+		mtspr(SPRN_AMR,		sprs.amr);
+		mtspr(SPRN_IAMR,	sprs.iamr);
+		mtspr(SPRN_AMOR,	sprs.amor);
+		mtspr(SPRN_UAMOR,	sprs.uamor);
+
 		/*
 		 * Workaround for POWER9 DD2.0, if we lost resources, the ERAT
 		 * might have been corrupted and needs flushing. We also need
@@ -722,7 +763,6 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
 	mtspr(SPRN_RPR,		sprs.rpr);
 	mtspr(SPRN_TSCR,	sprs.tscr);
 	mtspr(SPRN_LDBAR,	sprs.ldbar);
-	mtspr(SPRN_AMOR,	sprs.amor);
 
 	if (pls >= pnv_first_tb_loss_level) {
 		/* TB loss */
-- 
cgit v1.2.3-59-g8ed1b


From b511cdd1c12d1e450baeba5373dd3a8897396e2b Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 10 Apr 2019 16:48:00 +1000
Subject: powerpc/powernv/ioda: Handle failures correctly in
 pnv_pci_ioda_iommu_bypass_supported()

When the return value type was changed from int to bool, few places
were left unchanged, this fixes them. We did not hit these failures as
the first one is not happening at all and the second one is little
more likely to happen if the user switches a 33..58bit DMA capable
device between the VFIO and vendor drivers and there are not so many
of these.

Fixes: 2d6ad41b2c21 ("powerpc/powernv: use the generic iommu bypass code")
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3ead4c237ed0..9a9076a5686c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1836,7 +1836,7 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
 	struct pnv_ioda_pe *pe;
 
 	if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
-		return -ENODEV;
+		return false;
 
 	pe = &phb->ioda.pe_array[pdn->pe_number];
 	if (pe->tce_bypass_enabled) {
@@ -1859,7 +1859,7 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
 		/* Configure the bypass mode */
 		s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe);
 		if (rc)
-			return rc;
+			return false;
 		/* 4GB offset bypasses 32-bit space */
 		pdev->dev.archdata.dma_offset = (1ULL << 32);
 		return true;
-- 
cgit v1.2.3-59-g8ed1b


From 33dda8c32714c1a8f318450af4d1f9f123e2ed24 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Tue, 23 Apr 2019 14:11:14 -0700
Subject: powerpc/vdso: Drop unnecessary cc-ldoption

Towards the goal of removing cc-ldoption, it seems that --hash-style=
was added to binutils 2.17.50.0.2 in 2006. The minimal required
version of binutils for the kernel according to
Documentation/process/changes.rst is 2.20.

Suggested-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/vdso32/Makefile | 5 ++---
 arch/powerpc/kernel/vdso64/Makefile | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile
index ce199f6e4256..06f54d947057 100644
--- a/arch/powerpc/kernel/vdso32/Makefile
+++ b/arch/powerpc/kernel/vdso32/Makefile
@@ -26,9 +26,8 @@ GCOV_PROFILE := n
 KCOV_INSTRUMENT := n
 UBSAN_SANITIZE := n
 
-ccflags-y := -shared -fno-common -fno-builtin
-ccflags-y += -nostdlib -Wl,-soname=linux-vdso32.so.1 \
-		$(call cc-ldoption, -Wl$(comma)--hash-style=both)
+ccflags-y := -shared -fno-common -fno-builtin -nostdlib \
+	-Wl,-soname=linux-vdso32.so.1 -Wl,--hash-style=both
 asflags-y := -D__VDSO32__ -s
 
 obj-y += vdso32_wrapper.o
diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile
index 28e7d112aa2f..32ebb3522ea1 100644
--- a/arch/powerpc/kernel/vdso64/Makefile
+++ b/arch/powerpc/kernel/vdso64/Makefile
@@ -12,9 +12,8 @@ GCOV_PROFILE := n
 KCOV_INSTRUMENT := n
 UBSAN_SANITIZE := n
 
-ccflags-y := -shared -fno-common -fno-builtin
-ccflags-y += -nostdlib -Wl,-soname=linux-vdso64.so.1 \
-		$(call cc-ldoption, -Wl$(comma)--hash-style=both)
+ccflags-y := -shared -fno-common -fno-builtin -nostdlib \
+	-Wl,-soname=linux-vdso64.so.1 -Wl,--hash-style=both
 asflags-y := -D__VDSO64__ -s
 
 obj-y += vdso64_wrapper.o
-- 
cgit v1.2.3-59-g8ed1b


From 7e8039795a80bdf1418964b9cabef6168bc5d9a4 Mon Sep 17 00:00:00 2001
From: "Tobin C. Harding" <tobin@kernel.org>
Date: Tue, 30 Apr 2019 11:09:23 +1000
Subject: powerpc/cacheinfo: Fix kobject memleak

Currently error return from kobject_init_and_add() is not followed by
a call to kobject_put(). This means there is a memory leak.

Add call to kobject_put() in error path of kobject_init_and_add().

Signed-off-by: Tobin C. Harding <tobin@kernel.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Tyrel Datwyler <tyreld@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/cacheinfo.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 53102764fd2f..f2ed3ef4b129 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -759,23 +759,22 @@ static void cacheinfo_create_index_dir(struct cache *cache, int index,
 
 	index_dir = kzalloc(sizeof(*index_dir), GFP_KERNEL);
 	if (!index_dir)
-		goto err;
+		return;
 
 	index_dir->cache = cache;
 
 	rc = kobject_init_and_add(&index_dir->kobj, &cache_index_type,
 				  cache_dir->kobj, "index%d", index);
-	if (rc)
-		goto err;
+	if (rc) {
+		kobject_put(&index_dir->kobj);
+		kfree(index_dir);
+		return;
+	}
 
 	index_dir->next = cache_dir->index;
 	cache_dir->index = index_dir;
 
 	cacheinfo_create_index_opt_attrs(index_dir);
-
-	return;
-err:
-	kfree(index_dir);
 }
 
 static void cacheinfo_sysfs_populate(unsigned int cpu_id,
-- 
cgit v1.2.3-59-g8ed1b


From a5ae043de7678f189303559782f6057078459a41 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Wed, 13 Mar 2019 21:00:30 +0100
Subject: powerpc/64s: Remove 'dummy_copy_buffer'

In commit 2bf1071a8d50 ("powerpc/64s: Remove POWER9 DD1 support") the
function __switch_to remove usage for 'dummy_copy_buffer'. Since it is
not used anywhere else, remove it completely.

This remove the following warning:
  arch/powerpc/kernel/process.c:1156:17: error: 'dummy_copy_buffer' defined but not used

Suggested-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Mathieu Malaterre <malat@debian.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/process.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 64e1494d3a1d..0c2017357073 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1152,11 +1152,6 @@ static inline void restore_sprs(struct thread_struct *old_thread,
 	thread_pkey_regs_restore(new_thread, old_thread);
 }
 
-#ifdef CONFIG_PPC_BOOK3S_64
-#define CP_SIZE 128
-static const u8 dummy_copy_buffer[CP_SIZE] __attribute__((aligned(CP_SIZE)));
-#endif
-
 struct task_struct *__switch_to(struct task_struct *prev,
 	struct task_struct *new)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 5b2a15296210d3b70e06d0f09a8e701ff74ccbe8 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@ozlabs.org>
Date: Thu, 4 Oct 2018 16:23:37 +1000
Subject: powerpc: Add doorbell tracepoints

When analysing sources of OS jitter, I noticed that doorbells cannot be
traced.

Signed-off-by: Anton Blanchard <anton@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/trace.h | 16 ++++++++++++++++
 arch/powerpc/kernel/dbell.c      |  3 +++
 2 files changed, 19 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h
index 58ef8c43a89d..08cd60cd70b7 100644
--- a/arch/powerpc/include/asm/trace.h
+++ b/arch/powerpc/include/asm/trace.h
@@ -54,6 +54,22 @@ DEFINE_EVENT(ppc64_interrupt_class, timer_interrupt_exit,
 	TP_ARGS(regs)
 );
 
+#ifdef CONFIG_PPC_DOORBELL
+DEFINE_EVENT(ppc64_interrupt_class, doorbell_entry,
+
+	TP_PROTO(struct pt_regs *regs),
+
+	TP_ARGS(regs)
+);
+
+DEFINE_EVENT(ppc64_interrupt_class, doorbell_exit,
+
+	TP_PROTO(struct pt_regs *regs),
+
+	TP_ARGS(regs)
+);
+#endif
+
 #ifdef CONFIG_PPC_PSERIES
 extern int hcall_tracepoint_regfunc(void);
 extern void hcall_tracepoint_unregfunc(void);
diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
index b6fe883b1016..5ec3b3835925 100644
--- a/arch/powerpc/kernel/dbell.c
+++ b/arch/powerpc/kernel/dbell.c
@@ -18,6 +18,7 @@
 #include <asm/dbell.h>
 #include <asm/irq_regs.h>
 #include <asm/kvm_ppc.h>
+#include <asm/trace.h>
 
 #ifdef CONFIG_SMP
 
@@ -81,6 +82,7 @@ void doorbell_exception(struct pt_regs *regs)
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
 	irq_enter();
+	trace_doorbell_entry(regs);
 
 	ppc_msgsync();
 
@@ -91,6 +93,7 @@ void doorbell_exception(struct pt_regs *regs)
 
 	smp_ipi_demux_relaxed(); /* already performed the barrier */
 
+	trace_doorbell_exit(regs);
 	irq_exit();
 	set_irq_regs(old_regs);
 }
-- 
cgit v1.2.3-59-g8ed1b


From d6e8a150850601277039a548ffcdddd1bfe3e365 Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Mon, 29 Apr 2019 23:45:48 +0530
Subject: powerpc/powernv/mce: Reduce MCE console logs to lesser lines.

Also add cpu number while displaying MCE log. This will help cleaner
logs when MCE hits on multiple cpus simultaneously.

Before the changes the MCE output was:

  Severe Machine check interrupt [Recovered]
    NIP [d00000000ba80280]: insert_slb_entry.constprop.0+0x278/0x2c0 [mcetest_slb]
    Initiator: CPU
    Error type: SLB [Multihit]
      Effective address: d00000000ba80280

After this patch series changes the MCE output will be:

  MCE: CPU80: machine check (Warning) Host SLB Multihit [Recovered]
  MCE: CPU80: NIP: [d00000000b550280] insert_slb_entry.constprop.0+0x278/0x2c0 [mcetest_slb]
  MCE: CPU80: Probable software error (some chance of hardware cause)

UE in host application:

  MCE: CPU48: machine check (Severe) Host UE Load/Store DAR: 00007fffc6079a80 paddr: 0000000f8e260000 [Not recovered]
  MCE: CPU48: PID: 4584 Comm: find NIP: [0000000010023368]
  MCE: CPU48: Hardware error

and for MCE in Guest:

  MCE: CPU80: machine check (Warning) Guest SLB Multihit DAR: 000001001b6e0320 [Recovered]
  MCE: CPU80: PID: 24765 Comm: qemu-system-ppc Guest NIP: [00007fffa309dc60]
  MCE: CPU80: Probable software error (some chance of hardware cause)

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mce.h |  2 +-
 arch/powerpc/kernel/mce.c      | 89 +++++++++++++++++++++++-------------------
 2 files changed, 49 insertions(+), 42 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index ad47fa865324..c888ef9a3eaf 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -116,7 +116,7 @@ struct machine_check_event {
 	enum MCE_Initiator	initiator:8;	/* 0x03 */
 	enum MCE_ErrorType	error_type:8;	/* 0x04 */
 	enum MCE_Disposition	disposition:8;	/* 0x05 */
-	uint8_t			reserved_1[2];	/* 0x06 */
+	uint16_t		cpu;		/* 0x06 */
 	uint64_t		gpr3;		/* 0x08 */
 	uint64_t		srr0;		/* 0x10 */
 	uint64_t		srr1;		/* 0x18 */
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index b5fec1f9751a..25a8b20cbbdc 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -112,6 +112,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
 	mce->srr1 = regs->msr;
 	mce->gpr3 = regs->gpr[3];
 	mce->in_use = 1;
+	mce->cpu = get_paca()->paca_index;
 
 	/* Mark it recovered if we have handled it and MSR(RI=1). */
 	if (handled && (regs->msr & MSR_RI))
@@ -310,7 +311,11 @@ static void machine_check_process_queued_event(struct irq_work *work)
 void machine_check_print_event_info(struct machine_check_event *evt,
 				    bool user_mode, bool in_guest)
 {
-	const char *level, *sevstr, *subtype;
+	const char *level, *sevstr, *subtype, *err_type;
+	uint64_t ea = 0, pa = 0;
+	int n = 0;
+	char dar_str[50];
+	char pa_str[50];
 	static const char *mc_ue_types[] = {
 		"Indeterminate",
 		"Instruction fetch",
@@ -384,101 +389,103 @@ void machine_check_print_event_info(struct machine_check_event *evt,
 		break;
 	}
 
-	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
-	       evt->disposition == MCE_DISPOSITION_RECOVERED ?
-	       "Recovered" : "Not recovered");
-
-	if (in_guest) {
-		printk("%s  Guest NIP: %016llx\n", level, evt->srr0);
-	} else if (user_mode) {
-		printk("%s  NIP: [%016llx] PID: %d Comm: %s\n", level,
-			evt->srr0, current->pid, current->comm);
-	} else {
-		printk("%s  NIP [%016llx]: %pS\n", level, evt->srr0,
-		       (void *)evt->srr0);
-	}
-
-	printk("%s  Initiator: %s\n", level,
-	       evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
 	switch (evt->error_type) {
 	case MCE_ERROR_TYPE_UE:
+		err_type = "UE";
 		subtype = evt->u.ue_error.ue_error_type <
 			ARRAY_SIZE(mc_ue_types) ?
 			mc_ue_types[evt->u.ue_error.ue_error_type]
 			: "Unknown";
-		printk("%s  Error type: UE [%s]\n", level, subtype);
 		if (evt->u.ue_error.effective_address_provided)
-			printk("%s    Effective address: %016llx\n",
-			       level, evt->u.ue_error.effective_address);
+			ea = evt->u.ue_error.effective_address;
 		if (evt->u.ue_error.physical_address_provided)
-			printk("%s    Physical address:  %016llx\n",
-			       level, evt->u.ue_error.physical_address);
+			pa = evt->u.ue_error.physical_address;
 		break;
 	case MCE_ERROR_TYPE_SLB:
+		err_type = "SLB";
 		subtype = evt->u.slb_error.slb_error_type <
 			ARRAY_SIZE(mc_slb_types) ?
 			mc_slb_types[evt->u.slb_error.slb_error_type]
 			: "Unknown";
-		printk("%s  Error type: SLB [%s]\n", level, subtype);
 		if (evt->u.slb_error.effective_address_provided)
-			printk("%s    Effective address: %016llx\n",
-			       level, evt->u.slb_error.effective_address);
+			ea = evt->u.slb_error.effective_address;
 		break;
 	case MCE_ERROR_TYPE_ERAT:
+		err_type = "ERAT";
 		subtype = evt->u.erat_error.erat_error_type <
 			ARRAY_SIZE(mc_erat_types) ?
 			mc_erat_types[evt->u.erat_error.erat_error_type]
 			: "Unknown";
-		printk("%s  Error type: ERAT [%s]\n", level, subtype);
 		if (evt->u.erat_error.effective_address_provided)
-			printk("%s    Effective address: %016llx\n",
-			       level, evt->u.erat_error.effective_address);
+			ea = evt->u.erat_error.effective_address;
 		break;
 	case MCE_ERROR_TYPE_TLB:
+		err_type = "TLB";
 		subtype = evt->u.tlb_error.tlb_error_type <
 			ARRAY_SIZE(mc_tlb_types) ?
 			mc_tlb_types[evt->u.tlb_error.tlb_error_type]
 			: "Unknown";
-		printk("%s  Error type: TLB [%s]\n", level, subtype);
 		if (evt->u.tlb_error.effective_address_provided)
-			printk("%s    Effective address: %016llx\n",
-			       level, evt->u.tlb_error.effective_address);
+			ea = evt->u.tlb_error.effective_address;
 		break;
 	case MCE_ERROR_TYPE_USER:
+		err_type = "User";
 		subtype = evt->u.user_error.user_error_type <
 			ARRAY_SIZE(mc_user_types) ?
 			mc_user_types[evt->u.user_error.user_error_type]
 			: "Unknown";
-		printk("%s  Error type: User [%s]\n", level, subtype);
 		if (evt->u.user_error.effective_address_provided)
-			printk("%s    Effective address: %016llx\n",
-			       level, evt->u.user_error.effective_address);
+			ea = evt->u.user_error.effective_address;
 		break;
 	case MCE_ERROR_TYPE_RA:
+		err_type = "Real address";
 		subtype = evt->u.ra_error.ra_error_type <
 			ARRAY_SIZE(mc_ra_types) ?
 			mc_ra_types[evt->u.ra_error.ra_error_type]
 			: "Unknown";
-		printk("%s  Error type: Real address [%s]\n", level, subtype);
 		if (evt->u.ra_error.effective_address_provided)
-			printk("%s    Effective address: %016llx\n",
-			       level, evt->u.ra_error.effective_address);
+			ea = evt->u.ra_error.effective_address;
 		break;
 	case MCE_ERROR_TYPE_LINK:
+		err_type = "Link";
 		subtype = evt->u.link_error.link_error_type <
 			ARRAY_SIZE(mc_link_types) ?
 			mc_link_types[evt->u.link_error.link_error_type]
 			: "Unknown";
-		printk("%s  Error type: Link [%s]\n", level, subtype);
 		if (evt->u.link_error.effective_address_provided)
-			printk("%s    Effective address: %016llx\n",
-			       level, evt->u.link_error.effective_address);
+			ea = evt->u.link_error.effective_address;
 		break;
 	default:
 	case MCE_ERROR_TYPE_UNKNOWN:
-		printk("%s  Error type: Unknown\n", level);
+		err_type = "Unknown";
+		subtype = "";
 		break;
 	}
+
+	dar_str[0] = pa_str[0] = '\0';
+	if (ea && evt->srr0 != ea) {
+		/* Load/Store address */
+		n = sprintf(dar_str, "DAR: %016llx ", ea);
+		if (pa)
+			sprintf(dar_str + n, "paddr: %016llx ", pa);
+	} else if (pa) {
+		sprintf(pa_str, " paddr: %016llx", pa);
+	}
+
+	printk("%sMCE: CPU%d: machine check (%s) %s %s %s %s[%s]\n",
+		level, evt->cpu, sevstr, in_guest ? "Guest" : "Host",
+		err_type, subtype, dar_str,
+		evt->disposition == MCE_DISPOSITION_RECOVERED ?
+		"Recovered" : "Not recovered");
+
+	if (in_guest || user_mode) {
+		printk("%sMCE: CPU%d: PID: %d Comm: %s %sNIP: [%016llx]%s\n",
+			level, evt->cpu, current->pid, current->comm,
+			in_guest ? "Guest " : "", evt->srr0, pa_str);
+	} else {
+		printk("%sMCE: CPU%d: NIP: [%016llx] %pS%s\n",
+			level, evt->cpu, evt->srr0, (void *)evt->srr0, pa_str);
+	}
 }
 EXPORT_SYMBOL_GPL(machine_check_print_event_info);
 
-- 
cgit v1.2.3-59-g8ed1b


From cda6618d060b5e8afc93e691d4bcd987f3dd4393 Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Mon, 29 Apr 2019 23:45:55 +0530
Subject: powerpc/powernv/mce: Print correct severity for MCE error.

Currently all machine check errors are printed as severe errors which
isn't correct. Print soft errors as warning instead of severe errors.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mce.h        |  86 ++++++++++----------
 arch/powerpc/kernel/mce.c             |   5 +-
 arch/powerpc/kernel/mce_power.c       | 144 ++++++++++++++++++----------------
 arch/powerpc/platforms/powernv/opal.c |   2 +-
 4 files changed, 123 insertions(+), 114 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index c888ef9a3eaf..d6dc75f9e9bb 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -31,7 +31,7 @@ enum MCE_Version {
 enum MCE_Severity {
 	MCE_SEV_NO_ERROR = 0,
 	MCE_SEV_WARNING = 1,
-	MCE_SEV_ERROR_SYNC = 2,
+	MCE_SEV_SEVERE = 2,
 	MCE_SEV_FATAL = 3,
 };
 
@@ -110,73 +110,74 @@ enum MCE_LinkErrorType {
 };
 
 struct machine_check_event {
-	enum MCE_Version	version:8;	/* 0x00 */
-	uint8_t			in_use;		/* 0x01 */
-	enum MCE_Severity	severity:8;	/* 0x02 */
-	enum MCE_Initiator	initiator:8;	/* 0x03 */
-	enum MCE_ErrorType	error_type:8;	/* 0x04 */
-	enum MCE_Disposition	disposition:8;	/* 0x05 */
-	uint16_t		cpu;		/* 0x06 */
-	uint64_t		gpr3;		/* 0x08 */
-	uint64_t		srr0;		/* 0x10 */
-	uint64_t		srr1;		/* 0x18 */
-	union {					/* 0x20 */
+	enum MCE_Version	version:8;
+	u8			in_use;
+	enum MCE_Severity	severity:8;
+	enum MCE_Initiator	initiator:8;
+	enum MCE_ErrorType	error_type:8;
+	enum MCE_Disposition	disposition:8;
+	bool			sync_error;
+	u16			cpu;
+	u64			gpr3;
+	u64			srr0;
+	u64			srr1;
+	union {
 		struct {
 			enum MCE_UeErrorType ue_error_type:8;
-			uint8_t		effective_address_provided;
-			uint8_t		physical_address_provided;
-			uint8_t		reserved_1[5];
-			uint64_t	effective_address;
-			uint64_t	physical_address;
-			uint8_t		reserved_2[8];
+			u8		effective_address_provided;
+			u8		physical_address_provided;
+			u8		reserved_1[5];
+			u64		effective_address;
+			u64		physical_address;
+			u8		reserved_2[8];
 		} ue_error;
 
 		struct {
 			enum MCE_SlbErrorType slb_error_type:8;
-			uint8_t		effective_address_provided;
-			uint8_t		reserved_1[6];
-			uint64_t	effective_address;
-			uint8_t		reserved_2[16];
+			u8		effective_address_provided;
+			u8		reserved_1[6];
+			u64		effective_address;
+			u8		reserved_2[16];
 		} slb_error;
 
 		struct {
 			enum MCE_EratErrorType erat_error_type:8;
-			uint8_t		effective_address_provided;
-			uint8_t		reserved_1[6];
-			uint64_t	effective_address;
-			uint8_t		reserved_2[16];
+			u8		effective_address_provided;
+			u8		reserved_1[6];
+			u64		effective_address;
+			u8		reserved_2[16];
 		} erat_error;
 
 		struct {
 			enum MCE_TlbErrorType tlb_error_type:8;
-			uint8_t		effective_address_provided;
-			uint8_t		reserved_1[6];
-			uint64_t	effective_address;
-			uint8_t		reserved_2[16];
+			u8		effective_address_provided;
+			u8		reserved_1[6];
+			u64		effective_address;
+			u8		reserved_2[16];
 		} tlb_error;
 
 		struct {
 			enum MCE_UserErrorType user_error_type:8;
-			uint8_t		effective_address_provided;
-			uint8_t		reserved_1[6];
-			uint64_t	effective_address;
-			uint8_t		reserved_2[16];
+			u8		effective_address_provided;
+			u8		reserved_1[6];
+			u64		effective_address;
+			u8		reserved_2[16];
 		} user_error;
 
 		struct {
 			enum MCE_RaErrorType ra_error_type:8;
-			uint8_t		effective_address_provided;
-			uint8_t		reserved_1[6];
-			uint64_t	effective_address;
-			uint8_t		reserved_2[16];
+			u8		effective_address_provided;
+			u8		reserved_1[6];
+			u64		effective_address;
+			u8		reserved_2[16];
 		} ra_error;
 
 		struct {
 			enum MCE_LinkErrorType link_error_type:8;
-			uint8_t		effective_address_provided;
-			uint8_t		reserved_1[6];
-			uint64_t	effective_address;
-			uint8_t		reserved_2[16];
+			u8		effective_address_provided;
+			u8		reserved_1[6];
+			u64		effective_address;
+			u8		reserved_2[16];
 		} link_error;
 	} u;
 };
@@ -194,6 +195,7 @@ struct mce_error_info {
 	} u;
 	enum MCE_Severity	severity:8;
 	enum MCE_Initiator	initiator:8;
+	bool			sync_error;
 };
 
 #define MAX_MC_EVT	100
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 25a8b20cbbdc..71d245a387ab 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -122,6 +122,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
 
 	mce->initiator = mce_err->initiator;
 	mce->severity = mce_err->severity;
+	mce->sync_error = mce_err->sync_error;
 
 	/*
 	 * Populate the mce error_type and type-specific error_type.
@@ -376,9 +377,9 @@ void machine_check_print_event_info(struct machine_check_event *evt,
 		break;
 	case MCE_SEV_WARNING:
 		level = KERN_WARNING;
-		sevstr = "";
+		sevstr = "Warning";
 		break;
-	case MCE_SEV_ERROR_SYNC:
+	case MCE_SEV_SEVERE:
 		level = KERN_ERR;
 		sevstr = "Severe";
 		break;
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 367fbfa2e835..6647a31b85b2 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -133,106 +133,107 @@ struct mce_ierror_table {
 	unsigned int error_subtype;
 	unsigned int initiator;
 	unsigned int severity;
+	bool sync_error;
 };
 
 static const struct mce_ierror_table mce_p7_ierror_table[] = {
 { 0x00000000001c0000, 0x0000000000040000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000001c0000, 0x0000000000080000, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000001c0000, 0x00000000000c0000, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000001c0000, 0x0000000000100000, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000001c0000, 0x0000000000140000, true,
   MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000001c0000, 0x0000000000180000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000001c0000, 0x00000000001c0000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
-{ 0, 0, 0, 0, 0, 0 } };
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
+{ 0, 0, 0, 0, 0, 0, 0 } };
 
 static const struct mce_ierror_table mce_p8_ierror_table[] = {
 { 0x00000000081c0000, 0x0000000000040000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000000080000, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000000c0000, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000100000, true,
   MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000140000, true,
   MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000180000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000001c0000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008000000, true,
   MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008040000, true,
   MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
-{ 0, 0, 0, 0, 0, 0 } };
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
+{ 0, 0, 0, 0, 0, 0, 0 } };
 
 static const struct mce_ierror_table mce_p9_ierror_table[] = {
 { 0x00000000081c0000, 0x0000000000040000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000000080000, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000000c0000, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000100000, true,
   MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000140000, true,
   MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000180000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000001c0000, true,
   MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_IFETCH_FOREIGN,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008000000, true,
   MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008040000, true,
   MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000080c0000, true,
   MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008100000, true,
   MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008140000, false,
   MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_STORE,
-  MCE_INITIATOR_CPU,  MCE_SEV_FATAL, }, /* ASYNC is fatal */
+  MCE_INITIATOR_CPU,  MCE_SEV_FATAL, false }, /* ASYNC is fatal */
 { 0x00000000081c0000, 0x0000000008180000, false,
   MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_STORE_TIMEOUT,
-  MCE_INITIATOR_CPU,  MCE_SEV_FATAL, }, /* ASYNC is fatal */
+  MCE_INITIATOR_CPU,  MCE_SEV_FATAL, false }, /* ASYNC is fatal */
 { 0x00000000081c0000, 0x00000000081c0000, true,
   MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
-{ 0, 0, 0, 0, 0, 0 } };
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
+{ 0, 0, 0, 0, 0, 0, 0 } };
 
 struct mce_derror_table {
 	unsigned long dsisr_value;
@@ -241,103 +242,104 @@ struct mce_derror_table {
 	unsigned int error_subtype;
 	unsigned int initiator;
 	unsigned int severity;
+	bool sync_error;
 };
 
 static const struct mce_derror_table mce_p7_derror_table[] = {
 { 0x00008000, false,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00004000, true,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000800, true,
   MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000400, true,
   MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000080, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,	/* Before PARITY */
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000100, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000040, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
-{ 0, false, 0, 0, 0, 0 } };
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
+{ 0, false, 0, 0, 0, 0, 0 } };
 
 static const struct mce_derror_table mce_p8_derror_table[] = {
 { 0x00008000, false,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00004000, true,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00002000, true,
   MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00001000, true,
   MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000800, true,
   MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000400, true,
   MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000200, true,
   MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, /* SECONDARY ERAT */
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000080, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,	/* Before PARITY */
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000100, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
-{ 0, false, 0, 0, 0, 0 } };
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
+{ 0, false, 0, 0, 0, 0, 0 } };
 
 static const struct mce_derror_table mce_p9_derror_table[] = {
 { 0x00008000, false,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00004000, true,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00002000, true,
   MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00001000, true,
   MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000800, true,
   MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000400, true,
   MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000200, false,
   MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000080, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,	/* Before PARITY */
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000100, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000040, true,
   MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_LOAD,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000020, false,
   MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000010, false,
   MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000008, false,
   MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_LOAD_STORE_FOREIGN,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
-{ 0, false, 0, 0, 0, 0 } };
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
+{ 0, false, 0, 0, 0, 0, 0 } };
 
 static int mce_find_instr_ea_and_pfn(struct pt_regs *regs, uint64_t *addr,
 					uint64_t *phys_addr)
@@ -427,11 +429,12 @@ static int mce_handle_ierror(struct pt_regs *regs,
 			mce_err->u.link_error_type = table[i].error_subtype;
 			break;
 		}
+		mce_err->sync_error = table[i].sync_error;
 		mce_err->severity = table[i].severity;
 		mce_err->initiator = table[i].initiator;
 		if (table[i].nip_valid) {
 			*addr = regs->nip;
-			if (mce_err->severity == MCE_SEV_ERROR_SYNC &&
+			if (mce_err->sync_error &&
 				table[i].error_type == MCE_ERROR_TYPE_UE) {
 				unsigned long pfn;
 
@@ -448,8 +451,9 @@ static int mce_handle_ierror(struct pt_regs *regs,
 	}
 
 	mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
-	mce_err->severity = MCE_SEV_ERROR_SYNC;
+	mce_err->severity = MCE_SEV_SEVERE;
 	mce_err->initiator = MCE_INITIATOR_CPU;
+	mce_err->sync_error = true;
 
 	return 0;
 }
@@ -519,11 +523,12 @@ static int mce_handle_derror(struct pt_regs *regs,
 			mce_err->u.link_error_type = table[i].error_subtype;
 			break;
 		}
+		mce_err->sync_error = table[i].sync_error;
 		mce_err->severity = table[i].severity;
 		mce_err->initiator = table[i].initiator;
 		if (table[i].dar_valid)
 			*addr = regs->dar;
-		else if (mce_err->severity == MCE_SEV_ERROR_SYNC &&
+		else if (mce_err->sync_error &&
 				table[i].error_type == MCE_ERROR_TYPE_UE) {
 			/*
 			 * We do a maximum of 4 nested MCE calls, see
@@ -539,8 +544,9 @@ static int mce_handle_derror(struct pt_regs *regs,
 		return handled;
 
 	mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
-	mce_err->severity = MCE_SEV_ERROR_SYNC;
+	mce_err->severity = MCE_SEV_SEVERE;
 	mce_err->initiator = MCE_INITIATOR_CPU;
+	mce_err->sync_error = true;
 
 	return 0;
 }
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 2b0eca104f86..737c51d63480 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -505,7 +505,7 @@ static int opal_recover_mce(struct pt_regs *regs,
 		recovered = 0;
 	}
 
-	if (!recovered && evt->severity == MCE_SEV_ERROR_SYNC) {
+	if (!recovered && evt->sync_error) {
 		/*
 		 * Try to kill processes if we get a synchronous machine check
 		 * (e.g., one caused by execution of this instruction). This
-- 
cgit v1.2.3-59-g8ed1b


From 50dbabe06a6e1c35980154ea1fac2ed6ad28652b Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Mon, 29 Apr 2019 23:46:02 +0530
Subject: powerpc/powernv/mce: Print additional information about MCE error.

Print more information about MCE error whether it is an hardware or
software error.

Some of the MCE errors can be easily categorized as hardware or
software errors e.g. UEs are due to hardware error, where as error
triggered due to invalid usage of tlbie is a pure software bug. But
not all the MCE errors can be easily categorize into either software
or hardware. There are errors like multihit errors which are usually
result of a software bug, but in some rare cases a hardware failure
can cause a multihit error. In past, we have seen case where after
replacing faulty chip, multihit errors stopped occurring. Same with
parity errors, which are usually due to faulty hardware but there are
chances where multihit can also cause an parity error. Such errors are
difficult to determine what really caused it. Hence this patch
classifies MCE errors into following four categorize:

  1. Hardware error:
  	UE and Link timeout failure errors.
  2. Probable hardware error (some chance of software cause)
  	SLB/ERAT/TLB Parity errors.
  3. Software error
  	Invalid tlbie form.
  4. Probable software error (some chance of hardware cause)
  	SLB/ERAT/TLB Multihit errors.

Sample output:

  MCE: CPU80: machine check (Warning) Guest SLB Multihit DAR: 000001001b6e0320 [Recovered]
  MCE: CPU80: PID: 24765 Comm: qemu-system-ppc Guest NIP: [00007fffa309dc60]
  MCE: CPU80: Probable Software error (some chance of hardware cause)

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mce.h  |  10 ++++
 arch/powerpc/kernel/mce.c       |  12 +++++
 arch/powerpc/kernel/mce_power.c | 107 ++++++++++++++++++++++++----------------
 3 files changed, 86 insertions(+), 43 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index d6dc75f9e9bb..23247a132ce8 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -56,6 +56,14 @@ enum MCE_ErrorType {
 	MCE_ERROR_TYPE_LINK = 7,
 };
 
+enum MCE_ErrorClass {
+	MCE_ECLASS_UNKNOWN = 0,
+	MCE_ECLASS_HARDWARE,
+	MCE_ECLASS_HARD_INDETERMINATE,
+	MCE_ECLASS_SOFTWARE,
+	MCE_ECLASS_SOFT_INDETERMINATE,
+};
+
 enum MCE_UeErrorType {
 	MCE_UE_ERROR_INDETERMINATE = 0,
 	MCE_UE_ERROR_IFETCH = 1,
@@ -115,6 +123,7 @@ struct machine_check_event {
 	enum MCE_Severity	severity:8;
 	enum MCE_Initiator	initiator:8;
 	enum MCE_ErrorType	error_type:8;
+	enum MCE_ErrorClass	error_class:8;
 	enum MCE_Disposition	disposition:8;
 	bool			sync_error;
 	u16			cpu;
@@ -195,6 +204,7 @@ struct mce_error_info {
 	} u;
 	enum MCE_Severity	severity:8;
 	enum MCE_Initiator	initiator:8;
+	enum MCE_ErrorClass	error_class:8;
 	bool			sync_error;
 };
 
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 71d245a387ab..4581377cfc98 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -123,6 +123,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
 	mce->initiator = mce_err->initiator;
 	mce->severity = mce_err->severity;
 	mce->sync_error = mce_err->sync_error;
+	mce->error_class = mce_err->error_class;
 
 	/*
 	 * Populate the mce error_type and type-specific error_type.
@@ -363,6 +364,13 @@ void machine_check_print_event_info(struct machine_check_event *evt,
 		"Store (timeout)",
 		"Page table walk Load/Store (timeout)",
 	};
+	static const char *mc_error_class[] = {
+		"Unknown",
+		"Hardware error",
+		"Probable Hardware error (some chance of software cause)",
+		"Software error",
+		"Probable Software error (some chance of hardware cause)",
+	};
 
 	/* Print things out */
 	if (evt->version != MCE_V1) {
@@ -487,6 +495,10 @@ void machine_check_print_event_info(struct machine_check_event *evt,
 		printk("%sMCE: CPU%d: NIP: [%016llx] %pS%s\n",
 			level, evt->cpu, evt->srr0, (void *)evt->srr0, pa_str);
 	}
+
+	subtype = evt->error_class < ARRAY_SIZE(mc_error_class) ?
+		mc_error_class[evt->error_class] : "Unknown";
+	printk("%sMCE: CPU%d: %s\n", level, evt->cpu, subtype);
 }
 EXPORT_SYMBOL_GPL(machine_check_print_event_info);
 
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 6647a31b85b2..b5e876efe864 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -131,6 +131,7 @@ struct mce_ierror_table {
 	bool nip_valid; /* nip is a valid indicator of faulting address */
 	unsigned int error_type;
 	unsigned int error_subtype;
+	unsigned int error_class;
 	unsigned int initiator;
 	unsigned int severity;
 	bool sync_error;
@@ -138,99 +139,103 @@ struct mce_ierror_table {
 
 static const struct mce_ierror_table mce_p7_ierror_table[] = {
 { 0x00000000001c0000, 0x0000000000040000, true,
-  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000001c0000, 0x0000000000080000, true,
-  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000001c0000, 0x00000000000c0000, true,
-  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000001c0000, 0x0000000000100000, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
+  MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000001c0000, 0x0000000000140000, true,
-  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000001c0000, 0x0000000000180000, true,
-  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000001c0000, 0x00000000001c0000, true,
-  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0, 0, 0, 0, 0, 0, 0 } };
 
 static const struct mce_ierror_table mce_p8_ierror_table[] = {
 { 0x00000000081c0000, 0x0000000000040000, true,
-  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000000080000, true,
-  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000000c0000, true,
-  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000100000, true,
-  MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000140000, true,
-  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000180000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000001c0000, true,
-  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008000000, true,
-  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_IFETCH_TIMEOUT, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008040000, true,
   MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0, 0, 0, 0, 0, 0, 0 } };
 
 static const struct mce_ierror_table mce_p9_ierror_table[] = {
 { 0x00000000081c0000, 0x0000000000040000, true,
-  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000000080000, true,
-  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000000c0000, true,
-  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000100000, true,
-  MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000140000, true,
-  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000180000, true,
-  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000001c0000, true,
-  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_IFETCH_FOREIGN,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_IFETCH_FOREIGN, MCE_ECLASS_SOFTWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008000000, true,
-  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_IFETCH_TIMEOUT, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008040000, true,
   MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000080c0000, true,
-  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_IFETCH,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_IFETCH, MCE_ECLASS_SOFTWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008100000, true,
-  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH, MCE_ECLASS_SOFTWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008140000, false,
-  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_STORE,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_STORE, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_FATAL, false }, /* ASYNC is fatal */
 { 0x00000000081c0000, 0x0000000008180000, false,
   MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_STORE_TIMEOUT,
   MCE_INITIATOR_CPU,  MCE_SEV_FATAL, false }, /* ASYNC is fatal */
-{ 0x00000000081c0000, 0x00000000081c0000, true,
+{ 0x00000000081c0000, 0x00000000081c0000, true, MCE_ECLASS_HARDWARE,
   MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0, 0, 0, 0, 0, 0, 0 } };
@@ -240,6 +245,7 @@ struct mce_derror_table {
 	bool dar_valid; /* dar is a valid indicator of faulting address */
 	unsigned int error_type;
 	unsigned int error_subtype;
+	unsigned int error_class;
 	unsigned int initiator;
 	unsigned int severity;
 	bool sync_error;
@@ -247,97 +253,108 @@ struct mce_derror_table {
 
 static const struct mce_derror_table mce_p7_derror_table[] = {
 { 0x00008000, false,
-  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00004000, true,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000800, true,
-  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000400, true,
-  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000080, true,
-  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,	/* Before PARITY */
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000100, true,
-  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000040, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
+  MCE_ECLASS_HARD_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0, false, 0, 0, 0, 0, 0 } };
 
 static const struct mce_derror_table mce_p8_derror_table[] = {
 { 0x00008000, false,
-  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00004000, true,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00002000, true,
-  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00001000, true,
   MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000800, true,
-  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000400, true,
-  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000200, true,
   MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, /* SECONDARY ERAT */
+  MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000080, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,	/* Before PARITY */
+  MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000100, true,
-  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0, false, 0, 0, 0, 0, 0 } };
 
 static const struct mce_derror_table mce_p9_derror_table[] = {
 { 0x00008000, false,
-  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00004000, true,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00002000, true,
-  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00001000, true,
   MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000800, true,
-  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000400, true,
-  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000200, false,
-  MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE,
+  MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE, MCE_ECLASS_SOFTWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000080, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,	/* Before PARITY */
+  MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000100, true,
-  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000040, true,
-  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_LOAD,
+  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_LOAD, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000020, false,
   MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000010, false,
   MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000008, false,
-  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_LOAD_STORE_FOREIGN,
+  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_LOAD_STORE_FOREIGN, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0, false, 0, 0, 0, 0, 0 } };
 
@@ -406,6 +423,7 @@ static int mce_handle_ierror(struct pt_regs *regs,
 
 		/* now fill in mce_error_info */
 		mce_err->error_type = table[i].error_type;
+		mce_err->error_class = table[i].error_class;
 		switch (table[i].error_type) {
 		case MCE_ERROR_TYPE_UE:
 			mce_err->u.ue_error_type = table[i].error_subtype;
@@ -451,6 +469,7 @@ static int mce_handle_ierror(struct pt_regs *regs,
 	}
 
 	mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
+	mce_err->error_class = MCE_ECLASS_UNKNOWN;
 	mce_err->severity = MCE_SEV_SEVERE;
 	mce_err->initiator = MCE_INITIATOR_CPU;
 	mce_err->sync_error = true;
@@ -500,6 +519,7 @@ static int mce_handle_derror(struct pt_regs *regs,
 
 		/* now fill in mce_error_info */
 		mce_err->error_type = table[i].error_type;
+		mce_err->error_class = table[i].error_class;
 		switch (table[i].error_type) {
 		case MCE_ERROR_TYPE_UE:
 			mce_err->u.ue_error_type = table[i].error_subtype;
@@ -544,6 +564,7 @@ static int mce_handle_derror(struct pt_regs *regs,
 		return handled;
 
 	mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
+	mce_err->error_class = MCE_ECLASS_UNKNOWN;
 	mce_err->severity = MCE_SEV_SEVERE;
 	mce_err->initiator = MCE_INITIATOR_CPU;
 	mce_err->sync_error = true;
-- 
cgit v1.2.3-59-g8ed1b


From 2c474c03505677cfd987d52e8bf42abe8c270529 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Tue, 30 Apr 2019 13:29:07 +0530
Subject: powerpc/mm/radix: Fix kernel crash when running subpage protect test

This patch fixes the below crash by making sure we touch the subpage
protection related structures only if we know they are allocated on
the platform. With radix translation we don't allocate hash context at
all and trying to access subpage_prot_table results in:

  Faulting instruction address: 0xc00000000008bdb4
  Oops: Kernel access of bad area, sig: 11 [#1]
  LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
  ....
  NIP [c00000000008bdb4] sys_subpage_prot+0x74/0x590
  LR [c00000000000b688] system_call+0x5c/0x70
  Call Trace:
  [c00020002c6b7d30] [c00020002c6b7d90] 0xc00020002c6b7d90 (unreliable)
  [c00020002c6b7e20] [c00000000000b688] system_call+0x5c/0x70
  Instruction dump:
  fb61ffd8 fb81ffe0 fba1ffe8 fbc1fff0 fbe1fff8 f821ff11 e92d1178 f9210068
  39200000 e92d0968 ebe90630 e93f03e8 <eb891038> 60000000 3860fffe e9410068

We also move the subpage_prot_table with mmp_sem held to avoid race
between two parallel subpage_prot syscall.

Fixes: 701101865f5d ("powerpc/mm: Reduce memory usage for mm_context_t for radix")
Reported-by: Sachin Sant <sachinp@linux.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/subpage-prot.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index c9dff4e1f295..473dd430e306 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -90,16 +90,18 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 static void subpage_prot_clear(unsigned long addr, unsigned long len)
 {
 	struct mm_struct *mm = current->mm;
-	struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
+	struct subpage_prot_table *spt;
 	u32 **spm, *spp;
 	unsigned long i;
 	size_t nw;
 	unsigned long next, limit;
 
+	down_write(&mm->mmap_sem);
+
+	spt = mm_ctx_subpage_prot(&mm->context);
 	if (!spt)
-		return ;
+		goto err_out;
 
-	down_write(&mm->mmap_sem);
 	limit = addr + len;
 	if (limit > spt->maxaddr)
 		limit = spt->maxaddr;
@@ -127,6 +129,8 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
 		/* now flush any existing HPTEs for the range */
 		hpte_flush_range(mm, addr, nw);
 	}
+
+err_out:
 	up_write(&mm->mmap_sem);
 }
 
@@ -189,7 +193,7 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
 		unsigned long, len, u32 __user *, map)
 {
 	struct mm_struct *mm = current->mm;
-	struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
+	struct subpage_prot_table *spt;
 	u32 **spm, *spp;
 	unsigned long i;
 	size_t nw;
@@ -219,6 +223,7 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
 
 	down_write(&mm->mmap_sem);
 
+	spt = mm_ctx_subpage_prot(&mm->context);
 	if (!spt) {
 		/*
 		 * Allocate subpage prot table if not already done.
-- 
cgit v1.2.3-59-g8ed1b


From e620d45065c7b5b8d6ae11217c09c09380103b83 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Wed, 16 Jan 2019 14:47:44 -0200
Subject: powerpc/tm: Avoid machine crash on rt_sigreturn()

There is a kernel crash that happens if rt_sigreturn() is called inside
a transactional block.

This crash happens if the kernel hits an in-kernel page fault when
accessing userspace memory, usually through copy_ckvsx_to_user(). A
major page fault calls might_sleep() function, which can cause a task
reschedule. A task reschedule (switch_to()) reclaim and recheckpoint
the TM states, but, in the signal return path, the checkpointed memory
was already reclaimed, thus the exception stack has MSR that points to
MSR[TS]=0.

When the code returns from might_sleep() and a task reschedule
happened, then this task is returned with the memory recheckpointed,
and CPU MSR[TS] = suspended.

This means that there is a side effect at might_sleep() if it is
called with CPU MSR[TS] = 0 and the task has regs->msr[TS] != 0.

This side effect can cause a TM bad thing, since at the exception
entrance, the stack saves MSR[TS]=0, and this is what will be used at
RFID, but, the processor has MSR[TS] = Suspended, and this transition
will be invalid and a TM Bad thing will be raised, causing the
following crash:

  Unexpected TM Bad Thing exception at c00000000000e9ec (msr 0x8000000302a03031) tm_scratch=800000010280b033
  cpu 0xc: Vector: 700 (Program Check) at [c00000003ff1fd70]
      pc: c00000000000e9ec: fast_exception_return+0x100/0x1bc
      lr: c000000000032948: handle_rt_signal64+0xb8/0xaf0
      sp: c0000004263ebc40
     msr: 8000000302a03031
    current = 0xc000000415050300
    paca    = 0xc00000003ffc4080	 irqmask: 0x03	 irq_happened: 0x01
      pid   = 25006, comm = sigfuz
  Linux version 5.0.0-rc1-00001-g3bd6e94bec12 (breno@debian) (gcc version 8.2.0 (Debian 8.2.0-3)) #899 SMP Mon Jan 7 11:30:07 EST 2019
  WARNING: exception is not recoverable, can't continue
  enter ? for help
  [c0000004263ebc40] c000000000032948 handle_rt_signal64+0xb8/0xaf0 (unreliable)
  [c0000004263ebd30] c000000000022780 do_notify_resume+0x2f0/0x430
  [c0000004263ebe20] c00000000000e844 ret_from_except_lite+0x70/0x74
  --- Exception: c00 (System Call) at 00007fffbaac400c
  SP (7fffeca90f40) is in userspace

The solution for this problem is running the sigreturn code with
regs->msr[TS] disabled, thus, avoiding hitting the side effect above.
This does not seem to be a problem since regs->msr will be replaced by
the ucontext value, so, it is being flushed already. In this case, it
is flushed earlier.

Signed-off-by: Breno Leitao <leitao@debian.org>
Acked-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/signal_64.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 6794466f6420..06c299ef6132 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -565,7 +565,7 @@ static long restore_tm_sigcontexts(struct task_struct *tsk,
 	preempt_disable();
 
 	/* pull in MSR TS bits from user context */
-	regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK);
+	regs->msr |= msr & MSR_TS_MASK;
 
 	/*
 	 * Ensure that TM is enabled in regs->msr before we leave the signal
@@ -745,6 +745,31 @@ SYSCALL_DEFINE0(rt_sigreturn)
 	if (MSR_TM_SUSPENDED(mfmsr()))
 		tm_reclaim_current(0);
 
+	/*
+	 * Disable MSR[TS] bit also, so, if there is an exception in the
+	 * code below (as a page fault in copy_ckvsx_to_user()), it does
+	 * not recheckpoint this task if there was a context switch inside
+	 * the exception.
+	 *
+	 * A major page fault can indirectly call schedule(). A reschedule
+	 * process in the middle of an exception can have a side effect
+	 * (Changing the CPU MSR[TS] state), since schedule() is called
+	 * with the CPU MSR[TS] disable and returns with MSR[TS]=Suspended
+	 * (switch_to() calls tm_recheckpoint() for the 'new' process). In
+	 * this case, the process continues to be the same in the CPU, but
+	 * the CPU state just changed.
+	 *
+	 * This can cause a TM Bad Thing, since the MSR in the stack will
+	 * have the MSR[TS]=0, and this is what will be used to RFID.
+	 *
+	 * Clearing MSR[TS] state here will avoid a recheckpoint if there
+	 * is any process reschedule in kernel space. The MSR[TS] state
+	 * does not need to be saved also, since it will be replaced with
+	 * the MSR[TS] that came from user context later, at
+	 * restore_tm_sigcontexts.
+	 */
+	regs->msr &= ~MSR_TS_MASK;
+
 	if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR]))
 		goto badframe;
 	if (MSR_TM_ACTIVE(msr)) {
-- 
cgit v1.2.3-59-g8ed1b


From 711aef1bbf88212a21f7103e88f397b47a528805 Mon Sep 17 00:00:00 2001
From: Wang YanQing <udknight@gmail.com>
Date: Sat, 27 Apr 2019 16:28:26 +0800
Subject: bpf, x32: Fix bug for BPF_JMP | {BPF_JSGT, BPF_JSLE, BPF_JSLT,
 BPF_JSGE}

The current method to compare 64-bit numbers for conditional jump is:

1) Compare the high 32-bit first.

2) If the high 32-bit isn't the same, then goto step 4.

3) Compare the low 32-bit.

4) Check the desired condition.

This method is right for unsigned comparison, but it is buggy for signed
comparison, because it does signed comparison for low 32-bit too.

There is only one sign bit in 64-bit number, that is the MSB in the 64-bit
number, it is wrong to treat low 32-bit as signed number and do the signed
comparison for it.

This patch fixes the bug and adds a testcase in selftests/bpf for such bug.

Signed-off-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/x86/net/bpf_jit_comp32.c              | 217 ++++++++++++++++++++++-------
 tools/testing/selftests/bpf/verifier/jit.c |  19 +++
 2 files changed, 185 insertions(+), 51 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 0d9cdffce6ac..8097b88d744f 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -117,6 +117,8 @@ static bool is_simm32(s64 value)
 #define IA32_JLE 0x7E
 #define IA32_JG  0x7F
 
+#define COND_JMP_OPCODE_INVALID	(0xFF)
+
 /*
  * Map eBPF registers to IA32 32bit registers or stack scratch space.
  *
@@ -1613,6 +1615,75 @@ static inline void emit_push_r64(const u8 src[], u8 **pprog)
 	*pprog = prog;
 }
 
+static u8 get_cond_jmp_opcode(const u8 op, bool is_cmp_lo)
+{
+	u8 jmp_cond;
+
+	/* Convert BPF opcode to x86 */
+	switch (op) {
+	case BPF_JEQ:
+		jmp_cond = IA32_JE;
+		break;
+	case BPF_JSET:
+	case BPF_JNE:
+		jmp_cond = IA32_JNE;
+		break;
+	case BPF_JGT:
+		/* GT is unsigned '>', JA in x86 */
+		jmp_cond = IA32_JA;
+		break;
+	case BPF_JLT:
+		/* LT is unsigned '<', JB in x86 */
+		jmp_cond = IA32_JB;
+		break;
+	case BPF_JGE:
+		/* GE is unsigned '>=', JAE in x86 */
+		jmp_cond = IA32_JAE;
+		break;
+	case BPF_JLE:
+		/* LE is unsigned '<=', JBE in x86 */
+		jmp_cond = IA32_JBE;
+		break;
+	case BPF_JSGT:
+		if (!is_cmp_lo)
+			/* Signed '>', GT in x86 */
+			jmp_cond = IA32_JG;
+		else
+			/* GT is unsigned '>', JA in x86 */
+			jmp_cond = IA32_JA;
+		break;
+	case BPF_JSLT:
+		if (!is_cmp_lo)
+			/* Signed '<', LT in x86 */
+			jmp_cond = IA32_JL;
+		else
+			/* LT is unsigned '<', JB in x86 */
+			jmp_cond = IA32_JB;
+		break;
+	case BPF_JSGE:
+		if (!is_cmp_lo)
+			/* Signed '>=', GE in x86 */
+			jmp_cond = IA32_JGE;
+		else
+			/* GE is unsigned '>=', JAE in x86 */
+			jmp_cond = IA32_JAE;
+		break;
+	case BPF_JSLE:
+		if (!is_cmp_lo)
+			/* Signed '<=', LE in x86 */
+			jmp_cond = IA32_JLE;
+		else
+			/* LE is unsigned '<=', JBE in x86 */
+			jmp_cond = IA32_JBE;
+		break;
+	default: /* to silence GCC warning */
+		jmp_cond = COND_JMP_OPCODE_INVALID;
+		break;
+	}
+
+	return jmp_cond;
+}
+
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		  int oldproglen, struct jit_context *ctx)
 {
@@ -2069,10 +2140,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		case BPF_JMP | BPF_JLT | BPF_X:
 		case BPF_JMP | BPF_JGE | BPF_X:
 		case BPF_JMP | BPF_JLE | BPF_X:
-		case BPF_JMP | BPF_JSGT | BPF_X:
-		case BPF_JMP | BPF_JSLE | BPF_X:
-		case BPF_JMP | BPF_JSLT | BPF_X:
-		case BPF_JMP | BPF_JSGE | BPF_X:
 		case BPF_JMP32 | BPF_JEQ | BPF_X:
 		case BPF_JMP32 | BPF_JNE | BPF_X:
 		case BPF_JMP32 | BPF_JGT | BPF_X:
@@ -2118,6 +2185,40 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
 			goto emit_cond_jmp;
 		}
+		case BPF_JMP | BPF_JSGT | BPF_X:
+		case BPF_JMP | BPF_JSLE | BPF_X:
+		case BPF_JMP | BPF_JSLT | BPF_X:
+		case BPF_JMP | BPF_JSGE | BPF_X: {
+			u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+			u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+			u8 sreg_lo = sstk ? IA32_ECX : src_lo;
+			u8 sreg_hi = sstk ? IA32_EBX : src_hi;
+
+			if (dstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+				EMIT3(0x8B,
+				      add_2reg(0x40, IA32_EBP,
+					       IA32_EDX),
+				      STACK_VAR(dst_hi));
+			}
+
+			if (sstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+				      STACK_VAR(src_lo));
+				EMIT3(0x8B,
+				      add_2reg(0x40, IA32_EBP,
+					       IA32_EBX),
+				      STACK_VAR(src_hi));
+			}
+
+			/* cmp dreg_hi,sreg_hi */
+			EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+			EMIT2(IA32_JNE, 10);
+			/* cmp dreg_lo,sreg_lo */
+			EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
+			goto emit_cond_jmp_signed;
+		}
 		case BPF_JMP | BPF_JSET | BPF_X:
 		case BPF_JMP32 | BPF_JSET | BPF_X: {
 			bool is_jmp64 = BPF_CLASS(insn->code) == BPF_JMP;
@@ -2194,10 +2295,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		case BPF_JMP | BPF_JLT | BPF_K:
 		case BPF_JMP | BPF_JGE | BPF_K:
 		case BPF_JMP | BPF_JLE | BPF_K:
-		case BPF_JMP | BPF_JSGT | BPF_K:
-		case BPF_JMP | BPF_JSLE | BPF_K:
-		case BPF_JMP | BPF_JSLT | BPF_K:
-		case BPF_JMP | BPF_JSGE | BPF_K:
 		case BPF_JMP32 | BPF_JEQ | BPF_K:
 		case BPF_JMP32 | BPF_JNE | BPF_K:
 		case BPF_JMP32 | BPF_JGT | BPF_K:
@@ -2238,50 +2335,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			/* cmp dreg_lo,sreg_lo */
 			EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
 
-emit_cond_jmp:		/* Convert BPF opcode to x86 */
-			switch (BPF_OP(code)) {
-			case BPF_JEQ:
-				jmp_cond = IA32_JE;
-				break;
-			case BPF_JSET:
-			case BPF_JNE:
-				jmp_cond = IA32_JNE;
-				break;
-			case BPF_JGT:
-				/* GT is unsigned '>', JA in x86 */
-				jmp_cond = IA32_JA;
-				break;
-			case BPF_JLT:
-				/* LT is unsigned '<', JB in x86 */
-				jmp_cond = IA32_JB;
-				break;
-			case BPF_JGE:
-				/* GE is unsigned '>=', JAE in x86 */
-				jmp_cond = IA32_JAE;
-				break;
-			case BPF_JLE:
-				/* LE is unsigned '<=', JBE in x86 */
-				jmp_cond = IA32_JBE;
-				break;
-			case BPF_JSGT:
-				/* Signed '>', GT in x86 */
-				jmp_cond = IA32_JG;
-				break;
-			case BPF_JSLT:
-				/* Signed '<', LT in x86 */
-				jmp_cond = IA32_JL;
-				break;
-			case BPF_JSGE:
-				/* Signed '>=', GE in x86 */
-				jmp_cond = IA32_JGE;
-				break;
-			case BPF_JSLE:
-				/* Signed '<=', LE in x86 */
-				jmp_cond = IA32_JLE;
-				break;
-			default: /* to silence GCC warning */
+emit_cond_jmp:		jmp_cond = get_cond_jmp_opcode(BPF_OP(code), false);
+			if (jmp_cond == COND_JMP_OPCODE_INVALID)
 				return -EFAULT;
-			}
 			jmp_offset = addrs[i + insn->off] - addrs[i];
 			if (is_imm8(jmp_offset)) {
 				EMIT2(jmp_cond, jmp_offset);
@@ -2291,7 +2347,66 @@ emit_cond_jmp:		/* Convert BPF opcode to x86 */
 				pr_err("cond_jmp gen bug %llx\n", jmp_offset);
 				return -EFAULT;
 			}
+			break;
+		}
+		case BPF_JMP | BPF_JSGT | BPF_K:
+		case BPF_JMP | BPF_JSLE | BPF_K:
+		case BPF_JMP | BPF_JSLT | BPF_K:
+		case BPF_JMP | BPF_JSGE | BPF_K: {
+			u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+			u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+			u8 sreg_lo = IA32_ECX;
+			u8 sreg_hi = IA32_EBX;
+			u32 hi;
+
+			if (dstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+				EMIT3(0x8B,
+				      add_2reg(0x40, IA32_EBP,
+					       IA32_EDX),
+				      STACK_VAR(dst_hi));
+			}
+
+			/* mov ecx,imm32 */
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+			hi = imm32 & (1 << 31) ? (u32)~0 : 0;
+			/* mov ebx,imm32 */
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
+			/* cmp dreg_hi,sreg_hi */
+			EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+			EMIT2(IA32_JNE, 10);
+			/* cmp dreg_lo,sreg_lo */
+			EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
+
+			/*
+			 * For simplicity of branch offset computation,
+			 * let's use fixed jump coding here.
+			 */
+emit_cond_jmp_signed:	/* Check the condition for low 32-bit comparison */
+			jmp_cond = get_cond_jmp_opcode(BPF_OP(code), true);
+			if (jmp_cond == COND_JMP_OPCODE_INVALID)
+				return -EFAULT;
+			jmp_offset = addrs[i + insn->off] - addrs[i] + 8;
+			if (is_simm32(jmp_offset)) {
+				EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
+			} else {
+				pr_err("cond_jmp gen bug %llx\n", jmp_offset);
+				return -EFAULT;
+			}
+			EMIT2(0xEB, 6);
 
+			/* Check the condition for high 32-bit comparison */
+			jmp_cond = get_cond_jmp_opcode(BPF_OP(code), false);
+			if (jmp_cond == COND_JMP_OPCODE_INVALID)
+				return -EFAULT;
+			jmp_offset = addrs[i + insn->off] - addrs[i];
+			if (is_simm32(jmp_offset)) {
+				EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
+			} else {
+				pr_err("cond_jmp gen bug %llx\n", jmp_offset);
+				return -EFAULT;
+			}
 			break;
 		}
 		case BPF_JMP | BPF_JA:
diff --git a/tools/testing/selftests/bpf/verifier/jit.c b/tools/testing/selftests/bpf/verifier/jit.c
index be488b4495a3..c33adf344fae 100644
--- a/tools/testing/selftests/bpf/verifier/jit.c
+++ b/tools/testing/selftests/bpf/verifier/jit.c
@@ -86,3 +86,22 @@
 	.result = ACCEPT,
 	.retval = 2,
 },
+{
+	"jit: jsgt, jslt",
+	.insns = {
+	BPF_LD_IMM64(BPF_REG_1, 0x80000000ULL),
+	BPF_LD_IMM64(BPF_REG_2, 0x0ULL),
+	BPF_JMP_REG(BPF_JSGT, BPF_REG_1, BPF_REG_2, 2),
+	BPF_MOV64_IMM(BPF_REG_0, 1),
+	BPF_EXIT_INSN(),
+
+	BPF_JMP_REG(BPF_JSLT, BPF_REG_2, BPF_REG_1, 2),
+	BPF_MOV64_IMM(BPF_REG_0, 1),
+	BPF_EXIT_INSN(),
+
+	BPF_MOV64_IMM(BPF_REG_0, 2),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.retval = 2,
+},
-- 
cgit v1.2.3-59-g8ed1b


From b9aa0b35d878dff9ed19f94101fe353a4de00cc4 Mon Sep 17 00:00:00 2001
From: Wang YanQing <udknight@gmail.com>
Date: Sun, 28 Apr 2019 10:33:02 +0800
Subject: bpf, x32: Fix bug for BPF_ALU64 | BPF_NEG

The current implementation has two errors:

1: The second xor instruction will clear carry flag which
   is necessary for following sbb instruction.
2: The select coding for sbb instruction is wrong, the coding
   is "sbb dreg_hi,ecx", but what we need is "sbb ecx,dreg_hi".

This patch rewrites the implementation and fixes the errors.

This patch fixes below errors reported by bpf/test_verifier in x32
platform when the jit is enabled:

"
0: (b4) w1 = 4
1: (b4) w2 = 4
2: (1f) r2 -= r1
3: (4f) r2 |= r1
4: (87) r2 = -r2
5: (c7) r2 s>>= 63
6: (5f) r1 &= r2
7: (bf) r0 = r1
8: (95) exit
processed 9 insns (limit 131072), stack depth 0
0: (b4) w1 = 4
1: (b4) w2 = 4
2: (1f) r2 -= r1
3: (4f) r2 |= r1
4: (87) r2 = -r2
5: (c7) r2 s>>= 63
6: (5f) r1 &= r2
7: (bf) r0 = r1
8: (95) exit
processed 9 insns (limit 131072), stack depth 0
......
Summary: 1189 PASSED, 125 SKIPPED, 15 FAILED
"

Signed-off-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/x86/net/bpf_jit_comp32.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 8097b88d744f..b29e82f190c7 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -700,19 +700,12 @@ static inline void emit_ia32_neg64(const u8 dst[], bool dstk, u8 **pprog)
 		      STACK_VAR(dst_hi));
 	}
 
-	/* xor ecx,ecx */
-	EMIT2(0x31, add_2reg(0xC0, IA32_ECX, IA32_ECX));
-	/* sub dreg_lo,ecx */
-	EMIT2(0x2B, add_2reg(0xC0, dreg_lo, IA32_ECX));
-	/* mov dreg_lo,ecx */
-	EMIT2(0x89, add_2reg(0xC0, dreg_lo, IA32_ECX));
-
-	/* xor ecx,ecx */
-	EMIT2(0x31, add_2reg(0xC0, IA32_ECX, IA32_ECX));
-	/* sbb dreg_hi,ecx */
-	EMIT2(0x19, add_2reg(0xC0, dreg_hi, IA32_ECX));
-	/* mov dreg_hi,ecx */
-	EMIT2(0x89, add_2reg(0xC0, dreg_hi, IA32_ECX));
+	/* neg dreg_lo */
+	EMIT2(0xF7, add_1reg(0xD8, dreg_lo));
+	/* adc dreg_hi,0x0 */
+	EMIT3(0x83, add_1reg(0xD0, dreg_hi), 0x00);
+	/* neg dreg_hi */
+	EMIT2(0xF7, add_1reg(0xD8, dreg_hi));
 
 	if (dstk) {
 		/* mov dword ptr [ebp+off],dreg_lo */
-- 
cgit v1.2.3-59-g8ed1b


From 1dfbf334f12361ebe6269c5918328b755ee960c7 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sat, 20 Apr 2019 13:05:59 +0200
Subject: spi: ep93xx: Convert to use CS GPIO descriptors

This converts the EP93xx SPI master driver to use GPIO
descriptors for chip select handling.

EP93xx was using platform data to pass in GPIO lines,
by converting all board files to use GPIO descriptor
tables the core will look up the GPIO lines from the
SPI device in the same manner as for device tree.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 arch/arm/mach-ep93xx/edb93xx.c           | 13 +++++++++----
 arch/arm/mach-ep93xx/simone.c            | 11 +++++++----
 arch/arm/mach-ep93xx/ts72xx.c            | 25 +++++++++++++++++--------
 arch/arm/mach-ep93xx/vision_ep9307.c     | 15 +++++++++------
 drivers/spi/spi-ep93xx.c                 | 32 ++++++--------------------------
 include/linux/platform_data/spi-ep93xx.h |  4 ----
 6 files changed, 48 insertions(+), 52 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/mach-ep93xx/edb93xx.c b/arch/arm/mach-ep93xx/edb93xx.c
index 8e89ec8b6f0f..34e18e9556d9 100644
--- a/arch/arm/mach-ep93xx/edb93xx.c
+++ b/arch/arm/mach-ep93xx/edb93xx.c
@@ -29,6 +29,7 @@
 #include <linux/platform_device.h>
 #include <linux/i2c.h>
 #include <linux/spi/spi.h>
+#include <linux/gpio/machine.h>
 
 #include <sound/cs4271.h>
 
@@ -105,13 +106,16 @@ static struct spi_board_info edb93xx_spi_board_info[] __initdata = {
 	},
 };
 
-static int edb93xx_spi_chipselects[] __initdata = {
-	EP93XX_GPIO_LINE_EGPIO6,
+static struct gpiod_lookup_table edb93xx_spi_cs_gpio_table = {
+	.dev_id = "ep93xx-spi.0",
+	.table = {
+		GPIO_LOOKUP("A", 6, "cs", GPIO_ACTIVE_LOW),
+		{ },
+	},
 };
 
 static struct ep93xx_spi_info edb93xx_spi_info __initdata = {
-	.chipselect	= edb93xx_spi_chipselects,
-	.num_chipselect	= ARRAY_SIZE(edb93xx_spi_chipselects),
+	/* Intentionally left blank */
 };
 
 static void __init edb93xx_register_spi(void)
@@ -123,6 +127,7 @@ static void __init edb93xx_register_spi(void)
 	else if (machine_is_edb9315a())
 		edb93xx_cs4271_data.gpio_nreset = EP93XX_GPIO_LINE_EGPIO14;
 
+	gpiod_add_lookup_table(&edb93xx_spi_cs_gpio_table);
 	ep93xx_register_spi(&edb93xx_spi_info, edb93xx_spi_board_info,
 			    ARRAY_SIZE(edb93xx_spi_board_info));
 }
diff --git a/arch/arm/mach-ep93xx/simone.c b/arch/arm/mach-ep93xx/simone.c
index 80ccb984d521..f0f38c0dba52 100644
--- a/arch/arm/mach-ep93xx/simone.c
+++ b/arch/arm/mach-ep93xx/simone.c
@@ -77,13 +77,15 @@ static struct spi_board_info simone_spi_devices[] __initdata = {
  * low between multi-message command blocks. From v1.4, it uses a GPIO instead.
  * v1.3 parts will still work, since the signal on SFRMOUT is automatic.
  */
-static int simone_spi_chipselects[] __initdata = {
-	EP93XX_GPIO_LINE_EGPIO1,
+static struct gpiod_lookup_table simone_spi_cs_gpio_table = {
+	.dev_id = "ep93xx-spi.0",
+	.table = {
+		GPIO_LOOKUP("A", 1, "cs", GPIO_ACTIVE_LOW),
+		{ },
+	},
 };
 
 static struct ep93xx_spi_info simone_spi_info __initdata = {
-	.chipselect	= simone_spi_chipselects,
-	.num_chipselect	= ARRAY_SIZE(simone_spi_chipselects),
 	.use_dma = 1,
 };
 
@@ -113,6 +115,7 @@ static void __init simone_init_machine(void)
 	ep93xx_register_i2c(simone_i2c_board_info,
 			    ARRAY_SIZE(simone_i2c_board_info));
 	gpiod_add_lookup_table(&simone_mmc_spi_gpio_table);
+	gpiod_add_lookup_table(&simone_spi_cs_gpio_table);
 	ep93xx_register_spi(&simone_spi_info, simone_spi_devices,
 			    ARRAY_SIZE(simone_spi_devices));
 	simone_register_audio();
diff --git a/arch/arm/mach-ep93xx/ts72xx.c b/arch/arm/mach-ep93xx/ts72xx.c
index 85b74ac943f0..a3a20c83c6b8 100644
--- a/arch/arm/mach-ep93xx/ts72xx.c
+++ b/arch/arm/mach-ep93xx/ts72xx.c
@@ -22,6 +22,7 @@
 #include <linux/spi/mmc_spi.h>
 #include <linux/mmc/host.h>
 #include <linux/platform_data/spi-ep93xx.h>
+#include <linux/gpio/machine.h>
 
 #include <mach/gpio-ep93xx.h>
 #include <mach/hardware.h>
@@ -269,13 +270,15 @@ static struct spi_board_info bk3_spi_board_info[] __initdata = {
  * The all work is performed automatically by !SPI_FRAME (SFRM1) and
  * goes through CPLD
  */
-static int bk3_spi_chipselects[] __initdata = {
-	EP93XX_GPIO_LINE_F(3),
+static struct gpiod_lookup_table bk3_spi_cs_gpio_table = {
+	.dev_id = "ep93xx-spi.0",
+	.table = {
+		GPIO_LOOKUP("F", 3, "cs", GPIO_ACTIVE_LOW),
+		{ },
+	},
 };
 
 static struct ep93xx_spi_info bk3_spi_master __initdata = {
-	.chipselect	= bk3_spi_chipselects,
-	.num_chipselect = ARRAY_SIZE(bk3_spi_chipselects),
 	.use_dma	= 1,
 };
 
@@ -316,13 +319,17 @@ static struct spi_board_info ts72xx_spi_devices[] __initdata = {
 	},
 };
 
-static int ts72xx_spi_chipselects[] __initdata = {
-	EP93XX_GPIO_LINE_F(2),		/* DIO_17 */
+static struct gpiod_lookup_table ts72xx_spi_cs_gpio_table = {
+	.dev_id = "ep93xx-spi.0",
+	.table = {
+		/* DIO_17 */
+		GPIO_LOOKUP("F", 2, "cs", GPIO_ACTIVE_LOW),
+		{ },
+	},
 };
 
 static struct ep93xx_spi_info ts72xx_spi_info __initdata = {
-	.chipselect	= ts72xx_spi_chipselects,
-	.num_chipselect	= ARRAY_SIZE(ts72xx_spi_chipselects),
+	/* Intentionally left blank */
 };
 
 static void __init ts72xx_init_machine(void)
@@ -339,6 +346,7 @@ static void __init ts72xx_init_machine(void)
 	if (board_is_ts7300())
 		platform_device_register(&ts73xx_fpga_device);
 #endif
+	gpiod_add_lookup_table(&ts72xx_spi_cs_gpio_table);
 	ep93xx_register_spi(&ts72xx_spi_info, ts72xx_spi_devices,
 			    ARRAY_SIZE(ts72xx_spi_devices));
 }
@@ -398,6 +406,7 @@ static void __init bk3_init_machine(void)
 
 	ep93xx_register_eth(&ts72xx_eth_data, 1);
 
+	gpiod_add_lookup_table(&bk3_spi_cs_gpio_table);
 	ep93xx_register_spi(&bk3_spi_master, bk3_spi_board_info,
 			    ARRAY_SIZE(bk3_spi_board_info));
 
diff --git a/arch/arm/mach-ep93xx/vision_ep9307.c b/arch/arm/mach-ep93xx/vision_ep9307.c
index 767ee64628dc..f95a644769e4 100644
--- a/arch/arm/mach-ep93xx/vision_ep9307.c
+++ b/arch/arm/mach-ep93xx/vision_ep9307.c
@@ -245,15 +245,17 @@ static struct spi_board_info vision_spi_board_info[] __initdata = {
 	},
 };
 
-static int vision_spi_chipselects[] __initdata = {
-	EP93XX_GPIO_LINE_EGPIO6,
-	EP93XX_GPIO_LINE_EGPIO7,
-	EP93XX_GPIO_LINE_G(2),
+static struct gpiod_lookup_table vision_spi_cs_gpio_table = {
+	.dev_id = "ep93xx-spi.0",
+	.table = {
+		GPIO_LOOKUP_IDX("A", 6, "cs", 0, GPIO_ACTIVE_LOW),
+		GPIO_LOOKUP_IDX("A", 7, "cs", 1, GPIO_ACTIVE_LOW),
+		GPIO_LOOKUP_IDX("G", 2, "cs", 2, GPIO_ACTIVE_LOW),
+		{ },
+	},
 };
 
 static struct ep93xx_spi_info vision_spi_master __initdata = {
-	.chipselect	= vision_spi_chipselects,
-	.num_chipselect	= ARRAY_SIZE(vision_spi_chipselects),
 	.use_dma	= 1,
 };
 
@@ -295,6 +297,7 @@ static void __init vision_init_machine(void)
 	ep93xx_register_i2c(vision_i2c_info,
 				ARRAY_SIZE(vision_i2c_info));
 	gpiod_add_lookup_table(&vision_spi_mmc_gpio_table);
+	gpiod_add_lookup_table(&vision_spi_cs_gpio_table);
 	ep93xx_register_spi(&vision_spi_master, vision_spi_board_info,
 				ARRAY_SIZE(vision_spi_board_info));
 	vision_register_i2s();
diff --git a/drivers/spi/spi-ep93xx.c b/drivers/spi/spi-ep93xx.c
index 79fc3940245a..47e39251bad9 100644
--- a/drivers/spi/spi-ep93xx.c
+++ b/drivers/spi/spi-ep93xx.c
@@ -28,7 +28,6 @@
 #include <linux/platform_device.h>
 #include <linux/sched.h>
 #include <linux/scatterlist.h>
-#include <linux/gpio.h>
 #include <linux/spi/spi.h>
 
 #include <linux/platform_data/dma-ep93xx.h>
@@ -676,6 +675,7 @@ static int ep93xx_spi_probe(struct platform_device *pdev)
 	if (!master)
 		return -ENOMEM;
 
+	master->use_gpio_descriptors = true;
 	master->prepare_transfer_hardware = ep93xx_spi_prepare_hardware;
 	master->unprepare_transfer_hardware = ep93xx_spi_unprepare_hardware;
 	master->prepare_message = ep93xx_spi_prepare_message;
@@ -683,31 +683,11 @@ static int ep93xx_spi_probe(struct platform_device *pdev)
 	master->bus_num = pdev->id;
 	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
 	master->bits_per_word_mask = SPI_BPW_RANGE_MASK(4, 16);
-
-	master->num_chipselect = info->num_chipselect;
-	master->cs_gpios = devm_kcalloc(&master->dev,
-					master->num_chipselect, sizeof(int),
-					GFP_KERNEL);
-	if (!master->cs_gpios) {
-		error = -ENOMEM;
-		goto fail_release_master;
-	}
-
-	for (i = 0; i < master->num_chipselect; i++) {
-		master->cs_gpios[i] = info->chipselect[i];
-
-		if (!gpio_is_valid(master->cs_gpios[i]))
-			continue;
-
-		error = devm_gpio_request_one(&pdev->dev, master->cs_gpios[i],
-					      GPIOF_OUT_INIT_HIGH,
-					      "ep93xx-spi");
-		if (error) {
-			dev_err(&pdev->dev, "could not request cs gpio %d\n",
-				master->cs_gpios[i]);
-			goto fail_release_master;
-		}
-	}
+	/*
+	 * The SPI core will count the number of GPIO descriptors to figure
+	 * out the number of chip selects available on the platform.
+	 */
+	master->num_chipselect = 0;
 
 	platform_set_drvdata(pdev, master);
 
diff --git a/include/linux/platform_data/spi-ep93xx.h b/include/linux/platform_data/spi-ep93xx.h
index eb16c6739ac2..b439f2a896e0 100644
--- a/include/linux/platform_data/spi-ep93xx.h
+++ b/include/linux/platform_data/spi-ep93xx.h
@@ -6,13 +6,9 @@ struct spi_device;
 
 /**
  * struct ep93xx_spi_info - EP93xx specific SPI descriptor
- * @chipselect: array of gpio numbers to use as chip selects
- * @num_chipselect: ARRAY_SIZE(chipselect)
  * @use_dma: use DMA for the transfers
  */
 struct ep93xx_spi_info {
-	int	*chipselect;
-	int	num_chipselect;
 	bool	use_dma;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 6d0e0d0bb8eb673472dd08d83c8169452c50f269 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Apr 2019 14:55:32 -0400
Subject: spufs: switch to ->free_inode()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/platforms/cell/spufs/inode.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index db329d4bf1c3..c1a75216050a 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -71,17 +71,11 @@ spufs_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void spufs_i_callback(struct rcu_head *head)
+static void spufs_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	kmem_cache_free(spufs_inode_cache, SPUFS_I(inode));
 }
 
-static void spufs_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, spufs_i_callback);
-}
-
 static void
 spufs_init_once(void *p)
 {
@@ -739,7 +733,7 @@ spufs_fill_super(struct super_block *sb, void *data, int silent)
 	struct spufs_sb_info *info;
 	static const struct super_operations s_ops = {
 		.alloc_inode = spufs_alloc_inode,
-		.destroy_inode = spufs_destroy_inode,
+		.free_inode = spufs_free_inode,
 		.statfs = simple_statfs,
 		.evict_inode = spufs_evict_inode,
 		.show_options = spufs_show_options,
-- 
cgit v1.2.3-59-g8ed1b


From a1ac2a9c4f98482e49305ab5551b7b32f9cac39b Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 28 Mar 2019 13:03:45 +0000
Subject: powerpc/book3e: drop BUG_ON() in map_kernel_page()

early_alloc_pgtable() never returns NULL as it panics on failure.

This patch drops the three BUG_ON() which check the non nullity
of early_alloc_pgtable() returned value.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/pgtable-book3e.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c
index 1032ef7aaf62..390a6d0b216d 100644
--- a/arch/powerpc/mm/pgtable-book3e.c
+++ b/arch/powerpc/mm/pgtable-book3e.c
@@ -98,20 +98,17 @@ int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
 #ifndef __PAGETABLE_PUD_FOLDED
 		if (pgd_none(*pgdp)) {
 			pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
-			BUG_ON(pudp == NULL);
 			pgd_populate(&init_mm, pgdp, pudp);
 		}
 #endif /* !__PAGETABLE_PUD_FOLDED */
 		pudp = pud_offset(pgdp, ea);
 		if (pud_none(*pudp)) {
 			pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
-			BUG_ON(pmdp == NULL);
 			pud_populate(&init_mm, pudp, pmdp);
 		}
 		pmdp = pmd_offset(pudp, ea);
 		if (!pmd_present(*pmdp)) {
 			ptep = early_alloc_pgtable(PAGE_SIZE);
-			BUG_ON(ptep == NULL);
 			pmd_populate_kernel(&init_mm, pmdp, ptep);
 		}
 		ptep = pte_offset_kernel(pmdp, ea);
-- 
cgit v1.2.3-59-g8ed1b


From 71faf8145cdc20f22aa398eb7b206b33826cf2bd Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 28 Mar 2019 13:19:47 +0000
Subject: powerpc/nohash64: clean pgtable.h

TRANSPARENT_HUGEPAGE is only supported by book3s

VMEMMAP_REGION_ID is never used

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/nohash/64/pgtable.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index 0384a3302fb6..c8e6a9a5bc33 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -23,11 +23,7 @@
 			    PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT)
 #define PGTABLE_RANGE (ASM_CONST(1) << PGTABLE_EADDR_SIZE)
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define PMD_CACHE_INDEX	(PMD_INDEX_SIZE + 1)
-#else
 #define PMD_CACHE_INDEX	PMD_INDEX_SIZE
-#endif
 #define PUD_CACHE_INDEX PUD_INDEX_SIZE
 
 /*
@@ -73,7 +69,6 @@
 
 #define VMALLOC_REGION_ID	(REGION_ID(VMALLOC_START))
 #define KERNEL_REGION_ID	(REGION_ID(PAGE_OFFSET))
-#define VMEMMAP_REGION_ID	(0xfUL)	/* Server only */
 #define USER_REGION_ID		(0UL)
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 9d9f2cccde952126185e3336af0d4dc62eb254ad Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 29 Mar 2019 09:59:59 +0000
Subject: powerpc/mm: change #include "mmu_decl.h" to <mm/mmu_decl.h>

This patch make inclusion of mmu_decl.h independant of the location
of the file including it.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/40x_mmu.c            | 2 +-
 arch/powerpc/mm/44x_mmu.c            | 2 +-
 arch/powerpc/mm/8xx_mmu.c            | 2 +-
 arch/powerpc/mm/dma-noncoherent.c    | 2 +-
 arch/powerpc/mm/fsl_booke_mmu.c      | 2 +-
 arch/powerpc/mm/init_32.c            | 2 +-
 arch/powerpc/mm/init_64.c            | 2 +-
 arch/powerpc/mm/mem.c                | 2 +-
 arch/powerpc/mm/mmu_context_nohash.c | 2 +-
 arch/powerpc/mm/pgtable-book3e.c     | 2 +-
 arch/powerpc/mm/pgtable-book3s64.c   | 2 +-
 arch/powerpc/mm/pgtable-hash64.c     | 2 +-
 arch/powerpc/mm/pgtable_32.c         | 2 +-
 arch/powerpc/mm/pgtable_64.c         | 2 +-
 arch/powerpc/mm/ppc_mmu_32.c         | 2 +-
 arch/powerpc/mm/tlb_hash32.c         | 2 +-
 arch/powerpc/mm/tlb_nohash.c         | 2 +-
 17 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c
index b9cf6f8764b0..460459b6f53e 100644
--- a/arch/powerpc/mm/40x_mmu.c
+++ b/arch/powerpc/mm/40x_mmu.c
@@ -49,7 +49,7 @@
 #include <asm/machdep.h>
 #include <asm/setup.h>
 
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
 
 extern int __map_without_ltlbs;
 /*
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c
index aad127acdbaa..c07983ebc02e 100644
--- a/arch/powerpc/mm/44x_mmu.c
+++ b/arch/powerpc/mm/44x_mmu.c
@@ -31,7 +31,7 @@
 #include <asm/cacheflush.h>
 #include <asm/code-patching.h>
 
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
 
 /* Used by the 44x TLB replacement exception handler.
  * Just needed it declared someplace.
diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c
index 87648b58d295..70d55b615b62 100644
--- a/arch/powerpc/mm/8xx_mmu.c
+++ b/arch/powerpc/mm/8xx_mmu.c
@@ -17,7 +17,7 @@
 #include <asm/fixmap.h>
 #include <asm/code-patching.h>
 
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
 
 #define IMMR_SIZE (FIX_IMMR_SIZE << PAGE_SHIFT)
 
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index b5d2658c26af..2f6154b76328 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -36,7 +36,7 @@
 #include <asm/tlbflush.h>
 #include <asm/dma.h>
 
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
 
 /*
  * This address range defaults to a value that is safe for all
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index 210cbc1faf63..71a1a36751dd 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -54,7 +54,7 @@
 #include <asm/setup.h>
 #include <asm/paca.h>
 
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
 
 unsigned int tlbcam_index;
 
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 80cc97cd8878..3eb4cb09749c 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -47,7 +47,7 @@
 #include <asm/hugetlb.h>
 #include <asm/kup.h>
 
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
 
 #if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL)
 /* The amount of lowmem must be within 0xF0000000 - KERNELBASE. */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index a4c155af1597..45b02fa11cd8 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -66,7 +66,7 @@
 #include <asm/iommu.h>
 #include <asm/vdso.h>
 
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
 
 phys_addr_t memstart_addr = ~0;
 EXPORT_SYMBOL_GPL(memstart_addr);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index e12bec98366f..105c58f8900a 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -54,7 +54,7 @@
 #include <asm/swiotlb.h>
 #include <asm/rtas.h>
 
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
 
 #ifndef CPU_FTR_COHERENT_ICACHE
 #define CPU_FTR_COHERENT_ICACHE	0	/* XXX for now */
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index 1945c5f19f5e..ae4505d5b4b8 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -52,7 +52,7 @@
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
 
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
 
 /*
  * The MPC8xx has only 16 contexts. We rotate through them on each task switch.
diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c
index 390a6d0b216d..f296c2e88b09 100644
--- a/arch/powerpc/mm/pgtable-book3e.c
+++ b/arch/powerpc/mm/pgtable-book3e.c
@@ -15,7 +15,7 @@
 #include <asm/tlb.h>
 #include <asm/dma.h>
 
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 /*
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index a4341aba0af4..16bda049187a 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -17,7 +17,7 @@
 #include <asm/trace.h>
 #include <asm/powernv.h>
 
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
 #include <trace/events/thp.h>
 
 unsigned long __pmd_frag_nr;
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
index 097a3b3538b1..1fd025dba4a3 100644
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -19,7 +19,7 @@
 #include <asm/mmu.h>
 #include <asm/tlb.h>
 
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/thp.h>
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 6e56a6240bfa..c9cdbb84d31f 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -36,7 +36,7 @@
 #include <asm/setup.h>
 #include <asm/sections.h>
 
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
 
 unsigned long ioremap_bot;
 EXPORT_SYMBOL(ioremap_bot);	/* aka VMALLOC_END */
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 95ad2a09501c..95ed76519411 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -52,7 +52,7 @@
 #include <asm/firmware.h>
 #include <asm/dma.h>
 
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
 
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index bf1de3ca39bc..1db55159031c 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -34,7 +34,7 @@
 #include <asm/code-patching.h>
 #include <asm/sections.h>
 
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
 
 struct hash_pte *Hash, *Hash_end;
 unsigned long Hash_size, Hash_mask;
diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c
index cf8472cf3d59..8d56f0417f87 100644
--- a/arch/powerpc/mm/tlb_hash32.c
+++ b/arch/powerpc/mm/tlb_hash32.c
@@ -32,7 +32,7 @@
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
 
 /*
  * Called when unmapping pages to flush entries from the TLB/hash table.
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 088e0a6b5ade..704e613a0b14 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -46,7 +46,7 @@
 #include <asm/hugetlb.h>
 #include <asm/paca.h>
 
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
 
 /*
  * This struct lists the sw-supported page sizes.  The hardawre MMU may support
-- 
cgit v1.2.3-59-g8ed1b


From 47d99948eee48a84a4b242c17915a4ff59a29b5d Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 29 Mar 2019 10:00:00 +0000
Subject: powerpc/mm: Move book3s64 specifics in subdirectory mm/book3s64

Many files in arch/powerpc/mm are only for book3S64. This patch
creates a subdirectory for them.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
[mpe: Update the selftest sym links, shorten new filenames, cleanup some
      whitespace and formatting in the new files.]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/Makefile                     |   25 +-
 arch/powerpc/mm/book3s64/Makefile            |   24 +
 arch/powerpc/mm/book3s64/hash_4k.c           |  124 ++
 arch/powerpc/mm/book3s64/hash_64k.c          |  333 +++++
 arch/powerpc/mm/book3s64/hash_hugepage.c     |  191 +++
 arch/powerpc/mm/book3s64/hash_hugetlbpage.c  |  152 ++
 arch/powerpc/mm/book3s64/hash_native.c       |  884 ++++++++++++
 arch/powerpc/mm/book3s64/hash_pgtable.c      |  463 ++++++
 arch/powerpc/mm/book3s64/hash_tlb.c          |  265 ++++
 arch/powerpc/mm/book3s64/hash_utils.c        | 1946 ++++++++++++++++++++++++++
 arch/powerpc/mm/book3s64/iommu_api.c         |  482 +++++++
 arch/powerpc/mm/book3s64/mmu_context.c       |  263 ++++
 arch/powerpc/mm/book3s64/pgtable.c           |  449 ++++++
 arch/powerpc/mm/book3s64/pkeys.c             |  428 ++++++
 arch/powerpc/mm/book3s64/radix_hugetlbpage.c |  110 ++
 arch/powerpc/mm/book3s64/radix_pgtable.c     | 1124 +++++++++++++++
 arch/powerpc/mm/book3s64/radix_tlb.c         | 1101 +++++++++++++++
 arch/powerpc/mm/book3s64/slb.c               |  833 +++++++++++
 arch/powerpc/mm/book3s64/subpage_prot.c      |  289 ++++
 arch/powerpc/mm/book3s64/vphn.c              |   73 +
 arch/powerpc/mm/book3s64/vphn.h              |   16 +
 arch/powerpc/mm/hash64_4k.c                  |  124 --
 arch/powerpc/mm/hash64_64k.c                 |  333 -----
 arch/powerpc/mm/hash_native_64.c             |  884 ------------
 arch/powerpc/mm/hash_utils_64.c              | 1930 -------------------------
 arch/powerpc/mm/hugepage-hash64.c            |  191 ---
 arch/powerpc/mm/hugetlbpage-hash64.c         |  147 --
 arch/powerpc/mm/hugetlbpage-radix.c          |  110 --
 arch/powerpc/mm/mmu_context_book3s64.c       |  263 ----
 arch/powerpc/mm/mmu_context_iommu.c          |  482 -------
 arch/powerpc/mm/numa.c                       |    2 +-
 arch/powerpc/mm/pgtable-book3s64.c           |  449 ------
 arch/powerpc/mm/pgtable-hash64.c             |  463 ------
 arch/powerpc/mm/pgtable-radix.c              | 1124 ---------------
 arch/powerpc/mm/pkeys.c                      |  428 ------
 arch/powerpc/mm/slb.c                        |  832 -----------
 arch/powerpc/mm/subpage-prot.c               |  289 ----
 arch/powerpc/mm/tlb-radix.c                  | 1101 ---------------
 arch/powerpc/mm/tlb_hash64.c                 |  259 ----
 arch/powerpc/mm/vphn.c                       |   71 -
 arch/powerpc/mm/vphn.h                       |   17 -
 tools/testing/selftests/powerpc/vphn/vphn.c  |    2 +-
 tools/testing/selftests/powerpc/vphn/vphn.h  |    2 +-
 43 files changed, 9556 insertions(+), 9522 deletions(-)
 create mode 100644 arch/powerpc/mm/book3s64/Makefile
 create mode 100644 arch/powerpc/mm/book3s64/hash_4k.c
 create mode 100644 arch/powerpc/mm/book3s64/hash_64k.c
 create mode 100644 arch/powerpc/mm/book3s64/hash_hugepage.c
 create mode 100644 arch/powerpc/mm/book3s64/hash_hugetlbpage.c
 create mode 100644 arch/powerpc/mm/book3s64/hash_native.c
 create mode 100644 arch/powerpc/mm/book3s64/hash_pgtable.c
 create mode 100644 arch/powerpc/mm/book3s64/hash_tlb.c
 create mode 100644 arch/powerpc/mm/book3s64/hash_utils.c
 create mode 100644 arch/powerpc/mm/book3s64/iommu_api.c
 create mode 100644 arch/powerpc/mm/book3s64/mmu_context.c
 create mode 100644 arch/powerpc/mm/book3s64/pgtable.c
 create mode 100644 arch/powerpc/mm/book3s64/pkeys.c
 create mode 100644 arch/powerpc/mm/book3s64/radix_hugetlbpage.c
 create mode 100644 arch/powerpc/mm/book3s64/radix_pgtable.c
 create mode 100644 arch/powerpc/mm/book3s64/radix_tlb.c
 create mode 100644 arch/powerpc/mm/book3s64/slb.c
 create mode 100644 arch/powerpc/mm/book3s64/subpage_prot.c
 create mode 100644 arch/powerpc/mm/book3s64/vphn.c
 create mode 100644 arch/powerpc/mm/book3s64/vphn.h
 delete mode 100644 arch/powerpc/mm/hash64_4k.c
 delete mode 100644 arch/powerpc/mm/hash64_64k.c
 delete mode 100644 arch/powerpc/mm/hash_native_64.c
 delete mode 100644 arch/powerpc/mm/hash_utils_64.c
 delete mode 100644 arch/powerpc/mm/hugepage-hash64.c
 delete mode 100644 arch/powerpc/mm/hugetlbpage-hash64.c
 delete mode 100644 arch/powerpc/mm/hugetlbpage-radix.c
 delete mode 100644 arch/powerpc/mm/mmu_context_book3s64.c
 delete mode 100644 arch/powerpc/mm/mmu_context_iommu.c
 delete mode 100644 arch/powerpc/mm/pgtable-book3s64.c
 delete mode 100644 arch/powerpc/mm/pgtable-hash64.c
 delete mode 100644 arch/powerpc/mm/pgtable-radix.c
 delete mode 100644 arch/powerpc/mm/pkeys.c
 delete mode 100644 arch/powerpc/mm/slb.c
 delete mode 100644 arch/powerpc/mm/subpage-prot.c
 delete mode 100644 arch/powerpc/mm/tlb-radix.c
 delete mode 100644 arch/powerpc/mm/tlb_hash64.c
 delete mode 100644 arch/powerpc/mm/vphn.c
 delete mode 100644 arch/powerpc/mm/vphn.h

(limited to 'arch')

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 3c1bd9fa23cd..a137fdf775e2 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -5,53 +5,34 @@
 
 ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
 
-CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE)
-
 obj-y				:= fault.o mem.o pgtable.o mmap.o \
 				   init_$(BITS).o pgtable_$(BITS).o \
 				   init-common.o mmu_context.o drmem.o
 obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
 				   tlb_nohash_low.o
 obj-$(CONFIG_PPC_BOOK3E)	+= tlb_low_$(BITS)e.o
-hash64-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
 obj-$(CONFIG_PPC_BOOK3E_64)   += pgtable-book3e.o
-obj-$(CONFIG_PPC_BOOK3S_64)	+= pgtable-hash64.o hash_utils_64.o slb.o \
-				   $(hash64-y) mmu_context_book3s64.o \
-				   pgtable-book3s64.o pgtable-frag.o
+obj-$(CONFIG_PPC_BOOK3S_64)	+= book3s64/
+obj-$(CONFIG_PPC_BOOK3S_64)	+= pgtable-frag.o
 obj-$(CONFIG_PPC32)		+= pgtable-frag.o
-obj-$(CONFIG_PPC_RADIX_MMU)	+= pgtable-radix.o tlb-radix.o
 obj-$(CONFIG_PPC_BOOK3S_32)	+= ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o
-obj-$(CONFIG_PPC_BOOK3S)	+= tlb_hash$(BITS).o
-ifdef CONFIG_PPC_BOOK3S_64
-obj-$(CONFIG_PPC_4K_PAGES)	+= hash64_4k.o
-obj-$(CONFIG_PPC_64K_PAGES)	+= hash64_64k.o
-endif
+obj-$(CONFIG_PPC_BOOK3S_32)	+= tlb_hash32.o
 obj-$(CONFIG_40x)		+= 40x_mmu.o
 obj-$(CONFIG_44x)		+= 44x_mmu.o
 obj-$(CONFIG_PPC_8xx)		+= 8xx_mmu.o
 obj-$(CONFIG_PPC_FSL_BOOK3E)	+= fsl_booke_mmu.o
 obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
-obj-$(CONFIG_PPC_SPLPAR)	+= vphn.o
 obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
 obj-y				+= hugetlbpage.o
 ifdef CONFIG_HUGETLB_PAGE
-obj-$(CONFIG_PPC_BOOK3S_64)	+= hugetlbpage-hash64.o
-obj-$(CONFIG_PPC_RADIX_MMU)	+= hugetlbpage-radix.o
 obj-$(CONFIG_PPC_BOOK3E_MMU)	+= hugetlbpage-book3e.o
 endif
-obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
-obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)		+= highmem.o
 obj-$(CONFIG_PPC_COPRO_BASE)	+= copro_fault.o
-obj-$(CONFIG_SPAPR_TCE_IOMMU)	+= mmu_context_iommu.o
 obj-$(CONFIG_PPC_PTDUMP)	+= ptdump/
-obj-$(CONFIG_PPC_MEM_KEYS)	+= pkeys.o
 
 # Disable kcov instrumentation on sensitive code
 # This is necessary for booting with kcov enabled on book3e machines
 KCOV_INSTRUMENT_tlb_nohash.o := n
 KCOV_INSTRUMENT_fsl_booke_mmu.o := n
-
-# Instrumenting the SLB fault path can lead to duplicate SLB entries
-KCOV_INSTRUMENT_slb.o := n
diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile
new file mode 100644
index 000000000000..974b4fc19f4f
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/Makefile
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y	:= $(NO_MINIMAL_TOC)
+
+CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE)
+
+obj-y				+= hash_pgtable.o hash_utils.o slb.o \
+				   mmu_context.o pgtable.o hash_tlb.o
+obj-$(CONFIG_PPC_NATIVE)	+= hash_native.o
+obj-$(CONFIG_PPC_RADIX_MMU)	+= radix_pgtable.o radix_tlb.o
+obj-$(CONFIG_PPC_4K_PAGES)	+= hash_4k.o
+obj-$(CONFIG_PPC_64K_PAGES)	+= hash_64k.o
+obj-$(CONFIG_PPC_SPLPAR)	+= vphn.o
+obj-$(CONFIG_HUGETLB_PAGE)	+= hash_hugetlbpage.o
+ifdef CONFIG_HUGETLB_PAGE
+obj-$(CONFIG_PPC_RADIX_MMU)	+= radix_hugetlbpage.o
+endif
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o
+obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage_prot.o
+obj-$(CONFIG_SPAPR_TCE_IOMMU)	+= iommu_api.o
+obj-$(CONFIG_PPC_MEM_KEYS)	+= pkeys.o
+
+# Instrumenting the SLB fault path can lead to duplicate SLB entries
+KCOV_INSTRUMENT_slb.o := n
diff --git a/arch/powerpc/mm/book3s64/hash_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c
new file mode 100644
index 000000000000..22e787123cdf
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/hash_4k.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright IBM Corporation, 2015
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+
+int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
+		   pte_t *ptep, unsigned long trap, unsigned long flags,
+		   int ssize, int subpg_prot)
+{
+	real_pte_t rpte;
+	unsigned long hpte_group;
+	unsigned long rflags, pa;
+	unsigned long old_pte, new_pte;
+	unsigned long vpn, hash, slot;
+	unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift;
+
+	/*
+	 * atomically mark the linux large page PTE busy and dirty
+	 */
+	do {
+		pte_t pte = READ_ONCE(*ptep);
+
+		old_pte = pte_val(pte);
+		/* If PTE busy, retry the access */
+		if (unlikely(old_pte & H_PAGE_BUSY))
+			return 0;
+		/* If PTE permissions don't match, take page fault */
+		if (unlikely(!check_pte_access(access, old_pte)))
+			return 1;
+		/*
+		 * Try to lock the PTE, add ACCESSED and DIRTY if it was
+		 * a write access. Since this is 4K insert of 64K page size
+		 * also add H_PAGE_COMBO
+		 */
+		new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
+		if (access & _PAGE_WRITE)
+			new_pte |= _PAGE_DIRTY;
+	} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+	/*
+	 * PP bits. _PAGE_USER is already PP bit 0x2, so we only
+	 * need to add in 0x1 if it's a read-only user page
+	 */
+	rflags = htab_convert_pte_flags(new_pte);
+	rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
+
+	if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+
+	vpn  = hpt_vpn(ea, vsid, ssize);
+	if (unlikely(old_pte & H_PAGE_HASHPTE)) {
+		/*
+		 * There MIGHT be an HPTE for this pte
+		 */
+		unsigned long gslot = pte_get_hash_gslot(vpn, shift, ssize,
+							 rpte, 0);
+
+		if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_4K,
+					       MMU_PAGE_4K, ssize, flags) == -1)
+			old_pte &= ~_PAGE_HPTEFLAGS;
+	}
+
+	if (likely(!(old_pte & H_PAGE_HASHPTE))) {
+
+		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+		hash = hpt_hash(vpn, shift, ssize);
+
+repeat:
+		hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+		/* Insert into the hash table, primary slot */
+		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+						MMU_PAGE_4K, MMU_PAGE_4K, ssize);
+		/*
+		 * Primary is full, try the secondary
+		 */
+		if (unlikely(slot == -1)) {
+			hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+			slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+							rflags,
+							HPTE_V_SECONDARY,
+							MMU_PAGE_4K,
+							MMU_PAGE_4K, ssize);
+			if (slot == -1) {
+				if (mftb() & 0x1)
+					hpte_group = (hash & htab_hash_mask) *
+							HPTES_PER_GROUP;
+				mmu_hash_ops.hpte_remove(hpte_group);
+				/*
+				 * FIXME!! Should be try the group from which we removed ?
+				 */
+				goto repeat;
+			}
+		}
+		/*
+		 * Hypervisor failure. Restore old pte and return -1
+		 * similar to __hash_page_*
+		 */
+		if (unlikely(slot == -2)) {
+			*ptep = __pte(old_pte);
+			hash_failure_debug(ea, access, vsid, trap, ssize,
+					   MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
+			return -1;
+		}
+		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
+		new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+	}
+	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
+	return 0;
+}
diff --git a/arch/powerpc/mm/book3s64/hash_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c
new file mode 100644
index 000000000000..7084ce2951e6
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/hash_64k.c
@@ -0,0 +1,333 @@
+/*
+ * Copyright IBM Corporation, 2015
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+
+/*
+ * Return true, if the entry has a slot value which
+ * the software considers as invalid.
+ */
+static inline bool hpte_soft_invalid(unsigned long hidx)
+{
+	return ((hidx & 0xfUL) == 0xfUL);
+}
+
+/*
+ * index from 0 - 15
+ */
+bool __rpte_sub_valid(real_pte_t rpte, unsigned long index)
+{
+	return !(hpte_soft_invalid(__rpte_to_hidx(rpte, index)));
+}
+
+int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
+		   pte_t *ptep, unsigned long trap, unsigned long flags,
+		   int ssize, int subpg_prot)
+{
+	real_pte_t rpte;
+	unsigned long hpte_group;
+	unsigned int subpg_index;
+	unsigned long rflags, pa;
+	unsigned long old_pte, new_pte, subpg_pte;
+	unsigned long vpn, hash, slot, gslot;
+	unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift;
+
+	/*
+	 * atomically mark the linux large page PTE busy and dirty
+	 */
+	do {
+		pte_t pte = READ_ONCE(*ptep);
+
+		old_pte = pte_val(pte);
+		/* If PTE busy, retry the access */
+		if (unlikely(old_pte & H_PAGE_BUSY))
+			return 0;
+		/* If PTE permissions don't match, take page fault */
+		if (unlikely(!check_pte_access(access, old_pte)))
+			return 1;
+		/*
+		 * Try to lock the PTE, add ACCESSED and DIRTY if it was
+		 * a write access. Since this is 4K insert of 64K page size
+		 * also add H_PAGE_COMBO
+		 */
+		new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO;
+		if (access & _PAGE_WRITE)
+			new_pte |= _PAGE_DIRTY;
+	} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+	/*
+	 * Handle the subpage protection bits
+	 */
+	subpg_pte = new_pte & ~subpg_prot;
+	rflags = htab_convert_pte_flags(subpg_pte);
+
+	if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+
+		/*
+		 * No CPU has hugepages but lacks no execute, so we
+		 * don't need to worry about that case
+		 */
+		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+	}
+
+	subpg_index = (ea & (PAGE_SIZE - 1)) >> shift;
+	vpn  = hpt_vpn(ea, vsid, ssize);
+	rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
+	/*
+	 *None of the sub 4k page is hashed
+	 */
+	if (!(old_pte & H_PAGE_HASHPTE))
+		goto htab_insert_hpte;
+	/*
+	 * Check if the pte was already inserted into the hash table
+	 * as a 64k HW page, and invalidate the 64k HPTE if so.
+	 */
+	if (!(old_pte & H_PAGE_COMBO)) {
+		flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags);
+		/*
+		 * clear the old slot details from the old and new pte.
+		 * On hash insert failure we use old pte value and we don't
+		 * want slot information there if we have a insert failure.
+		 */
+		old_pte &= ~H_PAGE_HASHPTE;
+		new_pte &= ~H_PAGE_HASHPTE;
+		goto htab_insert_hpte;
+	}
+	/*
+	 * Check for sub page valid and update
+	 */
+	if (__rpte_sub_valid(rpte, subpg_index)) {
+		int ret;
+
+		gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte,
+					   subpg_index);
+		ret = mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn,
+						 MMU_PAGE_4K, MMU_PAGE_4K,
+						 ssize, flags);
+
+		/*
+		 * If we failed because typically the HPTE wasn't really here
+		 * we try an insertion.
+		 */
+		if (ret == -1)
+			goto htab_insert_hpte;
+
+		*ptep = __pte(new_pte & ~H_PAGE_BUSY);
+		return 0;
+	}
+
+htab_insert_hpte:
+
+	/*
+	 * Initialize all hidx entries to invalid value, the first time
+	 * the PTE is about to allocate a 4K HPTE.
+	 */
+	if (!(old_pte & H_PAGE_COMBO))
+		rpte.hidx = INVALID_RPTE_HIDX;
+
+	/*
+	 * handle H_PAGE_4K_PFN case
+	 */
+	if (old_pte & H_PAGE_4K_PFN) {
+		/*
+		 * All the sub 4k page have the same
+		 * physical address.
+		 */
+		pa = pte_pfn(__pte(old_pte)) << HW_PAGE_SHIFT;
+	} else {
+		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+		pa += (subpg_index << shift);
+	}
+	hash = hpt_hash(vpn, shift, ssize);
+repeat:
+	hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+	/* Insert into the hash table, primary slot */
+	slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+					MMU_PAGE_4K, MMU_PAGE_4K, ssize);
+	/*
+	 * Primary is full, try the secondary
+	 */
+	if (unlikely(slot == -1)) {
+		bool soft_invalid;
+
+		hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+						rflags, HPTE_V_SECONDARY,
+						MMU_PAGE_4K, MMU_PAGE_4K,
+						ssize);
+
+		soft_invalid = hpte_soft_invalid(slot);
+		if (unlikely(soft_invalid)) {
+			/*
+			 * We got a valid slot from a hardware point of view.
+			 * but we cannot use it, because we use this special
+			 * value; as defined by hpte_soft_invalid(), to track
+			 * invalid slots. We cannot use it. So invalidate it.
+			 */
+			gslot = slot & _PTEIDX_GROUP_IX;
+			mmu_hash_ops.hpte_invalidate(hpte_group + gslot, vpn,
+						     MMU_PAGE_4K, MMU_PAGE_4K,
+						     ssize, 0);
+		}
+
+		if (unlikely(slot == -1 || soft_invalid)) {
+			/*
+			 * For soft invalid slot, let's ensure that we release a
+			 * slot from the primary, with the hope that we will
+			 * acquire that slot next time we try. This will ensure
+			 * that we do not get the same soft-invalid slot.
+			 */
+			if (soft_invalid || (mftb() & 0x1))
+				hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+			mmu_hash_ops.hpte_remove(hpte_group);
+			/*
+			 * FIXME!! Should be try the group from which we removed ?
+			 */
+			goto repeat;
+		}
+	}
+	/*
+	 * Hypervisor failure. Restore old pte and return -1
+	 * similar to __hash_page_*
+	 */
+	if (unlikely(slot == -2)) {
+		*ptep = __pte(old_pte);
+		hash_failure_debug(ea, access, vsid, trap, ssize,
+				   MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
+		return -1;
+	}
+
+	new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE);
+	new_pte |= H_PAGE_HASHPTE;
+
+	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
+	return 0;
+}
+
+int __hash_page_64K(unsigned long ea, unsigned long access,
+		    unsigned long vsid, pte_t *ptep, unsigned long trap,
+		    unsigned long flags, int ssize)
+{
+	real_pte_t rpte;
+	unsigned long hpte_group;
+	unsigned long rflags, pa;
+	unsigned long old_pte, new_pte;
+	unsigned long vpn, hash, slot;
+	unsigned long shift = mmu_psize_defs[MMU_PAGE_64K].shift;
+
+	/*
+	 * atomically mark the linux large page PTE busy and dirty
+	 */
+	do {
+		pte_t pte = READ_ONCE(*ptep);
+
+		old_pte = pte_val(pte);
+		/* If PTE busy, retry the access */
+		if (unlikely(old_pte & H_PAGE_BUSY))
+			return 0;
+		/* If PTE permissions don't match, take page fault */
+		if (unlikely(!check_pte_access(access, old_pte)))
+			return 1;
+		/*
+		 * Check if PTE has the cache-inhibit bit set
+		 * If so, bail out and refault as a 4k page
+		 */
+		if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) &&
+		    unlikely(pte_ci(pte)))
+			return 0;
+		/*
+		 * Try to lock the PTE, add ACCESSED and DIRTY if it was
+		 * a write access.
+		 */
+		new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
+		if (access & _PAGE_WRITE)
+			new_pte |= _PAGE_DIRTY;
+	} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+	rflags = htab_convert_pte_flags(new_pte);
+	rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
+
+	if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+
+	vpn  = hpt_vpn(ea, vsid, ssize);
+	if (unlikely(old_pte & H_PAGE_HASHPTE)) {
+		unsigned long gslot;
+
+		/*
+		 * There MIGHT be an HPTE for this pte
+		 */
+		gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
+		if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K,
+					       MMU_PAGE_64K, ssize,
+					       flags) == -1)
+			old_pte &= ~_PAGE_HPTEFLAGS;
+	}
+
+	if (likely(!(old_pte & H_PAGE_HASHPTE))) {
+
+		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+		hash = hpt_hash(vpn, shift, ssize);
+
+repeat:
+		hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+		/* Insert into the hash table, primary slot */
+		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+						MMU_PAGE_64K, MMU_PAGE_64K,
+						ssize);
+		/*
+		 * Primary is full, try the secondary
+		 */
+		if (unlikely(slot == -1)) {
+			hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+			slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+							rflags,
+							HPTE_V_SECONDARY,
+							MMU_PAGE_64K,
+							MMU_PAGE_64K, ssize);
+			if (slot == -1) {
+				if (mftb() & 0x1)
+					hpte_group = (hash & htab_hash_mask) *
+							HPTES_PER_GROUP;
+				mmu_hash_ops.hpte_remove(hpte_group);
+				/*
+				 * FIXME!! Should be try the group from which we removed ?
+				 */
+				goto repeat;
+			}
+		}
+		/*
+		 * Hypervisor failure. Restore old pte and return -1
+		 * similar to __hash_page_*
+		 */
+		if (unlikely(slot == -2)) {
+			*ptep = __pte(old_pte);
+			hash_failure_debug(ea, access, vsid, trap, ssize,
+					   MMU_PAGE_64K, MMU_PAGE_64K, old_pte);
+			return -1;
+		}
+
+		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
+		new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+	}
+	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
+	return 0;
+}
diff --git a/arch/powerpc/mm/book3s64/hash_hugepage.c b/arch/powerpc/mm/book3s64/hash_hugepage.c
new file mode 100644
index 000000000000..440823797de7
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/hash_hugepage.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+/*
+ * PPC64 THP Support for hash based MMUs
+ */
+#include <linux/mm.h>
+#include <asm/machdep.h>
+
+int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
+		    pmd_t *pmdp, unsigned long trap, unsigned long flags,
+		    int ssize, unsigned int psize)
+{
+	unsigned int index, valid;
+	unsigned char *hpte_slot_array;
+	unsigned long rflags, pa, hidx;
+	unsigned long old_pmd, new_pmd;
+	int ret, lpsize = MMU_PAGE_16M;
+	unsigned long vpn, hash, shift, slot;
+
+	/*
+	 * atomically mark the linux large page PMD busy and dirty
+	 */
+	do {
+		pmd_t pmd = READ_ONCE(*pmdp);
+
+		old_pmd = pmd_val(pmd);
+		/* If PMD busy, retry the access */
+		if (unlikely(old_pmd & H_PAGE_BUSY))
+			return 0;
+		/* If PMD permissions don't match, take page fault */
+		if (unlikely(!check_pte_access(access, old_pmd)))
+			return 1;
+		/*
+		 * Try to lock the PTE, add ACCESSED and DIRTY if it was
+		 * a write access
+		 */
+		new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED;
+		if (access & _PAGE_WRITE)
+			new_pmd |= _PAGE_DIRTY;
+	} while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd)));
+
+	/*
+	 * Make sure this is thp or devmap entry
+	 */
+	if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)))
+		return 0;
+
+	rflags = htab_convert_pte_flags(new_pmd);
+
+#if 0
+	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+
+		/*
+		 * No CPU has hugepages but lacks no execute, so we
+		 * don't need to worry about that case
+		 */
+		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+	}
+#endif
+	/*
+	 * Find the slot index details for this ea, using base page size.
+	 */
+	shift = mmu_psize_defs[psize].shift;
+	index = (ea & ~HPAGE_PMD_MASK) >> shift;
+	BUG_ON(index >= PTE_FRAG_SIZE);
+
+	vpn = hpt_vpn(ea, vsid, ssize);
+	hpte_slot_array = get_hpte_slot_array(pmdp);
+	if (psize == MMU_PAGE_4K) {
+		/*
+		 * invalidate the old hpte entry if we have that mapped via 64K
+		 * base page size. This is because demote_segment won't flush
+		 * hash page table entries.
+		 */
+		if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) {
+			flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K,
+					    ssize, flags);
+			/*
+			 * With THP, we also clear the slot information with
+			 * respect to all the 64K hash pte mapping the 16MB
+			 * page. They are all invalid now. This make sure we
+			 * don't find the slot valid when we fault with 4k
+			 * base page size.
+			 *
+			 */
+			memset(hpte_slot_array, 0, PTE_FRAG_SIZE);
+		}
+	}
+
+	valid = hpte_valid(hpte_slot_array, index);
+	if (valid) {
+		/* update the hpte bits */
+		hash = hpt_hash(vpn, shift, ssize);
+		hidx =  hpte_hash_index(hpte_slot_array, index);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+
+		ret = mmu_hash_ops.hpte_updatepp(slot, rflags, vpn,
+						 psize, lpsize, ssize, flags);
+		/*
+		 * We failed to update, try to insert a new entry.
+		 */
+		if (ret == -1) {
+			/*
+			 * large pte is marked busy, so we can be sure
+			 * nobody is looking at hpte_slot_array. hence we can
+			 * safely update this here.
+			 */
+			valid = 0;
+			hpte_slot_array[index] = 0;
+		}
+	}
+
+	if (!valid) {
+		unsigned long hpte_group;
+
+		hash = hpt_hash(vpn, shift, ssize);
+		/* insert new entry */
+		pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
+		new_pmd |= H_PAGE_HASHPTE;
+
+repeat:
+		hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+		/* Insert into the hash table, primary slot */
+		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+						psize, lpsize, ssize);
+		/*
+		 * Primary is full, try the secondary
+		 */
+		if (unlikely(slot == -1)) {
+			hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+			slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+							rflags,
+							HPTE_V_SECONDARY,
+							psize, lpsize, ssize);
+			if (slot == -1) {
+				if (mftb() & 0x1)
+					hpte_group = (hash & htab_hash_mask) *
+							HPTES_PER_GROUP;
+
+				mmu_hash_ops.hpte_remove(hpte_group);
+				goto repeat;
+			}
+		}
+		/*
+		 * Hypervisor failure. Restore old pmd and return -1
+		 * similar to __hash_page_*
+		 */
+		if (unlikely(slot == -2)) {
+			*pmdp = __pmd(old_pmd);
+			hash_failure_debug(ea, access, vsid, trap, ssize,
+					   psize, lpsize, old_pmd);
+			return -1;
+		}
+		/*
+		 * large pte is marked busy, so we can be sure
+		 * nobody is looking at hpte_slot_array. hence we can
+		 * safely update this here.
+		 */
+		mark_hpte_slot_valid(hpte_slot_array, index, slot);
+	}
+	/*
+	 * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with
+	 * base page size 4k.
+	 */
+	if (psize == MMU_PAGE_4K)
+		new_pmd |= H_PAGE_COMBO;
+	/*
+	 * The hpte valid is stored in the pgtable whose address is in the
+	 * second half of the PMD. Order this against clearing of the busy bit in
+	 * huge pmd.
+	 */
+	smp_wmb();
+	*pmdp = __pmd(new_pmd & ~H_PAGE_BUSY);
+	return 0;
+}
diff --git a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c
new file mode 100644
index 000000000000..2d4e02aa15a3
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * Based on the IA-32 version:
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/machdep.h>
+
+extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
+				  unsigned long pa, unsigned long rlags,
+				  unsigned long vflags, int psize, int ssize);
+
+int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
+		     pte_t *ptep, unsigned long trap, unsigned long flags,
+		     int ssize, unsigned int shift, unsigned int mmu_psize)
+{
+	real_pte_t rpte;
+	unsigned long vpn;
+	unsigned long old_pte, new_pte;
+	unsigned long rflags, pa;
+	long slot, offset;
+
+	BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
+
+	/* Search the Linux page table for a match with va */
+	vpn = hpt_vpn(ea, vsid, ssize);
+
+	/*
+	 * At this point, we have a pte (old_pte) which can be used to build
+	 * or update an HPTE. There are 2 cases:
+	 *
+	 * 1. There is a valid (present) pte with no associated HPTE (this is
+	 *	the most common case)
+	 * 2. There is a valid (present) pte with an associated HPTE. The
+	 *	current values of the pp bits in the HPTE prevent access
+	 *	because we are doing software DIRTY bit management and the
+	 *	page is currently not DIRTY.
+	 */
+
+
+	do {
+		old_pte = pte_val(*ptep);
+		/* If PTE busy, retry the access */
+		if (unlikely(old_pte & H_PAGE_BUSY))
+			return 0;
+		/* If PTE permissions don't match, take page fault */
+		if (unlikely(!check_pte_access(access, old_pte)))
+			return 1;
+
+		/*
+		 * Try to lock the PTE, add ACCESSED and DIRTY if it was
+		 * a write access
+		 */
+		new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
+		if (access & _PAGE_WRITE)
+			new_pte |= _PAGE_DIRTY;
+	} while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+	/* Make sure this is a hugetlb entry */
+	if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))
+		return 0;
+
+	rflags = htab_convert_pte_flags(new_pte);
+	if (unlikely(mmu_psize == MMU_PAGE_16G))
+		offset = PTRS_PER_PUD;
+	else
+		offset = PTRS_PER_PMD;
+	rpte = __real_pte(__pte(old_pte), ptep, offset);
+
+	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+		/*
+		 * No CPU has hugepages but lacks no execute, so we
+		 * don't need to worry about that case
+		 */
+		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+
+	/* Check if pte already has an hpte (case 2) */
+	if (unlikely(old_pte & H_PAGE_HASHPTE)) {
+		/* There MIGHT be an HPTE for this pte */
+		unsigned long gslot;
+
+		gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
+		if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize,
+					       mmu_psize, ssize, flags) == -1)
+			old_pte &= ~_PAGE_HPTEFLAGS;
+	}
+
+	if (likely(!(old_pte & H_PAGE_HASHPTE))) {
+		unsigned long hash = hpt_hash(vpn, shift, ssize);
+
+		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+
+		/* clear HPTE slot informations in new PTE */
+		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
+
+		slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
+					     mmu_psize, ssize);
+
+		/*
+		 * Hypervisor failure. Restore old pte and return -1
+		 * similar to __hash_page_*
+		 */
+		if (unlikely(slot == -2)) {
+			*ptep = __pte(old_pte);
+			hash_failure_debug(ea, access, vsid, trap, ssize,
+					   mmu_psize, mmu_psize, old_pte);
+			return -1;
+		}
+
+		new_pte |= pte_set_hidx(ptep, rpte, 0, slot, offset);
+	}
+
+	/*
+	 * No need to use ldarx/stdcx here
+	 */
+	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
+	return 0;
+}
+
+pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
+				  unsigned long addr, pte_t *ptep)
+{
+	unsigned long pte_val;
+	/*
+	 * Clear the _PAGE_PRESENT so that no hardware parallel update is
+	 * possible. Also keep the pte_present true so that we don't take
+	 * wrong fault.
+	 */
+	pte_val = pte_update(vma->vm_mm, addr, ptep,
+			     _PAGE_PRESENT, _PAGE_INVALID, 1);
+
+	return __pte(pte_val);
+}
+
+void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+				  pte_t *ptep, pte_t old_pte, pte_t pte)
+{
+
+	if (radix_enabled())
+		return radix__huge_ptep_modify_prot_commit(vma, addr, ptep,
+							   old_pte, pte);
+	set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+}
diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c
new file mode 100644
index 000000000000..aaa28fd918fe
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/hash_native.c
@@ -0,0 +1,884 @@
+/*
+ * native hashtable management.
+ *
+ * SMP scalability work:
+ *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG_LOW
+
+#include <linux/spinlock.h>
+#include <linux/bitops.h>
+#include <linux/of.h>
+#include <linux/processor.h>
+#include <linux/threads.h>
+#include <linux/smp.h>
+
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/trace.h>
+#include <asm/tlb.h>
+#include <asm/cputable.h>
+#include <asm/udbg.h>
+#include <asm/kexec.h>
+#include <asm/ppc-opcode.h>
+#include <asm/feature-fixups.h>
+
+#include <misc/cxl-base.h>
+
+#ifdef DEBUG_LOW
+#define DBG_LOW(fmt...) udbg_printf(fmt)
+#else
+#define DBG_LOW(fmt...)
+#endif
+
+#ifdef __BIG_ENDIAN__
+#define HPTE_LOCK_BIT 3
+#else
+#define HPTE_LOCK_BIT (56+3)
+#endif
+
+DEFINE_RAW_SPINLOCK(native_tlbie_lock);
+
+static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
+{
+	unsigned long rb;
+
+	rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+
+	asm volatile("tlbiel %0" : : "r" (rb));
+}
+
+/*
+ * tlbiel instruction for hash, set invalidation
+ * i.e., r=1 and is=01 or is=10 or is=11
+ */
+static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
+					unsigned int pid,
+					unsigned int ric, unsigned int prs)
+{
+	unsigned long rb;
+	unsigned long rs;
+	unsigned int r = 0; /* hash format */
+
+	rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+	rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
+
+	asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
+		     : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r)
+		     : "memory");
+}
+
+
+static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is)
+{
+	unsigned int set;
+
+	asm volatile("ptesync": : :"memory");
+
+	for (set = 0; set < num_sets; set++)
+		tlbiel_hash_set_isa206(set, is);
+
+	asm volatile("ptesync": : :"memory");
+}
+
+static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
+{
+	unsigned int set;
+
+	asm volatile("ptesync": : :"memory");
+
+	/*
+	 * Flush the first set of the TLB, and any caching of partition table
+	 * entries. Then flush the remaining sets of the TLB. Hash mode uses
+	 * partition scoped TLB translations.
+	 */
+	tlbiel_hash_set_isa300(0, is, 0, 2, 0);
+	for (set = 1; set < num_sets; set++)
+		tlbiel_hash_set_isa300(set, is, 0, 0, 0);
+
+	/*
+	 * Now invalidate the process table cache.
+	 *
+	 * From ISA v3.0B p. 1078:
+	 *     The following forms are invalid.
+	 *      * PRS=1, R=0, and RIC!=2 (The only process-scoped
+	 *        HPT caching is of the Process Table.)
+	 */
+	tlbiel_hash_set_isa300(0, is, 0, 2, 1);
+
+	asm volatile("ptesync": : :"memory");
+
+	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+void hash__tlbiel_all(unsigned int action)
+{
+	unsigned int is;
+
+	switch (action) {
+	case TLB_INVAL_SCOPE_GLOBAL:
+		is = 3;
+		break;
+	case TLB_INVAL_SCOPE_LPID:
+		is = 2;
+		break;
+	default:
+		BUG();
+	}
+
+	if (early_cpu_has_feature(CPU_FTR_ARCH_300))
+		tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is);
+	else if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
+		tlbiel_all_isa206(POWER8_TLB_SETS, is);
+	else if (early_cpu_has_feature(CPU_FTR_ARCH_206))
+		tlbiel_all_isa206(POWER7_TLB_SETS, is);
+	else
+		WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
+}
+
+static inline unsigned long  ___tlbie(unsigned long vpn, int psize,
+						int apsize, int ssize)
+{
+	unsigned long va;
+	unsigned int penc;
+	unsigned long sllp;
+
+	/*
+	 * We need 14 to 65 bits of va for a tlibe of 4K page
+	 * With vpn we ignore the lower VPN_SHIFT bits already.
+	 * And top two bits are already ignored because we can
+	 * only accomodate 76 bits in a 64 bit vpn with a VPN_SHIFT
+	 * of 12.
+	 */
+	va = vpn << VPN_SHIFT;
+	/*
+	 * clear top 16 bits of 64bit va, non SLS segment
+	 * Older versions of the architecture (2.02 and earler) require the
+	 * masking of the top 16 bits.
+	 */
+	if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA))
+		va &= ~(0xffffULL << 48);
+
+	switch (psize) {
+	case MMU_PAGE_4K:
+		/* clear out bits after (52) [0....52.....63] */
+		va &= ~((1ul << (64 - 52)) - 1);
+		va |= ssize << 8;
+		sllp = get_sllp_encoding(apsize);
+		va |= sllp << 5;
+		asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
+			     : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
+			     : "memory");
+		break;
+	default:
+		/* We need 14 to 14 + i bits of va */
+		penc = mmu_psize_defs[psize].penc[apsize];
+		va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
+		va |= penc << 12;
+		va |= ssize << 8;
+		/*
+		 * AVAL bits:
+		 * We don't need all the bits, but rest of the bits
+		 * must be ignored by the processor.
+		 * vpn cover upto 65 bits of va. (0...65) and we need
+		 * 58..64 bits of va.
+		 */
+		va |= (vpn & 0xfe); /* AVAL */
+		va |= 1; /* L */
+		asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
+			     : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
+			     : "memory");
+		break;
+	}
+	return va;
+}
+
+static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize)
+{
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+		/* Need the extra ptesync to ensure we don't reorder tlbie*/
+		asm volatile("ptesync": : :"memory");
+		___tlbie(vpn, psize, apsize, ssize);
+	}
+}
+
+static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
+{
+	unsigned long rb;
+
+	rb = ___tlbie(vpn, psize, apsize, ssize);
+	trace_tlbie(0, 0, rb, 0, 0, 0, 0);
+}
+
+static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
+{
+	unsigned long va;
+	unsigned int penc;
+	unsigned long sllp;
+
+	/* VPN_SHIFT can be atmost 12 */
+	va = vpn << VPN_SHIFT;
+	/*
+	 * clear top 16 bits of 64 bit va, non SLS segment
+	 * Older versions of the architecture (2.02 and earler) require the
+	 * masking of the top 16 bits.
+	 */
+	if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA))
+		va &= ~(0xffffULL << 48);
+
+	switch (psize) {
+	case MMU_PAGE_4K:
+		/* clear out bits after(52) [0....52.....63] */
+		va &= ~((1ul << (64 - 52)) - 1);
+		va |= ssize << 8;
+		sllp = get_sllp_encoding(apsize);
+		va |= sllp << 5;
+		asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,0", %1)
+			     : : "r" (va), "i" (CPU_FTR_ARCH_206)
+			     : "memory");
+		break;
+	default:
+		/* We need 14 to 14 + i bits of va */
+		penc = mmu_psize_defs[psize].penc[apsize];
+		va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
+		va |= penc << 12;
+		va |= ssize << 8;
+		/*
+		 * AVAL bits:
+		 * We don't need all the bits, but rest of the bits
+		 * must be ignored by the processor.
+		 * vpn cover upto 65 bits of va. (0...65) and we need
+		 * 58..64 bits of va.
+		 */
+		va |= (vpn & 0xfe);
+		va |= 1; /* L */
+		asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,1", %1)
+			     : : "r" (va), "i" (CPU_FTR_ARCH_206)
+			     : "memory");
+		break;
+	}
+	trace_tlbie(0, 1, va, 0, 0, 0, 0);
+
+}
+
+static inline void tlbie(unsigned long vpn, int psize, int apsize,
+			 int ssize, int local)
+{
+	unsigned int use_local;
+	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+	use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && !cxl_ctx_in_use();
+
+	if (use_local)
+		use_local = mmu_psize_defs[psize].tlbiel;
+	if (lock_tlbie && !use_local)
+		raw_spin_lock(&native_tlbie_lock);
+	asm volatile("ptesync": : :"memory");
+	if (use_local) {
+		__tlbiel(vpn, psize, apsize, ssize);
+		asm volatile("ptesync": : :"memory");
+	} else {
+		__tlbie(vpn, psize, apsize, ssize);
+		fixup_tlbie(vpn, psize, apsize, ssize);
+		asm volatile("eieio; tlbsync; ptesync": : :"memory");
+	}
+	if (lock_tlbie && !use_local)
+		raw_spin_unlock(&native_tlbie_lock);
+}
+
+static inline void native_lock_hpte(struct hash_pte *hptep)
+{
+	unsigned long *word = (unsigned long *)&hptep->v;
+
+	while (1) {
+		if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
+			break;
+		spin_begin();
+		while(test_bit(HPTE_LOCK_BIT, word))
+			spin_cpu_relax();
+		spin_end();
+	}
+}
+
+static inline void native_unlock_hpte(struct hash_pte *hptep)
+{
+	unsigned long *word = (unsigned long *)&hptep->v;
+
+	clear_bit_unlock(HPTE_LOCK_BIT, word);
+}
+
+static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
+			unsigned long pa, unsigned long rflags,
+			unsigned long vflags, int psize, int apsize, int ssize)
+{
+	struct hash_pte *hptep = htab_address + hpte_group;
+	unsigned long hpte_v, hpte_r;
+	int i;
+
+	if (!(vflags & HPTE_V_BOLTED)) {
+		DBG_LOW("    insert(group=%lx, vpn=%016lx, pa=%016lx,"
+			" rflags=%lx, vflags=%lx, psize=%d)\n",
+			hpte_group, vpn, pa, rflags, vflags, psize);
+	}
+
+	for (i = 0; i < HPTES_PER_GROUP; i++) {
+		if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
+			/* retry with lock held */
+			native_lock_hpte(hptep);
+			if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
+				break;
+			native_unlock_hpte(hptep);
+		}
+
+		hptep++;
+	}
+
+	if (i == HPTES_PER_GROUP)
+		return -1;
+
+	hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
+	hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
+
+	if (!(vflags & HPTE_V_BOLTED)) {
+		DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
+			i, hpte_v, hpte_r);
+	}
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		hpte_r = hpte_old_to_new_r(hpte_v, hpte_r);
+		hpte_v = hpte_old_to_new_v(hpte_v);
+	}
+
+	hptep->r = cpu_to_be64(hpte_r);
+	/* Guarantee the second dword is visible before the valid bit */
+	eieio();
+	/*
+	 * Now set the first dword including the valid bit
+	 * NOTE: this also unlocks the hpte
+	 */
+	hptep->v = cpu_to_be64(hpte_v);
+
+	__asm__ __volatile__ ("ptesync" : : : "memory");
+
+	return i | (!!(vflags & HPTE_V_SECONDARY) << 3);
+}
+
+static long native_hpte_remove(unsigned long hpte_group)
+{
+	struct hash_pte *hptep;
+	int i;
+	int slot_offset;
+	unsigned long hpte_v;
+
+	DBG_LOW("    remove(group=%lx)\n", hpte_group);
+
+	/* pick a random entry to start at */
+	slot_offset = mftb() & 0x7;
+
+	for (i = 0; i < HPTES_PER_GROUP; i++) {
+		hptep = htab_address + hpte_group + slot_offset;
+		hpte_v = be64_to_cpu(hptep->v);
+
+		if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) {
+			/* retry with lock held */
+			native_lock_hpte(hptep);
+			hpte_v = be64_to_cpu(hptep->v);
+			if ((hpte_v & HPTE_V_VALID)
+			    && !(hpte_v & HPTE_V_BOLTED))
+				break;
+			native_unlock_hpte(hptep);
+		}
+
+		slot_offset++;
+		slot_offset &= 0x7;
+	}
+
+	if (i == HPTES_PER_GROUP)
+		return -1;
+
+	/* Invalidate the hpte. NOTE: this also unlocks it */
+	hptep->v = 0;
+
+	return i;
+}
+
+static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
+				 unsigned long vpn, int bpsize,
+				 int apsize, int ssize, unsigned long flags)
+{
+	struct hash_pte *hptep = htab_address + slot;
+	unsigned long hpte_v, want_v;
+	int ret = 0, local = 0;
+
+	want_v = hpte_encode_avpn(vpn, bpsize, ssize);
+
+	DBG_LOW("    update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
+		vpn, want_v & HPTE_V_AVPN, slot, newpp);
+
+	hpte_v = hpte_get_old_v(hptep);
+	/*
+	 * We need to invalidate the TLB always because hpte_remove doesn't do
+	 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
+	 * random entry from it. When we do that we don't invalidate the TLB
+	 * (hpte_remove) because we assume the old translation is still
+	 * technically "valid".
+	 */
+	if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
+		DBG_LOW(" -> miss\n");
+		ret = -1;
+	} else {
+		native_lock_hpte(hptep);
+		/* recheck with locks held */
+		hpte_v = hpte_get_old_v(hptep);
+		if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) ||
+			     !(hpte_v & HPTE_V_VALID))) {
+			ret = -1;
+		} else {
+			DBG_LOW(" -> hit\n");
+			/* Update the HPTE */
+			hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
+						~(HPTE_R_PPP | HPTE_R_N)) |
+					       (newpp & (HPTE_R_PPP | HPTE_R_N |
+							 HPTE_R_C)));
+		}
+		native_unlock_hpte(hptep);
+	}
+
+	if (flags & HPTE_LOCAL_UPDATE)
+		local = 1;
+	/*
+	 * Ensure it is out of the tlb too if it is not a nohpte fault
+	 */
+	if (!(flags & HPTE_NOHPTE_UPDATE))
+		tlbie(vpn, bpsize, apsize, ssize, local);
+
+	return ret;
+}
+
+static long native_hpte_find(unsigned long vpn, int psize, int ssize)
+{
+	struct hash_pte *hptep;
+	unsigned long hash;
+	unsigned long i;
+	long slot;
+	unsigned long want_v, hpte_v;
+
+	hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
+	want_v = hpte_encode_avpn(vpn, psize, ssize);
+
+	/* Bolted mappings are only ever in the primary group */
+	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+	for (i = 0; i < HPTES_PER_GROUP; i++) {
+
+		hptep = htab_address + slot;
+		hpte_v = hpte_get_old_v(hptep);
+		if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
+			/* HPTE matches */
+			return slot;
+		++slot;
+	}
+
+	return -1;
+}
+
+/*
+ * Update the page protection bits. Intended to be used to create
+ * guard pages for kernel data structures on pages which are bolted
+ * in the HPT. Assumes pages being operated on will not be stolen.
+ *
+ * No need to lock here because we should be the only user.
+ */
+static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
+				       int psize, int ssize)
+{
+	unsigned long vpn;
+	unsigned long vsid;
+	long slot;
+	struct hash_pte *hptep;
+
+	vsid = get_kernel_vsid(ea, ssize);
+	vpn = hpt_vpn(ea, vsid, ssize);
+
+	slot = native_hpte_find(vpn, psize, ssize);
+	if (slot == -1)
+		panic("could not find page to bolt\n");
+	hptep = htab_address + slot;
+
+	/* Update the HPTE */
+	hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
+				~(HPTE_R_PPP | HPTE_R_N)) |
+			       (newpp & (HPTE_R_PPP | HPTE_R_N)));
+	/*
+	 * Ensure it is out of the tlb too. Bolted entries base and
+	 * actual page size will be same.
+	 */
+	tlbie(vpn, psize, psize, ssize, 0);
+}
+
+/*
+ * Remove a bolted kernel entry. Memory hotplug uses this.
+ *
+ * No need to lock here because we should be the only user.
+ */
+static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
+{
+	unsigned long vpn;
+	unsigned long vsid;
+	long slot;
+	struct hash_pte *hptep;
+
+	vsid = get_kernel_vsid(ea, ssize);
+	vpn = hpt_vpn(ea, vsid, ssize);
+
+	slot = native_hpte_find(vpn, psize, ssize);
+	if (slot == -1)
+		return -ENOENT;
+
+	hptep = htab_address + slot;
+
+	VM_WARN_ON(!(be64_to_cpu(hptep->v) & HPTE_V_BOLTED));
+
+	/* Invalidate the hpte */
+	hptep->v = 0;
+
+	/* Invalidate the TLB */
+	tlbie(vpn, psize, psize, ssize, 0);
+	return 0;
+}
+
+
+static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
+				   int bpsize, int apsize, int ssize, int local)
+{
+	struct hash_pte *hptep = htab_address + slot;
+	unsigned long hpte_v;
+	unsigned long want_v;
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	DBG_LOW("    invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
+
+	want_v = hpte_encode_avpn(vpn, bpsize, ssize);
+	hpte_v = hpte_get_old_v(hptep);
+
+	if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+		native_lock_hpte(hptep);
+		/* recheck with locks held */
+		hpte_v = hpte_get_old_v(hptep);
+
+		if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
+			/* Invalidate the hpte. NOTE: this also unlocks it */
+			hptep->v = 0;
+		else
+			native_unlock_hpte(hptep);
+	}
+	/*
+	 * We need to invalidate the TLB always because hpte_remove doesn't do
+	 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
+	 * random entry from it. When we do that we don't invalidate the TLB
+	 * (hpte_remove) because we assume the old translation is still
+	 * technically "valid".
+	 */
+	tlbie(vpn, bpsize, apsize, ssize, local);
+
+	local_irq_restore(flags);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void native_hugepage_invalidate(unsigned long vsid,
+				       unsigned long addr,
+				       unsigned char *hpte_slot_array,
+				       int psize, int ssize, int local)
+{
+	int i;
+	struct hash_pte *hptep;
+	int actual_psize = MMU_PAGE_16M;
+	unsigned int max_hpte_count, valid;
+	unsigned long flags, s_addr = addr;
+	unsigned long hpte_v, want_v, shift;
+	unsigned long hidx, vpn = 0, hash, slot;
+
+	shift = mmu_psize_defs[psize].shift;
+	max_hpte_count = 1U << (PMD_SHIFT - shift);
+
+	local_irq_save(flags);
+	for (i = 0; i < max_hpte_count; i++) {
+		valid = hpte_valid(hpte_slot_array, i);
+		if (!valid)
+			continue;
+		hidx =  hpte_hash_index(hpte_slot_array, i);
+
+		/* get the vpn */
+		addr = s_addr + (i * (1ul << shift));
+		vpn = hpt_vpn(addr, vsid, ssize);
+		hash = hpt_hash(vpn, shift, ssize);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+
+		hptep = htab_address + slot;
+		want_v = hpte_encode_avpn(vpn, psize, ssize);
+		hpte_v = hpte_get_old_v(hptep);
+
+		/* Even if we miss, we need to invalidate the TLB */
+		if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+			/* recheck with locks held */
+			native_lock_hpte(hptep);
+			hpte_v = hpte_get_old_v(hptep);
+
+			if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+				/*
+				 * Invalidate the hpte. NOTE: this also unlocks it
+				 */
+
+				hptep->v = 0;
+			} else
+				native_unlock_hpte(hptep);
+		}
+		/*
+		 * We need to do tlb invalidate for all the address, tlbie
+		 * instruction compares entry_VA in tlb with the VA specified
+		 * here
+		 */
+		tlbie(vpn, psize, actual_psize, ssize, local);
+	}
+	local_irq_restore(flags);
+}
+#else
+static void native_hugepage_invalidate(unsigned long vsid,
+				       unsigned long addr,
+				       unsigned char *hpte_slot_array,
+				       int psize, int ssize, int local)
+{
+	WARN(1, "%s called without THP support\n", __func__);
+}
+#endif
+
+static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
+			int *psize, int *apsize, int *ssize, unsigned long *vpn)
+{
+	unsigned long avpn, pteg, vpi;
+	unsigned long hpte_v = be64_to_cpu(hpte->v);
+	unsigned long hpte_r = be64_to_cpu(hpte->r);
+	unsigned long vsid, seg_off;
+	int size, a_size, shift;
+	/* Look at the 8 bit LP value */
+	unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		hpte_v = hpte_new_to_old_v(hpte_v, hpte_r);
+		hpte_r = hpte_new_to_old_r(hpte_r);
+	}
+	if (!(hpte_v & HPTE_V_LARGE)) {
+		size   = MMU_PAGE_4K;
+		a_size = MMU_PAGE_4K;
+	} else {
+		size = hpte_page_sizes[lp] & 0xf;
+		a_size = hpte_page_sizes[lp] >> 4;
+	}
+	/* This works for all page sizes, and for 256M and 1T segments */
+	*ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
+	shift = mmu_psize_defs[size].shift;
+
+	avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm);
+	pteg = slot / HPTES_PER_GROUP;
+	if (hpte_v & HPTE_V_SECONDARY)
+		pteg = ~pteg;
+
+	switch (*ssize) {
+	case MMU_SEGSIZE_256M:
+		/* We only have 28 - 23 bits of seg_off in avpn */
+		seg_off = (avpn & 0x1f) << 23;
+		vsid    =  avpn >> 5;
+		/* We can find more bits from the pteg value */
+		if (shift < 23) {
+			vpi = (vsid ^ pteg) & htab_hash_mask;
+			seg_off |= vpi << shift;
+		}
+		*vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT;
+		break;
+	case MMU_SEGSIZE_1T:
+		/* We only have 40 - 23 bits of seg_off in avpn */
+		seg_off = (avpn & 0x1ffff) << 23;
+		vsid    = avpn >> 17;
+		if (shift < 23) {
+			vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask;
+			seg_off |= vpi << shift;
+		}
+		*vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT;
+		break;
+	default:
+		*vpn = size = 0;
+	}
+	*psize  = size;
+	*apsize = a_size;
+}
+
+/*
+ * clear all mappings on kexec.  All cpus are in real mode (or they will
+ * be when they isi), and we are the only one left.  We rely on our kernel
+ * mapping being 0xC0's and the hardware ignoring those two real bits.
+ *
+ * This must be called with interrupts disabled.
+ *
+ * Taking the native_tlbie_lock is unsafe here due to the possibility of
+ * lockdep being on. On pre POWER5 hardware, not taking the lock could
+ * cause deadlock. POWER5 and newer not taking the lock is fine. This only
+ * gets called during boot before secondary CPUs have come up and during
+ * crashdump and all bets are off anyway.
+ *
+ * TODO: add batching support when enabled.  remember, no dynamic memory here,
+ * although there is the control page available...
+ */
+static void native_hpte_clear(void)
+{
+	unsigned long vpn = 0;
+	unsigned long slot, slots;
+	struct hash_pte *hptep = htab_address;
+	unsigned long hpte_v;
+	unsigned long pteg_count;
+	int psize, apsize, ssize;
+
+	pteg_count = htab_hash_mask + 1;
+
+	slots = pteg_count * HPTES_PER_GROUP;
+
+	for (slot = 0; slot < slots; slot++, hptep++) {
+		/*
+		 * we could lock the pte here, but we are the only cpu
+		 * running,  right?  and for crash dump, we probably
+		 * don't want to wait for a maybe bad cpu.
+		 */
+		hpte_v = be64_to_cpu(hptep->v);
+
+		/*
+		 * Call __tlbie() here rather than tlbie() since we can't take the
+		 * native_tlbie_lock.
+		 */
+		if (hpte_v & HPTE_V_VALID) {
+			hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn);
+			hptep->v = 0;
+			___tlbie(vpn, psize, apsize, ssize);
+		}
+	}
+
+	asm volatile("eieio; tlbsync; ptesync":::"memory");
+}
+
+/*
+ * Batched hash table flush, we batch the tlbie's to avoid taking/releasing
+ * the lock all the time
+ */
+static void native_flush_hash_range(unsigned long number, int local)
+{
+	unsigned long vpn = 0;
+	unsigned long hash, index, hidx, shift, slot;
+	struct hash_pte *hptep;
+	unsigned long hpte_v;
+	unsigned long want_v;
+	unsigned long flags;
+	real_pte_t pte;
+	struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
+	unsigned long psize = batch->psize;
+	int ssize = batch->ssize;
+	int i;
+	unsigned int use_local;
+
+	use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) &&
+		mmu_psize_defs[psize].tlbiel && !cxl_ctx_in_use();
+
+	local_irq_save(flags);
+
+	for (i = 0; i < number; i++) {
+		vpn = batch->vpn[i];
+		pte = batch->pte[i];
+
+		pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
+			hash = hpt_hash(vpn, shift, ssize);
+			hidx = __rpte_to_hidx(pte, index);
+			if (hidx & _PTEIDX_SECONDARY)
+				hash = ~hash;
+			slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+			slot += hidx & _PTEIDX_GROUP_IX;
+			hptep = htab_address + slot;
+			want_v = hpte_encode_avpn(vpn, psize, ssize);
+			hpte_v = hpte_get_old_v(hptep);
+
+			if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+				continue;
+			/* lock and try again */
+			native_lock_hpte(hptep);
+			hpte_v = hpte_get_old_v(hptep);
+
+			if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+				native_unlock_hpte(hptep);
+			else
+				hptep->v = 0;
+
+		} pte_iterate_hashed_end();
+	}
+
+	if (use_local) {
+		asm volatile("ptesync":::"memory");
+		for (i = 0; i < number; i++) {
+			vpn = batch->vpn[i];
+			pte = batch->pte[i];
+
+			pte_iterate_hashed_subpages(pte, psize,
+						    vpn, index, shift) {
+				__tlbiel(vpn, psize, psize, ssize);
+			} pte_iterate_hashed_end();
+		}
+		asm volatile("ptesync":::"memory");
+	} else {
+		int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+		if (lock_tlbie)
+			raw_spin_lock(&native_tlbie_lock);
+
+		asm volatile("ptesync":::"memory");
+		for (i = 0; i < number; i++) {
+			vpn = batch->vpn[i];
+			pte = batch->pte[i];
+
+			pte_iterate_hashed_subpages(pte, psize,
+						    vpn, index, shift) {
+				__tlbie(vpn, psize, psize, ssize);
+			} pte_iterate_hashed_end();
+		}
+		/*
+		 * Just do one more with the last used values.
+		 */
+		fixup_tlbie(vpn, psize, psize, ssize);
+		asm volatile("eieio; tlbsync; ptesync":::"memory");
+
+		if (lock_tlbie)
+			raw_spin_unlock(&native_tlbie_lock);
+	}
+
+	local_irq_restore(flags);
+}
+
+void __init hpte_init_native(void)
+{
+	mmu_hash_ops.hpte_invalidate	= native_hpte_invalidate;
+	mmu_hash_ops.hpte_updatepp	= native_hpte_updatepp;
+	mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp;
+	mmu_hash_ops.hpte_removebolted = native_hpte_removebolted;
+	mmu_hash_ops.hpte_insert	= native_hpte_insert;
+	mmu_hash_ops.hpte_remove	= native_hpte_remove;
+	mmu_hash_ops.hpte_clear_all	= native_hpte_clear;
+	mmu_hash_ops.flush_hash_range = native_flush_hash_range;
+	mmu_hash_ops.hugepage_invalidate   = native_hugepage_invalidate;
+}
diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
new file mode 100644
index 000000000000..1fd025dba4a3
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -0,0 +1,463 @@
+/*
+ * Copyright 2005, Paul Mackerras, IBM Corporation.
+ * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm_types.h>
+#include <linux/mm.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
+#include <asm/mmu.h>
+#include <asm/tlb.h>
+
+#include <mm/mmu_decl.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/thp.h>
+
+#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE))
+#warning Limited user VSID range means pagetable space is wasted
+#endif
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/*
+ * vmemmap is the starting address of the virtual address space where
+ * struct pages are allocated for all possible PFNs present on the system
+ * including holes and bad memory (hence sparse). These virtual struct
+ * pages are stored in sequence in this virtual address space irrespective
+ * of the fact whether the corresponding PFN is valid or not. This achieves
+ * constant relationship between address of struct page and its PFN.
+ *
+ * During boot or memory hotplug operation when a new memory section is
+ * added, physical memory allocation (including hash table bolting) will
+ * be performed for the set of struct pages which are part of the memory
+ * section. This saves memory by not allocating struct pages for PFNs
+ * which are not valid.
+ *
+ *		----------------------------------------------
+ *		| PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES|
+ *		----------------------------------------------
+ *
+ *	   f000000000000000                  c000000000000000
+ * vmemmap +--------------+                  +--------------+
+ *  +      |  page struct | +--------------> |  page struct |
+ *  |      +--------------+                  +--------------+
+ *  |      |  page struct | +--------------> |  page struct |
+ *  |      +--------------+ |                +--------------+
+ *  |      |  page struct | +       +------> |  page struct |
+ *  |      +--------------+         |        +--------------+
+ *  |      |  page struct |         |   +--> |  page struct |
+ *  |      +--------------+         |   |    +--------------+
+ *  |      |  page struct |         |   |
+ *  |      +--------------+         |   |
+ *  |      |  page struct |         |   |
+ *  |      +--------------+         |   |
+ *  |      |  page struct |         |   |
+ *  |      +--------------+         |   |
+ *  |      |  page struct |         |   |
+ *  |      +--------------+         |   |
+ *  |      |  page struct | +-------+   |
+ *  |      +--------------+             |
+ *  |      |  page struct | +-----------+
+ *  |      +--------------+
+ *  |      |  page struct | No mapping
+ *  |      +--------------+
+ *  |      |  page struct | No mapping
+ *  v      +--------------+
+ *
+ *		-----------------------------------------
+ *		| RELATION BETWEEN STRUCT PAGES AND PFNS|
+ *		-----------------------------------------
+ *
+ * vmemmap +--------------+                 +---------------+
+ *  +      |  page struct | +-------------> |      PFN      |
+ *  |      +--------------+                 +---------------+
+ *  |      |  page struct | +-------------> |      PFN      |
+ *  |      +--------------+                 +---------------+
+ *  |      |  page struct | +-------------> |      PFN      |
+ *  |      +--------------+                 +---------------+
+ *  |      |  page struct | +-------------> |      PFN      |
+ *  |      +--------------+                 +---------------+
+ *  |      |              |
+ *  |      +--------------+
+ *  |      |              |
+ *  |      +--------------+
+ *  |      |              |
+ *  |      +--------------+                 +---------------+
+ *  |      |  page struct | +-------------> |      PFN      |
+ *  |      +--------------+                 +---------------+
+ *  |      |              |
+ *  |      +--------------+
+ *  |      |              |
+ *  |      +--------------+                 +---------------+
+ *  |      |  page struct | +-------------> |      PFN      |
+ *  |      +--------------+                 +---------------+
+ *  |      |  page struct | +-------------> |      PFN      |
+ *  v      +--------------+                 +---------------+
+ */
+/*
+ * On hash-based CPUs, the vmemmap is bolted in the hash table.
+ *
+ */
+int __meminit hash__vmemmap_create_mapping(unsigned long start,
+				       unsigned long page_size,
+				       unsigned long phys)
+{
+	int rc;
+
+	if ((start + page_size) >= H_VMEMMAP_END) {
+		pr_warn("Outside the supported range\n");
+		return -1;
+	}
+
+	rc = htab_bolt_mapping(start, start + page_size, phys,
+			       pgprot_val(PAGE_KERNEL),
+			       mmu_vmemmap_psize, mmu_kernel_ssize);
+	if (rc < 0) {
+		int rc2 = htab_remove_mapping(start, start + page_size,
+					      mmu_vmemmap_psize,
+					      mmu_kernel_ssize);
+		BUG_ON(rc2 && (rc2 != -ENOENT));
+	}
+	return rc;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void hash__vmemmap_remove_mapping(unsigned long start,
+			      unsigned long page_size)
+{
+	int rc = htab_remove_mapping(start, start + page_size,
+				     mmu_vmemmap_psize,
+				     mmu_kernel_ssize);
+	BUG_ON((rc < 0) && (rc != -ENOENT));
+	WARN_ON(rc == -ENOENT);
+}
+#endif
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+/*
+ * map_kernel_page currently only called by __ioremap
+ * map_kernel_page adds an entry to the ioremap page table
+ * and adds an entry to the HPT, possibly bolting it
+ */
+int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
+{
+	pgd_t *pgdp;
+	pud_t *pudp;
+	pmd_t *pmdp;
+	pte_t *ptep;
+
+	BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
+	if (slab_is_available()) {
+		pgdp = pgd_offset_k(ea);
+		pudp = pud_alloc(&init_mm, pgdp, ea);
+		if (!pudp)
+			return -ENOMEM;
+		pmdp = pmd_alloc(&init_mm, pudp, ea);
+		if (!pmdp)
+			return -ENOMEM;
+		ptep = pte_alloc_kernel(pmdp, ea);
+		if (!ptep)
+			return -ENOMEM;
+		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
+	} else {
+		/*
+		 * If the mm subsystem is not fully up, we cannot create a
+		 * linux page table entry for this mapping.  Simply bolt an
+		 * entry in the hardware page table.
+		 *
+		 */
+		if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot),
+				      mmu_io_psize, mmu_kernel_ssize)) {
+			printk(KERN_ERR "Failed to do bolted mapping IO "
+			       "memory at %016lx !\n", pa);
+			return -ENOMEM;
+		}
+	}
+
+	smp_wmb();
+	return 0;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+				    pmd_t *pmdp, unsigned long clr,
+				    unsigned long set)
+{
+	__be64 old_be, tmp;
+	unsigned long old;
+
+#ifdef CONFIG_DEBUG_VM
+	WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+	assert_spin_locked(pmd_lockptr(mm, pmdp));
+#endif
+
+	__asm__ __volatile__(
+	"1:	ldarx	%0,0,%3\n\
+		and.	%1,%0,%6\n\
+		bne-	1b \n\
+		andc	%1,%0,%4 \n\
+		or	%1,%1,%7\n\
+		stdcx.	%1,0,%3 \n\
+		bne-	1b"
+	: "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
+	: "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
+	  "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
+	: "cc" );
+
+	old = be64_to_cpu(old_be);
+
+	trace_hugepage_update(addr, old, clr, set);
+	if (old & H_PAGE_HASHPTE)
+		hpte_do_hugepage_flush(mm, addr, pmdp, old);
+	return old;
+}
+
+pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+			    pmd_t *pmdp)
+{
+	pmd_t pmd;
+
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+	VM_BUG_ON(pmd_trans_huge(*pmdp));
+	VM_BUG_ON(pmd_devmap(*pmdp));
+
+	pmd = *pmdp;
+	pmd_clear(pmdp);
+	/*
+	 * Wait for all pending hash_page to finish. This is needed
+	 * in case of subpage collapse. When we collapse normal pages
+	 * to hugepage, we first clear the pmd, then invalidate all
+	 * the PTE entries. The assumption here is that any low level
+	 * page fault will see a none pmd and take the slow path that
+	 * will wait on mmap_sem. But we could very well be in a
+	 * hash_page with local ptep pointer value. Such a hash page
+	 * can result in adding new HPTE entries for normal subpages.
+	 * That means we could be modifying the page content as we
+	 * copy them to a huge page. So wait for parallel hash_page
+	 * to finish before invalidating HPTE entries. We can do this
+	 * by sending an IPI to all the cpus and executing a dummy
+	 * function there.
+	 */
+	serialize_against_pte_lookup(vma->vm_mm);
+	/*
+	 * Now invalidate the hpte entries in the range
+	 * covered by pmd. This make sure we take a
+	 * fault and will find the pmd as none, which will
+	 * result in a major fault which takes mmap_sem and
+	 * hence wait for collapse to complete. Without this
+	 * the __collapse_huge_page_copy can result in copying
+	 * the old content.
+	 */
+	flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
+	return pmd;
+}
+
+/*
+ * We want to put the pgtable in pmd and use pgtable for tracking
+ * the base page size hptes
+ */
+void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+				  pgtable_t pgtable)
+{
+	pgtable_t *pgtable_slot;
+
+	assert_spin_locked(pmd_lockptr(mm, pmdp));
+	/*
+	 * we store the pgtable in the second half of PMD
+	 */
+	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+	*pgtable_slot = pgtable;
+	/*
+	 * expose the deposited pgtable to other cpus.
+	 * before we set the hugepage PTE at pmd level
+	 * hash fault code looks at the deposted pgtable
+	 * to store hash index values.
+	 */
+	smp_wmb();
+}
+
+pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+	pgtable_t pgtable;
+	pgtable_t *pgtable_slot;
+
+	assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+	pgtable = *pgtable_slot;
+	/*
+	 * Once we withdraw, mark the entry NULL.
+	 */
+	*pgtable_slot = NULL;
+	/*
+	 * We store HPTE information in the deposited PTE fragment.
+	 * zero out the content on withdraw.
+	 */
+	memset(pgtable, 0, PTE_FRAG_SIZE);
+	return pgtable;
+}
+
+/*
+ * A linux hugepage PMD was changed and the corresponding hash table entries
+ * neesd to be flushed.
+ */
+void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+			    pmd_t *pmdp, unsigned long old_pmd)
+{
+	int ssize;
+	unsigned int psize;
+	unsigned long vsid;
+	unsigned long flags = 0;
+
+	/* get the base page size,vsid and segment size */
+#ifdef CONFIG_DEBUG_VM
+	psize = get_slice_psize(mm, addr);
+	BUG_ON(psize == MMU_PAGE_16M);
+#endif
+	if (old_pmd & H_PAGE_COMBO)
+		psize = MMU_PAGE_4K;
+	else
+		psize = MMU_PAGE_64K;
+
+	if (!is_kernel_addr(addr)) {
+		ssize = user_segment_size(addr);
+		vsid = get_user_vsid(&mm->context, addr, ssize);
+		WARN_ON(vsid == 0);
+	} else {
+		vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+		ssize = mmu_kernel_ssize;
+	}
+
+	if (mm_is_thread_local(mm))
+		flags |= HPTE_LOCAL_UPDATE;
+
+	return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
+}
+
+pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
+				unsigned long addr, pmd_t *pmdp)
+{
+	pmd_t old_pmd;
+	pgtable_t pgtable;
+	unsigned long old;
+	pgtable_t *pgtable_slot;
+
+	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
+	old_pmd = __pmd(old);
+	/*
+	 * We have pmd == none and we are holding page_table_lock.
+	 * So we can safely go and clear the pgtable hash
+	 * index info.
+	 */
+	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+	pgtable = *pgtable_slot;
+	/*
+	 * Let's zero out old valid and hash index details
+	 * hash fault look at them.
+	 */
+	memset(pgtable, 0, PTE_FRAG_SIZE);
+	/*
+	 * Serialize against find_current_mm_pte variants which does lock-less
+	 * lookup in page tables with local interrupts disabled. For huge pages
+	 * it casts pmd_t to pte_t. Since format of pte_t is different from
+	 * pmd_t we want to prevent transit from pmd pointing to page table
+	 * to pmd pointing to huge page (and back) while interrupts are disabled.
+	 * We clear pmd to possibly replace it with page table pointer in
+	 * different code paths. So make sure we wait for the parallel
+	 * find_curren_mm_pte to finish.
+	 */
+	serialize_against_pte_lookup(mm);
+	return old_pmd;
+}
+
+int hash__has_transparent_hugepage(void)
+{
+
+	if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+		return 0;
+	/*
+	 * We support THP only if PMD_SIZE is 16MB.
+	 */
+	if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
+		return 0;
+	/*
+	 * We need to make sure that we support 16MB hugepage in a segement
+	 * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
+	 * of 64K.
+	 */
+	/*
+	 * If we have 64K HPTE, we will be using that by default
+	 */
+	if (mmu_psize_defs[MMU_PAGE_64K].shift &&
+	    (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
+		return 0;
+	/*
+	 * Ok we only have 4K HPTE
+	 */
+	if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
+		return 0;
+
+	return 1;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_STRICT_KERNEL_RWX
+static bool hash__change_memory_range(unsigned long start, unsigned long end,
+				      unsigned long newpp)
+{
+	unsigned long idx;
+	unsigned int step, shift;
+
+	shift = mmu_psize_defs[mmu_linear_psize].shift;
+	step = 1 << shift;
+
+	start = ALIGN_DOWN(start, step);
+	end = ALIGN(end, step); // aligns up
+
+	if (start >= end)
+		return false;
+
+	pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
+		 start, end, newpp, step);
+
+	for (idx = start; idx < end; idx += step)
+		/* Not sure if we can do much with the return value */
+		mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
+							mmu_kernel_ssize);
+
+	return true;
+}
+
+void hash__mark_rodata_ro(void)
+{
+	unsigned long start, end;
+
+	start = (unsigned long)_stext;
+	end = (unsigned long)__init_begin;
+
+	WARN_ON(!hash__change_memory_range(start, end, PP_RXXX));
+}
+
+void hash__mark_initmem_nx(void)
+{
+	unsigned long start, end, pp;
+
+	start = (unsigned long)__init_begin;
+	end = (unsigned long)__init_end;
+
+	pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
+
+	WARN_ON(!hash__change_memory_range(start, end, pp));
+}
+#endif
diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c
new file mode 100644
index 000000000000..d4f0101447b1
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/hash_tlb.c
@@ -0,0 +1,265 @@
+/*
+ * This file contains the routines for flushing entries from the
+ * TLB and MMU hash table.
+ *
+ *  Derived from arch/ppc64/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Dave Engebretsen <engebret@us.ibm.com>
+ *      Rework for PPC64 port.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/bug.h>
+#include <asm/pte-walk.h>
+
+
+#include <trace/events/thp.h>
+
+DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
+
+/*
+ * A linux PTE was changed and the corresponding hash table entry
+ * neesd to be flushed. This function will either perform the flush
+ * immediately or will batch it up if the current CPU has an active
+ * batch on it.
+ */
+void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, unsigned long pte, int huge)
+{
+	unsigned long vpn;
+	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
+	unsigned long vsid;
+	unsigned int psize;
+	int ssize;
+	real_pte_t rpte;
+	int i, offset;
+
+	i = batch->index;
+
+	/*
+	 * Get page size (maybe move back to caller).
+	 *
+	 * NOTE: when using special 64K mappings in 4K environment like
+	 * for SPEs, we obtain the page size from the slice, which thus
+	 * must still exist (and thus the VMA not reused) at the time
+	 * of this call
+	 */
+	if (huge) {
+#ifdef CONFIG_HUGETLB_PAGE
+		psize = get_slice_psize(mm, addr);
+		/* Mask the address for the correct page size */
+		addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1);
+		if (unlikely(psize == MMU_PAGE_16G))
+			offset = PTRS_PER_PUD;
+		else
+			offset = PTRS_PER_PMD;
+#else
+		BUG();
+		psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
+#endif
+	} else {
+		psize = pte_pagesize_index(mm, addr, pte);
+		/*
+		 * Mask the address for the standard page size.  If we
+		 * have a 64k page kernel, but the hardware does not
+		 * support 64k pages, this might be different from the
+		 * hardware page size encoded in the slice table.
+		 */
+		addr &= PAGE_MASK;
+		offset = PTRS_PER_PTE;
+	}
+
+
+	/* Build full vaddr */
+	if (!is_kernel_addr(addr)) {
+		ssize = user_segment_size(addr);
+		vsid = get_user_vsid(&mm->context, addr, ssize);
+	} else {
+		vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+		ssize = mmu_kernel_ssize;
+	}
+	WARN_ON(vsid == 0);
+	vpn = hpt_vpn(addr, vsid, ssize);
+	rpte = __real_pte(__pte(pte), ptep, offset);
+
+	/*
+	 * Check if we have an active batch on this CPU. If not, just
+	 * flush now and return.
+	 */
+	if (!batch->active) {
+		flush_hash_page(vpn, rpte, psize, ssize, mm_is_thread_local(mm));
+		put_cpu_var(ppc64_tlb_batch);
+		return;
+	}
+
+	/*
+	 * This can happen when we are in the middle of a TLB batch and
+	 * we encounter memory pressure (eg copy_page_range when it tries
+	 * to allocate a new pte). If we have to reclaim memory and end
+	 * up scanning and resetting referenced bits then our batch context
+	 * will change mid stream.
+	 *
+	 * We also need to ensure only one page size is present in a given
+	 * batch
+	 */
+	if (i != 0 && (mm != batch->mm || batch->psize != psize ||
+		       batch->ssize != ssize)) {
+		__flush_tlb_pending(batch);
+		i = 0;
+	}
+	if (i == 0) {
+		batch->mm = mm;
+		batch->psize = psize;
+		batch->ssize = ssize;
+	}
+	batch->pte[i] = rpte;
+	batch->vpn[i] = vpn;
+	batch->index = ++i;
+	if (i >= PPC64_TLB_BATCH_NR)
+		__flush_tlb_pending(batch);
+	put_cpu_var(ppc64_tlb_batch);
+}
+
+/*
+ * This function is called when terminating an mmu batch or when a batch
+ * is full. It will perform the flush of all the entries currently stored
+ * in a batch.
+ *
+ * Must be called from within some kind of spinlock/non-preempt region...
+ */
+void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
+{
+	int i, local;
+
+	i = batch->index;
+	local = mm_is_thread_local(batch->mm);
+	if (i == 1)
+		flush_hash_page(batch->vpn[0], batch->pte[0],
+				batch->psize, batch->ssize, local);
+	else
+		flush_hash_range(i, local);
+	batch->index = 0;
+}
+
+void hash__tlb_flush(struct mmu_gather *tlb)
+{
+	struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch);
+
+	/*
+	 * If there's a TLB batch pending, then we must flush it because the
+	 * pages are going to be freed and we really don't want to have a CPU
+	 * access a freed page because it has a stale TLB
+	 */
+	if (tlbbatch->index)
+		__flush_tlb_pending(tlbbatch);
+
+	put_cpu_var(ppc64_tlb_batch);
+}
+
+/**
+ * __flush_hash_table_range - Flush all HPTEs for a given address range
+ *                            from the hash table (and the TLB). But keeps
+ *                            the linux PTEs intact.
+ *
+ * @mm		: mm_struct of the target address space (generally init_mm)
+ * @start	: starting address
+ * @end         : ending address (not included in the flush)
+ *
+ * This function is mostly to be used by some IO hotplug code in order
+ * to remove all hash entries from a given address range used to map IO
+ * space on a removed PCI-PCI bidge without tearing down the full mapping
+ * since 64K pages may overlap with other bridges when using 64K pages
+ * with 4K HW pages on IO space.
+ *
+ * Because of that usage pattern, it is implemented for small size rather
+ * than speed.
+ */
+void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
+			      unsigned long end)
+{
+	bool is_thp;
+	int hugepage_shift;
+	unsigned long flags;
+
+	start = _ALIGN_DOWN(start, PAGE_SIZE);
+	end = _ALIGN_UP(end, PAGE_SIZE);
+
+	BUG_ON(!mm->pgd);
+
+	/*
+	 * Note: Normally, we should only ever use a batch within a
+	 * PTE locked section. This violates the rule, but will work
+	 * since we don't actually modify the PTEs, we just flush the
+	 * hash while leaving the PTEs intact (including their reference
+	 * to being hashed). This is not the most performance oriented
+	 * way to do things but is fine for our needs here.
+	 */
+	local_irq_save(flags);
+	arch_enter_lazy_mmu_mode();
+	for (; start < end; start += PAGE_SIZE) {
+		pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp,
+						  &hugepage_shift);
+		unsigned long pte;
+
+		if (ptep == NULL)
+			continue;
+		pte = pte_val(*ptep);
+		if (is_thp)
+			trace_hugepage_invalidate(start, pte);
+		if (!(pte & H_PAGE_HASHPTE))
+			continue;
+		if (unlikely(is_thp))
+			hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
+		else
+			hpte_need_flush(mm, start, ptep, pte, hugepage_shift);
+	}
+	arch_leave_lazy_mmu_mode();
+	local_irq_restore(flags);
+}
+
+void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+{
+	pte_t *pte;
+	pte_t *start_pte;
+	unsigned long flags;
+
+	addr = _ALIGN_DOWN(addr, PMD_SIZE);
+	/*
+	 * Note: Normally, we should only ever use a batch within a
+	 * PTE locked section. This violates the rule, but will work
+	 * since we don't actually modify the PTEs, we just flush the
+	 * hash while leaving the PTEs intact (including their reference
+	 * to being hashed). This is not the most performance oriented
+	 * way to do things but is fine for our needs here.
+	 */
+	local_irq_save(flags);
+	arch_enter_lazy_mmu_mode();
+	start_pte = pte_offset_map(pmd, addr);
+	for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
+		unsigned long pteval = pte_val(*pte);
+		if (pteval & H_PAGE_HASHPTE)
+			hpte_need_flush(mm, addr, pte, pteval, 0);
+		addr += PAGE_SIZE;
+	}
+	arch_leave_lazy_mmu_mode();
+	local_irq_restore(flags);
+}
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
new file mode 100644
index 000000000000..b21a81d42f15
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -0,0 +1,1946 @@
+/*
+ * PowerPC64 port by Mike Corrigan and Dave Engebretsen
+ *   {mikejc|engebret}@us.ibm.com
+ *
+ *    Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com>
+ *
+ * SMP scalability work:
+ *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ * 
+ *    Module name: htab.c
+ *
+ *    Description:
+ *      PowerPC Hashed Page Table functions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+#undef DEBUG_LOW
+
+#define pr_fmt(fmt) "hash-mmu: " fmt
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/sched/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include <linux/export.h>
+#include <linux/ctype.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/signal.h>
+#include <linux/memblock.h>
+#include <linux/context_tracking.h>
+#include <linux/libfdt.h>
+#include <linux/pkeys.h>
+
+#include <asm/debugfs.h>
+#include <asm/processor.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/page.h>
+#include <asm/types.h>
+#include <linux/uaccess.h>
+#include <asm/machdep.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/eeh.h>
+#include <asm/tlb.h>
+#include <asm/cacheflush.h>
+#include <asm/cputable.h>
+#include <asm/sections.h>
+#include <asm/copro.h>
+#include <asm/udbg.h>
+#include <asm/code-patching.h>
+#include <asm/fadump.h>
+#include <asm/firmware.h>
+#include <asm/tm.h>
+#include <asm/trace.h>
+#include <asm/ps3.h>
+#include <asm/pte-walk.h>
+#include <asm/asm-prototypes.h>
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+#ifdef DEBUG_LOW
+#define DBG_LOW(fmt...) udbg_printf(fmt)
+#else
+#define DBG_LOW(fmt...)
+#endif
+
+#define KB (1024)
+#define MB (1024*KB)
+#define GB (1024L*MB)
+
+/*
+ * Note:  pte   --> Linux PTE
+ *        HPTE  --> PowerPC Hashed Page Table Entry
+ *
+ * Execution context:
+ *   htab_initialize is called with the MMU off (of course), but
+ *   the kernel has been copied down to zero so it can directly
+ *   reference global data.  At this point it is very difficult
+ *   to print debug info.
+ *
+ */
+
+static unsigned long _SDR1;
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
+EXPORT_SYMBOL_GPL(mmu_psize_defs);
+
+u8 hpte_page_sizes[1 << LP_BITS];
+EXPORT_SYMBOL_GPL(hpte_page_sizes);
+
+struct hash_pte *htab_address;
+unsigned long htab_size_bytes;
+unsigned long htab_hash_mask;
+EXPORT_SYMBOL_GPL(htab_hash_mask);
+int mmu_linear_psize = MMU_PAGE_4K;
+EXPORT_SYMBOL_GPL(mmu_linear_psize);
+int mmu_virtual_psize = MMU_PAGE_4K;
+int mmu_vmalloc_psize = MMU_PAGE_4K;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+int mmu_vmemmap_psize = MMU_PAGE_4K;
+#endif
+int mmu_io_psize = MMU_PAGE_4K;
+int mmu_kernel_ssize = MMU_SEGSIZE_256M;
+EXPORT_SYMBOL_GPL(mmu_kernel_ssize);
+int mmu_highuser_ssize = MMU_SEGSIZE_256M;
+u16 mmu_slb_size = 64;
+EXPORT_SYMBOL_GPL(mmu_slb_size);
+#ifdef CONFIG_PPC_64K_PAGES
+int mmu_ci_restrictions;
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+static u8 *linear_map_hash_slots;
+static unsigned long linear_map_hash_count;
+static DEFINE_SPINLOCK(linear_map_hash_lock);
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+struct mmu_hash_ops mmu_hash_ops;
+EXPORT_SYMBOL(mmu_hash_ops);
+
+/*
+ * These are definitions of page sizes arrays to be used when none
+ * is provided by the firmware.
+ */
+
+/*
+ * Fallback (4k pages only)
+ */
+static struct mmu_psize_def mmu_psize_defaults[] = {
+	[MMU_PAGE_4K] = {
+		.shift	= 12,
+		.sllp	= 0,
+		.penc   = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
+		.avpnm	= 0,
+		.tlbiel = 0,
+	},
+};
+
+/*
+ * POWER4, GPUL, POWER5
+ *
+ * Support for 16Mb large pages
+ */
+static struct mmu_psize_def mmu_psize_defaults_gp[] = {
+	[MMU_PAGE_4K] = {
+		.shift	= 12,
+		.sllp	= 0,
+		.penc   = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
+		.avpnm	= 0,
+		.tlbiel = 1,
+	},
+	[MMU_PAGE_16M] = {
+		.shift	= 24,
+		.sllp	= SLB_VSID_L,
+		.penc   = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0,
+			    [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 },
+		.avpnm	= 0x1UL,
+		.tlbiel = 0,
+	},
+};
+
+/*
+ * 'R' and 'C' update notes:
+ *  - Under pHyp or KVM, the updatepp path will not set C, thus it *will*
+ *     create writeable HPTEs without C set, because the hcall H_PROTECT
+ *     that we use in that case will not update C
+ *  - The above is however not a problem, because we also don't do that
+ *     fancy "no flush" variant of eviction and we use H_REMOVE which will
+ *     do the right thing and thus we don't have the race I described earlier
+ *
+ *    - Under bare metal,  we do have the race, so we need R and C set
+ *    - We make sure R is always set and never lost
+ *    - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping
+ */
+unsigned long htab_convert_pte_flags(unsigned long pteflags)
+{
+	unsigned long rflags = 0;
+
+	/* _PAGE_EXEC -> NOEXEC */
+	if ((pteflags & _PAGE_EXEC) == 0)
+		rflags |= HPTE_R_N;
+	/*
+	 * PPP bits:
+	 * Linux uses slb key 0 for kernel and 1 for user.
+	 * kernel RW areas are mapped with PPP=0b000
+	 * User area is mapped with PPP=0b010 for read/write
+	 * or PPP=0b011 for read-only (including writeable but clean pages).
+	 */
+	if (pteflags & _PAGE_PRIVILEGED) {
+		/*
+		 * Kernel read only mapped with ppp bits 0b110
+		 */
+		if (!(pteflags & _PAGE_WRITE)) {
+			if (mmu_has_feature(MMU_FTR_KERNEL_RO))
+				rflags |= (HPTE_R_PP0 | 0x2);
+			else
+				rflags |= 0x3;
+		}
+	} else {
+		if (pteflags & _PAGE_RWX)
+			rflags |= 0x2;
+		if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY)))
+			rflags |= 0x1;
+	}
+	/*
+	 * We can't allow hardware to update hpte bits. Hence always
+	 * set 'R' bit and set 'C' if it is a write fault
+	 */
+	rflags |=  HPTE_R_R;
+
+	if (pteflags & _PAGE_DIRTY)
+		rflags |= HPTE_R_C;
+	/*
+	 * Add in WIG bits
+	 */
+
+	if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT)
+		rflags |= HPTE_R_I;
+	else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)
+		rflags |= (HPTE_R_I | HPTE_R_G);
+	else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO)
+		rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M);
+	else
+		/*
+		 * Add memory coherence if cache inhibited is not set
+		 */
+		rflags |= HPTE_R_M;
+
+	rflags |= pte_to_hpte_pkey_bits(pteflags);
+	return rflags;
+}
+
+int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
+		      unsigned long pstart, unsigned long prot,
+		      int psize, int ssize)
+{
+	unsigned long vaddr, paddr;
+	unsigned int step, shift;
+	int ret = 0;
+
+	shift = mmu_psize_defs[psize].shift;
+	step = 1 << shift;
+
+	prot = htab_convert_pte_flags(prot);
+
+	DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n",
+	    vstart, vend, pstart, prot, psize, ssize);
+
+	for (vaddr = vstart, paddr = pstart; vaddr < vend;
+	     vaddr += step, paddr += step) {
+		unsigned long hash, hpteg;
+		unsigned long vsid = get_kernel_vsid(vaddr, ssize);
+		unsigned long vpn  = hpt_vpn(vaddr, vsid, ssize);
+		unsigned long tprot = prot;
+
+		/*
+		 * If we hit a bad address return error.
+		 */
+		if (!vsid)
+			return -1;
+		/* Make kernel text executable */
+		if (overlaps_kernel_text(vaddr, vaddr + step))
+			tprot &= ~HPTE_R_N;
+
+		/* Make kvm guest trampolines executable */
+		if (overlaps_kvm_tmp(vaddr, vaddr + step))
+			tprot &= ~HPTE_R_N;
+
+		/*
+		 * If relocatable, check if it overlaps interrupt vectors that
+		 * are copied down to real 0. For relocatable kernel
+		 * (e.g. kdump case) we copy interrupt vectors down to real
+		 * address 0. Mark that region as executable. This is
+		 * because on p8 system with relocation on exception feature
+		 * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence
+		 * in order to execute the interrupt handlers in virtual
+		 * mode the vector region need to be marked as executable.
+		 */
+		if ((PHYSICAL_START > MEMORY_START) &&
+			overlaps_interrupt_vector_text(vaddr, vaddr + step))
+				tprot &= ~HPTE_R_N;
+
+		hash = hpt_hash(vpn, shift, ssize);
+		hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+
+		BUG_ON(!mmu_hash_ops.hpte_insert);
+		ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot,
+					       HPTE_V_BOLTED, psize, psize,
+					       ssize);
+
+		if (ret < 0)
+			break;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+		if (debug_pagealloc_enabled() &&
+			(paddr >> PAGE_SHIFT) < linear_map_hash_count)
+			linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80;
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+	}
+	return ret < 0 ? ret : 0;
+}
+
+int htab_remove_mapping(unsigned long vstart, unsigned long vend,
+		      int psize, int ssize)
+{
+	unsigned long vaddr;
+	unsigned int step, shift;
+	int rc;
+	int ret = 0;
+
+	shift = mmu_psize_defs[psize].shift;
+	step = 1 << shift;
+
+	if (!mmu_hash_ops.hpte_removebolted)
+		return -ENODEV;
+
+	for (vaddr = vstart; vaddr < vend; vaddr += step) {
+		rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize);
+		if (rc == -ENOENT) {
+			ret = -ENOENT;
+			continue;
+		}
+		if (rc < 0)
+			return rc;
+	}
+
+	return ret;
+}
+
+static bool disable_1tb_segments = false;
+
+static int __init parse_disable_1tb_segments(char *p)
+{
+	disable_1tb_segments = true;
+	return 0;
+}
+early_param("disable_1tb_segments", parse_disable_1tb_segments);
+
+static int __init htab_dt_scan_seg_sizes(unsigned long node,
+					 const char *uname, int depth,
+					 void *data)
+{
+	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	const __be32 *prop;
+	int size = 0;
+
+	/* We are scanning "cpu" nodes only */
+	if (type == NULL || strcmp(type, "cpu") != 0)
+		return 0;
+
+	prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size);
+	if (prop == NULL)
+		return 0;
+	for (; size >= 4; size -= 4, ++prop) {
+		if (be32_to_cpu(prop[0]) == 40) {
+			DBG("1T segment support detected\n");
+
+			if (disable_1tb_segments) {
+				DBG("1T segments disabled by command line\n");
+				break;
+			}
+
+			cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT;
+			return 1;
+		}
+	}
+	cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
+	return 0;
+}
+
+static int __init get_idx_from_shift(unsigned int shift)
+{
+	int idx = -1;
+
+	switch (shift) {
+	case 0xc:
+		idx = MMU_PAGE_4K;
+		break;
+	case 0x10:
+		idx = MMU_PAGE_64K;
+		break;
+	case 0x14:
+		idx = MMU_PAGE_1M;
+		break;
+	case 0x18:
+		idx = MMU_PAGE_16M;
+		break;
+	case 0x22:
+		idx = MMU_PAGE_16G;
+		break;
+	}
+	return idx;
+}
+
+static int __init htab_dt_scan_page_sizes(unsigned long node,
+					  const char *uname, int depth,
+					  void *data)
+{
+	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	const __be32 *prop;
+	int size = 0;
+
+	/* We are scanning "cpu" nodes only */
+	if (type == NULL || strcmp(type, "cpu") != 0)
+		return 0;
+
+	prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size);
+	if (!prop)
+		return 0;
+
+	pr_info("Page sizes from device-tree:\n");
+	size /= 4;
+	cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE);
+	while(size > 0) {
+		unsigned int base_shift = be32_to_cpu(prop[0]);
+		unsigned int slbenc = be32_to_cpu(prop[1]);
+		unsigned int lpnum = be32_to_cpu(prop[2]);
+		struct mmu_psize_def *def;
+		int idx, base_idx;
+
+		size -= 3; prop += 3;
+		base_idx = get_idx_from_shift(base_shift);
+		if (base_idx < 0) {
+			/* skip the pte encoding also */
+			prop += lpnum * 2; size -= lpnum * 2;
+			continue;
+		}
+		def = &mmu_psize_defs[base_idx];
+		if (base_idx == MMU_PAGE_16M)
+			cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE;
+
+		def->shift = base_shift;
+		if (base_shift <= 23)
+			def->avpnm = 0;
+		else
+			def->avpnm = (1 << (base_shift - 23)) - 1;
+		def->sllp = slbenc;
+		/*
+		 * We don't know for sure what's up with tlbiel, so
+		 * for now we only set it for 4K and 64K pages
+		 */
+		if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K)
+			def->tlbiel = 1;
+		else
+			def->tlbiel = 0;
+
+		while (size > 0 && lpnum) {
+			unsigned int shift = be32_to_cpu(prop[0]);
+			int penc  = be32_to_cpu(prop[1]);
+
+			prop += 2; size -= 2;
+			lpnum--;
+
+			idx = get_idx_from_shift(shift);
+			if (idx < 0)
+				continue;
+
+			if (penc == -1)
+				pr_err("Invalid penc for base_shift=%d "
+				       "shift=%d\n", base_shift, shift);
+
+			def->penc[idx] = penc;
+			pr_info("base_shift=%d: shift=%d, sllp=0x%04lx,"
+				" avpnm=0x%08lx, tlbiel=%d, penc=%d\n",
+				base_shift, shift, def->sllp,
+				def->avpnm, def->tlbiel, def->penc[idx]);
+		}
+	}
+
+	return 1;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * Scan for 16G memory blocks that have been set aside for huge pages
+ * and reserve those blocks for 16G huge pages.
+ */
+static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
+					const char *uname, int depth,
+					void *data) {
+	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	const __be64 *addr_prop;
+	const __be32 *page_count_prop;
+	unsigned int expected_pages;
+	long unsigned int phys_addr;
+	long unsigned int block_size;
+
+	/* We are scanning "memory" nodes only */
+	if (type == NULL || strcmp(type, "memory") != 0)
+		return 0;
+
+	/*
+	 * This property is the log base 2 of the number of virtual pages that
+	 * will represent this memory block.
+	 */
+	page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
+	if (page_count_prop == NULL)
+		return 0;
+	expected_pages = (1 << be32_to_cpu(page_count_prop[0]));
+	addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
+	if (addr_prop == NULL)
+		return 0;
+	phys_addr = be64_to_cpu(addr_prop[0]);
+	block_size = be64_to_cpu(addr_prop[1]);
+	if (block_size != (16 * GB))
+		return 0;
+	printk(KERN_INFO "Huge page(16GB) memory: "
+			"addr = 0x%lX size = 0x%lX pages = %d\n",
+			phys_addr, block_size, expected_pages);
+	if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) {
+		memblock_reserve(phys_addr, block_size * expected_pages);
+		pseries_add_gpage(phys_addr, block_size, expected_pages);
+	}
+	return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static void mmu_psize_set_default_penc(void)
+{
+	int bpsize, apsize;
+	for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
+		for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++)
+			mmu_psize_defs[bpsize].penc[apsize] = -1;
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+
+static bool might_have_hea(void)
+{
+	/*
+	 * The HEA ethernet adapter requires awareness of the
+	 * GX bus. Without that awareness we can easily assume
+	 * we will never see an HEA ethernet device.
+	 */
+#ifdef CONFIG_IBMEBUS
+	return !cpu_has_feature(CPU_FTR_ARCH_207S) &&
+		firmware_has_feature(FW_FEATURE_SPLPAR);
+#else
+	return false;
+#endif
+}
+
+#endif /* #ifdef CONFIG_PPC_64K_PAGES */
+
+static void __init htab_scan_page_sizes(void)
+{
+	int rc;
+
+	/* se the invalid penc to -1 */
+	mmu_psize_set_default_penc();
+
+	/* Default to 4K pages only */
+	memcpy(mmu_psize_defs, mmu_psize_defaults,
+	       sizeof(mmu_psize_defaults));
+
+	/*
+	 * Try to find the available page sizes in the device-tree
+	 */
+	rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL);
+	if (rc == 0 && early_mmu_has_feature(MMU_FTR_16M_PAGE)) {
+		/*
+		 * Nothing in the device-tree, but the CPU supports 16M pages,
+		 * so let's fallback on a known size list for 16M capable CPUs.
+		 */
+		memcpy(mmu_psize_defs, mmu_psize_defaults_gp,
+		       sizeof(mmu_psize_defaults_gp));
+	}
+
+#ifdef CONFIG_HUGETLB_PAGE
+	if (!hugetlb_disabled) {
+		/* Reserve 16G huge page memory sections for huge pages */
+		of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
+	}
+#endif /* CONFIG_HUGETLB_PAGE */
+}
+
+/*
+ * Fill in the hpte_page_sizes[] array.
+ * We go through the mmu_psize_defs[] array looking for all the
+ * supported base/actual page size combinations.  Each combination
+ * has a unique pagesize encoding (penc) value in the low bits of
+ * the LP field of the HPTE.  For actual page sizes less than 1MB,
+ * some of the upper LP bits are used for RPN bits, meaning that
+ * we need to fill in several entries in hpte_page_sizes[].
+ *
+ * In diagrammatic form, with r = RPN bits and z = page size bits:
+ *        PTE LP     actual page size
+ *    rrrr rrrz		>=8KB
+ *    rrrr rrzz		>=16KB
+ *    rrrr rzzz		>=32KB
+ *    rrrr zzzz		>=64KB
+ *    ...
+ *
+ * The zzzz bits are implementation-specific but are chosen so that
+ * no encoding for a larger page size uses the same value in its
+ * low-order N bits as the encoding for the 2^(12+N) byte page size
+ * (if it exists).
+ */
+static void init_hpte_page_sizes(void)
+{
+	long int ap, bp;
+	long int shift, penc;
+
+	for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) {
+		if (!mmu_psize_defs[bp].shift)
+			continue;	/* not a supported page size */
+		for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) {
+			penc = mmu_psize_defs[bp].penc[ap];
+			if (penc == -1 || !mmu_psize_defs[ap].shift)
+				continue;
+			shift = mmu_psize_defs[ap].shift - LP_SHIFT;
+			if (shift <= 0)
+				continue;	/* should never happen */
+			/*
+			 * For page sizes less than 1MB, this loop
+			 * replicates the entry for all possible values
+			 * of the rrrr bits.
+			 */
+			while (penc < (1 << LP_BITS)) {
+				hpte_page_sizes[penc] = (ap << 4) | bp;
+				penc += 1 << shift;
+			}
+		}
+	}
+}
+
+static void __init htab_init_page_sizes(void)
+{
+	init_hpte_page_sizes();
+
+	if (!debug_pagealloc_enabled()) {
+		/*
+		 * Pick a size for the linear mapping. Currently, we only
+		 * support 16M, 1M and 4K which is the default
+		 */
+		if (mmu_psize_defs[MMU_PAGE_16M].shift)
+			mmu_linear_psize = MMU_PAGE_16M;
+		else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+			mmu_linear_psize = MMU_PAGE_1M;
+	}
+
+#ifdef CONFIG_PPC_64K_PAGES
+	/*
+	 * Pick a size for the ordinary pages. Default is 4K, we support
+	 * 64K for user mappings and vmalloc if supported by the processor.
+	 * We only use 64k for ioremap if the processor
+	 * (and firmware) support cache-inhibited large pages.
+	 * If not, we use 4k and set mmu_ci_restrictions so that
+	 * hash_page knows to switch processes that use cache-inhibited
+	 * mappings to 4k pages.
+	 */
+	if (mmu_psize_defs[MMU_PAGE_64K].shift) {
+		mmu_virtual_psize = MMU_PAGE_64K;
+		mmu_vmalloc_psize = MMU_PAGE_64K;
+		if (mmu_linear_psize == MMU_PAGE_4K)
+			mmu_linear_psize = MMU_PAGE_64K;
+		if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) {
+			/*
+			 * When running on pSeries using 64k pages for ioremap
+			 * would stop us accessing the HEA ethernet. So if we
+			 * have the chance of ever seeing one, stay at 4k.
+			 */
+			if (!might_have_hea())
+				mmu_io_psize = MMU_PAGE_64K;
+		} else
+			mmu_ci_restrictions = 1;
+	}
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	/*
+	 * We try to use 16M pages for vmemmap if that is supported
+	 * and we have at least 1G of RAM at boot
+	 */
+	if (mmu_psize_defs[MMU_PAGE_16M].shift &&
+	    memblock_phys_mem_size() >= 0x40000000)
+		mmu_vmemmap_psize = MMU_PAGE_16M;
+	else if (mmu_psize_defs[MMU_PAGE_64K].shift)
+		mmu_vmemmap_psize = MMU_PAGE_64K;
+	else
+		mmu_vmemmap_psize = MMU_PAGE_4K;
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+	printk(KERN_DEBUG "Page orders: linear mapping = %d, "
+	       "virtual = %d, io = %d"
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	       ", vmemmap = %d"
+#endif
+	       "\n",
+	       mmu_psize_defs[mmu_linear_psize].shift,
+	       mmu_psize_defs[mmu_virtual_psize].shift,
+	       mmu_psize_defs[mmu_io_psize].shift
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	       ,mmu_psize_defs[mmu_vmemmap_psize].shift
+#endif
+	       );
+}
+
+static int __init htab_dt_scan_pftsize(unsigned long node,
+				       const char *uname, int depth,
+				       void *data)
+{
+	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	const __be32 *prop;
+
+	/* We are scanning "cpu" nodes only */
+	if (type == NULL || strcmp(type, "cpu") != 0)
+		return 0;
+
+	prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
+	if (prop != NULL) {
+		/* pft_size[0] is the NUMA CEC cookie */
+		ppc64_pft_size = be32_to_cpu(prop[1]);
+		return 1;
+	}
+	return 0;
+}
+
+unsigned htab_shift_for_mem_size(unsigned long mem_size)
+{
+	unsigned memshift = __ilog2(mem_size);
+	unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift;
+	unsigned pteg_shift;
+
+	/* round mem_size up to next power of 2 */
+	if ((1UL << memshift) < mem_size)
+		memshift += 1;
+
+	/* aim for 2 pages / pteg */
+	pteg_shift = memshift - (pshift + 1);
+
+	/*
+	 * 2^11 PTEGS of 128 bytes each, ie. 2^18 bytes is the minimum htab
+	 * size permitted by the architecture.
+	 */
+	return max(pteg_shift + 7, 18U);
+}
+
+static unsigned long __init htab_get_table_size(void)
+{
+	/*
+	 * If hash size isn't already provided by the platform, we try to
+	 * retrieve it from the device-tree. If it's not there neither, we
+	 * calculate it now based on the total RAM size
+	 */
+	if (ppc64_pft_size == 0)
+		of_scan_flat_dt(htab_dt_scan_pftsize, NULL);
+	if (ppc64_pft_size)
+		return 1UL << ppc64_pft_size;
+
+	return 1UL << htab_shift_for_mem_size(memblock_phys_mem_size());
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int resize_hpt_for_hotplug(unsigned long new_mem_size)
+{
+	unsigned target_hpt_shift;
+
+	if (!mmu_hash_ops.resize_hpt)
+		return 0;
+
+	target_hpt_shift = htab_shift_for_mem_size(new_mem_size);
+
+	/*
+	 * To avoid lots of HPT resizes if memory size is fluctuating
+	 * across a boundary, we deliberately have some hysterisis
+	 * here: we immediately increase the HPT size if the target
+	 * shift exceeds the current shift, but we won't attempt to
+	 * reduce unless the target shift is at least 2 below the
+	 * current shift
+	 */
+	if (target_hpt_shift > ppc64_pft_size ||
+	    target_hpt_shift < ppc64_pft_size - 1)
+		return mmu_hash_ops.resize_hpt(target_hpt_shift);
+
+	return 0;
+}
+
+int hash__create_section_mapping(unsigned long start, unsigned long end, int nid)
+{
+	int rc;
+
+	if (end >= H_VMALLOC_START) {
+		pr_warn("Outside the supported range\n");
+		return -1;
+	}
+
+	rc = htab_bolt_mapping(start, end, __pa(start),
+			       pgprot_val(PAGE_KERNEL), mmu_linear_psize,
+			       mmu_kernel_ssize);
+
+	if (rc < 0) {
+		int rc2 = htab_remove_mapping(start, end, mmu_linear_psize,
+					      mmu_kernel_ssize);
+		BUG_ON(rc2 && (rc2 != -ENOENT));
+	}
+	return rc;
+}
+
+int hash__remove_section_mapping(unsigned long start, unsigned long end)
+{
+	int rc = htab_remove_mapping(start, end, mmu_linear_psize,
+				     mmu_kernel_ssize);
+	WARN_ON(rc < 0);
+	return rc;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+static void __init hash_init_partition_table(phys_addr_t hash_table,
+					     unsigned long htab_size)
+{
+	mmu_partition_table_init();
+
+	/*
+	 * PS field (VRMA page size) is not used for LPID 0, hence set to 0.
+	 * For now, UPRT is 0 and we have no segment table.
+	 */
+	htab_size =  __ilog2(htab_size) - 18;
+	mmu_partition_table_set_entry(0, hash_table | htab_size, 0);
+	pr_info("Partition table %p\n", partition_tb);
+}
+
+static void __init htab_initialize(void)
+{
+	unsigned long table;
+	unsigned long pteg_count;
+	unsigned long prot;
+	unsigned long base = 0, size = 0;
+	struct memblock_region *reg;
+
+	DBG(" -> htab_initialize()\n");
+
+	if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
+		mmu_kernel_ssize = MMU_SEGSIZE_1T;
+		mmu_highuser_ssize = MMU_SEGSIZE_1T;
+		printk(KERN_INFO "Using 1TB segments\n");
+	}
+
+	/*
+	 * Calculate the required size of the htab.  We want the number of
+	 * PTEGs to equal one half the number of real pages.
+	 */ 
+	htab_size_bytes = htab_get_table_size();
+	pteg_count = htab_size_bytes >> 7;
+
+	htab_hash_mask = pteg_count - 1;
+
+	if (firmware_has_feature(FW_FEATURE_LPAR) ||
+	    firmware_has_feature(FW_FEATURE_PS3_LV1)) {
+		/* Using a hypervisor which owns the htab */
+		htab_address = NULL;
+		_SDR1 = 0; 
+		/*
+		 * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
+		 * to inform the hypervisor that we wish to use the HPT.
+		 */
+		if (cpu_has_feature(CPU_FTR_ARCH_300))
+			register_process_table(0, 0, 0);
+#ifdef CONFIG_FA_DUMP
+		/*
+		 * If firmware assisted dump is active firmware preserves
+		 * the contents of htab along with entire partition memory.
+		 * Clear the htab if firmware assisted dump is active so
+		 * that we dont end up using old mappings.
+		 */
+		if (is_fadump_active() && mmu_hash_ops.hpte_clear_all)
+			mmu_hash_ops.hpte_clear_all();
+#endif
+	} else {
+		unsigned long limit = MEMBLOCK_ALLOC_ANYWHERE;
+
+#ifdef CONFIG_PPC_CELL
+		/*
+		 * Cell may require the hash table down low when using the
+		 * Axon IOMMU in order to fit the dynamic region over it, see
+		 * comments in cell/iommu.c
+		 */
+		if (fdt_subnode_offset(initial_boot_params, 0, "axon") > 0) {
+			limit = 0x80000000;
+			pr_info("Hash table forced below 2G for Axon IOMMU\n");
+		}
+#endif /* CONFIG_PPC_CELL */
+
+		table = memblock_phys_alloc_range(htab_size_bytes,
+						  htab_size_bytes,
+						  0, limit);
+		if (!table)
+			panic("ERROR: Failed to allocate %pa bytes below %pa\n",
+			      &htab_size_bytes, &limit);
+
+		DBG("Hash table allocated at %lx, size: %lx\n", table,
+		    htab_size_bytes);
+
+		htab_address = __va(table);
+
+		/* htab absolute addr + encoded htabsize */
+		_SDR1 = table + __ilog2(htab_size_bytes) - 18;
+
+		/* Initialize the HPT with no entries */
+		memset((void *)table, 0, htab_size_bytes);
+
+		if (!cpu_has_feature(CPU_FTR_ARCH_300))
+			/* Set SDR1 */
+			mtspr(SPRN_SDR1, _SDR1);
+		else
+			hash_init_partition_table(table, htab_size_bytes);
+	}
+
+	prot = pgprot_val(PAGE_KERNEL);
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	if (debug_pagealloc_enabled()) {
+		linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
+		linear_map_hash_slots = memblock_alloc_try_nid(
+				linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
+				ppc64_rma_size,	NUMA_NO_NODE);
+		if (!linear_map_hash_slots)
+			panic("%s: Failed to allocate %lu bytes max_addr=%pa\n",
+			      __func__, linear_map_hash_count, &ppc64_rma_size);
+	}
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+	/* create bolted the linear mapping in the hash table */
+	for_each_memblock(memory, reg) {
+		base = (unsigned long)__va(reg->base);
+		size = reg->size;
+
+		DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
+		    base, size, prot);
+
+		if ((base + size) >= H_VMALLOC_START) {
+			pr_warn("Outside the supported range\n");
+			continue;
+		}
+
+		BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
+				prot, mmu_linear_psize, mmu_kernel_ssize));
+	}
+	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
+
+	/*
+	 * If we have a memory_limit and we've allocated TCEs then we need to
+	 * explicitly map the TCE area at the top of RAM. We also cope with the
+	 * case that the TCEs start below memory_limit.
+	 * tce_alloc_start/end are 16MB aligned so the mapping should work
+	 * for either 4K or 16MB pages.
+	 */
+	if (tce_alloc_start) {
+		tce_alloc_start = (unsigned long)__va(tce_alloc_start);
+		tce_alloc_end = (unsigned long)__va(tce_alloc_end);
+
+		if (base + size >= tce_alloc_start)
+			tce_alloc_start = base + size + 1;
+
+		BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end,
+					 __pa(tce_alloc_start), prot,
+					 mmu_linear_psize, mmu_kernel_ssize));
+	}
+
+
+	DBG(" <- htab_initialize()\n");
+}
+#undef KB
+#undef MB
+
+void __init hash__early_init_devtree(void)
+{
+	/* Initialize segment sizes */
+	of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
+
+	/* Initialize page sizes */
+	htab_scan_page_sizes();
+}
+
+struct hash_mm_context init_hash_mm_context;
+void __init hash__early_init_mmu(void)
+{
+#ifndef CONFIG_PPC_64K_PAGES
+	/*
+	 * We have code in __hash_page_4K() and elsewhere, which assumes it can
+	 * do the following:
+	 *   new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX);
+	 *
+	 * Where the slot number is between 0-15, and values of 8-15 indicate
+	 * the secondary bucket. For that code to work H_PAGE_F_SECOND and
+	 * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and
+	 * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here
+	 * with a BUILD_BUG_ON().
+	 */
+	BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul  << (H_PAGE_F_GIX_SHIFT + 3)));
+#endif /* CONFIG_PPC_64K_PAGES */
+
+	htab_init_page_sizes();
+
+	/*
+	 * initialize page table size
+	 */
+	__pte_frag_nr = H_PTE_FRAG_NR;
+	__pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
+	__pmd_frag_nr = H_PMD_FRAG_NR;
+	__pmd_frag_size_shift = H_PMD_FRAG_SIZE_SHIFT;
+
+	__pte_index_size = H_PTE_INDEX_SIZE;
+	__pmd_index_size = H_PMD_INDEX_SIZE;
+	__pud_index_size = H_PUD_INDEX_SIZE;
+	__pgd_index_size = H_PGD_INDEX_SIZE;
+	__pud_cache_index = H_PUD_CACHE_INDEX;
+	__pte_table_size = H_PTE_TABLE_SIZE;
+	__pmd_table_size = H_PMD_TABLE_SIZE;
+	__pud_table_size = H_PUD_TABLE_SIZE;
+	__pgd_table_size = H_PGD_TABLE_SIZE;
+	/*
+	 * 4k use hugepd format, so for hash set then to
+	 * zero
+	 */
+	__pmd_val_bits = HASH_PMD_VAL_BITS;
+	__pud_val_bits = HASH_PUD_VAL_BITS;
+	__pgd_val_bits = HASH_PGD_VAL_BITS;
+
+	__kernel_virt_start = H_KERN_VIRT_START;
+	__vmalloc_start = H_VMALLOC_START;
+	__vmalloc_end = H_VMALLOC_END;
+	__kernel_io_start = H_KERN_IO_START;
+	__kernel_io_end = H_KERN_IO_END;
+	vmemmap = (struct page *)H_VMEMMAP_START;
+	ioremap_bot = IOREMAP_BASE;
+
+#ifdef CONFIG_PCI
+	pci_io_base = ISA_IO_BASE;
+#endif
+
+	/* Select appropriate backend */
+	if (firmware_has_feature(FW_FEATURE_PS3_LV1))
+		ps3_early_mm_init();
+	else if (firmware_has_feature(FW_FEATURE_LPAR))
+		hpte_init_pseries();
+	else if (IS_ENABLED(CONFIG_PPC_NATIVE))
+		hpte_init_native();
+
+	if (!mmu_hash_ops.hpte_insert)
+		panic("hash__early_init_mmu: No MMU hash ops defined!\n");
+
+	/*
+	 * Initialize the MMU Hash table and create the linear mapping
+	 * of memory. Has to be done before SLB initialization as this is
+	 * currently where the page size encoding is obtained.
+	 */
+	htab_initialize();
+
+	init_mm.context.hash_context = &init_hash_mm_context;
+	init_mm.context.hash_context->slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
+
+	pr_info("Initializing hash mmu with SLB\n");
+	/* Initialize SLB management */
+	slb_initialize();
+
+	if (cpu_has_feature(CPU_FTR_ARCH_206)
+			&& cpu_has_feature(CPU_FTR_HVMODE))
+		tlbiel_all();
+}
+
+#ifdef CONFIG_SMP
+void hash__early_init_mmu_secondary(void)
+{
+	/* Initialize hash table for that CPU */
+	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+
+		if (!cpu_has_feature(CPU_FTR_ARCH_300))
+			mtspr(SPRN_SDR1, _SDR1);
+		else
+			mtspr(SPRN_PTCR,
+			      __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+	}
+	/* Initialize SLB */
+	slb_initialize();
+
+	if (cpu_has_feature(CPU_FTR_ARCH_206)
+			&& cpu_has_feature(CPU_FTR_HVMODE))
+		tlbiel_all();
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * Called by asm hashtable.S for doing lazy icache flush
+ */
+unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
+{
+	struct page *page;
+
+	if (!pfn_valid(pte_pfn(pte)))
+		return pp;
+
+	page = pte_page(pte);
+
+	/* page is dirty */
+	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
+		if (trap == 0x400) {
+			flush_dcache_icache_page(page);
+			set_bit(PG_arch_1, &page->flags);
+		} else
+			pp |= HPTE_R_N;
+	}
+	return pp;
+}
+
+#ifdef CONFIG_PPC_MM_SLICES
+static unsigned int get_paca_psize(unsigned long addr)
+{
+	unsigned char *psizes;
+	unsigned long index, mask_index;
+
+	if (addr < SLICE_LOW_TOP) {
+		psizes = get_paca()->mm_ctx_low_slices_psize;
+		index = GET_LOW_SLICE_INDEX(addr);
+	} else {
+		psizes = get_paca()->mm_ctx_high_slices_psize;
+		index = GET_HIGH_SLICE_INDEX(addr);
+	}
+	mask_index = index & 0x1;
+	return (psizes[index >> 1] >> (mask_index * 4)) & 0xF;
+}
+
+#else
+unsigned int get_paca_psize(unsigned long addr)
+{
+	return get_paca()->mm_ctx_user_psize;
+}
+#endif
+
+/*
+ * Demote a segment to using 4k pages.
+ * For now this makes the whole process use 4k pages.
+ */
+#ifdef CONFIG_PPC_64K_PAGES
+void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
+{
+	if (get_slice_psize(mm, addr) == MMU_PAGE_4K)
+		return;
+	slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K);
+	copro_flush_all_slbs(mm);
+	if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
+
+		copy_mm_to_paca(mm);
+		slb_flush_and_restore_bolted();
+	}
+}
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+/*
+ * This looks up a 2-bit protection code for a 4k subpage of a 64k page.
+ * Userspace sets the subpage permissions using the subpage_prot system call.
+ *
+ * Result is 0: full permissions, _PAGE_RW: read-only,
+ * _PAGE_RWX: no access.
+ */
+static int subpage_protection(struct mm_struct *mm, unsigned long ea)
+{
+	struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
+	u32 spp = 0;
+	u32 **sbpm, *sbpp;
+
+	if (!spt)
+		return 0;
+
+	if (ea >= spt->maxaddr)
+		return 0;
+	if (ea < 0x100000000UL) {
+		/* addresses below 4GB use spt->low_prot */
+		sbpm = spt->low_prot;
+	} else {
+		sbpm = spt->protptrs[ea >> SBP_L3_SHIFT];
+		if (!sbpm)
+			return 0;
+	}
+	sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
+	if (!sbpp)
+		return 0;
+	spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)];
+
+	/* extract 2-bit bitfield for this 4k subpage */
+	spp >>= 30 - 2 * ((ea >> 12) & 0xf);
+
+	/*
+	 * 0 -> full premission
+	 * 1 -> Read only
+	 * 2 -> no access.
+	 * We return the flag that need to be cleared.
+	 */
+	spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0);
+	return spp;
+}
+
+#else /* CONFIG_PPC_SUBPAGE_PROT */
+static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
+{
+	return 0;
+}
+#endif
+
+void hash_failure_debug(unsigned long ea, unsigned long access,
+			unsigned long vsid, unsigned long trap,
+			int ssize, int psize, int lpsize, unsigned long pte)
+{
+	if (!printk_ratelimit())
+		return;
+	pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n",
+		ea, access, current->comm);
+	pr_info("    trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n",
+		trap, vsid, ssize, psize, lpsize, pte);
+}
+
+static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
+			     int psize, bool user_region)
+{
+	if (user_region) {
+		if (psize != get_paca_psize(ea)) {
+			copy_mm_to_paca(mm);
+			slb_flush_and_restore_bolted();
+		}
+	} else if (get_paca()->vmalloc_sllp !=
+		   mmu_psize_defs[mmu_vmalloc_psize].sllp) {
+		get_paca()->vmalloc_sllp =
+			mmu_psize_defs[mmu_vmalloc_psize].sllp;
+		slb_vmalloc_update();
+	}
+}
+
+/*
+ * Result code is:
+ *  0 - handled
+ *  1 - normal page fault
+ * -1 - critical hash insertion error
+ * -2 - access not permitted by subpage protection mechanism
+ */
+int hash_page_mm(struct mm_struct *mm, unsigned long ea,
+		 unsigned long access, unsigned long trap,
+		 unsigned long flags)
+{
+	bool is_thp;
+	enum ctx_state prev_state = exception_enter();
+	pgd_t *pgdir;
+	unsigned long vsid;
+	pte_t *ptep;
+	unsigned hugeshift;
+	int rc, user_region = 0;
+	int psize, ssize;
+
+	DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
+		ea, access, trap);
+	trace_hash_fault(ea, access, trap);
+
+	/* Get region & vsid */
+	switch (get_region_id(ea)) {
+	case USER_REGION_ID:
+		user_region = 1;
+		if (! mm) {
+			DBG_LOW(" user region with no mm !\n");
+			rc = 1;
+			goto bail;
+		}
+		psize = get_slice_psize(mm, ea);
+		ssize = user_segment_size(ea);
+		vsid = get_user_vsid(&mm->context, ea, ssize);
+		break;
+	case VMALLOC_REGION_ID:
+		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+		psize = mmu_vmalloc_psize;
+		ssize = mmu_kernel_ssize;
+		break;
+
+	case IO_REGION_ID:
+		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+		psize = mmu_io_psize;
+		ssize = mmu_kernel_ssize;
+		break;
+	default:
+		/*
+		 * Not a valid range
+		 * Send the problem up to do_page_fault()
+		 */
+		rc = 1;
+		goto bail;
+	}
+	DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
+
+	/* Bad address. */
+	if (!vsid) {
+		DBG_LOW("Bad address!\n");
+		rc = 1;
+		goto bail;
+	}
+	/* Get pgdir */
+	pgdir = mm->pgd;
+	if (pgdir == NULL) {
+		rc = 1;
+		goto bail;
+	}
+
+	/* Check CPU locality */
+	if (user_region && mm_is_thread_local(mm))
+		flags |= HPTE_LOCAL_UPDATE;
+
+#ifndef CONFIG_PPC_64K_PAGES
+	/*
+	 * If we use 4K pages and our psize is not 4K, then we might
+	 * be hitting a special driver mapping, and need to align the
+	 * address before we fetch the PTE.
+	 *
+	 * It could also be a hugepage mapping, in which case this is
+	 * not necessary, but it's not harmful, either.
+	 */
+	if (psize != MMU_PAGE_4K)
+		ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+#endif /* CONFIG_PPC_64K_PAGES */
+
+	/* Get PTE and page size from page tables */
+	ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift);
+	if (ptep == NULL || !pte_present(*ptep)) {
+		DBG_LOW(" no PTE !\n");
+		rc = 1;
+		goto bail;
+	}
+
+	/* Add _PAGE_PRESENT to the required access perm */
+	access |= _PAGE_PRESENT;
+
+	/*
+	 * Pre-check access permissions (will be re-checked atomically
+	 * in __hash_page_XX but this pre-check is a fast path
+	 */
+	if (!check_pte_access(access, pte_val(*ptep))) {
+		DBG_LOW(" no access !\n");
+		rc = 1;
+		goto bail;
+	}
+
+	if (hugeshift) {
+		if (is_thp)
+			rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
+					     trap, flags, ssize, psize);
+#ifdef CONFIG_HUGETLB_PAGE
+		else
+			rc = __hash_page_huge(ea, access, vsid, ptep, trap,
+					      flags, ssize, hugeshift, psize);
+#else
+		else {
+			/*
+			 * if we have hugeshift, and is not transhuge with
+			 * hugetlb disabled, something is really wrong.
+			 */
+			rc = 1;
+			WARN_ON(1);
+		}
+#endif
+		if (current->mm == mm)
+			check_paca_psize(ea, mm, psize, user_region);
+
+		goto bail;
+	}
+
+#ifndef CONFIG_PPC_64K_PAGES
+	DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
+#else
+	DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep),
+		pte_val(*(ptep + PTRS_PER_PTE)));
+#endif
+	/* Do actual hashing */
+#ifdef CONFIG_PPC_64K_PAGES
+	/* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */
+	if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) {
+		demote_segment_4k(mm, ea);
+		psize = MMU_PAGE_4K;
+	}
+
+	/*
+	 * If this PTE is non-cacheable and we have restrictions on
+	 * using non cacheable large pages, then we switch to 4k
+	 */
+	if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) {
+		if (user_region) {
+			demote_segment_4k(mm, ea);
+			psize = MMU_PAGE_4K;
+		} else if (ea < VMALLOC_END) {
+			/*
+			 * some driver did a non-cacheable mapping
+			 * in vmalloc space, so switch vmalloc
+			 * to 4k pages
+			 */
+			printk(KERN_ALERT "Reducing vmalloc segment "
+			       "to 4kB pages because of "
+			       "non-cacheable mapping\n");
+			psize = mmu_vmalloc_psize = MMU_PAGE_4K;
+			copro_flush_all_slbs(mm);
+		}
+	}
+
+#endif /* CONFIG_PPC_64K_PAGES */
+
+	if (current->mm == mm)
+		check_paca_psize(ea, mm, psize, user_region);
+
+#ifdef CONFIG_PPC_64K_PAGES
+	if (psize == MMU_PAGE_64K)
+		rc = __hash_page_64K(ea, access, vsid, ptep, trap,
+				     flags, ssize);
+	else
+#endif /* CONFIG_PPC_64K_PAGES */
+	{
+		int spp = subpage_protection(mm, ea);
+		if (access & spp)
+			rc = -2;
+		else
+			rc = __hash_page_4K(ea, access, vsid, ptep, trap,
+					    flags, ssize, spp);
+	}
+
+	/*
+	 * Dump some info in case of hash insertion failure, they should
+	 * never happen so it is really useful to know if/when they do
+	 */
+	if (rc == -1)
+		hash_failure_debug(ea, access, vsid, trap, ssize, psize,
+				   psize, pte_val(*ptep));
+#ifndef CONFIG_PPC_64K_PAGES
+	DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep));
+#else
+	DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep),
+		pte_val(*(ptep + PTRS_PER_PTE)));
+#endif
+	DBG_LOW(" -> rc=%d\n", rc);
+
+bail:
+	exception_exit(prev_state);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(hash_page_mm);
+
+int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
+	      unsigned long dsisr)
+{
+	unsigned long flags = 0;
+	struct mm_struct *mm = current->mm;
+
+	if ((get_region_id(ea) == VMALLOC_REGION_ID) ||
+	    (get_region_id(ea) == IO_REGION_ID))
+		mm = &init_mm;
+
+	if (dsisr & DSISR_NOHPTE)
+		flags |= HPTE_NOHPTE_UPDATE;
+
+	return hash_page_mm(mm, ea, access, trap, flags);
+}
+EXPORT_SYMBOL_GPL(hash_page);
+
+int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap,
+		unsigned long dsisr)
+{
+	unsigned long access = _PAGE_PRESENT | _PAGE_READ;
+	unsigned long flags = 0;
+	struct mm_struct *mm = current->mm;
+	unsigned int region_id = get_region_id(ea);
+
+	if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID))
+		mm = &init_mm;
+
+	if (dsisr & DSISR_NOHPTE)
+		flags |= HPTE_NOHPTE_UPDATE;
+
+	if (dsisr & DSISR_ISSTORE)
+		access |= _PAGE_WRITE;
+	/*
+	 * We set _PAGE_PRIVILEGED only when
+	 * kernel mode access kernel space.
+	 *
+	 * _PAGE_PRIVILEGED is NOT set
+	 * 1) when kernel mode access user space
+	 * 2) user space access kernel space.
+	 */
+	access |= _PAGE_PRIVILEGED;
+	if ((msr & MSR_PR) || (region_id == USER_REGION_ID))
+		access &= ~_PAGE_PRIVILEGED;
+
+	if (trap == 0x400)
+		access |= _PAGE_EXEC;
+
+	return hash_page_mm(mm, ea, access, trap, flags);
+}
+
+#ifdef CONFIG_PPC_MM_SLICES
+static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
+{
+	int psize = get_slice_psize(mm, ea);
+
+	/* We only prefault standard pages for now */
+	if (unlikely(psize != mm_ctx_user_psize(&mm->context)))
+		return false;
+
+	/*
+	 * Don't prefault if subpage protection is enabled for the EA.
+	 */
+	if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea)))
+		return false;
+
+	return true;
+}
+#else
+static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
+{
+	return true;
+}
+#endif
+
+void hash_preload(struct mm_struct *mm, unsigned long ea,
+		  bool is_exec, unsigned long trap)
+{
+	int hugepage_shift;
+	unsigned long vsid;
+	pgd_t *pgdir;
+	pte_t *ptep;
+	unsigned long flags;
+	int rc, ssize, update_flags = 0;
+	unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0);
+
+	BUG_ON(get_region_id(ea) != USER_REGION_ID);
+
+	if (!should_hash_preload(mm, ea))
+		return;
+
+	DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx,"
+		" trap=%lx\n", mm, mm->pgd, ea, access, trap);
+
+	/* Get Linux PTE if available */
+	pgdir = mm->pgd;
+	if (pgdir == NULL)
+		return;
+
+	/* Get VSID */
+	ssize = user_segment_size(ea);
+	vsid = get_user_vsid(&mm->context, ea, ssize);
+	if (!vsid)
+		return;
+	/*
+	 * Hash doesn't like irqs. Walking linux page table with irq disabled
+	 * saves us from holding multiple locks.
+	 */
+	local_irq_save(flags);
+
+	/*
+	 * THP pages use update_mmu_cache_pmd. We don't do
+	 * hash preload there. Hence can ignore THP here
+	 */
+	ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift);
+	if (!ptep)
+		goto out_exit;
+
+	WARN_ON(hugepage_shift);
+#ifdef CONFIG_PPC_64K_PAGES
+	/* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on
+	 * a 64K kernel), then we don't preload, hash_page() will take
+	 * care of it once we actually try to access the page.
+	 * That way we don't have to duplicate all of the logic for segment
+	 * page size demotion here
+	 */
+	if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep))
+		goto out_exit;
+#endif /* CONFIG_PPC_64K_PAGES */
+
+	/* Is that local to this CPU ? */
+	if (mm_is_thread_local(mm))
+		update_flags |= HPTE_LOCAL_UPDATE;
+
+	/* Hash it in */
+#ifdef CONFIG_PPC_64K_PAGES
+	if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K)
+		rc = __hash_page_64K(ea, access, vsid, ptep, trap,
+				     update_flags, ssize);
+	else
+#endif /* CONFIG_PPC_64K_PAGES */
+		rc = __hash_page_4K(ea, access, vsid, ptep, trap, update_flags,
+				    ssize, subpage_protection(mm, ea));
+
+	/* Dump some info in case of hash insertion failure, they should
+	 * never happen so it is really useful to know if/when they do
+	 */
+	if (rc == -1)
+		hash_failure_debug(ea, access, vsid, trap, ssize,
+				   mm_ctx_user_psize(&mm->context),
+				   mm_ctx_user_psize(&mm->context),
+				   pte_val(*ptep));
+out_exit:
+	local_irq_restore(flags);
+}
+
+#ifdef CONFIG_PPC_MEM_KEYS
+/*
+ * Return the protection key associated with the given address and the
+ * mm_struct.
+ */
+u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
+{
+	pte_t *ptep;
+	u16 pkey = 0;
+	unsigned long flags;
+
+	if (!mm || !mm->pgd)
+		return 0;
+
+	local_irq_save(flags);
+	ptep = find_linux_pte(mm->pgd, address, NULL, NULL);
+	if (ptep)
+		pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep)));
+	local_irq_restore(flags);
+
+	return pkey;
+}
+#endif /* CONFIG_PPC_MEM_KEYS */
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static inline void tm_flush_hash_page(int local)
+{
+	/*
+	 * Transactions are not aborted by tlbiel, only tlbie. Without, syncing a
+	 * page back to a block device w/PIO could pick up transactional data
+	 * (bad!) so we force an abort here. Before the sync the page will be
+	 * made read-only, which will flush_hash_page. BIG ISSUE here: if the
+	 * kernel uses a page from userspace without unmapping it first, it may
+	 * see the speculated version.
+	 */
+	if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
+	    MSR_TM_ACTIVE(current->thread.regs->msr)) {
+		tm_enable();
+		tm_abort(TM_CAUSE_TLBI);
+	}
+}
+#else
+static inline void tm_flush_hash_page(int local)
+{
+}
+#endif
+
+/*
+ * Return the global hash slot, corresponding to the given PTE, which contains
+ * the HPTE.
+ */
+unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift,
+		int ssize, real_pte_t rpte, unsigned int subpg_index)
+{
+	unsigned long hash, gslot, hidx;
+
+	hash = hpt_hash(vpn, shift, ssize);
+	hidx = __rpte_to_hidx(rpte, subpg_index);
+	if (hidx & _PTEIDX_SECONDARY)
+		hash = ~hash;
+	gslot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+	gslot += hidx & _PTEIDX_GROUP_IX;
+	return gslot;
+}
+
+/*
+ * WARNING: This is called from hash_low_64.S, if you change this prototype,
+ *          do not forget to update the assembly call site !
+ */
+void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
+		     unsigned long flags)
+{
+	unsigned long index, shift, gslot;
+	int local = flags & HPTE_LOCAL_UPDATE;
+
+	DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn);
+	pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
+		gslot = pte_get_hash_gslot(vpn, shift, ssize, pte, index);
+		DBG_LOW(" sub %ld: gslot=%lx\n", index, gslot);
+		/*
+		 * We use same base page size and actual psize, because we don't
+		 * use these functions for hugepage
+		 */
+		mmu_hash_ops.hpte_invalidate(gslot, vpn, psize, psize,
+					     ssize, local);
+	} pte_iterate_hashed_end();
+
+	tm_flush_hash_page(local);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
+			 pmd_t *pmdp, unsigned int psize, int ssize,
+			 unsigned long flags)
+{
+	int i, max_hpte_count, valid;
+	unsigned long s_addr;
+	unsigned char *hpte_slot_array;
+	unsigned long hidx, shift, vpn, hash, slot;
+	int local = flags & HPTE_LOCAL_UPDATE;
+
+	s_addr = addr & HPAGE_PMD_MASK;
+	hpte_slot_array = get_hpte_slot_array(pmdp);
+	/*
+	 * IF we try to do a HUGE PTE update after a withdraw is done.
+	 * we will find the below NULL. This happens when we do
+	 * split_huge_page_pmd
+	 */
+	if (!hpte_slot_array)
+		return;
+
+	if (mmu_hash_ops.hugepage_invalidate) {
+		mmu_hash_ops.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
+						 psize, ssize, local);
+		goto tm_abort;
+	}
+	/*
+	 * No bluk hpte removal support, invalidate each entry
+	 */
+	shift = mmu_psize_defs[psize].shift;
+	max_hpte_count = HPAGE_PMD_SIZE >> shift;
+	for (i = 0; i < max_hpte_count; i++) {
+		/*
+		 * 8 bits per each hpte entries
+		 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+		 */
+		valid = hpte_valid(hpte_slot_array, i);
+		if (!valid)
+			continue;
+		hidx =  hpte_hash_index(hpte_slot_array, i);
+
+		/* get the vpn */
+		addr = s_addr + (i * (1ul << shift));
+		vpn = hpt_vpn(addr, vsid, ssize);
+		hash = hpt_hash(vpn, shift, ssize);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+		mmu_hash_ops.hpte_invalidate(slot, vpn, psize,
+					     MMU_PAGE_16M, ssize, local);
+	}
+tm_abort:
+	tm_flush_hash_page(local);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+void flush_hash_range(unsigned long number, int local)
+{
+	if (mmu_hash_ops.flush_hash_range)
+		mmu_hash_ops.flush_hash_range(number, local);
+	else {
+		int i;
+		struct ppc64_tlb_batch *batch =
+			this_cpu_ptr(&ppc64_tlb_batch);
+
+		for (i = 0; i < number; i++)
+			flush_hash_page(batch->vpn[i], batch->pte[i],
+					batch->psize, batch->ssize, local);
+	}
+}
+
+/*
+ * low_hash_fault is called when we the low level hash code failed
+ * to instert a PTE due to an hypervisor error
+ */
+void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
+{
+	enum ctx_state prev_state = exception_enter();
+
+	if (user_mode(regs)) {
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+		if (rc == -2)
+			_exception(SIGSEGV, regs, SEGV_ACCERR, address);
+		else
+#endif
+			_exception(SIGBUS, regs, BUS_ADRERR, address);
+	} else
+		bad_page_fault(regs, address, SIGBUS);
+
+	exception_exit(prev_state);
+}
+
+long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
+			   unsigned long pa, unsigned long rflags,
+			   unsigned long vflags, int psize, int ssize)
+{
+	unsigned long hpte_group;
+	long slot;
+
+repeat:
+	hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+	/* Insert into the hash table, primary slot */
+	slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags,
+					psize, psize, ssize);
+
+	/* Primary is full, try the secondary */
+	if (unlikely(slot == -1)) {
+		hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags,
+						vflags | HPTE_V_SECONDARY,
+						psize, psize, ssize);
+		if (slot == -1) {
+			if (mftb() & 0x1)
+				hpte_group = (hash & htab_hash_mask) *
+						HPTES_PER_GROUP;
+
+			mmu_hash_ops.hpte_remove(hpte_group);
+			goto repeat;
+		}
+	}
+
+	return slot;
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
+{
+	unsigned long hash;
+	unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
+	unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
+	unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
+	long ret;
+
+	hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
+
+	/* Don't create HPTE entries for bad address */
+	if (!vsid)
+		return;
+
+	ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
+				    HPTE_V_BOLTED,
+				    mmu_linear_psize, mmu_kernel_ssize);
+
+	BUG_ON (ret < 0);
+	spin_lock(&linear_map_hash_lock);
+	BUG_ON(linear_map_hash_slots[lmi] & 0x80);
+	linear_map_hash_slots[lmi] = ret | 0x80;
+	spin_unlock(&linear_map_hash_lock);
+}
+
+static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
+{
+	unsigned long hash, hidx, slot;
+	unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
+	unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
+
+	hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
+	spin_lock(&linear_map_hash_lock);
+	BUG_ON(!(linear_map_hash_slots[lmi] & 0x80));
+	hidx = linear_map_hash_slots[lmi] & 0x7f;
+	linear_map_hash_slots[lmi] = 0;
+	spin_unlock(&linear_map_hash_lock);
+	if (hidx & _PTEIDX_SECONDARY)
+		hash = ~hash;
+	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+	slot += hidx & _PTEIDX_GROUP_IX;
+	mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize,
+				     mmu_linear_psize,
+				     mmu_kernel_ssize, 0);
+}
+
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+	unsigned long flags, vaddr, lmi;
+	int i;
+
+	local_irq_save(flags);
+	for (i = 0; i < numpages; i++, page++) {
+		vaddr = (unsigned long)page_address(page);
+		lmi = __pa(vaddr) >> PAGE_SHIFT;
+		if (lmi >= linear_map_hash_count)
+			continue;
+		if (enable)
+			kernel_map_linear_page(vaddr, lmi);
+		else
+			kernel_unmap_linear_page(vaddr, lmi);
+	}
+	local_irq_restore(flags);
+}
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
+				phys_addr_t first_memblock_size)
+{
+	/*
+	 * We don't currently support the first MEMBLOCK not mapping 0
+	 * physical on those processors
+	 */
+	BUG_ON(first_memblock_base != 0);
+
+	/*
+	 * On virtualized systems the first entry is our RMA region aka VRMA,
+	 * non-virtualized 64-bit hash MMU systems don't have a limitation
+	 * on real mode access.
+	 *
+	 * For guests on platforms before POWER9, we clamp the it limit to 1G
+	 * to avoid some funky things such as RTAS bugs etc...
+	 */
+	if (!early_cpu_has_feature(CPU_FTR_HVMODE)) {
+		ppc64_rma_size = first_memblock_size;
+		if (!early_cpu_has_feature(CPU_FTR_ARCH_300))
+			ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000);
+
+		/* Finally limit subsequent allocations */
+		memblock_set_current_limit(ppc64_rma_size);
+	} else {
+		ppc64_rma_size = ULONG_MAX;
+	}
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+static int hpt_order_get(void *data, u64 *val)
+{
+	*val = ppc64_pft_size;
+	return 0;
+}
+
+static int hpt_order_set(void *data, u64 val)
+{
+	if (!mmu_hash_ops.resize_hpt)
+		return -ENODEV;
+
+	return mmu_hash_ops.resize_hpt(val);
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
+
+static int __init hash64_debugfs(void)
+{
+	if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root,
+					NULL, &fops_hpt_order)) {
+		pr_err("lpar: unable to create hpt_order debugsfs file\n");
+	}
+
+	return 0;
+}
+machine_device_initcall(pseries, hash64_debugfs);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
new file mode 100644
index 000000000000..e7a9c4f6bfca
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -0,0 +1,482 @@
+/*
+ *  IOMMU helpers in MMU context.
+ *
+ *  Copyright (C) 2015 IBM Corp. <aik@ozlabs.ru>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
+#include <linux/migrate.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/sizes.h>
+#include <asm/mmu_context.h>
+#include <asm/pte-walk.h>
+#include <linux/mm_inline.h>
+
+static DEFINE_MUTEX(mem_list_mutex);
+
+#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY	0x1
+#define MM_IOMMU_TABLE_GROUP_PAGE_MASK	~(SZ_4K - 1)
+
+struct mm_iommu_table_group_mem_t {
+	struct list_head next;
+	struct rcu_head rcu;
+	unsigned long used;
+	atomic64_t mapped;
+	unsigned int pageshift;
+	u64 ua;			/* userspace address */
+	u64 entries;		/* number of entries in hpas/hpages[] */
+	/*
+	 * in mm_iommu_get we temporarily use this to store
+	 * struct page address.
+	 *
+	 * We need to convert ua to hpa in real mode. Make it
+	 * simpler by storing physical address.
+	 */
+	union {
+		struct page **hpages;	/* vmalloc'ed */
+		phys_addr_t *hpas;
+	};
+#define MM_IOMMU_TABLE_INVALID_HPA	((uint64_t)-1)
+	u64 dev_hpa;		/* Device memory base address */
+};
+
+static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
+		unsigned long npages, bool incr)
+{
+	long ret = 0, locked, lock_limit;
+
+	if (!npages)
+		return 0;
+
+	down_write(&mm->mmap_sem);
+
+	if (incr) {
+		locked = mm->locked_vm + npages;
+		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+			ret = -ENOMEM;
+		else
+			mm->locked_vm += npages;
+	} else {
+		if (WARN_ON_ONCE(npages > mm->locked_vm))
+			npages = mm->locked_vm;
+		mm->locked_vm -= npages;
+	}
+
+	pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
+			current ? current->pid : 0,
+			incr ? '+' : '-',
+			npages << PAGE_SHIFT,
+			mm->locked_vm << PAGE_SHIFT,
+			rlimit(RLIMIT_MEMLOCK));
+	up_write(&mm->mmap_sem);
+
+	return ret;
+}
+
+bool mm_iommu_preregistered(struct mm_struct *mm)
+{
+	return !list_empty(&mm->context.iommu_group_mem_list);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
+
+static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
+			      unsigned long entries, unsigned long dev_hpa,
+			      struct mm_iommu_table_group_mem_t **pmem)
+{
+	struct mm_iommu_table_group_mem_t *mem;
+	long i, ret, locked_entries = 0;
+	unsigned int pageshift;
+
+	mutex_lock(&mem_list_mutex);
+
+	list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list,
+			next) {
+		/* Overlap? */
+		if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
+				(ua < (mem->ua +
+				       (mem->entries << PAGE_SHIFT)))) {
+			ret = -EINVAL;
+			goto unlock_exit;
+		}
+
+	}
+
+	if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
+		ret = mm_iommu_adjust_locked_vm(mm, entries, true);
+		if (ret)
+			goto unlock_exit;
+
+		locked_entries = entries;
+	}
+
+	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+	if (!mem) {
+		ret = -ENOMEM;
+		goto unlock_exit;
+	}
+
+	if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) {
+		mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT));
+		mem->dev_hpa = dev_hpa;
+		goto good_exit;
+	}
+	mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA;
+
+	/*
+	 * For a starting point for a maximum page size calculation
+	 * we use @ua and @entries natural alignment to allow IOMMU pages
+	 * smaller than huge pages but still bigger than PAGE_SIZE.
+	 */
+	mem->pageshift = __ffs(ua | (entries << PAGE_SHIFT));
+	mem->hpas = vzalloc(array_size(entries, sizeof(mem->hpas[0])));
+	if (!mem->hpas) {
+		kfree(mem);
+		ret = -ENOMEM;
+		goto unlock_exit;
+	}
+
+	down_read(&mm->mmap_sem);
+	ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL);
+	up_read(&mm->mmap_sem);
+	if (ret != entries) {
+		/* free the reference taken */
+		for (i = 0; i < ret; i++)
+			put_page(mem->hpages[i]);
+
+		vfree(mem->hpas);
+		kfree(mem);
+		ret = -EFAULT;
+		goto unlock_exit;
+	}
+
+	pageshift = PAGE_SHIFT;
+	for (i = 0; i < entries; ++i) {
+		struct page *page = mem->hpages[i];
+
+		/*
+		 * Allow to use larger than 64k IOMMU pages. Only do that
+		 * if we are backed by hugetlb.
+		 */
+		if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) {
+			struct page *head = compound_head(page);
+
+			pageshift = compound_order(head) + PAGE_SHIFT;
+		}
+		mem->pageshift = min(mem->pageshift, pageshift);
+		/*
+		 * We don't need struct page reference any more, switch
+		 * to physical address.
+		 */
+		mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
+	}
+
+good_exit:
+	ret = 0;
+	atomic64_set(&mem->mapped, 1);
+	mem->used = 1;
+	mem->ua = ua;
+	mem->entries = entries;
+	*pmem = mem;
+
+	list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list);
+
+unlock_exit:
+	if (locked_entries && ret)
+		mm_iommu_adjust_locked_vm(mm, locked_entries, false);
+
+	mutex_unlock(&mem_list_mutex);
+
+	return ret;
+}
+
+long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
+		struct mm_iommu_table_group_mem_t **pmem)
+{
+	return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA,
+			pmem);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_new);
+
+long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua,
+		unsigned long entries, unsigned long dev_hpa,
+		struct mm_iommu_table_group_mem_t **pmem)
+{
+	return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_newdev);
+
+static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
+{
+	long i;
+	struct page *page = NULL;
+
+	if (!mem->hpas)
+		return;
+
+	for (i = 0; i < mem->entries; ++i) {
+		if (!mem->hpas[i])
+			continue;
+
+		page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
+		if (!page)
+			continue;
+
+		if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
+			SetPageDirty(page);
+
+		put_page(page);
+		mem->hpas[i] = 0;
+	}
+}
+
+static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem)
+{
+
+	mm_iommu_unpin(mem);
+	vfree(mem->hpas);
+	kfree(mem);
+}
+
+static void mm_iommu_free(struct rcu_head *head)
+{
+	struct mm_iommu_table_group_mem_t *mem = container_of(head,
+			struct mm_iommu_table_group_mem_t, rcu);
+
+	mm_iommu_do_free(mem);
+}
+
+static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem)
+{
+	list_del_rcu(&mem->next);
+	call_rcu(&mem->rcu, mm_iommu_free);
+}
+
+long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
+{
+	long ret = 0;
+	unsigned long entries, dev_hpa;
+
+	mutex_lock(&mem_list_mutex);
+
+	if (mem->used == 0) {
+		ret = -ENOENT;
+		goto unlock_exit;
+	}
+
+	--mem->used;
+	/* There are still users, exit */
+	if (mem->used)
+		goto unlock_exit;
+
+	/* Are there still mappings? */
+	if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) {
+		++mem->used;
+		ret = -EBUSY;
+		goto unlock_exit;
+	}
+
+	/* @mapped became 0 so now mappings are disabled, release the region */
+	entries = mem->entries;
+	dev_hpa = mem->dev_hpa;
+	mm_iommu_release(mem);
+
+	if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
+		mm_iommu_adjust_locked_vm(mm, entries, false);
+
+unlock_exit:
+	mutex_unlock(&mem_list_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_put);
+
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
+		unsigned long ua, unsigned long size)
+{
+	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+	list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
+		if ((mem->ua <= ua) &&
+				(ua + size <= mem->ua +
+				 (mem->entries << PAGE_SHIFT))) {
+			ret = mem;
+			break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_lookup);
+
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
+		unsigned long ua, unsigned long size)
+{
+	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+	list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list,
+			next) {
+		if ((mem->ua <= ua) &&
+				(ua + size <= mem->ua +
+				 (mem->entries << PAGE_SHIFT))) {
+			ret = mem;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
+		unsigned long ua, unsigned long entries)
+{
+	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+	mutex_lock(&mem_list_mutex);
+
+	list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
+		if ((mem->ua == ua) && (mem->entries == entries)) {
+			ret = mem;
+			++mem->used;
+			break;
+		}
+	}
+
+	mutex_unlock(&mem_list_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_get);
+
+long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
+		unsigned long ua, unsigned int pageshift, unsigned long *hpa)
+{
+	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+	u64 *va;
+
+	if (entry >= mem->entries)
+		return -EFAULT;
+
+	if (pageshift > mem->pageshift)
+		return -EFAULT;
+
+	if (!mem->hpas) {
+		*hpa = mem->dev_hpa + (ua - mem->ua);
+		return 0;
+	}
+
+	va = &mem->hpas[entry];
+	*hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
+
+long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
+		unsigned long ua, unsigned int pageshift, unsigned long *hpa)
+{
+	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+	unsigned long *pa;
+
+	if (entry >= mem->entries)
+		return -EFAULT;
+
+	if (pageshift > mem->pageshift)
+		return -EFAULT;
+
+	if (!mem->hpas) {
+		*hpa = mem->dev_hpa + (ua - mem->ua);
+		return 0;
+	}
+
+	pa = (void *) vmalloc_to_phys(&mem->hpas[entry]);
+	if (!pa)
+		return -EFAULT;
+
+	*hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
+
+	return 0;
+}
+
+extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua)
+{
+	struct mm_iommu_table_group_mem_t *mem;
+	long entry;
+	void *va;
+	unsigned long *pa;
+
+	mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE);
+	if (!mem)
+		return;
+
+	if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA)
+		return;
+
+	entry = (ua - mem->ua) >> PAGE_SHIFT;
+	va = &mem->hpas[entry];
+
+	pa = (void *) vmalloc_to_phys(va);
+	if (!pa)
+		return;
+
+	*pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY;
+}
+
+bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
+		unsigned int pageshift, unsigned long *size)
+{
+	struct mm_iommu_table_group_mem_t *mem;
+	unsigned long end;
+
+	list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
+		if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
+			continue;
+
+		end = mem->dev_hpa + (mem->entries << PAGE_SHIFT);
+		if ((mem->dev_hpa <= hpa) && (hpa < end)) {
+			/*
+			 * Since the IOMMU page size might be bigger than
+			 * PAGE_SIZE, the amount of preregistered memory
+			 * starting from @hpa might be smaller than 1<<pageshift
+			 * and the caller needs to distinguish this situation.
+			 */
+			*size = min(1UL << pageshift, end - hpa);
+			return true;
+		}
+	}
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_is_devmem);
+
+long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
+{
+	if (atomic64_inc_not_zero(&mem->mapped))
+		return 0;
+
+	/* Last mm_iommu_put() has been called, no more mappings allowed() */
+	return -ENXIO;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc);
+
+void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem)
+{
+	atomic64_add_unless(&mem->mapped, -1, 1);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec);
+
+void mm_iommu_init(struct mm_struct *mm)
+{
+	INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
+}
diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
new file mode 100644
index 000000000000..cb2b08635508
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/mmu_context.c
@@ -0,0 +1,263 @@
+/*
+ *  MMU context allocation for 64-bit kernels.
+ *
+ *  Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/pkeys.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
+
+static DEFINE_IDA(mmu_context_ida);
+
+static int alloc_context_id(int min_id, int max_id)
+{
+	return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL);
+}
+
+void hash__reserve_context_id(int id)
+{
+	int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL);
+
+	WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result);
+}
+
+int hash__alloc_context_id(void)
+{
+	unsigned long max;
+
+	if (mmu_has_feature(MMU_FTR_68_BIT_VA))
+		max = MAX_USER_CONTEXT;
+	else
+		max = MAX_USER_CONTEXT_65BIT_VA;
+
+	return alloc_context_id(MIN_USER_CONTEXT, max);
+}
+EXPORT_SYMBOL_GPL(hash__alloc_context_id);
+
+void slb_setup_new_exec(void);
+
+static int hash__init_new_context(struct mm_struct *mm)
+{
+	int index;
+
+	index = hash__alloc_context_id();
+	if (index < 0)
+		return index;
+
+	mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context),
+					   GFP_KERNEL);
+	if (!mm->context.hash_context) {
+		ida_free(&mmu_context_ida, index);
+		return -ENOMEM;
+	}
+
+	/*
+	 * The old code would re-promote on fork, we don't do that when using
+	 * slices as it could cause problem promoting slices that have been
+	 * forced down to 4K.
+	 *
+	 * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
+	 * explicitly against context.id == 0. This ensures that we properly
+	 * initialize context slice details for newly allocated mm's (which will
+	 * have id == 0) and don't alter context slice inherited via fork (which
+	 * will have id != 0).
+	 *
+	 * We should not be calling init_new_context() on init_mm. Hence a
+	 * check against 0 is OK.
+	 */
+	if (mm->context.id == 0) {
+		memset(mm->context.hash_context, 0, sizeof(struct hash_mm_context));
+		slice_init_new_context_exec(mm);
+	} else {
+		/* This is fork. Copy hash_context details from current->mm */
+		memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context));
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+		/* inherit subpage prot detalis if we have one. */
+		if (current->mm->context.hash_context->spt) {
+			mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table),
+								GFP_KERNEL);
+			if (!mm->context.hash_context->spt) {
+				ida_free(&mmu_context_ida, index);
+				kfree(mm->context.hash_context);
+				return -ENOMEM;
+			}
+		}
+#endif
+
+	}
+
+	pkey_mm_init(mm);
+	return index;
+}
+
+void hash__setup_new_exec(void)
+{
+	slice_setup_new_exec();
+
+	slb_setup_new_exec();
+}
+
+static int radix__init_new_context(struct mm_struct *mm)
+{
+	unsigned long rts_field;
+	int index, max_id;
+
+	max_id = (1 << mmu_pid_bits) - 1;
+	index = alloc_context_id(mmu_base_pid, max_id);
+	if (index < 0)
+		return index;
+
+	/*
+	 * set the process table entry,
+	 */
+	rts_field = radix__get_tree_size();
+	process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE);
+
+	/*
+	 * Order the above store with subsequent update of the PID
+	 * register (at which point HW can start loading/caching
+	 * the entry) and the corresponding load by the MMU from
+	 * the L2 cache.
+	 */
+	asm volatile("ptesync;isync" : : : "memory");
+
+	mm->context.npu_context = NULL;
+	mm->context.hash_context = NULL;
+
+	return index;
+}
+
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+	int index;
+
+	if (radix_enabled())
+		index = radix__init_new_context(mm);
+	else
+		index = hash__init_new_context(mm);
+
+	if (index < 0)
+		return index;
+
+	mm->context.id = index;
+
+	mm->context.pte_frag = NULL;
+	mm->context.pmd_frag = NULL;
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	mm_iommu_init(mm);
+#endif
+	atomic_set(&mm->context.active_cpus, 0);
+	atomic_set(&mm->context.copros, 0);
+
+	return 0;
+}
+
+void __destroy_context(int context_id)
+{
+	ida_free(&mmu_context_ida, context_id);
+}
+EXPORT_SYMBOL_GPL(__destroy_context);
+
+static void destroy_contexts(mm_context_t *ctx)
+{
+	int index, context_id;
+
+	for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) {
+		context_id = ctx->extended_id[index];
+		if (context_id)
+			ida_free(&mmu_context_ida, context_id);
+	}
+	kfree(ctx->hash_context);
+}
+
+static void pmd_frag_destroy(void *pmd_frag)
+{
+	int count;
+	struct page *page;
+
+	page = virt_to_page(pmd_frag);
+	/* drop all the pending references */
+	count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
+	/* We allow PTE_FRAG_NR fragments from a PTE page */
+	if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) {
+		pgtable_pmd_page_dtor(page);
+		__free_page(page);
+	}
+}
+
+static void destroy_pagetable_cache(struct mm_struct *mm)
+{
+	void *frag;
+
+	frag = mm->context.pte_frag;
+	if (frag)
+		pte_frag_destroy(frag);
+
+	frag = mm->context.pmd_frag;
+	if (frag)
+		pmd_frag_destroy(frag);
+	return;
+}
+
+void destroy_context(struct mm_struct *mm)
+{
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
+#endif
+	if (radix_enabled())
+		WARN_ON(process_tb[mm->context.id].prtb0 != 0);
+	else
+		subpage_prot_free(mm);
+	destroy_contexts(&mm->context);
+	mm->context.id = MMU_NO_CONTEXT;
+}
+
+void arch_exit_mmap(struct mm_struct *mm)
+{
+	destroy_pagetable_cache(mm);
+
+	if (radix_enabled()) {
+		/*
+		 * Radix doesn't have a valid bit in the process table
+		 * entries. However we know that at least P9 implementation
+		 * will avoid caching an entry with an invalid RTS field,
+		 * and 0 is invalid. So this will do.
+		 *
+		 * This runs before the "fullmm" tlb flush in exit_mmap,
+		 * which does a RIC=2 tlbie to clear the process table
+		 * entry. See the "fullmm" comments in tlb-radix.c.
+		 *
+		 * No barrier required here after the store because
+		 * this process will do the invalidate, which starts with
+		 * ptesync.
+		 */
+		process_tb[mm->context.id].prtb0 = 0;
+	}
+}
+
+#ifdef CONFIG_PPC_RADIX_MMU
+void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
+{
+	mtspr(SPRN_PID, next->context.id);
+	isync();
+}
+#endif
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
new file mode 100644
index 000000000000..16bda049187a
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -0,0 +1,449 @@
+/*
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm_types.h>
+#include <linux/memblock.h>
+#include <misc/cxl-base.h>
+
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/trace.h>
+#include <asm/powernv.h>
+
+#include <mm/mmu_decl.h>
+#include <trace/events/thp.h>
+
+unsigned long __pmd_frag_nr;
+EXPORT_SYMBOL(__pmd_frag_nr);
+unsigned long __pmd_frag_size_shift;
+EXPORT_SYMBOL(__pmd_frag_size_shift);
+
+int (*register_process_table)(unsigned long base, unsigned long page_size,
+			      unsigned long tbl_size);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * This is called when relaxing access to a hugepage. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+			  pmd_t *pmdp, pmd_t entry, int dirty)
+{
+	int changed;
+#ifdef CONFIG_DEBUG_VM
+	WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+	assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp));
+#endif
+	changed = !pmd_same(*(pmdp), entry);
+	if (changed) {
+		/*
+		 * We can use MMU_PAGE_2M here, because only radix
+		 * path look at the psize.
+		 */
+		__ptep_set_access_flags(vma, pmdp_ptep(pmdp),
+					pmd_pte(entry), address, MMU_PAGE_2M);
+	}
+	return changed;
+}
+
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+			      unsigned long address, pmd_t *pmdp)
+{
+	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+/*
+ * set a new huge pmd. We should not be called for updating
+ * an existing pmd entry. That should go via pmd_hugepage_update.
+ */
+void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+		pmd_t *pmdp, pmd_t pmd)
+{
+#ifdef CONFIG_DEBUG_VM
+	/*
+	 * Make sure hardware valid bit is not set. We don't do
+	 * tlb flush for this update.
+	 */
+
+	WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
+	assert_spin_locked(pmd_lockptr(mm, pmdp));
+	WARN_ON(!(pmd_large(pmd) || pmd_devmap(pmd)));
+#endif
+	trace_hugepage_set_pmd(addr, pmd_val(pmd));
+	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
+}
+
+static void do_nothing(void *unused)
+{
+
+}
+/*
+ * Serialize against find_current_mm_pte which does lock-less
+ * lookup in page tables with local interrupts disabled. For huge pages
+ * it casts pmd_t to pte_t. Since format of pte_t is different from
+ * pmd_t we want to prevent transit from pmd pointing to page table
+ * to pmd pointing to huge page (and back) while interrupts are disabled.
+ * We clear pmd to possibly replace it with page table pointer in
+ * different code paths. So make sure we wait for the parallel
+ * find_current_mm_pte to finish.
+ */
+void serialize_against_pte_lookup(struct mm_struct *mm)
+{
+	smp_mb();
+	smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
+}
+
+/*
+ * We use this to invalidate a pmdp entry before switching from a
+ * hugepte to regular pmd entry.
+ */
+pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+		     pmd_t *pmdp)
+{
+	unsigned long old_pmd;
+
+	old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
+	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+	/*
+	 * This ensures that generic code that rely on IRQ disabling
+	 * to prevent a parallel THP split work as expected.
+	 */
+	serialize_against_pte_lookup(vma->vm_mm);
+	return __pmd(old_pmd);
+}
+
+static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
+{
+	return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
+}
+
+pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
+{
+	unsigned long pmdv;
+
+	pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
+	return pmd_set_protbits(__pmd(pmdv), pgprot);
+}
+
+pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
+{
+	return pfn_pmd(page_to_pfn(page), pgprot);
+}
+
+pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+	unsigned long pmdv;
+
+	pmdv = pmd_val(pmd);
+	pmdv &= _HPAGE_CHG_MASK;
+	return pmd_set_protbits(__pmd(pmdv), newprot);
+}
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a HUGE PMD entry in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux HUGE PMD entry.
+ */
+void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+			  pmd_t *pmd)
+{
+	if (radix_enabled())
+		prefetch((void *)addr);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+/* For use by kexec */
+void mmu_cleanup_all(void)
+{
+	if (radix_enabled())
+		radix__mmu_cleanup_all();
+	else if (mmu_hash_ops.hpte_clear_all)
+		mmu_hash_ops.hpte_clear_all();
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int __meminit create_section_mapping(unsigned long start, unsigned long end, int nid)
+{
+	if (radix_enabled())
+		return radix__create_section_mapping(start, end, nid);
+
+	return hash__create_section_mapping(start, end, nid);
+}
+
+int __meminit remove_section_mapping(unsigned long start, unsigned long end)
+{
+	if (radix_enabled())
+		return radix__remove_section_mapping(start, end);
+
+	return hash__remove_section_mapping(start, end);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+void __init mmu_partition_table_init(void)
+{
+	unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
+	unsigned long ptcr;
+
+	BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
+	/* Initialize the Partition Table with no entries */
+	partition_tb = memblock_alloc(patb_size, patb_size);
+	if (!partition_tb)
+		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+		      __func__, patb_size, patb_size);
+
+	/*
+	 * update partition table control register,
+	 * 64 K size.
+	 */
+	ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
+	mtspr(SPRN_PTCR, ptcr);
+	powernv_set_nmmu_ptcr(ptcr);
+}
+
+void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
+				   unsigned long dw1)
+{
+	unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
+
+	partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+	partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+
+	/*
+	 * Global flush of TLBs and partition table caches for this lpid.
+	 * The type of flush (hash or radix) depends on what the previous
+	 * use of this partition ID was, not the new use.
+	 */
+	asm volatile("ptesync" : : : "memory");
+	if (old & PATB_HR) {
+		asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
+			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+		asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
+			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+		trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1);
+	} else {
+		asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
+			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+		trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0);
+	}
+	/* do we need fixup here ?*/
+	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
+EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
+
+static pmd_t *get_pmd_from_cache(struct mm_struct *mm)
+{
+	void *pmd_frag, *ret;
+
+	if (PMD_FRAG_NR == 1)
+		return NULL;
+
+	spin_lock(&mm->page_table_lock);
+	ret = mm->context.pmd_frag;
+	if (ret) {
+		pmd_frag = ret + PMD_FRAG_SIZE;
+		/*
+		 * If we have taken up all the fragments mark PTE page NULL
+		 */
+		if (((unsigned long)pmd_frag & ~PAGE_MASK) == 0)
+			pmd_frag = NULL;
+		mm->context.pmd_frag = pmd_frag;
+	}
+	spin_unlock(&mm->page_table_lock);
+	return (pmd_t *)ret;
+}
+
+static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
+{
+	void *ret = NULL;
+	struct page *page;
+	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
+
+	if (mm == &init_mm)
+		gfp &= ~__GFP_ACCOUNT;
+	page = alloc_page(gfp);
+	if (!page)
+		return NULL;
+	if (!pgtable_pmd_page_ctor(page)) {
+		__free_pages(page, 0);
+		return NULL;
+	}
+
+	atomic_set(&page->pt_frag_refcount, 1);
+
+	ret = page_address(page);
+	/*
+	 * if we support only one fragment just return the
+	 * allocated page.
+	 */
+	if (PMD_FRAG_NR == 1)
+		return ret;
+
+	spin_lock(&mm->page_table_lock);
+	/*
+	 * If we find pgtable_page set, we return
+	 * the allocated page with single fragement
+	 * count.
+	 */
+	if (likely(!mm->context.pmd_frag)) {
+		atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR);
+		mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
+	}
+	spin_unlock(&mm->page_table_lock);
+
+	return (pmd_t *)ret;
+}
+
+pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr)
+{
+	pmd_t *pmd;
+
+	pmd = get_pmd_from_cache(mm);
+	if (pmd)
+		return pmd;
+
+	return __alloc_for_pmdcache(mm);
+}
+
+void pmd_fragment_free(unsigned long *pmd)
+{
+	struct page *page = virt_to_page(pmd);
+
+	BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
+	if (atomic_dec_and_test(&page->pt_frag_refcount)) {
+		pgtable_pmd_page_dtor(page);
+		__free_page(page);
+	}
+}
+
+static inline void pgtable_free(void *table, int index)
+{
+	switch (index) {
+	case PTE_INDEX:
+		pte_fragment_free(table, 0);
+		break;
+	case PMD_INDEX:
+		pmd_fragment_free(table);
+		break;
+	case PUD_INDEX:
+		kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table);
+		break;
+#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE)
+		/* 16M hugepd directory at pud level */
+	case HTLB_16M_INDEX:
+		BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0);
+		kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table);
+		break;
+		/* 16G hugepd directory at the pgd level */
+	case HTLB_16G_INDEX:
+		BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0);
+		kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table);
+		break;
+#endif
+		/* We don't free pgd table via RCU callback */
+	default:
+		BUG();
+	}
+}
+
+#ifdef CONFIG_SMP
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
+{
+	unsigned long pgf = (unsigned long)table;
+
+	BUG_ON(index > MAX_PGTABLE_INDEX_SIZE);
+	pgf |= index;
+	tlb_remove_table(tlb, (void *)pgf);
+}
+
+void __tlb_remove_table(void *_table)
+{
+	void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
+	unsigned int index = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
+
+	return pgtable_free(table, index);
+}
+#else
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
+{
+	return pgtable_free(table, index);
+}
+#endif
+
+#ifdef CONFIG_PROC_FS
+atomic_long_t direct_pages_count[MMU_PAGE_COUNT];
+
+void arch_report_meminfo(struct seq_file *m)
+{
+	/*
+	 * Hash maps the memory with one size mmu_linear_psize.
+	 * So don't bother to print these on hash
+	 */
+	if (!radix_enabled())
+		return;
+	seq_printf(m, "DirectMap4k:    %8lu kB\n",
+		   atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2);
+	seq_printf(m, "DirectMap64k:    %8lu kB\n",
+		   atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6);
+	seq_printf(m, "DirectMap2M:    %8lu kB\n",
+		   atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11);
+	seq_printf(m, "DirectMap1G:    %8lu kB\n",
+		   atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20);
+}
+#endif /* CONFIG_PROC_FS */
+
+pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
+			     pte_t *ptep)
+{
+	unsigned long pte_val;
+
+	/*
+	 * Clear the _PAGE_PRESENT so that no hardware parallel update is
+	 * possible. Also keep the pte_present true so that we don't take
+	 * wrong fault.
+	 */
+	pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0);
+
+	return __pte(pte_val);
+
+}
+
+void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+			     pte_t *ptep, pte_t old_pte, pte_t pte)
+{
+	if (radix_enabled())
+		return radix__ptep_modify_prot_commit(vma, addr,
+						      ptep, old_pte, pte);
+	set_pte_at(vma->vm_mm, addr, ptep, pte);
+}
+
+/*
+ * For hash translation mode, we use the deposited table to store hash slot
+ * information and they are stored at PTRS_PER_PMD offset from related pmd
+ * location. Hence a pmd move requires deposit and withdraw.
+ *
+ * For radix translation with split pmd ptl, we store the deposited table in the
+ * pmd page. Hence if we have different pmd page we need to withdraw during pmd
+ * move.
+ *
+ * With hash we use deposited table always irrespective of anon or not.
+ * With radix we use deposited table only for anonymous mapping.
+ */
+int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
+			   struct spinlock *old_pmd_ptl,
+			   struct vm_area_struct *vma)
+{
+	if (radix_enabled())
+		return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
+
+	return true;
+}
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
new file mode 100644
index 000000000000..ae7fca40e5b3
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -0,0 +1,428 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * PowerPC Memory Protection Keys management
+ *
+ * Copyright 2017, Ram Pai, IBM Corporation.
+ */
+
+#include <asm/mman.h>
+#include <asm/mmu_context.h>
+#include <asm/mmu.h>
+#include <asm/setup.h>
+#include <linux/pkeys.h>
+#include <linux/of_device.h>
+
+DEFINE_STATIC_KEY_TRUE(pkey_disabled);
+int  pkeys_total;		/* Total pkeys as per device tree */
+u32  initial_allocation_mask;   /* Bits set for the initially allocated keys */
+u32  reserved_allocation_mask;  /* Bits set for reserved keys */
+static bool pkey_execute_disable_supported;
+static bool pkeys_devtree_defined;	/* property exported by device tree */
+static u64 pkey_amr_mask;		/* Bits in AMR not to be touched */
+static u64 pkey_iamr_mask;		/* Bits in AMR not to be touched */
+static u64 pkey_uamor_mask;		/* Bits in UMOR not to be touched */
+static int execute_only_key = 2;
+
+#define AMR_BITS_PER_PKEY 2
+#define AMR_RD_BIT 0x1UL
+#define AMR_WR_BIT 0x2UL
+#define IAMR_EX_BIT 0x1UL
+#define PKEY_REG_BITS (sizeof(u64)*8)
+#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY))
+
+static void scan_pkey_feature(void)
+{
+	u32 vals[2];
+	struct device_node *cpu;
+
+	cpu = of_find_node_by_type(NULL, "cpu");
+	if (!cpu)
+		return;
+
+	if (of_property_read_u32_array(cpu,
+			"ibm,processor-storage-keys", vals, 2))
+		return;
+
+	/*
+	 * Since any pkey can be used for data or execute, we will just treat
+	 * all keys as equal and track them as one entity.
+	 */
+	pkeys_total = vals[0];
+	pkeys_devtree_defined = true;
+}
+
+static inline bool pkey_mmu_enabled(void)
+{
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		return pkeys_total;
+	else
+		return cpu_has_feature(CPU_FTR_PKEY);
+}
+
+static int pkey_initialize(void)
+{
+	int os_reserved, i;
+
+	/*
+	 * We define PKEY_DISABLE_EXECUTE in addition to the arch-neutral
+	 * generic defines for PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE.
+	 * Ensure that the bits a distinct.
+	 */
+	BUILD_BUG_ON(PKEY_DISABLE_EXECUTE &
+		     (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+	/*
+	 * pkey_to_vmflag_bits() assumes that the pkey bits are contiguous
+	 * in the vmaflag. Make sure that is really the case.
+	 */
+	BUILD_BUG_ON(__builtin_clzl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) +
+		     __builtin_popcountl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)
+				!= (sizeof(u64) * BITS_PER_BYTE));
+
+	/* scan the device tree for pkey feature */
+	scan_pkey_feature();
+
+	/*
+	 * Let's assume 32 pkeys on P8 bare metal, if its not defined by device
+	 * tree. We make this exception since skiboot forgot to expose this
+	 * property on power8.
+	 */
+	if (!pkeys_devtree_defined && !firmware_has_feature(FW_FEATURE_LPAR) &&
+			cpu_has_feature(CPU_FTRS_POWER8))
+		pkeys_total = 32;
+
+	/*
+	 * Adjust the upper limit, based on the number of bits supported by
+	 * arch-neutral code.
+	 */
+	pkeys_total = min_t(int, pkeys_total,
+			((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)+1));
+
+	if (!pkey_mmu_enabled() || radix_enabled() || !pkeys_total)
+		static_branch_enable(&pkey_disabled);
+	else
+		static_branch_disable(&pkey_disabled);
+
+	if (static_branch_likely(&pkey_disabled))
+		return 0;
+
+	/*
+	 * The device tree cannot be relied to indicate support for
+	 * execute_disable support. Instead we use a PVR check.
+	 */
+	if (pvr_version_is(PVR_POWER7) || pvr_version_is(PVR_POWER7p))
+		pkey_execute_disable_supported = false;
+	else
+		pkey_execute_disable_supported = true;
+
+#ifdef CONFIG_PPC_4K_PAGES
+	/*
+	 * The OS can manage only 8 pkeys due to its inability to represent them
+	 * in the Linux 4K PTE.
+	 */
+	os_reserved = pkeys_total - 8;
+#else
+	os_reserved = 0;
+#endif
+	/* Bits are in LE format. */
+	reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key);
+
+	/* register mask is in BE format */
+	pkey_amr_mask = ~0x0ul;
+	pkey_amr_mask &= ~(0x3ul << pkeyshift(0));
+
+	pkey_iamr_mask = ~0x0ul;
+	pkey_iamr_mask &= ~(0x3ul << pkeyshift(0));
+	pkey_iamr_mask &= ~(0x3ul << pkeyshift(execute_only_key));
+
+	pkey_uamor_mask = ~0x0ul;
+	pkey_uamor_mask &= ~(0x3ul << pkeyshift(0));
+	pkey_uamor_mask &= ~(0x3ul << pkeyshift(execute_only_key));
+
+	/* mark the rest of the keys as reserved and hence unavailable */
+	for (i = (pkeys_total - os_reserved); i < pkeys_total; i++) {
+		reserved_allocation_mask |= (0x1 << i);
+		pkey_uamor_mask &= ~(0x3ul << pkeyshift(i));
+	}
+	initial_allocation_mask = reserved_allocation_mask | (0x1 << 0);
+
+	if (unlikely((pkeys_total - os_reserved) <= execute_only_key)) {
+		/*
+		 * Insufficient number of keys to support
+		 * execute only key. Mark it unavailable.
+		 * Any AMR, UAMOR, IAMR bit set for
+		 * this key is irrelevant since this key
+		 * can never be allocated.
+		 */
+		execute_only_key = -1;
+	}
+
+	return 0;
+}
+
+arch_initcall(pkey_initialize);
+
+void pkey_mm_init(struct mm_struct *mm)
+{
+	if (static_branch_likely(&pkey_disabled))
+		return;
+	mm_pkey_allocation_map(mm) = initial_allocation_mask;
+	mm->context.execute_only_pkey = execute_only_key;
+}
+
+static inline u64 read_amr(void)
+{
+	return mfspr(SPRN_AMR);
+}
+
+static inline void write_amr(u64 value)
+{
+	mtspr(SPRN_AMR, value);
+}
+
+static inline u64 read_iamr(void)
+{
+	if (!likely(pkey_execute_disable_supported))
+		return 0x0UL;
+
+	return mfspr(SPRN_IAMR);
+}
+
+static inline void write_iamr(u64 value)
+{
+	if (!likely(pkey_execute_disable_supported))
+		return;
+
+	mtspr(SPRN_IAMR, value);
+}
+
+static inline u64 read_uamor(void)
+{
+	return mfspr(SPRN_UAMOR);
+}
+
+static inline void write_uamor(u64 value)
+{
+	mtspr(SPRN_UAMOR, value);
+}
+
+static bool is_pkey_enabled(int pkey)
+{
+	u64 uamor = read_uamor();
+	u64 pkey_bits = 0x3ul << pkeyshift(pkey);
+	u64 uamor_pkey_bits = (uamor & pkey_bits);
+
+	/*
+	 * Both the bits in UAMOR corresponding to the key should be set or
+	 * reset.
+	 */
+	WARN_ON(uamor_pkey_bits && (uamor_pkey_bits != pkey_bits));
+	return !!(uamor_pkey_bits);
+}
+
+static inline void init_amr(int pkey, u8 init_bits)
+{
+	u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey));
+	u64 old_amr = read_amr() & ~((u64)(0x3ul) << pkeyshift(pkey));
+
+	write_amr(old_amr | new_amr_bits);
+}
+
+static inline void init_iamr(int pkey, u8 init_bits)
+{
+	u64 new_iamr_bits = (((u64)init_bits & 0x1UL) << pkeyshift(pkey));
+	u64 old_iamr = read_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey));
+
+	write_iamr(old_iamr | new_iamr_bits);
+}
+
+/*
+ * Set the access rights in AMR IAMR and UAMOR registers for @pkey to that
+ * specified in @init_val.
+ */
+int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+				unsigned long init_val)
+{
+	u64 new_amr_bits = 0x0ul;
+	u64 new_iamr_bits = 0x0ul;
+
+	if (!is_pkey_enabled(pkey))
+		return -EINVAL;
+
+	if (init_val & PKEY_DISABLE_EXECUTE) {
+		if (!pkey_execute_disable_supported)
+			return -EINVAL;
+		new_iamr_bits |= IAMR_EX_BIT;
+	}
+	init_iamr(pkey, new_iamr_bits);
+
+	/* Set the bits we need in AMR: */
+	if (init_val & PKEY_DISABLE_ACCESS)
+		new_amr_bits |= AMR_RD_BIT | AMR_WR_BIT;
+	else if (init_val & PKEY_DISABLE_WRITE)
+		new_amr_bits |= AMR_WR_BIT;
+
+	init_amr(pkey, new_amr_bits);
+	return 0;
+}
+
+void thread_pkey_regs_save(struct thread_struct *thread)
+{
+	if (static_branch_likely(&pkey_disabled))
+		return;
+
+	/*
+	 * TODO: Skip saving registers if @thread hasn't used any keys yet.
+	 */
+	thread->amr = read_amr();
+	thread->iamr = read_iamr();
+	thread->uamor = read_uamor();
+}
+
+void thread_pkey_regs_restore(struct thread_struct *new_thread,
+			      struct thread_struct *old_thread)
+{
+	if (static_branch_likely(&pkey_disabled))
+		return;
+
+	if (old_thread->amr != new_thread->amr)
+		write_amr(new_thread->amr);
+	if (old_thread->iamr != new_thread->iamr)
+		write_iamr(new_thread->iamr);
+	if (old_thread->uamor != new_thread->uamor)
+		write_uamor(new_thread->uamor);
+}
+
+void thread_pkey_regs_init(struct thread_struct *thread)
+{
+	if (static_branch_likely(&pkey_disabled))
+		return;
+
+	thread->amr = pkey_amr_mask;
+	thread->iamr = pkey_iamr_mask;
+	thread->uamor = pkey_uamor_mask;
+
+	write_uamor(pkey_uamor_mask);
+	write_amr(pkey_amr_mask);
+	write_iamr(pkey_iamr_mask);
+}
+
+static inline bool pkey_allows_readwrite(int pkey)
+{
+	int pkey_shift = pkeyshift(pkey);
+
+	if (!is_pkey_enabled(pkey))
+		return true;
+
+	return !(read_amr() & ((AMR_RD_BIT|AMR_WR_BIT) << pkey_shift));
+}
+
+int __execute_only_pkey(struct mm_struct *mm)
+{
+	return mm->context.execute_only_pkey;
+}
+
+static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
+{
+	/* Do this check first since the vm_flags should be hot */
+	if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
+		return false;
+
+	return (vma_pkey(vma) == vma->vm_mm->context.execute_only_pkey);
+}
+
+/*
+ * This should only be called for *plain* mprotect calls.
+ */
+int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot,
+				  int pkey)
+{
+	/*
+	 * If the currently associated pkey is execute-only, but the requested
+	 * protection is not execute-only, move it back to the default pkey.
+	 */
+	if (vma_is_pkey_exec_only(vma) && (prot != PROT_EXEC))
+		return 0;
+
+	/*
+	 * The requested protection is execute-only. Hence let's use an
+	 * execute-only pkey.
+	 */
+	if (prot == PROT_EXEC) {
+		pkey = execute_only_pkey(vma->vm_mm);
+		if (pkey > 0)
+			return pkey;
+	}
+
+	/* Nothing to override. */
+	return vma_pkey(vma);
+}
+
+static bool pkey_access_permitted(int pkey, bool write, bool execute)
+{
+	int pkey_shift;
+	u64 amr;
+
+	if (!is_pkey_enabled(pkey))
+		return true;
+
+	pkey_shift = pkeyshift(pkey);
+	if (execute && !(read_iamr() & (IAMR_EX_BIT << pkey_shift)))
+		return true;
+
+	amr = read_amr(); /* Delay reading amr until absolutely needed */
+	return ((!write && !(amr & (AMR_RD_BIT << pkey_shift))) ||
+		(write &&  !(amr & (AMR_WR_BIT << pkey_shift))));
+}
+
+bool arch_pte_access_permitted(u64 pte, bool write, bool execute)
+{
+	if (static_branch_likely(&pkey_disabled))
+		return true;
+
+	return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute);
+}
+
+/*
+ * We only want to enforce protection keys on the current thread because we
+ * effectively have no access to AMR/IAMR for other threads or any way to tell
+ * which AMR/IAMR in a threaded process we could use.
+ *
+ * So do not enforce things if the VMA is not from the current mm, or if we are
+ * in a kernel thread.
+ */
+static inline bool vma_is_foreign(struct vm_area_struct *vma)
+{
+	if (!current->mm)
+		return true;
+
+	/* if it is not our ->mm, it has to be foreign */
+	if (current->mm != vma->vm_mm)
+		return true;
+
+	return false;
+}
+
+bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
+			       bool execute, bool foreign)
+{
+	if (static_branch_likely(&pkey_disabled))
+		return true;
+	/*
+	 * Do not enforce our key-permissions on a foreign vma.
+	 */
+	if (foreign || vma_is_foreign(vma))
+		return true;
+
+	return pkey_access_permitted(vma_pkey(vma), write, execute);
+}
+
+void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+	if (static_branch_likely(&pkey_disabled))
+		return;
+
+	/* Duplicate the oldmm pkey state in mm: */
+	mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm);
+	mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
+}
diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
new file mode 100644
index 000000000000..cab06331c0c0
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/security.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/machdep.h>
+#include <asm/mman.h>
+#include <asm/tlb.h>
+
+void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+	int psize;
+	struct hstate *hstate = hstate_file(vma->vm_file);
+
+	psize = hstate_get_psize(hstate);
+	radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, psize);
+}
+
+void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+	int psize;
+	struct hstate *hstate = hstate_file(vma->vm_file);
+
+	psize = hstate_get_psize(hstate);
+	radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, psize);
+}
+
+void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start,
+				   unsigned long end)
+{
+	int psize;
+	struct hstate *hstate = hstate_file(vma->vm_file);
+
+	psize = hstate_get_psize(hstate);
+	radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize);
+}
+
+/*
+ * A vairant of hugetlb_get_unmapped_area doing topdown search
+ * FIXME!! should we do as x86 does or non hugetlb area does ?
+ * ie, use topdown or not based on mmap_is_legacy check ?
+ */
+unsigned long
+radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+				unsigned long len, unsigned long pgoff,
+				unsigned long flags)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	struct hstate *h = hstate_file(file);
+	int fixed = (flags & MAP_FIXED);
+	unsigned long high_limit;
+	struct vm_unmapped_area_info info;
+
+	high_limit = DEFAULT_MAP_WINDOW;
+	if (addr >= high_limit || (fixed && (addr + len > high_limit)))
+		high_limit = TASK_SIZE;
+
+	if (len & ~huge_page_mask(h))
+		return -EINVAL;
+	if (len > high_limit)
+		return -ENOMEM;
+
+	if (fixed) {
+		if (addr > high_limit - len)
+			return -ENOMEM;
+		if (prepare_hugepage_range(file, addr, len))
+			return -EINVAL;
+		return addr;
+	}
+
+	if (addr) {
+		addr = ALIGN(addr, huge_page_size(h));
+		vma = find_vma(mm, addr);
+		if (high_limit - len >= addr && addr >= mmap_min_addr &&
+		    (!vma || addr + len <= vm_start_gap(vma)))
+			return addr;
+	}
+	/*
+	 * We are always doing an topdown search here. Slice code
+	 * does that too.
+	 */
+	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+	info.length = len;
+	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+	info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
+	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+	info.align_offset = 0;
+
+	return vm_unmapped_area(&info);
+}
+
+void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
+					 unsigned long addr, pte_t *ptep,
+					 pte_t old_pte, pte_t pte)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	/*
+	 * To avoid NMMU hang while relaxing access we need to flush the tlb before
+	 * we set the new value.
+	 */
+	if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+	    (atomic_read(&mm->context.copros) > 0))
+		radix__flush_hugetlb_page(vma, addr);
+
+	set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+}
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
new file mode 100644
index 000000000000..c9bcf428dd2b
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -0,0 +1,1124 @@
+/*
+ * Page table handling routines for radix page table.
+ *
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "radix-mmu: " fmt
+
+#include <linux/kernel.h>
+#include <linux/sched/mm.h>
+#include <linux/memblock.h>
+#include <linux/of_fdt.h>
+#include <linux/mm.h>
+#include <linux/string_helpers.h>
+#include <linux/stop_machine.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/mmu_context.h>
+#include <asm/dma.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/firmware.h>
+#include <asm/powernv.h>
+#include <asm/sections.h>
+#include <asm/trace.h>
+#include <asm/uaccess.h>
+
+#include <trace/events/thp.h>
+
+unsigned int mmu_pid_bits;
+unsigned int mmu_base_pid;
+
+static int native_register_process_table(unsigned long base, unsigned long pg_sz,
+					 unsigned long table_size)
+{
+	unsigned long patb0, patb1;
+
+	patb0 = be64_to_cpu(partition_tb[0].patb0);
+	patb1 = base | table_size | PATB_GR;
+
+	mmu_partition_table_set_entry(0, patb0, patb1);
+
+	return 0;
+}
+
+static __ref void *early_alloc_pgtable(unsigned long size, int nid,
+			unsigned long region_start, unsigned long region_end)
+{
+	phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
+	phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
+	void *ptr;
+
+	if (region_start)
+		min_addr = region_start;
+	if (region_end)
+		max_addr = region_end;
+
+	ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
+
+	if (!ptr)
+		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
+		      __func__, size, size, nid, &min_addr, &max_addr);
+
+	return ptr;
+}
+
+static int early_map_kernel_page(unsigned long ea, unsigned long pa,
+			  pgprot_t flags,
+			  unsigned int map_page_size,
+			  int nid,
+			  unsigned long region_start, unsigned long region_end)
+{
+	unsigned long pfn = pa >> PAGE_SHIFT;
+	pgd_t *pgdp;
+	pud_t *pudp;
+	pmd_t *pmdp;
+	pte_t *ptep;
+
+	pgdp = pgd_offset_k(ea);
+	if (pgd_none(*pgdp)) {
+		pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
+						region_start, region_end);
+		pgd_populate(&init_mm, pgdp, pudp);
+	}
+	pudp = pud_offset(pgdp, ea);
+	if (map_page_size == PUD_SIZE) {
+		ptep = (pte_t *)pudp;
+		goto set_the_pte;
+	}
+	if (pud_none(*pudp)) {
+		pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
+						region_start, region_end);
+		pud_populate(&init_mm, pudp, pmdp);
+	}
+	pmdp = pmd_offset(pudp, ea);
+	if (map_page_size == PMD_SIZE) {
+		ptep = pmdp_ptep(pmdp);
+		goto set_the_pte;
+	}
+	if (!pmd_present(*pmdp)) {
+		ptep = early_alloc_pgtable(PAGE_SIZE, nid,
+						region_start, region_end);
+		pmd_populate_kernel(&init_mm, pmdp, ptep);
+	}
+	ptep = pte_offset_kernel(pmdp, ea);
+
+set_the_pte:
+	set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
+	smp_wmb();
+	return 0;
+}
+
+/*
+ * nid, region_start, and region_end are hints to try to place the page
+ * table memory in the same node or region.
+ */
+static int __map_kernel_page(unsigned long ea, unsigned long pa,
+			  pgprot_t flags,
+			  unsigned int map_page_size,
+			  int nid,
+			  unsigned long region_start, unsigned long region_end)
+{
+	unsigned long pfn = pa >> PAGE_SHIFT;
+	pgd_t *pgdp;
+	pud_t *pudp;
+	pmd_t *pmdp;
+	pte_t *ptep;
+	/*
+	 * Make sure task size is correct as per the max adddr
+	 */
+	BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
+
+#ifdef CONFIG_PPC_64K_PAGES
+	BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
+#endif
+
+	if (unlikely(!slab_is_available()))
+		return early_map_kernel_page(ea, pa, flags, map_page_size,
+						nid, region_start, region_end);
+
+	/*
+	 * Should make page table allocation functions be able to take a
+	 * node, so we can place kernel page tables on the right nodes after
+	 * boot.
+	 */
+	pgdp = pgd_offset_k(ea);
+	pudp = pud_alloc(&init_mm, pgdp, ea);
+	if (!pudp)
+		return -ENOMEM;
+	if (map_page_size == PUD_SIZE) {
+		ptep = (pte_t *)pudp;
+		goto set_the_pte;
+	}
+	pmdp = pmd_alloc(&init_mm, pudp, ea);
+	if (!pmdp)
+		return -ENOMEM;
+	if (map_page_size == PMD_SIZE) {
+		ptep = pmdp_ptep(pmdp);
+		goto set_the_pte;
+	}
+	ptep = pte_alloc_kernel(pmdp, ea);
+	if (!ptep)
+		return -ENOMEM;
+
+set_the_pte:
+	set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
+	smp_wmb();
+	return 0;
+}
+
+int radix__map_kernel_page(unsigned long ea, unsigned long pa,
+			  pgprot_t flags,
+			  unsigned int map_page_size)
+{
+	return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
+}
+
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void radix__change_memory_range(unsigned long start, unsigned long end,
+				unsigned long clear)
+{
+	unsigned long idx;
+	pgd_t *pgdp;
+	pud_t *pudp;
+	pmd_t *pmdp;
+	pte_t *ptep;
+
+	start = ALIGN_DOWN(start, PAGE_SIZE);
+	end = PAGE_ALIGN(end); // aligns up
+
+	pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
+		 start, end, clear);
+
+	for (idx = start; idx < end; idx += PAGE_SIZE) {
+		pgdp = pgd_offset_k(idx);
+		pudp = pud_alloc(&init_mm, pgdp, idx);
+		if (!pudp)
+			continue;
+		if (pud_huge(*pudp)) {
+			ptep = (pte_t *)pudp;
+			goto update_the_pte;
+		}
+		pmdp = pmd_alloc(&init_mm, pudp, idx);
+		if (!pmdp)
+			continue;
+		if (pmd_huge(*pmdp)) {
+			ptep = pmdp_ptep(pmdp);
+			goto update_the_pte;
+		}
+		ptep = pte_alloc_kernel(pmdp, idx);
+		if (!ptep)
+			continue;
+update_the_pte:
+		radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
+	}
+
+	radix__flush_tlb_kernel_range(start, end);
+}
+
+void radix__mark_rodata_ro(void)
+{
+	unsigned long start, end;
+
+	start = (unsigned long)_stext;
+	end = (unsigned long)__init_begin;
+
+	radix__change_memory_range(start, end, _PAGE_WRITE);
+}
+
+void radix__mark_initmem_nx(void)
+{
+	unsigned long start = (unsigned long)__init_begin;
+	unsigned long end = (unsigned long)__init_end;
+
+	radix__change_memory_range(start, end, _PAGE_EXEC);
+}
+#endif /* CONFIG_STRICT_KERNEL_RWX */
+
+static inline void __meminit
+print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
+{
+	char buf[10];
+
+	if (end <= start)
+		return;
+
+	string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
+
+	pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
+		exec ? " (exec)" : "");
+}
+
+static unsigned long next_boundary(unsigned long addr, unsigned long end)
+{
+#ifdef CONFIG_STRICT_KERNEL_RWX
+	if (addr < __pa_symbol(__init_begin))
+		return __pa_symbol(__init_begin);
+#endif
+	return end;
+}
+
+static int __meminit create_physical_mapping(unsigned long start,
+					     unsigned long end,
+					     int nid)
+{
+	unsigned long vaddr, addr, mapping_size = 0;
+	bool prev_exec, exec = false;
+	pgprot_t prot;
+	int psize;
+
+	start = _ALIGN_UP(start, PAGE_SIZE);
+	for (addr = start; addr < end; addr += mapping_size) {
+		unsigned long gap, previous_size;
+		int rc;
+
+		gap = next_boundary(addr, end) - addr;
+		previous_size = mapping_size;
+		prev_exec = exec;
+
+		if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
+		    mmu_psize_defs[MMU_PAGE_1G].shift) {
+			mapping_size = PUD_SIZE;
+			psize = MMU_PAGE_1G;
+		} else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
+			   mmu_psize_defs[MMU_PAGE_2M].shift) {
+			mapping_size = PMD_SIZE;
+			psize = MMU_PAGE_2M;
+		} else {
+			mapping_size = PAGE_SIZE;
+			psize = mmu_virtual_psize;
+		}
+
+		vaddr = (unsigned long)__va(addr);
+
+		if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
+		    overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
+			prot = PAGE_KERNEL_X;
+			exec = true;
+		} else {
+			prot = PAGE_KERNEL;
+			exec = false;
+		}
+
+		if (mapping_size != previous_size || exec != prev_exec) {
+			print_mapping(start, addr, previous_size, prev_exec);
+			start = addr;
+		}
+
+		rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
+		if (rc)
+			return rc;
+
+		update_page_count(psize, 1);
+	}
+
+	print_mapping(start, addr, mapping_size, exec);
+	return 0;
+}
+
+void __init radix_init_pgtable(void)
+{
+	unsigned long rts_field;
+	struct memblock_region *reg;
+
+	/* We don't support slb for radix */
+	mmu_slb_size = 0;
+	/*
+	 * Create the linear mapping, using standard page size for now
+	 */
+	for_each_memblock(memory, reg) {
+		/*
+		 * The memblock allocator  is up at this point, so the
+		 * page tables will be allocated within the range. No
+		 * need or a node (which we don't have yet).
+		 */
+
+		if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
+			pr_warn("Outside the supported range\n");
+			continue;
+		}
+
+		WARN_ON(create_physical_mapping(reg->base,
+						reg->base + reg->size,
+						-1));
+	}
+
+	/* Find out how many PID bits are supported */
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		if (!mmu_pid_bits)
+			mmu_pid_bits = 20;
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+		/*
+		 * When KVM is possible, we only use the top half of the
+		 * PID space to avoid collisions between host and guest PIDs
+		 * which can cause problems due to prefetch when exiting the
+		 * guest with AIL=3
+		 */
+		mmu_base_pid = 1 << (mmu_pid_bits - 1);
+#else
+		mmu_base_pid = 1;
+#endif
+	} else {
+		/* The guest uses the bottom half of the PID space */
+		if (!mmu_pid_bits)
+			mmu_pid_bits = 19;
+		mmu_base_pid = 1;
+	}
+
+	/*
+	 * Allocate Partition table and process table for the
+	 * host.
+	 */
+	BUG_ON(PRTB_SIZE_SHIFT > 36);
+	process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
+	/*
+	 * Fill in the process table.
+	 */
+	rts_field = radix__get_tree_size();
+	process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
+	/*
+	 * Fill in the partition table. We are suppose to use effective address
+	 * of process table here. But our linear mapping also enable us to use
+	 * physical address here.
+	 */
+	register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
+	pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
+	asm volatile("ptesync" : : : "memory");
+	asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
+		     "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
+	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+	trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
+
+	/*
+	 * The init_mm context is given the first available (non-zero) PID,
+	 * which is the "guard PID" and contains no page table. PIDR should
+	 * never be set to zero because that duplicates the kernel address
+	 * space at the 0x0... offset (quadrant 0)!
+	 *
+	 * An arbitrary PID that may later be allocated by the PID allocator
+	 * for userspace processes must not be used either, because that
+	 * would cause stale user mappings for that PID on CPUs outside of
+	 * the TLB invalidation scheme (because it won't be in mm_cpumask).
+	 *
+	 * So permanently carve out one PID for the purpose of a guard PID.
+	 */
+	init_mm.context.id = mmu_base_pid;
+	mmu_base_pid++;
+}
+
+static void __init radix_init_partition_table(void)
+{
+	unsigned long rts_field, dw0;
+
+	mmu_partition_table_init();
+	rts_field = radix__get_tree_size();
+	dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
+	mmu_partition_table_set_entry(0, dw0, 0);
+
+	pr_info("Initializing Radix MMU\n");
+	pr_info("Partition table %p\n", partition_tb);
+}
+
+void __init radix_init_native(void)
+{
+	register_process_table = native_register_process_table;
+}
+
+static int __init get_idx_from_shift(unsigned int shift)
+{
+	int idx = -1;
+
+	switch (shift) {
+	case 0xc:
+		idx = MMU_PAGE_4K;
+		break;
+	case 0x10:
+		idx = MMU_PAGE_64K;
+		break;
+	case 0x15:
+		idx = MMU_PAGE_2M;
+		break;
+	case 0x1e:
+		idx = MMU_PAGE_1G;
+		break;
+	}
+	return idx;
+}
+
+static int __init radix_dt_scan_page_sizes(unsigned long node,
+					   const char *uname, int depth,
+					   void *data)
+{
+	int size = 0;
+	int shift, idx;
+	unsigned int ap;
+	const __be32 *prop;
+	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+
+	/* We are scanning "cpu" nodes only */
+	if (type == NULL || strcmp(type, "cpu") != 0)
+		return 0;
+
+	/* Find MMU PID size */
+	prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
+	if (prop && size == 4)
+		mmu_pid_bits = be32_to_cpup(prop);
+
+	/* Grab page size encodings */
+	prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
+	if (!prop)
+		return 0;
+
+	pr_info("Page sizes from device-tree:\n");
+	for (; size >= 4; size -= 4, ++prop) {
+
+		struct mmu_psize_def *def;
+
+		/* top 3 bit is AP encoding */
+		shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
+		ap = be32_to_cpu(prop[0]) >> 29;
+		pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
+
+		idx = get_idx_from_shift(shift);
+		if (idx < 0)
+			continue;
+
+		def = &mmu_psize_defs[idx];
+		def->shift = shift;
+		def->ap  = ap;
+	}
+
+	/* needed ? */
+	cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
+	return 1;
+}
+
+void __init radix__early_init_devtree(void)
+{
+	int rc;
+
+	/*
+	 * Try to find the available page sizes in the device-tree
+	 */
+	rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
+	if (rc != 0)  /* Found */
+		goto found;
+	/*
+	 * let's assume we have page 4k and 64k support
+	 */
+	mmu_psize_defs[MMU_PAGE_4K].shift = 12;
+	mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
+
+	mmu_psize_defs[MMU_PAGE_64K].shift = 16;
+	mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
+found:
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	if (mmu_psize_defs[MMU_PAGE_2M].shift) {
+		/*
+		 * map vmemmap using 2M if available
+		 */
+		mmu_vmemmap_psize = MMU_PAGE_2M;
+	}
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+	return;
+}
+
+static void radix_init_amor(void)
+{
+	/*
+	* In HV mode, we init AMOR (Authority Mask Override Register) so that
+	* the hypervisor and guest can setup IAMR (Instruction Authority Mask
+	* Register), enable key 0 and set it to 1.
+	*
+	* AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
+	*/
+	mtspr(SPRN_AMOR, (3ul << 62));
+}
+
+#ifdef CONFIG_PPC_KUEP
+void setup_kuep(bool disabled)
+{
+	if (disabled || !early_radix_enabled())
+		return;
+
+	if (smp_processor_id() == boot_cpuid)
+		pr_info("Activating Kernel Userspace Execution Prevention\n");
+
+	/*
+	 * Radix always uses key0 of the IAMR to determine if an access is
+	 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
+	 * fetch.
+	 */
+	mtspr(SPRN_IAMR, (1ul << 62));
+}
+#endif
+
+#ifdef CONFIG_PPC_KUAP
+void setup_kuap(bool disabled)
+{
+	if (disabled || !early_radix_enabled())
+		return;
+
+	if (smp_processor_id() == boot_cpuid) {
+		pr_info("Activating Kernel Userspace Access Prevention\n");
+		cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP;
+	}
+
+	/* Make sure userspace can't change the AMR */
+	mtspr(SPRN_UAMOR, 0);
+	mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
+	isync();
+}
+#endif
+
+void __init radix__early_init_mmu(void)
+{
+	unsigned long lpcr;
+
+#ifdef CONFIG_PPC_64K_PAGES
+	/* PAGE_SIZE mappings */
+	mmu_virtual_psize = MMU_PAGE_64K;
+#else
+	mmu_virtual_psize = MMU_PAGE_4K;
+#endif
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	/* vmemmap mapping */
+	mmu_vmemmap_psize = mmu_virtual_psize;
+#endif
+	/*
+	 * initialize page table size
+	 */
+	__pte_index_size = RADIX_PTE_INDEX_SIZE;
+	__pmd_index_size = RADIX_PMD_INDEX_SIZE;
+	__pud_index_size = RADIX_PUD_INDEX_SIZE;
+	__pgd_index_size = RADIX_PGD_INDEX_SIZE;
+	__pud_cache_index = RADIX_PUD_INDEX_SIZE;
+	__pte_table_size = RADIX_PTE_TABLE_SIZE;
+	__pmd_table_size = RADIX_PMD_TABLE_SIZE;
+	__pud_table_size = RADIX_PUD_TABLE_SIZE;
+	__pgd_table_size = RADIX_PGD_TABLE_SIZE;
+
+	__pmd_val_bits = RADIX_PMD_VAL_BITS;
+	__pud_val_bits = RADIX_PUD_VAL_BITS;
+	__pgd_val_bits = RADIX_PGD_VAL_BITS;
+
+	__kernel_virt_start = RADIX_KERN_VIRT_START;
+	__vmalloc_start = RADIX_VMALLOC_START;
+	__vmalloc_end = RADIX_VMALLOC_END;
+	__kernel_io_start = RADIX_KERN_IO_START;
+	__kernel_io_end = RADIX_KERN_IO_END;
+	vmemmap = (struct page *)RADIX_VMEMMAP_START;
+	ioremap_bot = IOREMAP_BASE;
+
+#ifdef CONFIG_PCI
+	pci_io_base = ISA_IO_BASE;
+#endif
+	__pte_frag_nr = RADIX_PTE_FRAG_NR;
+	__pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
+	__pmd_frag_nr = RADIX_PMD_FRAG_NR;
+	__pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
+
+	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+		radix_init_native();
+		lpcr = mfspr(SPRN_LPCR);
+		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
+		radix_init_partition_table();
+		radix_init_amor();
+	} else {
+		radix_init_pseries();
+	}
+
+	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
+
+	radix_init_pgtable();
+	/* Switch to the guard PID before turning on MMU */
+	radix__switch_mmu_context(NULL, &init_mm);
+	if (cpu_has_feature(CPU_FTR_HVMODE))
+		tlbiel_all();
+}
+
+void radix__early_init_mmu_secondary(void)
+{
+	unsigned long lpcr;
+	/*
+	 * update partition table control register and UPRT
+	 */
+	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+		lpcr = mfspr(SPRN_LPCR);
+		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
+
+		mtspr(SPRN_PTCR,
+		      __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+		radix_init_amor();
+	}
+
+	radix__switch_mmu_context(NULL, &init_mm);
+	if (cpu_has_feature(CPU_FTR_HVMODE))
+		tlbiel_all();
+}
+
+void radix__mmu_cleanup_all(void)
+{
+	unsigned long lpcr;
+
+	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+		lpcr = mfspr(SPRN_LPCR);
+		mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
+		mtspr(SPRN_PTCR, 0);
+		powernv_set_nmmu_ptcr(0);
+		radix__flush_tlb_all();
+	}
+}
+
+void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
+				phys_addr_t first_memblock_size)
+{
+	/*
+	 * We don't currently support the first MEMBLOCK not mapping 0
+	 * physical on those processors
+	 */
+	BUG_ON(first_memblock_base != 0);
+
+	/*
+	 * Radix mode is not limited by RMA / VRMA addressing.
+	 */
+	ppc64_rma_size = ULONG_MAX;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+	pte_t *pte;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte = pte_start + i;
+		if (!pte_none(*pte))
+			return;
+	}
+
+	pte_free_kernel(&init_mm, pte_start);
+	pmd_clear(pmd);
+}
+
+static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+	pmd_t *pmd;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		pmd = pmd_start + i;
+		if (!pmd_none(*pmd))
+			return;
+	}
+
+	pmd_free(&init_mm, pmd_start);
+	pud_clear(pud);
+}
+
+struct change_mapping_params {
+	pte_t *pte;
+	unsigned long start;
+	unsigned long end;
+	unsigned long aligned_start;
+	unsigned long aligned_end;
+};
+
+static int __meminit stop_machine_change_mapping(void *data)
+{
+	struct change_mapping_params *params =
+			(struct change_mapping_params *)data;
+
+	if (!data)
+		return -1;
+
+	spin_unlock(&init_mm.page_table_lock);
+	pte_clear(&init_mm, params->aligned_start, params->pte);
+	create_physical_mapping(params->aligned_start, params->start, -1);
+	create_physical_mapping(params->end, params->aligned_end, -1);
+	spin_lock(&init_mm.page_table_lock);
+	return 0;
+}
+
+static void remove_pte_table(pte_t *pte_start, unsigned long addr,
+			     unsigned long end)
+{
+	unsigned long next;
+	pte_t *pte;
+
+	pte = pte_start + pte_index(addr);
+	for (; addr < end; addr = next, pte++) {
+		next = (addr + PAGE_SIZE) & PAGE_MASK;
+		if (next > end)
+			next = end;
+
+		if (!pte_present(*pte))
+			continue;
+
+		if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
+			/*
+			 * The vmemmap_free() and remove_section_mapping()
+			 * codepaths call us with aligned addresses.
+			 */
+			WARN_ONCE(1, "%s: unaligned range\n", __func__);
+			continue;
+		}
+
+		pte_clear(&init_mm, addr, pte);
+	}
+}
+
+/*
+ * clear the pte and potentially split the mapping helper
+ */
+static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end,
+				unsigned long size, pte_t *pte)
+{
+	unsigned long mask = ~(size - 1);
+	unsigned long aligned_start = addr & mask;
+	unsigned long aligned_end = addr + size;
+	struct change_mapping_params params;
+	bool split_region = false;
+
+	if ((end - addr) < size) {
+		/*
+		 * We're going to clear the PTE, but not flushed
+		 * the mapping, time to remap and flush. The
+		 * effects if visible outside the processor or
+		 * if we are running in code close to the
+		 * mapping we cleared, we are in trouble.
+		 */
+		if (overlaps_kernel_text(aligned_start, addr) ||
+			overlaps_kernel_text(end, aligned_end)) {
+			/*
+			 * Hack, just return, don't pte_clear
+			 */
+			WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel "
+				  "text, not splitting\n", addr, end);
+			return;
+		}
+		split_region = true;
+	}
+
+	if (split_region) {
+		params.pte = pte;
+		params.start = addr;
+		params.end = end;
+		params.aligned_start = addr & ~(size - 1);
+		params.aligned_end = min_t(unsigned long, aligned_end,
+				(unsigned long)__va(memblock_end_of_DRAM()));
+		stop_machine(stop_machine_change_mapping, &params, NULL);
+		return;
+	}
+
+	pte_clear(&init_mm, addr, pte);
+}
+
+static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
+			     unsigned long end)
+{
+	unsigned long next;
+	pte_t *pte_base;
+	pmd_t *pmd;
+
+	pmd = pmd_start + pmd_index(addr);
+	for (; addr < end; addr = next, pmd++) {
+		next = pmd_addr_end(addr, end);
+
+		if (!pmd_present(*pmd))
+			continue;
+
+		if (pmd_huge(*pmd)) {
+			split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
+			continue;
+		}
+
+		pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+		remove_pte_table(pte_base, addr, next);
+		free_pte_table(pte_base, pmd);
+	}
+}
+
+static void remove_pud_table(pud_t *pud_start, unsigned long addr,
+			     unsigned long end)
+{
+	unsigned long next;
+	pmd_t *pmd_base;
+	pud_t *pud;
+
+	pud = pud_start + pud_index(addr);
+	for (; addr < end; addr = next, pud++) {
+		next = pud_addr_end(addr, end);
+
+		if (!pud_present(*pud))
+			continue;
+
+		if (pud_huge(*pud)) {
+			split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
+			continue;
+		}
+
+		pmd_base = (pmd_t *)pud_page_vaddr(*pud);
+		remove_pmd_table(pmd_base, addr, next);
+		free_pmd_table(pmd_base, pud);
+	}
+}
+
+static void __meminit remove_pagetable(unsigned long start, unsigned long end)
+{
+	unsigned long addr, next;
+	pud_t *pud_base;
+	pgd_t *pgd;
+
+	spin_lock(&init_mm.page_table_lock);
+
+	for (addr = start; addr < end; addr = next) {
+		next = pgd_addr_end(addr, end);
+
+		pgd = pgd_offset_k(addr);
+		if (!pgd_present(*pgd))
+			continue;
+
+		if (pgd_huge(*pgd)) {
+			split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
+			continue;
+		}
+
+		pud_base = (pud_t *)pgd_page_vaddr(*pgd);
+		remove_pud_table(pud_base, addr, next);
+	}
+
+	spin_unlock(&init_mm.page_table_lock);
+	radix__flush_tlb_kernel_range(start, end);
+}
+
+int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
+{
+	if (end >= RADIX_VMALLOC_START) {
+		pr_warn("Outside the supported range\n");
+		return -1;
+	}
+
+	return create_physical_mapping(start, end, nid);
+}
+
+int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
+{
+	remove_pagetable(start, end);
+	return 0;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
+				 pgprot_t flags, unsigned int map_page_size,
+				 int nid)
+{
+	return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
+}
+
+int __meminit radix__vmemmap_create_mapping(unsigned long start,
+				      unsigned long page_size,
+				      unsigned long phys)
+{
+	/* Create a PTE encoding */
+	unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
+	int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
+	int ret;
+
+	if ((start + page_size) >= RADIX_VMEMMAP_END) {
+		pr_warn("Outside the supported range\n");
+		return -1;
+	}
+
+	ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
+	BUG_ON(ret);
+
+	return 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
+{
+	remove_pagetable(start, start + page_size);
+}
+#endif
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+				  pmd_t *pmdp, unsigned long clr,
+				  unsigned long set)
+{
+	unsigned long old;
+
+#ifdef CONFIG_DEBUG_VM
+	WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+	assert_spin_locked(pmd_lockptr(mm, pmdp));
+#endif
+
+	old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
+	trace_hugepage_update(addr, old, clr, set);
+
+	return old;
+}
+
+pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+			pmd_t *pmdp)
+
+{
+	pmd_t pmd;
+
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+	VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
+	VM_BUG_ON(pmd_devmap(*pmdp));
+	/*
+	 * khugepaged calls this for normal pmd
+	 */
+	pmd = *pmdp;
+	pmd_clear(pmdp);
+
+	/*FIXME!!  Verify whether we need this kick below */
+	serialize_against_pte_lookup(vma->vm_mm);
+
+	radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
+
+	return pmd;
+}
+
+/*
+ * For us pgtable_t is pte_t *. Inorder to save the deposisted
+ * page table, we consider the allocated page table as a list
+ * head. On withdraw we need to make sure we zero out the used
+ * list_head memory area.
+ */
+void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+				 pgtable_t pgtable)
+{
+	struct list_head *lh = (struct list_head *) pgtable;
+
+	assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+	/* FIFO */
+	if (!pmd_huge_pte(mm, pmdp))
+		INIT_LIST_HEAD(lh);
+	else
+		list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
+	pmd_huge_pte(mm, pmdp) = pgtable;
+}
+
+pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+	pte_t *ptep;
+	pgtable_t pgtable;
+	struct list_head *lh;
+
+	assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+	/* FIFO */
+	pgtable = pmd_huge_pte(mm, pmdp);
+	lh = (struct list_head *) pgtable;
+	if (list_empty(lh))
+		pmd_huge_pte(mm, pmdp) = NULL;
+	else {
+		pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
+		list_del(lh);
+	}
+	ptep = (pte_t *) pgtable;
+	*ptep = __pte(0);
+	ptep++;
+	*ptep = __pte(0);
+	return pgtable;
+}
+
+pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
+				     unsigned long addr, pmd_t *pmdp)
+{
+	pmd_t old_pmd;
+	unsigned long old;
+
+	old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
+	old_pmd = __pmd(old);
+	/*
+	 * Serialize against find_current_mm_pte which does lock-less
+	 * lookup in page tables with local interrupts disabled. For huge pages
+	 * it casts pmd_t to pte_t. Since format of pte_t is different from
+	 * pmd_t we want to prevent transit from pmd pointing to page table
+	 * to pmd pointing to huge page (and back) while interrupts are disabled.
+	 * We clear pmd to possibly replace it with page table pointer in
+	 * different code paths. So make sure we wait for the parallel
+	 * find_current_mm_pte to finish.
+	 */
+	serialize_against_pte_lookup(mm);
+	return old_pmd;
+}
+
+int radix__has_transparent_hugepage(void)
+{
+	/* For radix 2M at PMD level means thp */
+	if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
+		return 1;
+	return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
+				  pte_t entry, unsigned long address, int psize)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
+					      _PAGE_RW | _PAGE_EXEC);
+
+	unsigned long change = pte_val(entry) ^ pte_val(*ptep);
+	/*
+	 * To avoid NMMU hang while relaxing access, we need mark
+	 * the pte invalid in between.
+	 */
+	if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) {
+		unsigned long old_pte, new_pte;
+
+		old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
+		/*
+		 * new value of pte
+		 */
+		new_pte = old_pte | set;
+		radix__flush_tlb_page_psize(mm, address, psize);
+		__radix_pte_update(ptep, _PAGE_INVALID, new_pte);
+	} else {
+		__radix_pte_update(ptep, 0, set);
+		/*
+		 * Book3S does not require a TLB flush when relaxing access
+		 * restrictions when the address space is not attached to a
+		 * NMMU, because the core MMU will reload the pte after taking
+		 * an access fault, which is defined by the architectue.
+		 */
+	}
+	/* See ptesync comment in radix__set_pte_at */
+}
+
+void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
+				    unsigned long addr, pte_t *ptep,
+				    pte_t old_pte, pte_t pte)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	/*
+	 * To avoid NMMU hang while relaxing access we need to flush the tlb before
+	 * we set the new value. We need to do this only for radix, because hash
+	 * translation does flush when updating the linux pte.
+	 */
+	if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+	    (atomic_read(&mm->context.copros) > 0))
+		radix__flush_tlb_page(vma, addr);
+
+	set_pte_at(mm, addr, ptep, pte);
+}
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
new file mode 100644
index 000000000000..6a23b9ebd2a1
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -0,0 +1,1101 @@
+/*
+ * TLB flush routines for radix kernels.
+ *
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/memblock.h>
+#include <linux/mmu_context.h>
+#include <linux/sched/mm.h>
+
+#include <asm/ppc-opcode.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/trace.h>
+#include <asm/cputhreads.h>
+
+#define RIC_FLUSH_TLB 0
+#define RIC_FLUSH_PWC 1
+#define RIC_FLUSH_ALL 2
+
+/*
+ * tlbiel instruction for radix, set invalidation
+ * i.e., r=1 and is=01 or is=10 or is=11
+ */
+static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is,
+					unsigned int pid,
+					unsigned int ric, unsigned int prs)
+{
+	unsigned long rb;
+	unsigned long rs;
+
+	rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+	rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
+
+	asm volatile(PPC_TLBIEL(%0, %1, %2, %3, 1)
+		     : : "r"(rb), "r"(rs), "i"(ric), "i"(prs)
+		     : "memory");
+}
+
+static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
+{
+	unsigned int set;
+
+	asm volatile("ptesync": : :"memory");
+
+	/*
+	 * Flush the first set of the TLB, and the entire Page Walk Cache
+	 * and partition table entries. Then flush the remaining sets of the
+	 * TLB.
+	 */
+	tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
+	for (set = 1; set < num_sets; set++)
+		tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
+
+	/* Do the same for process scoped entries. */
+	tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
+	for (set = 1; set < num_sets; set++)
+		tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
+
+	asm volatile("ptesync": : :"memory");
+}
+
+void radix__tlbiel_all(unsigned int action)
+{
+	unsigned int is;
+
+	switch (action) {
+	case TLB_INVAL_SCOPE_GLOBAL:
+		is = 3;
+		break;
+	case TLB_INVAL_SCOPE_LPID:
+		is = 2;
+		break;
+	default:
+		BUG();
+	}
+
+	if (early_cpu_has_feature(CPU_FTR_ARCH_300))
+		tlbiel_all_isa300(POWER9_TLB_SETS_RADIX, is);
+	else
+		WARN(1, "%s called on pre-POWER9 CPU\n", __func__);
+
+	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+static inline void __tlbiel_pid(unsigned long pid, int set,
+				unsigned long ric)
+{
+	unsigned long rb,rs,prs,r;
+
+	rb = PPC_BIT(53); /* IS = 1 */
+	rb |= set << PPC_BITLSHIFT(51);
+	rs = ((unsigned long)pid) << PPC_BITLSHIFT(31);
+	prs = 1; /* process scoped */
+	r = 1;   /* radix format */
+
+	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+	trace_tlbie(0, 1, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
+{
+	unsigned long rb,rs,prs,r;
+
+	rb = PPC_BIT(53); /* IS = 1 */
+	rs = pid << PPC_BITLSHIFT(31);
+	prs = 1; /* process scoped */
+	r = 1;   /* radix format */
+
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+	trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbiel_lpid(unsigned long lpid, int set,
+				unsigned long ric)
+{
+	unsigned long rb,rs,prs,r;
+
+	rb = PPC_BIT(52); /* IS = 2 */
+	rb |= set << PPC_BITLSHIFT(51);
+	rs = 0;  /* LPID comes from LPIDR */
+	prs = 0; /* partition scoped */
+	r = 1;   /* radix format */
+
+	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+	trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
+{
+	unsigned long rb,rs,prs,r;
+
+	rb = PPC_BIT(52); /* IS = 2 */
+	rs = lpid;
+	prs = 0; /* partition scoped */
+	r = 1;   /* radix format */
+
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+	trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbiel_lpid_guest(unsigned long lpid, int set,
+				unsigned long ric)
+{
+	unsigned long rb,rs,prs,r;
+
+	rb = PPC_BIT(52); /* IS = 2 */
+	rb |= set << PPC_BITLSHIFT(51);
+	rs = 0;  /* LPID comes from LPIDR */
+	prs = 1; /* process scoped */
+	r = 1;   /* radix format */
+
+	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+	trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
+}
+
+
+static inline void __tlbiel_va(unsigned long va, unsigned long pid,
+			       unsigned long ap, unsigned long ric)
+{
+	unsigned long rb,rs,prs,r;
+
+	rb = va & ~(PPC_BITMASK(52, 63));
+	rb |= ap << PPC_BITLSHIFT(58);
+	rs = pid << PPC_BITLSHIFT(31);
+	prs = 1; /* process scoped */
+	r = 1;   /* radix format */
+
+	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+	trace_tlbie(0, 1, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbie_va(unsigned long va, unsigned long pid,
+			      unsigned long ap, unsigned long ric)
+{
+	unsigned long rb,rs,prs,r;
+
+	rb = va & ~(PPC_BITMASK(52, 63));
+	rb |= ap << PPC_BITLSHIFT(58);
+	rs = pid << PPC_BITLSHIFT(31);
+	prs = 1; /* process scoped */
+	r = 1;   /* radix format */
+
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+	trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
+			      unsigned long ap, unsigned long ric)
+{
+	unsigned long rb,rs,prs,r;
+
+	rb = va & ~(PPC_BITMASK(52, 63));
+	rb |= ap << PPC_BITLSHIFT(58);
+	rs = lpid;
+	prs = 0; /* partition scoped */
+	r = 1;   /* radix format */
+
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+	trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
+}
+
+static inline void fixup_tlbie(void)
+{
+	unsigned long pid = 0;
+	unsigned long va = ((1UL << 52) - 1);
+
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		__tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
+	}
+}
+
+static inline void fixup_tlbie_lpid(unsigned long lpid)
+{
+	unsigned long va = ((1UL << 52) - 1);
+
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		__tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
+	}
+}
+
+/*
+ * We use 128 set in radix mode and 256 set in hpt mode.
+ */
+static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
+{
+	int set;
+
+	asm volatile("ptesync": : :"memory");
+
+	/*
+	 * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+	 * also flush the entire Page Walk Cache.
+	 */
+	__tlbiel_pid(pid, 0, ric);
+
+	/* For PWC, only one flush is needed */
+	if (ric == RIC_FLUSH_PWC) {
+		asm volatile("ptesync": : :"memory");
+		return;
+	}
+
+	/* For the remaining sets, just flush the TLB */
+	for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
+		__tlbiel_pid(pid, set, RIC_FLUSH_TLB);
+
+	asm volatile("ptesync": : :"memory");
+	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
+{
+	asm volatile("ptesync": : :"memory");
+
+	/*
+	 * Workaround the fact that the "ric" argument to __tlbie_pid
+	 * must be a compile-time contraint to match the "i" constraint
+	 * in the asm statement.
+	 */
+	switch (ric) {
+	case RIC_FLUSH_TLB:
+		__tlbie_pid(pid, RIC_FLUSH_TLB);
+		break;
+	case RIC_FLUSH_PWC:
+		__tlbie_pid(pid, RIC_FLUSH_PWC);
+		break;
+	case RIC_FLUSH_ALL:
+	default:
+		__tlbie_pid(pid, RIC_FLUSH_ALL);
+	}
+	fixup_tlbie();
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric)
+{
+	int set;
+
+	VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
+
+	asm volatile("ptesync": : :"memory");
+
+	/*
+	 * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+	 * also flush the entire Page Walk Cache.
+	 */
+	__tlbiel_lpid(lpid, 0, ric);
+
+	/* For PWC, only one flush is needed */
+	if (ric == RIC_FLUSH_PWC) {
+		asm volatile("ptesync": : :"memory");
+		return;
+	}
+
+	/* For the remaining sets, just flush the TLB */
+	for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
+		__tlbiel_lpid(lpid, set, RIC_FLUSH_TLB);
+
+	asm volatile("ptesync": : :"memory");
+	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
+{
+	asm volatile("ptesync": : :"memory");
+
+	/*
+	 * Workaround the fact that the "ric" argument to __tlbie_pid
+	 * must be a compile-time contraint to match the "i" constraint
+	 * in the asm statement.
+	 */
+	switch (ric) {
+	case RIC_FLUSH_TLB:
+		__tlbie_lpid(lpid, RIC_FLUSH_TLB);
+		break;
+	case RIC_FLUSH_PWC:
+		__tlbie_lpid(lpid, RIC_FLUSH_PWC);
+		break;
+	case RIC_FLUSH_ALL:
+	default:
+		__tlbie_lpid(lpid, RIC_FLUSH_ALL);
+	}
+	fixup_tlbie_lpid(lpid);
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric)
+{
+	int set;
+
+	VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
+
+	asm volatile("ptesync": : :"memory");
+
+	/*
+	 * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+	 * also flush the entire Page Walk Cache.
+	 */
+	__tlbiel_lpid_guest(lpid, 0, ric);
+
+	/* For PWC, only one flush is needed */
+	if (ric == RIC_FLUSH_PWC) {
+		asm volatile("ptesync": : :"memory");
+		return;
+	}
+
+	/* For the remaining sets, just flush the TLB */
+	for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
+		__tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB);
+
+	asm volatile("ptesync": : :"memory");
+	asm volatile(PPC_INVALIDATE_ERAT : : :"memory");
+}
+
+
+static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
+				    unsigned long pid, unsigned long page_size,
+				    unsigned long psize)
+{
+	unsigned long addr;
+	unsigned long ap = mmu_get_ap(psize);
+
+	for (addr = start; addr < end; addr += page_size)
+		__tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+}
+
+static inline void _tlbiel_va(unsigned long va, unsigned long pid,
+			      unsigned long psize, unsigned long ric)
+{
+	unsigned long ap = mmu_get_ap(psize);
+
+	asm volatile("ptesync": : :"memory");
+	__tlbiel_va(va, pid, ap, ric);
+	asm volatile("ptesync": : :"memory");
+}
+
+static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
+				    unsigned long pid, unsigned long page_size,
+				    unsigned long psize, bool also_pwc)
+{
+	asm volatile("ptesync": : :"memory");
+	if (also_pwc)
+		__tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
+	__tlbiel_va_range(start, end, pid, page_size, psize);
+	asm volatile("ptesync": : :"memory");
+}
+
+static inline void __tlbie_va_range(unsigned long start, unsigned long end,
+				    unsigned long pid, unsigned long page_size,
+				    unsigned long psize)
+{
+	unsigned long addr;
+	unsigned long ap = mmu_get_ap(psize);
+
+	for (addr = start; addr < end; addr += page_size)
+		__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+}
+
+static inline void _tlbie_va(unsigned long va, unsigned long pid,
+			      unsigned long psize, unsigned long ric)
+{
+	unsigned long ap = mmu_get_ap(psize);
+
+	asm volatile("ptesync": : :"memory");
+	__tlbie_va(va, pid, ap, ric);
+	fixup_tlbie();
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
+			      unsigned long psize, unsigned long ric)
+{
+	unsigned long ap = mmu_get_ap(psize);
+
+	asm volatile("ptesync": : :"memory");
+	__tlbie_lpid_va(va, lpid, ap, ric);
+	fixup_tlbie_lpid(lpid);
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbie_va_range(unsigned long start, unsigned long end,
+				    unsigned long pid, unsigned long page_size,
+				    unsigned long psize, bool also_pwc)
+{
+	asm volatile("ptesync": : :"memory");
+	if (also_pwc)
+		__tlbie_pid(pid, RIC_FLUSH_PWC);
+	__tlbie_va_range(start, end, pid, page_size, psize);
+	fixup_tlbie();
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+/*
+ * Base TLB flushing operations:
+ *
+ *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ *  - flush_tlb_page(vma, vmaddr) flushes one page
+ *  - flush_tlb_range(vma, start, end) flushes a range of pages
+ *  - flush_tlb_kernel_range(start, end) flushes kernel pages
+ *
+ *  - local_* variants of page and mm only apply to the current
+ *    processor
+ */
+void radix__local_flush_tlb_mm(struct mm_struct *mm)
+{
+	unsigned long pid;
+
+	preempt_disable();
+	pid = mm->context.id;
+	if (pid != MMU_NO_CONTEXT)
+		_tlbiel_pid(pid, RIC_FLUSH_TLB);
+	preempt_enable();
+}
+EXPORT_SYMBOL(radix__local_flush_tlb_mm);
+
+#ifndef CONFIG_SMP
+void radix__local_flush_all_mm(struct mm_struct *mm)
+{
+	unsigned long pid;
+
+	preempt_disable();
+	pid = mm->context.id;
+	if (pid != MMU_NO_CONTEXT)
+		_tlbiel_pid(pid, RIC_FLUSH_ALL);
+	preempt_enable();
+}
+EXPORT_SYMBOL(radix__local_flush_all_mm);
+#endif /* CONFIG_SMP */
+
+void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
+				       int psize)
+{
+	unsigned long pid;
+
+	preempt_disable();
+	pid = mm->context.id;
+	if (pid != MMU_NO_CONTEXT)
+		_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+	preempt_enable();
+}
+
+void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+	/* need the return fix for nohash.c */
+	if (is_vm_hugetlb_page(vma))
+		return radix__local_flush_hugetlb_page(vma, vmaddr);
+#endif
+	radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
+}
+EXPORT_SYMBOL(radix__local_flush_tlb_page);
+
+static bool mm_is_singlethreaded(struct mm_struct *mm)
+{
+	if (atomic_read(&mm->context.copros) > 0)
+		return false;
+	if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm)
+		return true;
+	return false;
+}
+
+static bool mm_needs_flush_escalation(struct mm_struct *mm)
+{
+	/*
+	 * P9 nest MMU has issues with the page walk cache
+	 * caching PTEs and not flushing them properly when
+	 * RIC = 0 for a PID/LPID invalidate
+	 */
+	if (atomic_read(&mm->context.copros) > 0)
+		return true;
+	return false;
+}
+
+#ifdef CONFIG_SMP
+static void do_exit_flush_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+	unsigned long pid = mm->context.id;
+
+	if (current->mm == mm)
+		return; /* Local CPU */
+
+	if (current->active_mm == mm) {
+		/*
+		 * Must be a kernel thread because sender is single-threaded.
+		 */
+		BUG_ON(current->mm);
+		mmgrab(&init_mm);
+		switch_mm(mm, &init_mm, current);
+		current->active_mm = &init_mm;
+		mmdrop(mm);
+	}
+	_tlbiel_pid(pid, RIC_FLUSH_ALL);
+}
+
+static void exit_flush_lazy_tlbs(struct mm_struct *mm)
+{
+	/*
+	 * Would be nice if this was async so it could be run in
+	 * parallel with our local flush, but generic code does not
+	 * give a good API for it. Could extend the generic code or
+	 * make a special powerpc IPI for flushing TLBs.
+	 * For now it's not too performance critical.
+	 */
+	smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
+				(void *)mm, 1);
+	mm_reset_thread_local(mm);
+}
+
+void radix__flush_tlb_mm(struct mm_struct *mm)
+{
+	unsigned long pid;
+
+	pid = mm->context.id;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		return;
+
+	preempt_disable();
+	/*
+	 * Order loads of mm_cpumask vs previous stores to clear ptes before
+	 * the invalidate. See barrier in switch_mm_irqs_off
+	 */
+	smp_mb();
+	if (!mm_is_thread_local(mm)) {
+		if (unlikely(mm_is_singlethreaded(mm))) {
+			exit_flush_lazy_tlbs(mm);
+			goto local;
+		}
+
+		if (mm_needs_flush_escalation(mm))
+			_tlbie_pid(pid, RIC_FLUSH_ALL);
+		else
+			_tlbie_pid(pid, RIC_FLUSH_TLB);
+	} else {
+local:
+		_tlbiel_pid(pid, RIC_FLUSH_TLB);
+	}
+	preempt_enable();
+}
+EXPORT_SYMBOL(radix__flush_tlb_mm);
+
+static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
+{
+	unsigned long pid;
+
+	pid = mm->context.id;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		return;
+
+	preempt_disable();
+	smp_mb(); /* see radix__flush_tlb_mm */
+	if (!mm_is_thread_local(mm)) {
+		if (unlikely(mm_is_singlethreaded(mm))) {
+			if (!fullmm) {
+				exit_flush_lazy_tlbs(mm);
+				goto local;
+			}
+		}
+		_tlbie_pid(pid, RIC_FLUSH_ALL);
+	} else {
+local:
+		_tlbiel_pid(pid, RIC_FLUSH_ALL);
+	}
+	preempt_enable();
+}
+void radix__flush_all_mm(struct mm_struct *mm)
+{
+	__flush_all_mm(mm, false);
+}
+EXPORT_SYMBOL(radix__flush_all_mm);
+
+void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
+{
+	tlb->need_flush_all = 1;
+}
+EXPORT_SYMBOL(radix__flush_tlb_pwc);
+
+void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
+				 int psize)
+{
+	unsigned long pid;
+
+	pid = mm->context.id;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		return;
+
+	preempt_disable();
+	smp_mb(); /* see radix__flush_tlb_mm */
+	if (!mm_is_thread_local(mm)) {
+		if (unlikely(mm_is_singlethreaded(mm))) {
+			exit_flush_lazy_tlbs(mm);
+			goto local;
+		}
+		_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+	} else {
+local:
+		_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+	}
+	preempt_enable();
+}
+
+void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+	if (is_vm_hugetlb_page(vma))
+		return radix__flush_hugetlb_page(vma, vmaddr);
+#endif
+	radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
+}
+EXPORT_SYMBOL(radix__flush_tlb_page);
+
+#else /* CONFIG_SMP */
+#define radix__flush_all_mm radix__local_flush_all_mm
+#endif /* CONFIG_SMP */
+
+void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+	_tlbie_pid(0, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
+
+#define TLB_FLUSH_ALL -1UL
+
+/*
+ * Number of pages above which we invalidate the entire PID rather than
+ * flush individual pages, for local and global flushes respectively.
+ *
+ * tlbie goes out to the interconnect and individual ops are more costly.
+ * It also does not iterate over sets like the local tlbiel variant when
+ * invalidating a full PID, so it has a far lower threshold to change from
+ * individual page flushes to full-pid flushes.
+ */
+static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
+
+static inline void __radix__flush_tlb_range(struct mm_struct *mm,
+					unsigned long start, unsigned long end,
+					bool flush_all_sizes)
+
+{
+	unsigned long pid;
+	unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
+	unsigned long page_size = 1UL << page_shift;
+	unsigned long nr_pages = (end - start) >> page_shift;
+	bool local, full;
+
+	pid = mm->context.id;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		return;
+
+	preempt_disable();
+	smp_mb(); /* see radix__flush_tlb_mm */
+	if (!mm_is_thread_local(mm)) {
+		if (unlikely(mm_is_singlethreaded(mm))) {
+			if (end != TLB_FLUSH_ALL) {
+				exit_flush_lazy_tlbs(mm);
+				goto is_local;
+			}
+		}
+		local = false;
+		full = (end == TLB_FLUSH_ALL ||
+				nr_pages > tlb_single_page_flush_ceiling);
+	} else {
+is_local:
+		local = true;
+		full = (end == TLB_FLUSH_ALL ||
+				nr_pages > tlb_local_single_page_flush_ceiling);
+	}
+
+	if (full) {
+		if (local) {
+			_tlbiel_pid(pid, RIC_FLUSH_TLB);
+		} else {
+			if (mm_needs_flush_escalation(mm))
+				_tlbie_pid(pid, RIC_FLUSH_ALL);
+			else
+				_tlbie_pid(pid, RIC_FLUSH_TLB);
+		}
+	} else {
+		bool hflush = flush_all_sizes;
+		bool gflush = flush_all_sizes;
+		unsigned long hstart, hend;
+		unsigned long gstart, gend;
+
+		if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+			hflush = true;
+
+		if (hflush) {
+			hstart = (start + PMD_SIZE - 1) & PMD_MASK;
+			hend = end & PMD_MASK;
+			if (hstart == hend)
+				hflush = false;
+		}
+
+		if (gflush) {
+			gstart = (start + PUD_SIZE - 1) & PUD_MASK;
+			gend = end & PUD_MASK;
+			if (gstart == gend)
+				gflush = false;
+		}
+
+		asm volatile("ptesync": : :"memory");
+		if (local) {
+			__tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
+			if (hflush)
+				__tlbiel_va_range(hstart, hend, pid,
+						PMD_SIZE, MMU_PAGE_2M);
+			if (gflush)
+				__tlbiel_va_range(gstart, gend, pid,
+						PUD_SIZE, MMU_PAGE_1G);
+			asm volatile("ptesync": : :"memory");
+		} else {
+			__tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
+			if (hflush)
+				__tlbie_va_range(hstart, hend, pid,
+						PMD_SIZE, MMU_PAGE_2M);
+			if (gflush)
+				__tlbie_va_range(gstart, gend, pid,
+						PUD_SIZE, MMU_PAGE_1G);
+			fixup_tlbie();
+			asm volatile("eieio; tlbsync; ptesync": : :"memory");
+		}
+	}
+	preempt_enable();
+}
+
+void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+		     unsigned long end)
+
+{
+#ifdef CONFIG_HUGETLB_PAGE
+	if (is_vm_hugetlb_page(vma))
+		return radix__flush_hugetlb_tlb_range(vma, start, end);
+#endif
+
+	__radix__flush_tlb_range(vma->vm_mm, start, end, false);
+}
+EXPORT_SYMBOL(radix__flush_tlb_range);
+
+static int radix_get_mmu_psize(int page_size)
+{
+	int psize;
+
+	if (page_size == (1UL << mmu_psize_defs[mmu_virtual_psize].shift))
+		psize = mmu_virtual_psize;
+	else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_2M].shift))
+		psize = MMU_PAGE_2M;
+	else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_1G].shift))
+		psize = MMU_PAGE_1G;
+	else
+		return -1;
+	return psize;
+}
+
+/*
+ * Flush partition scoped LPID address translation for all CPUs.
+ */
+void radix__flush_tlb_lpid_page(unsigned int lpid,
+					unsigned long addr,
+					unsigned long page_size)
+{
+	int psize = radix_get_mmu_psize(page_size);
+
+	_tlbie_lpid_va(addr, lpid, psize, RIC_FLUSH_TLB);
+}
+EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid_page);
+
+/*
+ * Flush partition scoped PWC from LPID for all CPUs.
+ */
+void radix__flush_pwc_lpid(unsigned int lpid)
+{
+	_tlbie_lpid(lpid, RIC_FLUSH_PWC);
+}
+EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
+
+/*
+ * Flush partition scoped translations from LPID (=LPIDR)
+ */
+void radix__flush_tlb_lpid(unsigned int lpid)
+{
+	_tlbie_lpid(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid);
+
+/*
+ * Flush partition scoped translations from LPID (=LPIDR)
+ */
+void radix__local_flush_tlb_lpid(unsigned int lpid)
+{
+	_tlbiel_lpid(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid);
+
+/*
+ * Flush process scoped translations from LPID (=LPIDR).
+ * Important difference, the guest normally manages its own translations,
+ * but some cases e.g., vCPU CPU migration require KVM to flush.
+ */
+void radix__local_flush_tlb_lpid_guest(unsigned int lpid)
+{
+	_tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest);
+
+
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+				  unsigned long end, int psize);
+
+void radix__tlb_flush(struct mmu_gather *tlb)
+{
+	int psize = 0;
+	struct mm_struct *mm = tlb->mm;
+	int page_size = tlb->page_size;
+	unsigned long start = tlb->start;
+	unsigned long end = tlb->end;
+
+	/*
+	 * if page size is not something we understand, do a full mm flush
+	 *
+	 * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush
+	 * that flushes the process table entry cache upon process teardown.
+	 * See the comment for radix in arch_exit_mmap().
+	 */
+	if (tlb->fullmm) {
+		__flush_all_mm(mm, true);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
+	} else if (mm_tlb_flush_nested(mm)) {
+		/*
+		 * If there is a concurrent invalidation that is clearing ptes,
+		 * then it's possible this invalidation will miss one of those
+		 * cleared ptes and miss flushing the TLB. If this invalidate
+		 * returns before the other one flushes TLBs, that can result
+		 * in it returning while there are still valid TLBs inside the
+		 * range to be invalidated.
+		 *
+		 * See mm/memory.c:tlb_finish_mmu() for more details.
+		 *
+		 * The solution to this is ensure the entire range is always
+		 * flushed here. The problem for powerpc is that the flushes
+		 * are page size specific, so this "forced flush" would not
+		 * do the right thing if there are a mix of page sizes in
+		 * the range to be invalidated. So use __flush_tlb_range
+		 * which invalidates all possible page sizes in the range.
+		 *
+		 * PWC flush probably is not be required because the core code
+		 * shouldn't free page tables in this path, but accounting
+		 * for the possibility makes us a bit more robust.
+		 *
+		 * need_flush_all is an uncommon case because page table
+		 * teardown should be done with exclusive locks held (but
+		 * after locks are dropped another invalidate could come
+		 * in), it could be optimized further if necessary.
+		 */
+		if (!tlb->need_flush_all)
+			__radix__flush_tlb_range(mm, start, end, true);
+		else
+			radix__flush_all_mm(mm);
+#endif
+	} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
+		if (!tlb->need_flush_all)
+			radix__flush_tlb_mm(mm);
+		else
+			radix__flush_all_mm(mm);
+	} else {
+		if (!tlb->need_flush_all)
+			radix__flush_tlb_range_psize(mm, start, end, psize);
+		else
+			radix__flush_tlb_pwc_range_psize(mm, start, end, psize);
+	}
+	tlb->need_flush_all = 0;
+}
+
+static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
+				unsigned long start, unsigned long end,
+				int psize, bool also_pwc)
+{
+	unsigned long pid;
+	unsigned int page_shift = mmu_psize_defs[psize].shift;
+	unsigned long page_size = 1UL << page_shift;
+	unsigned long nr_pages = (end - start) >> page_shift;
+	bool local, full;
+
+	pid = mm->context.id;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		return;
+
+	preempt_disable();
+	smp_mb(); /* see radix__flush_tlb_mm */
+	if (!mm_is_thread_local(mm)) {
+		if (unlikely(mm_is_singlethreaded(mm))) {
+			if (end != TLB_FLUSH_ALL) {
+				exit_flush_lazy_tlbs(mm);
+				goto is_local;
+			}
+		}
+		local = false;
+		full = (end == TLB_FLUSH_ALL ||
+				nr_pages > tlb_single_page_flush_ceiling);
+	} else {
+is_local:
+		local = true;
+		full = (end == TLB_FLUSH_ALL ||
+				nr_pages > tlb_local_single_page_flush_ceiling);
+	}
+
+	if (full) {
+		if (local) {
+			_tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+		} else {
+			if (mm_needs_flush_escalation(mm))
+				also_pwc = true;
+
+			_tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+		}
+	} else {
+		if (local)
+			_tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
+		else
+			_tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
+	}
+	preempt_enable();
+}
+
+void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
+				  unsigned long end, int psize)
+{
+	return __radix__flush_tlb_range_psize(mm, start, end, psize, false);
+}
+
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+				  unsigned long end, int psize)
+{
+	__radix__flush_tlb_range_psize(mm, start, end, psize, true);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
+{
+	unsigned long pid, end;
+
+	pid = mm->context.id;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		return;
+
+	/* 4k page size, just blow the world */
+	if (PAGE_SIZE == 0x1000) {
+		radix__flush_all_mm(mm);
+		return;
+	}
+
+	end = addr + HPAGE_PMD_SIZE;
+
+	/* Otherwise first do the PWC, then iterate the pages. */
+	preempt_disable();
+	smp_mb(); /* see radix__flush_tlb_mm */
+	if (!mm_is_thread_local(mm)) {
+		if (unlikely(mm_is_singlethreaded(mm))) {
+			exit_flush_lazy_tlbs(mm);
+			goto local;
+		}
+		_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+	} else {
+local:
+		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+	}
+
+	preempt_enable();
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end)
+{
+	radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_2M);
+}
+EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
+
+void radix__flush_tlb_all(void)
+{
+	unsigned long rb,prs,r,rs;
+	unsigned long ric = RIC_FLUSH_ALL;
+
+	rb = 0x3 << PPC_BITLSHIFT(53); /* IS = 3 */
+	prs = 0; /* partition scoped */
+	r = 1;   /* radix format */
+	rs = 1 & ((1UL << 32) - 1); /* any LPID value to flush guest mappings */
+
+	asm volatile("ptesync": : :"memory");
+	/*
+	 * now flush guest entries by passing PRS = 1 and LPID != 0
+	 */
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(1), "i"(ric), "r"(rs) : "memory");
+	/*
+	 * now flush host entires by passing PRS = 0 and LPID == 0
+	 */
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory");
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
+{
+	unsigned long pid = mm->context.id;
+
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		return;
+
+	/*
+	 * If this context hasn't run on that CPU before and KVM is
+	 * around, there's a slim chance that the guest on another
+	 * CPU just brought in obsolete translation into the TLB of
+	 * this CPU due to a bad prefetch using the guest PID on
+	 * the way into the hypervisor.
+	 *
+	 * We work around this here. If KVM is possible, we check if
+	 * any sibling thread is in KVM. If it is, the window may exist
+	 * and thus we flush that PID from the core.
+	 *
+	 * A potential future improvement would be to mark which PIDs
+	 * have never been used on the system and avoid it if the PID
+	 * is new and the process has no other cpumask bit set.
+	 */
+	if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) {
+		int cpu = smp_processor_id();
+		int sib = cpu_first_thread_sibling(cpu);
+		bool flush = false;
+
+		for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) {
+			if (sib == cpu)
+				continue;
+			if (!cpu_possible(sib))
+				continue;
+			if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu)
+				flush = true;
+		}
+		if (flush)
+			_tlbiel_pid(pid, RIC_FLUSH_ALL);
+	}
+}
+EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
new file mode 100644
index 000000000000..c22742218bd3
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -0,0 +1,833 @@
+/*
+ * PowerPC64 SLB support.
+ *
+ * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
+ * Based on earlier code written by:
+ * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
+ *    Copyright (c) 2001 Dave Engebretsen
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/asm-prototypes.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/paca.h>
+#include <asm/ppc-opcode.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/smp.h>
+#include <linux/compiler.h>
+#include <linux/context_tracking.h>
+#include <linux/mm_types.h>
+
+#include <asm/udbg.h>
+#include <asm/code-patching.h>
+
+enum slb_index {
+	LINEAR_INDEX	= 0, /* Kernel linear map  (0xc000000000000000) */
+	KSTACK_INDEX	= 1, /* Kernel stack map */
+};
+
+static long slb_allocate_user(struct mm_struct *mm, unsigned long ea);
+
+#define slb_esid_mask(ssize)	\
+	(((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
+
+static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
+					 enum slb_index index)
+{
+	return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index;
+}
+
+static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize,
+					 unsigned long flags)
+{
+	return (vsid << slb_vsid_shift(ssize)) | flags |
+		((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT);
+}
+
+static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
+					 unsigned long flags)
+{
+	return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags);
+}
+
+static void assert_slb_presence(bool present, unsigned long ea)
+{
+#ifdef CONFIG_DEBUG_VM
+	unsigned long tmp;
+
+	WARN_ON_ONCE(mfmsr() & MSR_EE);
+
+	if (!cpu_has_feature(CPU_FTR_ARCH_206))
+		return;
+
+	/*
+	 * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware
+	 * ignores all other bits from 0-27, so just clear them all.
+	 */
+	ea &= ~((1UL << 28) - 1);
+	asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0");
+
+	WARN_ON(present == (tmp == 0));
+#endif
+}
+
+static inline void slb_shadow_update(unsigned long ea, int ssize,
+				     unsigned long flags,
+				     enum slb_index index)
+{
+	struct slb_shadow *p = get_slb_shadow();
+
+	/*
+	 * Clear the ESID first so the entry is not valid while we are
+	 * updating it.  No write barriers are needed here, provided
+	 * we only update the current CPU's SLB shadow buffer.
+	 */
+	WRITE_ONCE(p->save_area[index].esid, 0);
+	WRITE_ONCE(p->save_area[index].vsid, cpu_to_be64(mk_vsid_data(ea, ssize, flags)));
+	WRITE_ONCE(p->save_area[index].esid, cpu_to_be64(mk_esid_data(ea, ssize, index)));
+}
+
+static inline void slb_shadow_clear(enum slb_index index)
+{
+	WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index));
+}
+
+static inline void create_shadowed_slbe(unsigned long ea, int ssize,
+					unsigned long flags,
+					enum slb_index index)
+{
+	/*
+	 * Updating the shadow buffer before writing the SLB ensures
+	 * we don't get a stale entry here if we get preempted by PHYP
+	 * between these two statements.
+	 */
+	slb_shadow_update(ea, ssize, flags, index);
+
+	assert_slb_presence(false, ea);
+	asm volatile("slbmte  %0,%1" :
+		     : "r" (mk_vsid_data(ea, ssize, flags)),
+		       "r" (mk_esid_data(ea, ssize, index))
+		     : "memory" );
+}
+
+/*
+ * Insert bolted entries into SLB (which may not be empty, so don't clear
+ * slb_cache_ptr).
+ */
+void __slb_restore_bolted_realmode(void)
+{
+	struct slb_shadow *p = get_slb_shadow();
+	enum slb_index index;
+
+	 /* No isync needed because realmode. */
+	for (index = 0; index < SLB_NUM_BOLTED; index++) {
+		asm volatile("slbmte  %0,%1" :
+		     : "r" (be64_to_cpu(p->save_area[index].vsid)),
+		       "r" (be64_to_cpu(p->save_area[index].esid)));
+	}
+
+	assert_slb_presence(true, local_paca->kstack);
+}
+
+/*
+ * Insert the bolted entries into an empty SLB.
+ */
+void slb_restore_bolted_realmode(void)
+{
+	__slb_restore_bolted_realmode();
+	get_paca()->slb_cache_ptr = 0;
+
+	get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+	get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+}
+
+/*
+ * This flushes all SLB entries including 0, so it must be realmode.
+ */
+void slb_flush_all_realmode(void)
+{
+	asm volatile("slbmte %0,%0; slbia" : : "r" (0));
+}
+
+/*
+ * This flushes non-bolted entries, it can be run in virtual mode. Must
+ * be called with interrupts disabled.
+ */
+void slb_flush_and_restore_bolted(void)
+{
+	struct slb_shadow *p = get_slb_shadow();
+
+	BUILD_BUG_ON(SLB_NUM_BOLTED != 2);
+
+	WARN_ON(!irqs_disabled());
+
+	/*
+	 * We can't take a PMU exception in the following code, so hard
+	 * disable interrupts.
+	 */
+	hard_irq_disable();
+
+	asm volatile("isync\n"
+		     "slbia\n"
+		     "slbmte  %0, %1\n"
+		     "isync\n"
+		     :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)),
+			"r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid))
+		     : "memory");
+	assert_slb_presence(true, get_paca()->kstack);
+
+	get_paca()->slb_cache_ptr = 0;
+
+	get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+	get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+}
+
+void slb_save_contents(struct slb_entry *slb_ptr)
+{
+	int i;
+	unsigned long e, v;
+
+	/* Save slb_cache_ptr value. */
+	get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr;
+
+	if (!slb_ptr)
+		return;
+
+	for (i = 0; i < mmu_slb_size; i++) {
+		asm volatile("slbmfee  %0,%1" : "=r" (e) : "r" (i));
+		asm volatile("slbmfev  %0,%1" : "=r" (v) : "r" (i));
+		slb_ptr->esid = e;
+		slb_ptr->vsid = v;
+		slb_ptr++;
+	}
+}
+
+void slb_dump_contents(struct slb_entry *slb_ptr)
+{
+	int i, n;
+	unsigned long e, v;
+	unsigned long llp;
+
+	if (!slb_ptr)
+		return;
+
+	pr_err("SLB contents of cpu 0x%x\n", smp_processor_id());
+	pr_err("Last SLB entry inserted at slot %d\n", get_paca()->stab_rr);
+
+	for (i = 0; i < mmu_slb_size; i++) {
+		e = slb_ptr->esid;
+		v = slb_ptr->vsid;
+		slb_ptr++;
+
+		if (!e && !v)
+			continue;
+
+		pr_err("%02d %016lx %016lx\n", i, e, v);
+
+		if (!(e & SLB_ESID_V)) {
+			pr_err("\n");
+			continue;
+		}
+		llp = v & SLB_VSID_LLP;
+		if (v & SLB_VSID_B_1T) {
+			pr_err("  1T  ESID=%9lx  VSID=%13lx LLP:%3lx\n",
+			       GET_ESID_1T(e),
+			       (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp);
+		} else {
+			pr_err(" 256M ESID=%9lx  VSID=%13lx LLP:%3lx\n",
+			       GET_ESID(e),
+			       (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp);
+		}
+	}
+	pr_err("----------------------------------\n");
+
+	/* Dump slb cache entires as well. */
+	pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr);
+	pr_err("Valid SLB cache entries:\n");
+	n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES);
+	for (i = 0; i < n; i++)
+		pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
+	pr_err("Rest of SLB cache entries:\n");
+	for (i = n; i < SLB_CACHE_ENTRIES; i++)
+		pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
+}
+
+void slb_vmalloc_update(void)
+{
+	/*
+	 * vmalloc is not bolted, so just have to flush non-bolted.
+	 */
+	slb_flush_and_restore_bolted();
+}
+
+static bool preload_hit(struct thread_info *ti, unsigned long esid)
+{
+	unsigned char i;
+
+	for (i = 0; i < ti->slb_preload_nr; i++) {
+		unsigned char idx;
+
+		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
+		if (esid == ti->slb_preload_esid[idx])
+			return true;
+	}
+	return false;
+}
+
+static bool preload_add(struct thread_info *ti, unsigned long ea)
+{
+	unsigned char idx;
+	unsigned long esid;
+
+	if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
+		/* EAs are stored >> 28 so 256MB segments don't need clearing */
+		if (ea & ESID_MASK_1T)
+			ea &= ESID_MASK_1T;
+	}
+
+	esid = ea >> SID_SHIFT;
+
+	if (preload_hit(ti, esid))
+		return false;
+
+	idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR;
+	ti->slb_preload_esid[idx] = esid;
+	if (ti->slb_preload_nr == SLB_PRELOAD_NR)
+		ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
+	else
+		ti->slb_preload_nr++;
+
+	return true;
+}
+
+static void preload_age(struct thread_info *ti)
+{
+	if (!ti->slb_preload_nr)
+		return;
+	ti->slb_preload_nr--;
+	ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
+}
+
+void slb_setup_new_exec(void)
+{
+	struct thread_info *ti = current_thread_info();
+	struct mm_struct *mm = current->mm;
+	unsigned long exec = 0x10000000;
+
+	WARN_ON(irqs_disabled());
+
+	/*
+	 * preload cache can only be used to determine whether a SLB
+	 * entry exists if it does not start to overflow.
+	 */
+	if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR)
+		return;
+
+	hard_irq_disable();
+
+	/*
+	 * We have no good place to clear the slb preload cache on exec,
+	 * flush_thread is about the earliest arch hook but that happens
+	 * after we switch to the mm and have aleady preloaded the SLBEs.
+	 *
+	 * For the most part that's probably okay to use entries from the
+	 * previous exec, they will age out if unused. It may turn out to
+	 * be an advantage to clear the cache before switching to it,
+	 * however.
+	 */
+
+	/*
+	 * preload some userspace segments into the SLB.
+	 * Almost all 32 and 64bit PowerPC executables are linked at
+	 * 0x10000000 so it makes sense to preload this segment.
+	 */
+	if (!is_kernel_addr(exec)) {
+		if (preload_add(ti, exec))
+			slb_allocate_user(mm, exec);
+	}
+
+	/* Libraries and mmaps. */
+	if (!is_kernel_addr(mm->mmap_base)) {
+		if (preload_add(ti, mm->mmap_base))
+			slb_allocate_user(mm, mm->mmap_base);
+	}
+
+	/* see switch_slb */
+	asm volatile("isync" : : : "memory");
+
+	local_irq_enable();
+}
+
+void preload_new_slb_context(unsigned long start, unsigned long sp)
+{
+	struct thread_info *ti = current_thread_info();
+	struct mm_struct *mm = current->mm;
+	unsigned long heap = mm->start_brk;
+
+	WARN_ON(irqs_disabled());
+
+	/* see above */
+	if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR)
+		return;
+
+	hard_irq_disable();
+
+	/* Userspace entry address. */
+	if (!is_kernel_addr(start)) {
+		if (preload_add(ti, start))
+			slb_allocate_user(mm, start);
+	}
+
+	/* Top of stack, grows down. */
+	if (!is_kernel_addr(sp)) {
+		if (preload_add(ti, sp))
+			slb_allocate_user(mm, sp);
+	}
+
+	/* Bottom of heap, grows up. */
+	if (heap && !is_kernel_addr(heap)) {
+		if (preload_add(ti, heap))
+			slb_allocate_user(mm, heap);
+	}
+
+	/* see switch_slb */
+	asm volatile("isync" : : : "memory");
+
+	local_irq_enable();
+}
+
+
+/* Flush all user entries from the segment table of the current processor. */
+void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
+{
+	struct thread_info *ti = task_thread_info(tsk);
+	unsigned char i;
+
+	/*
+	 * We need interrupts hard-disabled here, not just soft-disabled,
+	 * so that a PMU interrupt can't occur, which might try to access
+	 * user memory (to get a stack trace) and possible cause an SLB miss
+	 * which would update the slb_cache/slb_cache_ptr fields in the PACA.
+	 */
+	hard_irq_disable();
+	asm volatile("isync" : : : "memory");
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		/*
+		 * SLBIA IH=3 invalidates all Class=1 SLBEs and their
+		 * associated lookaside structures, which matches what
+		 * switch_slb wants. So ARCH_300 does not use the slb
+		 * cache.
+		 */
+		asm volatile(PPC_SLBIA(3));
+	} else {
+		unsigned long offset = get_paca()->slb_cache_ptr;
+
+		if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
+		    offset <= SLB_CACHE_ENTRIES) {
+			unsigned long slbie_data = 0;
+
+			for (i = 0; i < offset; i++) {
+				unsigned long ea;
+
+				ea = (unsigned long)
+					get_paca()->slb_cache[i] << SID_SHIFT;
+				/*
+				 * Could assert_slb_presence(true) here, but
+				 * hypervisor or machine check could have come
+				 * in and removed the entry at this point.
+				 */
+
+				slbie_data = ea;
+				slbie_data |= user_segment_size(slbie_data)
+						<< SLBIE_SSIZE_SHIFT;
+				slbie_data |= SLBIE_C; /* user slbs have C=1 */
+				asm volatile("slbie %0" : : "r" (slbie_data));
+			}
+
+			/* Workaround POWER5 < DD2.1 issue */
+			if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1)
+				asm volatile("slbie %0" : : "r" (slbie_data));
+
+		} else {
+			struct slb_shadow *p = get_slb_shadow();
+			unsigned long ksp_esid_data =
+				be64_to_cpu(p->save_area[KSTACK_INDEX].esid);
+			unsigned long ksp_vsid_data =
+				be64_to_cpu(p->save_area[KSTACK_INDEX].vsid);
+
+			asm volatile(PPC_SLBIA(1) "\n"
+				     "slbmte	%0,%1\n"
+				     "isync"
+				     :: "r"(ksp_vsid_data),
+					"r"(ksp_esid_data));
+
+			get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+		}
+
+		get_paca()->slb_cache_ptr = 0;
+	}
+	get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+
+	copy_mm_to_paca(mm);
+
+	/*
+	 * We gradually age out SLBs after a number of context switches to
+	 * reduce reload overhead of unused entries (like we do with FP/VEC
+	 * reload). Each time we wrap 256 switches, take an entry out of the
+	 * SLB preload cache.
+	 */
+	tsk->thread.load_slb++;
+	if (!tsk->thread.load_slb) {
+		unsigned long pc = KSTK_EIP(tsk);
+
+		preload_age(ti);
+		preload_add(ti, pc);
+	}
+
+	for (i = 0; i < ti->slb_preload_nr; i++) {
+		unsigned char idx;
+		unsigned long ea;
+
+		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
+		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
+
+		slb_allocate_user(mm, ea);
+	}
+
+	/*
+	 * Synchronize slbmte preloads with possible subsequent user memory
+	 * address accesses by the kernel (user mode won't happen until
+	 * rfid, which is safe).
+	 */
+	asm volatile("isync" : : : "memory");
+}
+
+void slb_set_size(u16 size)
+{
+	mmu_slb_size = size;
+}
+
+void slb_initialize(void)
+{
+	unsigned long linear_llp, vmalloc_llp, io_llp;
+	unsigned long lflags;
+	static int slb_encoding_inited;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	unsigned long vmemmap_llp;
+#endif
+
+	/* Prepare our SLB miss handler based on our page size */
+	linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
+	io_llp = mmu_psize_defs[mmu_io_psize].sllp;
+	vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
+	get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp;
+#endif
+	if (!slb_encoding_inited) {
+		slb_encoding_inited = 1;
+		pr_devel("SLB: linear  LLP = %04lx\n", linear_llp);
+		pr_devel("SLB: io      LLP = %04lx\n", io_llp);
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+		pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp);
+#endif
+	}
+
+	get_paca()->stab_rr = SLB_NUM_BOLTED - 1;
+	get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+	get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+
+	lflags = SLB_VSID_KERNEL | linear_llp;
+
+	/* Invalidate the entire SLB (even entry 0) & all the ERATS */
+	asm volatile("isync":::"memory");
+	asm volatile("slbmte  %0,%0"::"r" (0) : "memory");
+	asm volatile("isync; slbia; isync":::"memory");
+	create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX);
+
+	/*
+	 * For the boot cpu, we're running on the stack in init_thread_union,
+	 * which is in the first segment of the linear mapping, and also
+	 * get_paca()->kstack hasn't been initialized yet.
+	 * For secondary cpus, we need to bolt the kernel stack entry now.
+	 */
+	slb_shadow_clear(KSTACK_INDEX);
+	if (raw_smp_processor_id() != boot_cpuid &&
+	    (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET)
+		create_shadowed_slbe(get_paca()->kstack,
+				     mmu_kernel_ssize, lflags, KSTACK_INDEX);
+
+	asm volatile("isync":::"memory");
+}
+
+static void slb_cache_update(unsigned long esid_data)
+{
+	int slb_cache_index;
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		return; /* ISAv3.0B and later does not use slb_cache */
+
+	/*
+	 * Now update slb cache entries
+	 */
+	slb_cache_index = local_paca->slb_cache_ptr;
+	if (slb_cache_index < SLB_CACHE_ENTRIES) {
+		/*
+		 * We have space in slb cache for optimized switch_slb().
+		 * Top 36 bits from esid_data as per ISA
+		 */
+		local_paca->slb_cache[slb_cache_index++] = esid_data >> 28;
+		local_paca->slb_cache_ptr++;
+	} else {
+		/*
+		 * Our cache is full and the current cache content strictly
+		 * doesn't indicate the active SLB conents. Bump the ptr
+		 * so that switch_slb() will ignore the cache.
+		 */
+		local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1;
+	}
+}
+
+static enum slb_index alloc_slb_index(bool kernel)
+{
+	enum slb_index index;
+
+	/*
+	 * The allocation bitmaps can become out of synch with the SLB
+	 * when the _switch code does slbie when bolting a new stack
+	 * segment and it must not be anywhere else in the SLB. This leaves
+	 * a kernel allocated entry that is unused in the SLB. With very
+	 * large systems or small segment sizes, the bitmaps could slowly
+	 * fill with these entries. They will eventually be cleared out
+	 * by the round robin allocator in that case, so it's probably not
+	 * worth accounting for.
+	 */
+
+	/*
+	 * SLBs beyond 32 entries are allocated with stab_rr only
+	 * POWER7/8/9 have 32 SLB entries, this could be expanded if a
+	 * future CPU has more.
+	 */
+	if (local_paca->slb_used_bitmap != U32_MAX) {
+		index = ffz(local_paca->slb_used_bitmap);
+		local_paca->slb_used_bitmap |= 1U << index;
+		if (kernel)
+			local_paca->slb_kern_bitmap |= 1U << index;
+	} else {
+		/* round-robin replacement of slb starting at SLB_NUM_BOLTED. */
+		index = local_paca->stab_rr;
+		if (index < (mmu_slb_size - 1))
+			index++;
+		else
+			index = SLB_NUM_BOLTED;
+		local_paca->stab_rr = index;
+		if (index < 32) {
+			if (kernel)
+				local_paca->slb_kern_bitmap |= 1U << index;
+			else
+				local_paca->slb_kern_bitmap &= ~(1U << index);
+		}
+	}
+	BUG_ON(index < SLB_NUM_BOLTED);
+
+	return index;
+}
+
+static long slb_insert_entry(unsigned long ea, unsigned long context,
+				unsigned long flags, int ssize, bool kernel)
+{
+	unsigned long vsid;
+	unsigned long vsid_data, esid_data;
+	enum slb_index index;
+
+	vsid = get_vsid(context, ea, ssize);
+	if (!vsid)
+		return -EFAULT;
+
+	/*
+	 * There must not be a kernel SLB fault in alloc_slb_index or before
+	 * slbmte here or the allocation bitmaps could get out of whack with
+	 * the SLB.
+	 *
+	 * User SLB faults or preloads take this path which might get inlined
+	 * into the caller, so add compiler barriers here to ensure unsafe
+	 * memory accesses do not come between.
+	 */
+	barrier();
+
+	index = alloc_slb_index(kernel);
+
+	vsid_data = __mk_vsid_data(vsid, ssize, flags);
+	esid_data = mk_esid_data(ea, ssize, index);
+
+	/*
+	 * No need for an isync before or after this slbmte. The exception
+	 * we enter with and the rfid we exit with are context synchronizing.
+	 * User preloads should add isync afterwards in case the kernel
+	 * accesses user memory before it returns to userspace with rfid.
+	 */
+	assert_slb_presence(false, ea);
+	asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data));
+
+	barrier();
+
+	if (!kernel)
+		slb_cache_update(esid_data);
+
+	return 0;
+}
+
+static long slb_allocate_kernel(unsigned long ea, unsigned long id)
+{
+	unsigned long context;
+	unsigned long flags;
+	int ssize;
+
+	if (id == LINEAR_MAP_REGION_ID) {
+
+		/* We only support upto MAX_PHYSMEM_BITS */
+		if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS))
+			return -EFAULT;
+
+		flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp;
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	} else if (id == VMEMMAP_REGION_ID) {
+
+		if (ea >= H_VMEMMAP_END)
+			return -EFAULT;
+
+		flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp;
+#endif
+	} else if (id == VMALLOC_REGION_ID) {
+
+		if (ea >= H_VMALLOC_END)
+			return -EFAULT;
+
+		flags = local_paca->vmalloc_sllp;
+
+	} else if (id == IO_REGION_ID) {
+
+		if (ea >= H_KERN_IO_END)
+			return -EFAULT;
+
+		flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp;
+
+	} else {
+		return -EFAULT;
+	}
+
+	ssize = MMU_SEGSIZE_1T;
+	if (!mmu_has_feature(MMU_FTR_1T_SEGMENT))
+		ssize = MMU_SEGSIZE_256M;
+
+	context = get_kernel_context(ea);
+
+	return slb_insert_entry(ea, context, flags, ssize, true);
+}
+
+static long slb_allocate_user(struct mm_struct *mm, unsigned long ea)
+{
+	unsigned long context;
+	unsigned long flags;
+	int bpsize;
+	int ssize;
+
+	/*
+	 * consider this as bad access if we take a SLB miss
+	 * on an address above addr limit.
+	 */
+	if (ea >= mm_ctx_slb_addr_limit(&mm->context))
+		return -EFAULT;
+
+	context = get_user_context(&mm->context, ea);
+	if (!context)
+		return -EFAULT;
+
+	if (unlikely(ea >= H_PGTABLE_RANGE)) {
+		WARN_ON(1);
+		return -EFAULT;
+	}
+
+	ssize = user_segment_size(ea);
+
+	bpsize = get_slice_psize(mm, ea);
+	flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp;
+
+	return slb_insert_entry(ea, context, flags, ssize, false);
+}
+
+long do_slb_fault(struct pt_regs *regs, unsigned long ea)
+{
+	unsigned long id = get_region_id(ea);
+
+	/* IRQs are not reconciled here, so can't check irqs_disabled */
+	VM_WARN_ON(mfmsr() & MSR_EE);
+
+	if (unlikely(!(regs->msr & MSR_RI)))
+		return -EINVAL;
+
+	/*
+	 * SLB kernel faults must be very careful not to touch anything
+	 * that is not bolted. E.g., PACA and global variables are okay,
+	 * mm->context stuff is not.
+	 *
+	 * SLB user faults can access all of kernel memory, but must be
+	 * careful not to touch things like IRQ state because it is not
+	 * "reconciled" here. The difficulty is that we must use
+	 * fast_exception_return to return from kernel SLB faults without
+	 * looking at possible non-bolted memory. We could test user vs
+	 * kernel faults in the interrupt handler asm and do a full fault,
+	 * reconcile, ret_from_except for user faults which would make them
+	 * first class kernel code. But for performance it's probably nicer
+	 * if they go via fast_exception_return too.
+	 */
+	if (id >= LINEAR_MAP_REGION_ID) {
+		long err;
+#ifdef CONFIG_DEBUG_VM
+		/* Catch recursive kernel SLB faults. */
+		BUG_ON(local_paca->in_kernel_slb_handler);
+		local_paca->in_kernel_slb_handler = 1;
+#endif
+		err = slb_allocate_kernel(ea, id);
+#ifdef CONFIG_DEBUG_VM
+		local_paca->in_kernel_slb_handler = 0;
+#endif
+		return err;
+	} else {
+		struct mm_struct *mm = current->mm;
+		long err;
+
+		if (unlikely(!mm))
+			return -EFAULT;
+
+		err = slb_allocate_user(mm, ea);
+		if (!err)
+			preload_add(current_thread_info(), ea);
+
+		return err;
+	}
+}
+
+void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err)
+{
+	if (err == -EFAULT) {
+		if (user_mode(regs))
+			_exception(SIGSEGV, regs, SEGV_BNDERR, ea);
+		else
+			bad_page_fault(regs, ea, SIGSEGV);
+	} else if (err == -EINVAL) {
+		unrecoverable_exception(regs);
+	} else {
+		BUG();
+	}
+}
diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c
new file mode 100644
index 000000000000..473dd430e306
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/subpage_prot.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright 2007-2008 Paul Mackerras, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/syscalls.h>
+
+#include <asm/pgtable.h>
+#include <linux/uaccess.h>
+
+/*
+ * Free all pages allocated for subpage protection maps and pointers.
+ * Also makes sure that the subpage_prot_table structure is
+ * reinitialized for the next user.
+ */
+void subpage_prot_free(struct mm_struct *mm)
+{
+	struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
+	unsigned long i, j, addr;
+	u32 **p;
+
+	if (!spt)
+		return;
+
+	for (i = 0; i < 4; ++i) {
+		if (spt->low_prot[i]) {
+			free_page((unsigned long)spt->low_prot[i]);
+			spt->low_prot[i] = NULL;
+		}
+	}
+	addr = 0;
+	for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) {
+		p = spt->protptrs[i];
+		if (!p)
+			continue;
+		spt->protptrs[i] = NULL;
+		for (j = 0; j < SBP_L2_COUNT && addr < spt->maxaddr;
+		     ++j, addr += PAGE_SIZE)
+			if (p[j])
+				free_page((unsigned long)p[j]);
+		free_page((unsigned long)p);
+	}
+	spt->maxaddr = 0;
+	kfree(spt);
+}
+
+static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
+			     int npages)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	spinlock_t *ptl;
+
+	pgd = pgd_offset(mm, addr);
+	if (pgd_none(*pgd))
+		return;
+	pud = pud_offset(pgd, addr);
+	if (pud_none(*pud))
+		return;
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd))
+		return;
+	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	arch_enter_lazy_mmu_mode();
+	for (; npages > 0; --npages) {
+		pte_update(mm, addr, pte, 0, 0, 0);
+		addr += PAGE_SIZE;
+		++pte;
+	}
+	arch_leave_lazy_mmu_mode();
+	pte_unmap_unlock(pte - 1, ptl);
+}
+
+/*
+ * Clear the subpage protection map for an address range, allowing
+ * all accesses that are allowed by the pte permissions.
+ */
+static void subpage_prot_clear(unsigned long addr, unsigned long len)
+{
+	struct mm_struct *mm = current->mm;
+	struct subpage_prot_table *spt;
+	u32 **spm, *spp;
+	unsigned long i;
+	size_t nw;
+	unsigned long next, limit;
+
+	down_write(&mm->mmap_sem);
+
+	spt = mm_ctx_subpage_prot(&mm->context);
+	if (!spt)
+		goto err_out;
+
+	limit = addr + len;
+	if (limit > spt->maxaddr)
+		limit = spt->maxaddr;
+	for (; addr < limit; addr = next) {
+		next = pmd_addr_end(addr, limit);
+		if (addr < 0x100000000UL) {
+			spm = spt->low_prot;
+		} else {
+			spm = spt->protptrs[addr >> SBP_L3_SHIFT];
+			if (!spm)
+				continue;
+		}
+		spp = spm[(addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
+		if (!spp)
+			continue;
+		spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
+
+		i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+		nw = PTRS_PER_PTE - i;
+		if (addr + (nw << PAGE_SHIFT) > next)
+			nw = (next - addr) >> PAGE_SHIFT;
+
+		memset(spp, 0, nw * sizeof(u32));
+
+		/* now flush any existing HPTEs for the range */
+		hpte_flush_range(mm, addr, nw);
+	}
+
+err_out:
+	up_write(&mm->mmap_sem);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
+				  unsigned long end, struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->vma;
+	split_huge_pmd(vma, pmd, addr);
+	return 0;
+}
+
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+				    unsigned long len)
+{
+	struct vm_area_struct *vma;
+	struct mm_walk subpage_proto_walk = {
+		.mm = mm,
+		.pmd_entry = subpage_walk_pmd_entry,
+	};
+
+	/*
+	 * We don't try too hard, we just mark all the vma in that range
+	 * VM_NOHUGEPAGE and split them.
+	 */
+	vma = find_vma(mm, addr);
+	/*
+	 * If the range is in unmapped range, just return
+	 */
+	if (vma && ((addr + len) <= vma->vm_start))
+		return;
+
+	while (vma) {
+		if (vma->vm_start >= (addr + len))
+			break;
+		vma->vm_flags |= VM_NOHUGEPAGE;
+		walk_page_vma(vma, &subpage_proto_walk);
+		vma = vma->vm_next;
+	}
+}
+#else
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+				    unsigned long len)
+{
+	return;
+}
+#endif
+
+/*
+ * Copy in a subpage protection map for an address range.
+ * The map has 2 bits per 4k subpage, so 32 bits per 64k page.
+ * Each 2-bit field is 0 to allow any access, 1 to prevent writes,
+ * 2 or 3 to prevent all accesses.
+ * Note that the normal page protections also apply; the subpage
+ * protection mechanism is an additional constraint, so putting 0
+ * in a 2-bit field won't allow writes to a page that is otherwise
+ * write-protected.
+ */
+SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
+		unsigned long, len, u32 __user *, map)
+{
+	struct mm_struct *mm = current->mm;
+	struct subpage_prot_table *spt;
+	u32 **spm, *spp;
+	unsigned long i;
+	size_t nw;
+	unsigned long next, limit;
+	int err;
+
+	if (radix_enabled())
+		return -ENOENT;
+
+	/* Check parameters */
+	if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) ||
+	    addr >= mm->task_size || len >= mm->task_size ||
+	    addr + len > mm->task_size)
+		return -EINVAL;
+
+	if (is_hugepage_only_range(mm, addr, len))
+		return -EINVAL;
+
+	if (!map) {
+		/* Clear out the protection map for the address range */
+		subpage_prot_clear(addr, len);
+		return 0;
+	}
+
+	if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32)))
+		return -EFAULT;
+
+	down_write(&mm->mmap_sem);
+
+	spt = mm_ctx_subpage_prot(&mm->context);
+	if (!spt) {
+		/*
+		 * Allocate subpage prot table if not already done.
+		 * Do this with mmap_sem held
+		 */
+		spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL);
+		if (!spt) {
+			err = -ENOMEM;
+			goto out;
+		}
+		mm->context.hash_context->spt = spt;
+	}
+
+	subpage_mark_vma_nohuge(mm, addr, len);
+	for (limit = addr + len; addr < limit; addr = next) {
+		next = pmd_addr_end(addr, limit);
+		err = -ENOMEM;
+		if (addr < 0x100000000UL) {
+			spm = spt->low_prot;
+		} else {
+			spm = spt->protptrs[addr >> SBP_L3_SHIFT];
+			if (!spm) {
+				spm = (u32 **)get_zeroed_page(GFP_KERNEL);
+				if (!spm)
+					goto out;
+				spt->protptrs[addr >> SBP_L3_SHIFT] = spm;
+			}
+		}
+		spm += (addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1);
+		spp = *spm;
+		if (!spp) {
+			spp = (u32 *)get_zeroed_page(GFP_KERNEL);
+			if (!spp)
+				goto out;
+			*spm = spp;
+		}
+		spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
+
+		local_irq_disable();
+		demote_segment_4k(mm, addr);
+		local_irq_enable();
+
+		i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+		nw = PTRS_PER_PTE - i;
+		if (addr + (nw << PAGE_SHIFT) > next)
+			nw = (next - addr) >> PAGE_SHIFT;
+
+		up_write(&mm->mmap_sem);
+		if (__copy_from_user(spp, map, nw * sizeof(u32)))
+			return -EFAULT;
+		map += nw;
+		down_write(&mm->mmap_sem);
+
+		/* now flush any existing HPTEs for the range */
+		hpte_flush_range(mm, addr, nw);
+	}
+	if (limit > spt->maxaddr)
+		spt->maxaddr = limit;
+	err = 0;
+ out:
+	up_write(&mm->mmap_sem);
+	return err;
+}
diff --git a/arch/powerpc/mm/book3s64/vphn.c b/arch/powerpc/mm/book3s64/vphn.c
new file mode 100644
index 000000000000..0ee7734afb50
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/vphn.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/byteorder.h>
+#include "vphn.h"
+
+/*
+ * The associativity domain numbers are returned from the hypervisor as a
+ * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the
+ * special value of "all ones" (aka. 0xffff) and its size may not exceed 48
+ * bytes.
+ *
+ *    --- 16-bit fields -->
+ *  _________________________
+ *  |  0  |  1  |  2  |  3  |   be_packed[0]
+ *  ------+-----+-----+------
+ *  _________________________
+ *  |  4  |  5  |  6  |  7  |   be_packed[1]
+ *  -------------------------
+ *            ...
+ *  _________________________
+ *  | 20  | 21  | 22  | 23  |   be_packed[5]
+ *  -------------------------
+ *
+ * Convert to the sequence they would appear in the ibm,associativity property.
+ */
+int vphn_unpack_associativity(const long *packed, __be32 *unpacked)
+{
+	__be64 be_packed[VPHN_REGISTER_COUNT];
+	int i, nr_assoc_doms = 0;
+	const __be16 *field = (const __be16 *) be_packed;
+	u16 last = 0;
+	bool is_32bit = false;
+
+#define VPHN_FIELD_UNUSED	(0xffff)
+#define VPHN_FIELD_MSB		(0x8000)
+#define VPHN_FIELD_MASK		(~VPHN_FIELD_MSB)
+
+	/* Let's fix the values returned by plpar_hcall9() */
+	for (i = 0; i < VPHN_REGISTER_COUNT; i++)
+		be_packed[i] = cpu_to_be64(packed[i]);
+
+	for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
+		u16 new = be16_to_cpup(field++);
+
+		if (is_32bit) {
+			/*
+			 * Let's concatenate the 16 bits of this field to the
+			 * 15 lower bits of the previous field
+			 */
+			unpacked[++nr_assoc_doms] =
+				cpu_to_be32(last << 16 | new);
+			is_32bit = false;
+		} else if (new == VPHN_FIELD_UNUSED)
+			/* This is the list terminator */
+			break;
+		else if (new & VPHN_FIELD_MSB) {
+			/* Data is in the lower 15 bits of this field */
+			unpacked[++nr_assoc_doms] =
+				cpu_to_be32(new & VPHN_FIELD_MASK);
+		} else {
+			/*
+			 * Data is in the lower 15 bits of this field
+			 * concatenated with the next 16 bit field
+			 */
+			last = new;
+			is_32bit = true;
+		}
+	}
+
+	/* The first cell contains the length of the property */
+	unpacked[0] = cpu_to_be32(nr_assoc_doms);
+
+	return nr_assoc_doms;
+}
diff --git a/arch/powerpc/mm/book3s64/vphn.h b/arch/powerpc/mm/book3s64/vphn.h
new file mode 100644
index 000000000000..f0b93c2dd578
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/vphn.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ARCH_POWERPC_MM_VPHN_H_
+#define _ARCH_POWERPC_MM_VPHN_H_
+
+/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers. */
+#define VPHN_REGISTER_COUNT 6
+
+/*
+ * 6 64-bit registers unpacked into up to 24 be32 associativity values. To
+ * form the complete property we have to add the length in the first cell.
+ */
+#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1)
+
+extern int vphn_unpack_associativity(const long *packed, __be32 *unpacked);
+
+#endif
diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c
deleted file mode 100644
index 6fa6765a10eb..000000000000
--- a/arch/powerpc/mm/hash64_4k.c
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright IBM Corporation, 2015
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-#include <linux/mm.h>
-#include <asm/machdep.h>
-#include <asm/mmu.h>
-
-int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
-		   pte_t *ptep, unsigned long trap, unsigned long flags,
-		   int ssize, int subpg_prot)
-{
-	real_pte_t rpte;
-	unsigned long hpte_group;
-	unsigned long rflags, pa;
-	unsigned long old_pte, new_pte;
-	unsigned long vpn, hash, slot;
-	unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift;
-
-	/*
-	 * atomically mark the linux large page PTE busy and dirty
-	 */
-	do {
-		pte_t pte = READ_ONCE(*ptep);
-
-		old_pte = pte_val(pte);
-		/* If PTE busy, retry the access */
-		if (unlikely(old_pte & H_PAGE_BUSY))
-			return 0;
-		/* If PTE permissions don't match, take page fault */
-		if (unlikely(!check_pte_access(access, old_pte)))
-			return 1;
-		/*
-		 * Try to lock the PTE, add ACCESSED and DIRTY if it was
-		 * a write access. Since this is 4K insert of 64K page size
-		 * also add H_PAGE_COMBO
-		 */
-		new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
-		if (access & _PAGE_WRITE)
-			new_pte |= _PAGE_DIRTY;
-	} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
-
-	/*
-	 * PP bits. _PAGE_USER is already PP bit 0x2, so we only
-	 * need to add in 0x1 if it's a read-only user page
-	 */
-	rflags = htab_convert_pte_flags(new_pte);
-	rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
-
-	if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
-	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
-		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
-
-	vpn  = hpt_vpn(ea, vsid, ssize);
-	if (unlikely(old_pte & H_PAGE_HASHPTE)) {
-		/*
-		 * There MIGHT be an HPTE for this pte
-		 */
-		unsigned long gslot = pte_get_hash_gslot(vpn, shift, ssize,
-							 rpte, 0);
-
-		if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_4K,
-					       MMU_PAGE_4K, ssize, flags) == -1)
-			old_pte &= ~_PAGE_HPTEFLAGS;
-	}
-
-	if (likely(!(old_pte & H_PAGE_HASHPTE))) {
-
-		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
-		hash = hpt_hash(vpn, shift, ssize);
-
-repeat:
-		hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-
-		/* Insert into the hash table, primary slot */
-		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
-						MMU_PAGE_4K, MMU_PAGE_4K, ssize);
-		/*
-		 * Primary is full, try the secondary
-		 */
-		if (unlikely(slot == -1)) {
-			hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
-			slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
-							rflags,
-							HPTE_V_SECONDARY,
-							MMU_PAGE_4K,
-							MMU_PAGE_4K, ssize);
-			if (slot == -1) {
-				if (mftb() & 0x1)
-					hpte_group = (hash & htab_hash_mask) *
-							HPTES_PER_GROUP;
-				mmu_hash_ops.hpte_remove(hpte_group);
-				/*
-				 * FIXME!! Should be try the group from which we removed ?
-				 */
-				goto repeat;
-			}
-		}
-		/*
-		 * Hypervisor failure. Restore old pte and return -1
-		 * similar to __hash_page_*
-		 */
-		if (unlikely(slot == -2)) {
-			*ptep = __pte(old_pte);
-			hash_failure_debug(ea, access, vsid, trap, ssize,
-					   MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
-			return -1;
-		}
-		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
-		new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
-	}
-	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
-	return 0;
-}
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
deleted file mode 100644
index 3afa253d7f52..000000000000
--- a/arch/powerpc/mm/hash64_64k.c
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- * Copyright IBM Corporation, 2015
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-#include <linux/mm.h>
-#include <asm/machdep.h>
-#include <asm/mmu.h>
-
-/*
- * Return true, if the entry has a slot value which
- * the software considers as invalid.
- */
-static inline bool hpte_soft_invalid(unsigned long hidx)
-{
-	return ((hidx & 0xfUL) == 0xfUL);
-}
-
-/*
- * index from 0 - 15
- */
-bool __rpte_sub_valid(real_pte_t rpte, unsigned long index)
-{
-	return !(hpte_soft_invalid(__rpte_to_hidx(rpte, index)));
-}
-
-int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
-		   pte_t *ptep, unsigned long trap, unsigned long flags,
-		   int ssize, int subpg_prot)
-{
-	real_pte_t rpte;
-	unsigned long hpte_group;
-	unsigned int subpg_index;
-	unsigned long rflags, pa;
-	unsigned long old_pte, new_pte, subpg_pte;
-	unsigned long vpn, hash, slot, gslot;
-	unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift;
-
-	/*
-	 * atomically mark the linux large page PTE busy and dirty
-	 */
-	do {
-		pte_t pte = READ_ONCE(*ptep);
-
-		old_pte = pte_val(pte);
-		/* If PTE busy, retry the access */
-		if (unlikely(old_pte & H_PAGE_BUSY))
-			return 0;
-		/* If PTE permissions don't match, take page fault */
-		if (unlikely(!check_pte_access(access, old_pte)))
-			return 1;
-		/*
-		 * Try to lock the PTE, add ACCESSED and DIRTY if it was
-		 * a write access. Since this is 4K insert of 64K page size
-		 * also add H_PAGE_COMBO
-		 */
-		new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO;
-		if (access & _PAGE_WRITE)
-			new_pte |= _PAGE_DIRTY;
-	} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
-
-	/*
-	 * Handle the subpage protection bits
-	 */
-	subpg_pte = new_pte & ~subpg_prot;
-	rflags = htab_convert_pte_flags(subpg_pte);
-
-	if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
-	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
-
-		/*
-		 * No CPU has hugepages but lacks no execute, so we
-		 * don't need to worry about that case
-		 */
-		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
-	}
-
-	subpg_index = (ea & (PAGE_SIZE - 1)) >> shift;
-	vpn  = hpt_vpn(ea, vsid, ssize);
-	rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
-	/*
-	 *None of the sub 4k page is hashed
-	 */
-	if (!(old_pte & H_PAGE_HASHPTE))
-		goto htab_insert_hpte;
-	/*
-	 * Check if the pte was already inserted into the hash table
-	 * as a 64k HW page, and invalidate the 64k HPTE if so.
-	 */
-	if (!(old_pte & H_PAGE_COMBO)) {
-		flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags);
-		/*
-		 * clear the old slot details from the old and new pte.
-		 * On hash insert failure we use old pte value and we don't
-		 * want slot information there if we have a insert failure.
-		 */
-		old_pte &= ~H_PAGE_HASHPTE;
-		new_pte &= ~H_PAGE_HASHPTE;
-		goto htab_insert_hpte;
-	}
-	/*
-	 * Check for sub page valid and update
-	 */
-	if (__rpte_sub_valid(rpte, subpg_index)) {
-		int ret;
-
-		gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte,
-					   subpg_index);
-		ret = mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn,
-						 MMU_PAGE_4K, MMU_PAGE_4K,
-						 ssize, flags);
-
-		/*
-		 * If we failed because typically the HPTE wasn't really here
-		 * we try an insertion.
-		 */
-		if (ret == -1)
-			goto htab_insert_hpte;
-
-		*ptep = __pte(new_pte & ~H_PAGE_BUSY);
-		return 0;
-	}
-
-htab_insert_hpte:
-
-	/*
-	 * Initialize all hidx entries to invalid value, the first time
-	 * the PTE is about to allocate a 4K HPTE.
-	 */
-	if (!(old_pte & H_PAGE_COMBO))
-		rpte.hidx = INVALID_RPTE_HIDX;
-
-	/*
-	 * handle H_PAGE_4K_PFN case
-	 */
-	if (old_pte & H_PAGE_4K_PFN) {
-		/*
-		 * All the sub 4k page have the same
-		 * physical address.
-		 */
-		pa = pte_pfn(__pte(old_pte)) << HW_PAGE_SHIFT;
-	} else {
-		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
-		pa += (subpg_index << shift);
-	}
-	hash = hpt_hash(vpn, shift, ssize);
-repeat:
-	hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-
-	/* Insert into the hash table, primary slot */
-	slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
-					MMU_PAGE_4K, MMU_PAGE_4K, ssize);
-	/*
-	 * Primary is full, try the secondary
-	 */
-	if (unlikely(slot == -1)) {
-		bool soft_invalid;
-
-		hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
-		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
-						rflags, HPTE_V_SECONDARY,
-						MMU_PAGE_4K, MMU_PAGE_4K,
-						ssize);
-
-		soft_invalid = hpte_soft_invalid(slot);
-		if (unlikely(soft_invalid)) {
-			/*
-			 * We got a valid slot from a hardware point of view.
-			 * but we cannot use it, because we use this special
-			 * value; as defined by hpte_soft_invalid(), to track
-			 * invalid slots. We cannot use it. So invalidate it.
-			 */
-			gslot = slot & _PTEIDX_GROUP_IX;
-			mmu_hash_ops.hpte_invalidate(hpte_group + gslot, vpn,
-						     MMU_PAGE_4K, MMU_PAGE_4K,
-						     ssize, 0);
-		}
-
-		if (unlikely(slot == -1 || soft_invalid)) {
-			/*
-			 * For soft invalid slot, let's ensure that we release a
-			 * slot from the primary, with the hope that we will
-			 * acquire that slot next time we try. This will ensure
-			 * that we do not get the same soft-invalid slot.
-			 */
-			if (soft_invalid || (mftb() & 0x1))
-				hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-
-			mmu_hash_ops.hpte_remove(hpte_group);
-			/*
-			 * FIXME!! Should be try the group from which we removed ?
-			 */
-			goto repeat;
-		}
-	}
-	/*
-	 * Hypervisor failure. Restore old pte and return -1
-	 * similar to __hash_page_*
-	 */
-	if (unlikely(slot == -2)) {
-		*ptep = __pte(old_pte);
-		hash_failure_debug(ea, access, vsid, trap, ssize,
-				   MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
-		return -1;
-	}
-
-	new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE);
-	new_pte |= H_PAGE_HASHPTE;
-
-	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
-	return 0;
-}
-
-int __hash_page_64K(unsigned long ea, unsigned long access,
-		    unsigned long vsid, pte_t *ptep, unsigned long trap,
-		    unsigned long flags, int ssize)
-{
-	real_pte_t rpte;
-	unsigned long hpte_group;
-	unsigned long rflags, pa;
-	unsigned long old_pte, new_pte;
-	unsigned long vpn, hash, slot;
-	unsigned long shift = mmu_psize_defs[MMU_PAGE_64K].shift;
-
-	/*
-	 * atomically mark the linux large page PTE busy and dirty
-	 */
-	do {
-		pte_t pte = READ_ONCE(*ptep);
-
-		old_pte = pte_val(pte);
-		/* If PTE busy, retry the access */
-		if (unlikely(old_pte & H_PAGE_BUSY))
-			return 0;
-		/* If PTE permissions don't match, take page fault */
-		if (unlikely(!check_pte_access(access, old_pte)))
-			return 1;
-		/*
-		 * Check if PTE has the cache-inhibit bit set
-		 * If so, bail out and refault as a 4k page
-		 */
-		if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) &&
-		    unlikely(pte_ci(pte)))
-			return 0;
-		/*
-		 * Try to lock the PTE, add ACCESSED and DIRTY if it was
-		 * a write access.
-		 */
-		new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
-		if (access & _PAGE_WRITE)
-			new_pte |= _PAGE_DIRTY;
-	} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
-
-	rflags = htab_convert_pte_flags(new_pte);
-	rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
-
-	if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
-	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
-		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
-
-	vpn  = hpt_vpn(ea, vsid, ssize);
-	if (unlikely(old_pte & H_PAGE_HASHPTE)) {
-		unsigned long gslot;
-
-		/*
-		 * There MIGHT be an HPTE for this pte
-		 */
-		gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
-		if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K,
-					       MMU_PAGE_64K, ssize,
-					       flags) == -1)
-			old_pte &= ~_PAGE_HPTEFLAGS;
-	}
-
-	if (likely(!(old_pte & H_PAGE_HASHPTE))) {
-
-		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
-		hash = hpt_hash(vpn, shift, ssize);
-
-repeat:
-		hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-
-		/* Insert into the hash table, primary slot */
-		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
-						MMU_PAGE_64K, MMU_PAGE_64K,
-						ssize);
-		/*
-		 * Primary is full, try the secondary
-		 */
-		if (unlikely(slot == -1)) {
-			hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
-			slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
-							rflags,
-							HPTE_V_SECONDARY,
-							MMU_PAGE_64K,
-							MMU_PAGE_64K, ssize);
-			if (slot == -1) {
-				if (mftb() & 0x1)
-					hpte_group = (hash & htab_hash_mask) *
-							HPTES_PER_GROUP;
-				mmu_hash_ops.hpte_remove(hpte_group);
-				/*
-				 * FIXME!! Should be try the group from which we removed ?
-				 */
-				goto repeat;
-			}
-		}
-		/*
-		 * Hypervisor failure. Restore old pte and return -1
-		 * similar to __hash_page_*
-		 */
-		if (unlikely(slot == -2)) {
-			*ptep = __pte(old_pte);
-			hash_failure_debug(ea, access, vsid, trap, ssize,
-					   MMU_PAGE_64K, MMU_PAGE_64K, old_pte);
-			return -1;
-		}
-
-		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
-		new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
-	}
-	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
-	return 0;
-}
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
deleted file mode 100644
index aaa28fd918fe..000000000000
--- a/arch/powerpc/mm/hash_native_64.c
+++ /dev/null
@@ -1,884 +0,0 @@
-/*
- * native hashtable management.
- *
- * SMP scalability work:
- *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- * 
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#undef DEBUG_LOW
-
-#include <linux/spinlock.h>
-#include <linux/bitops.h>
-#include <linux/of.h>
-#include <linux/processor.h>
-#include <linux/threads.h>
-#include <linux/smp.h>
-
-#include <asm/machdep.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-#include <asm/trace.h>
-#include <asm/tlb.h>
-#include <asm/cputable.h>
-#include <asm/udbg.h>
-#include <asm/kexec.h>
-#include <asm/ppc-opcode.h>
-#include <asm/feature-fixups.h>
-
-#include <misc/cxl-base.h>
-
-#ifdef DEBUG_LOW
-#define DBG_LOW(fmt...) udbg_printf(fmt)
-#else
-#define DBG_LOW(fmt...)
-#endif
-
-#ifdef __BIG_ENDIAN__
-#define HPTE_LOCK_BIT 3
-#else
-#define HPTE_LOCK_BIT (56+3)
-#endif
-
-DEFINE_RAW_SPINLOCK(native_tlbie_lock);
-
-static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
-{
-	unsigned long rb;
-
-	rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
-
-	asm volatile("tlbiel %0" : : "r" (rb));
-}
-
-/*
- * tlbiel instruction for hash, set invalidation
- * i.e., r=1 and is=01 or is=10 or is=11
- */
-static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
-					unsigned int pid,
-					unsigned int ric, unsigned int prs)
-{
-	unsigned long rb;
-	unsigned long rs;
-	unsigned int r = 0; /* hash format */
-
-	rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
-	rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
-
-	asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
-		     : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r)
-		     : "memory");
-}
-
-
-static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is)
-{
-	unsigned int set;
-
-	asm volatile("ptesync": : :"memory");
-
-	for (set = 0; set < num_sets; set++)
-		tlbiel_hash_set_isa206(set, is);
-
-	asm volatile("ptesync": : :"memory");
-}
-
-static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
-{
-	unsigned int set;
-
-	asm volatile("ptesync": : :"memory");
-
-	/*
-	 * Flush the first set of the TLB, and any caching of partition table
-	 * entries. Then flush the remaining sets of the TLB. Hash mode uses
-	 * partition scoped TLB translations.
-	 */
-	tlbiel_hash_set_isa300(0, is, 0, 2, 0);
-	for (set = 1; set < num_sets; set++)
-		tlbiel_hash_set_isa300(set, is, 0, 0, 0);
-
-	/*
-	 * Now invalidate the process table cache.
-	 *
-	 * From ISA v3.0B p. 1078:
-	 *     The following forms are invalid.
-	 *      * PRS=1, R=0, and RIC!=2 (The only process-scoped
-	 *        HPT caching is of the Process Table.)
-	 */
-	tlbiel_hash_set_isa300(0, is, 0, 2, 1);
-
-	asm volatile("ptesync": : :"memory");
-
-	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
-}
-
-void hash__tlbiel_all(unsigned int action)
-{
-	unsigned int is;
-
-	switch (action) {
-	case TLB_INVAL_SCOPE_GLOBAL:
-		is = 3;
-		break;
-	case TLB_INVAL_SCOPE_LPID:
-		is = 2;
-		break;
-	default:
-		BUG();
-	}
-
-	if (early_cpu_has_feature(CPU_FTR_ARCH_300))
-		tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is);
-	else if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
-		tlbiel_all_isa206(POWER8_TLB_SETS, is);
-	else if (early_cpu_has_feature(CPU_FTR_ARCH_206))
-		tlbiel_all_isa206(POWER7_TLB_SETS, is);
-	else
-		WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
-}
-
-static inline unsigned long  ___tlbie(unsigned long vpn, int psize,
-						int apsize, int ssize)
-{
-	unsigned long va;
-	unsigned int penc;
-	unsigned long sllp;
-
-	/*
-	 * We need 14 to 65 bits of va for a tlibe of 4K page
-	 * With vpn we ignore the lower VPN_SHIFT bits already.
-	 * And top two bits are already ignored because we can
-	 * only accomodate 76 bits in a 64 bit vpn with a VPN_SHIFT
-	 * of 12.
-	 */
-	va = vpn << VPN_SHIFT;
-	/*
-	 * clear top 16 bits of 64bit va, non SLS segment
-	 * Older versions of the architecture (2.02 and earler) require the
-	 * masking of the top 16 bits.
-	 */
-	if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA))
-		va &= ~(0xffffULL << 48);
-
-	switch (psize) {
-	case MMU_PAGE_4K:
-		/* clear out bits after (52) [0....52.....63] */
-		va &= ~((1ul << (64 - 52)) - 1);
-		va |= ssize << 8;
-		sllp = get_sllp_encoding(apsize);
-		va |= sllp << 5;
-		asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
-			     : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
-			     : "memory");
-		break;
-	default:
-		/* We need 14 to 14 + i bits of va */
-		penc = mmu_psize_defs[psize].penc[apsize];
-		va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
-		va |= penc << 12;
-		va |= ssize << 8;
-		/*
-		 * AVAL bits:
-		 * We don't need all the bits, but rest of the bits
-		 * must be ignored by the processor.
-		 * vpn cover upto 65 bits of va. (0...65) and we need
-		 * 58..64 bits of va.
-		 */
-		va |= (vpn & 0xfe); /* AVAL */
-		va |= 1; /* L */
-		asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
-			     : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
-			     : "memory");
-		break;
-	}
-	return va;
-}
-
-static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize)
-{
-	if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
-		/* Need the extra ptesync to ensure we don't reorder tlbie*/
-		asm volatile("ptesync": : :"memory");
-		___tlbie(vpn, psize, apsize, ssize);
-	}
-}
-
-static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
-{
-	unsigned long rb;
-
-	rb = ___tlbie(vpn, psize, apsize, ssize);
-	trace_tlbie(0, 0, rb, 0, 0, 0, 0);
-}
-
-static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
-{
-	unsigned long va;
-	unsigned int penc;
-	unsigned long sllp;
-
-	/* VPN_SHIFT can be atmost 12 */
-	va = vpn << VPN_SHIFT;
-	/*
-	 * clear top 16 bits of 64 bit va, non SLS segment
-	 * Older versions of the architecture (2.02 and earler) require the
-	 * masking of the top 16 bits.
-	 */
-	if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA))
-		va &= ~(0xffffULL << 48);
-
-	switch (psize) {
-	case MMU_PAGE_4K:
-		/* clear out bits after(52) [0....52.....63] */
-		va &= ~((1ul << (64 - 52)) - 1);
-		va |= ssize << 8;
-		sllp = get_sllp_encoding(apsize);
-		va |= sllp << 5;
-		asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,0", %1)
-			     : : "r" (va), "i" (CPU_FTR_ARCH_206)
-			     : "memory");
-		break;
-	default:
-		/* We need 14 to 14 + i bits of va */
-		penc = mmu_psize_defs[psize].penc[apsize];
-		va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
-		va |= penc << 12;
-		va |= ssize << 8;
-		/*
-		 * AVAL bits:
-		 * We don't need all the bits, but rest of the bits
-		 * must be ignored by the processor.
-		 * vpn cover upto 65 bits of va. (0...65) and we need
-		 * 58..64 bits of va.
-		 */
-		va |= (vpn & 0xfe);
-		va |= 1; /* L */
-		asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,1", %1)
-			     : : "r" (va), "i" (CPU_FTR_ARCH_206)
-			     : "memory");
-		break;
-	}
-	trace_tlbie(0, 1, va, 0, 0, 0, 0);
-
-}
-
-static inline void tlbie(unsigned long vpn, int psize, int apsize,
-			 int ssize, int local)
-{
-	unsigned int use_local;
-	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-
-	use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && !cxl_ctx_in_use();
-
-	if (use_local)
-		use_local = mmu_psize_defs[psize].tlbiel;
-	if (lock_tlbie && !use_local)
-		raw_spin_lock(&native_tlbie_lock);
-	asm volatile("ptesync": : :"memory");
-	if (use_local) {
-		__tlbiel(vpn, psize, apsize, ssize);
-		asm volatile("ptesync": : :"memory");
-	} else {
-		__tlbie(vpn, psize, apsize, ssize);
-		fixup_tlbie(vpn, psize, apsize, ssize);
-		asm volatile("eieio; tlbsync; ptesync": : :"memory");
-	}
-	if (lock_tlbie && !use_local)
-		raw_spin_unlock(&native_tlbie_lock);
-}
-
-static inline void native_lock_hpte(struct hash_pte *hptep)
-{
-	unsigned long *word = (unsigned long *)&hptep->v;
-
-	while (1) {
-		if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
-			break;
-		spin_begin();
-		while(test_bit(HPTE_LOCK_BIT, word))
-			spin_cpu_relax();
-		spin_end();
-	}
-}
-
-static inline void native_unlock_hpte(struct hash_pte *hptep)
-{
-	unsigned long *word = (unsigned long *)&hptep->v;
-
-	clear_bit_unlock(HPTE_LOCK_BIT, word);
-}
-
-static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
-			unsigned long pa, unsigned long rflags,
-			unsigned long vflags, int psize, int apsize, int ssize)
-{
-	struct hash_pte *hptep = htab_address + hpte_group;
-	unsigned long hpte_v, hpte_r;
-	int i;
-
-	if (!(vflags & HPTE_V_BOLTED)) {
-		DBG_LOW("    insert(group=%lx, vpn=%016lx, pa=%016lx,"
-			" rflags=%lx, vflags=%lx, psize=%d)\n",
-			hpte_group, vpn, pa, rflags, vflags, psize);
-	}
-
-	for (i = 0; i < HPTES_PER_GROUP; i++) {
-		if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
-			/* retry with lock held */
-			native_lock_hpte(hptep);
-			if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
-				break;
-			native_unlock_hpte(hptep);
-		}
-
-		hptep++;
-	}
-
-	if (i == HPTES_PER_GROUP)
-		return -1;
-
-	hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
-	hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
-
-	if (!(vflags & HPTE_V_BOLTED)) {
-		DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
-			i, hpte_v, hpte_r);
-	}
-
-	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
-		hpte_r = hpte_old_to_new_r(hpte_v, hpte_r);
-		hpte_v = hpte_old_to_new_v(hpte_v);
-	}
-
-	hptep->r = cpu_to_be64(hpte_r);
-	/* Guarantee the second dword is visible before the valid bit */
-	eieio();
-	/*
-	 * Now set the first dword including the valid bit
-	 * NOTE: this also unlocks the hpte
-	 */
-	hptep->v = cpu_to_be64(hpte_v);
-
-	__asm__ __volatile__ ("ptesync" : : : "memory");
-
-	return i | (!!(vflags & HPTE_V_SECONDARY) << 3);
-}
-
-static long native_hpte_remove(unsigned long hpte_group)
-{
-	struct hash_pte *hptep;
-	int i;
-	int slot_offset;
-	unsigned long hpte_v;
-
-	DBG_LOW("    remove(group=%lx)\n", hpte_group);
-
-	/* pick a random entry to start at */
-	slot_offset = mftb() & 0x7;
-
-	for (i = 0; i < HPTES_PER_GROUP; i++) {
-		hptep = htab_address + hpte_group + slot_offset;
-		hpte_v = be64_to_cpu(hptep->v);
-
-		if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) {
-			/* retry with lock held */
-			native_lock_hpte(hptep);
-			hpte_v = be64_to_cpu(hptep->v);
-			if ((hpte_v & HPTE_V_VALID)
-			    && !(hpte_v & HPTE_V_BOLTED))
-				break;
-			native_unlock_hpte(hptep);
-		}
-
-		slot_offset++;
-		slot_offset &= 0x7;
-	}
-
-	if (i == HPTES_PER_GROUP)
-		return -1;
-
-	/* Invalidate the hpte. NOTE: this also unlocks it */
-	hptep->v = 0;
-
-	return i;
-}
-
-static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
-				 unsigned long vpn, int bpsize,
-				 int apsize, int ssize, unsigned long flags)
-{
-	struct hash_pte *hptep = htab_address + slot;
-	unsigned long hpte_v, want_v;
-	int ret = 0, local = 0;
-
-	want_v = hpte_encode_avpn(vpn, bpsize, ssize);
-
-	DBG_LOW("    update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
-		vpn, want_v & HPTE_V_AVPN, slot, newpp);
-
-	hpte_v = hpte_get_old_v(hptep);
-	/*
-	 * We need to invalidate the TLB always because hpte_remove doesn't do
-	 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
-	 * random entry from it. When we do that we don't invalidate the TLB
-	 * (hpte_remove) because we assume the old translation is still
-	 * technically "valid".
-	 */
-	if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
-		DBG_LOW(" -> miss\n");
-		ret = -1;
-	} else {
-		native_lock_hpte(hptep);
-		/* recheck with locks held */
-		hpte_v = hpte_get_old_v(hptep);
-		if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) ||
-			     !(hpte_v & HPTE_V_VALID))) {
-			ret = -1;
-		} else {
-			DBG_LOW(" -> hit\n");
-			/* Update the HPTE */
-			hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
-						~(HPTE_R_PPP | HPTE_R_N)) |
-					       (newpp & (HPTE_R_PPP | HPTE_R_N |
-							 HPTE_R_C)));
-		}
-		native_unlock_hpte(hptep);
-	}
-
-	if (flags & HPTE_LOCAL_UPDATE)
-		local = 1;
-	/*
-	 * Ensure it is out of the tlb too if it is not a nohpte fault
-	 */
-	if (!(flags & HPTE_NOHPTE_UPDATE))
-		tlbie(vpn, bpsize, apsize, ssize, local);
-
-	return ret;
-}
-
-static long native_hpte_find(unsigned long vpn, int psize, int ssize)
-{
-	struct hash_pte *hptep;
-	unsigned long hash;
-	unsigned long i;
-	long slot;
-	unsigned long want_v, hpte_v;
-
-	hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
-	want_v = hpte_encode_avpn(vpn, psize, ssize);
-
-	/* Bolted mappings are only ever in the primary group */
-	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-	for (i = 0; i < HPTES_PER_GROUP; i++) {
-
-		hptep = htab_address + slot;
-		hpte_v = hpte_get_old_v(hptep);
-		if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
-			/* HPTE matches */
-			return slot;
-		++slot;
-	}
-
-	return -1;
-}
-
-/*
- * Update the page protection bits. Intended to be used to create
- * guard pages for kernel data structures on pages which are bolted
- * in the HPT. Assumes pages being operated on will not be stolen.
- *
- * No need to lock here because we should be the only user.
- */
-static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
-				       int psize, int ssize)
-{
-	unsigned long vpn;
-	unsigned long vsid;
-	long slot;
-	struct hash_pte *hptep;
-
-	vsid = get_kernel_vsid(ea, ssize);
-	vpn = hpt_vpn(ea, vsid, ssize);
-
-	slot = native_hpte_find(vpn, psize, ssize);
-	if (slot == -1)
-		panic("could not find page to bolt\n");
-	hptep = htab_address + slot;
-
-	/* Update the HPTE */
-	hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
-				~(HPTE_R_PPP | HPTE_R_N)) |
-			       (newpp & (HPTE_R_PPP | HPTE_R_N)));
-	/*
-	 * Ensure it is out of the tlb too. Bolted entries base and
-	 * actual page size will be same.
-	 */
-	tlbie(vpn, psize, psize, ssize, 0);
-}
-
-/*
- * Remove a bolted kernel entry. Memory hotplug uses this.
- *
- * No need to lock here because we should be the only user.
- */
-static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
-{
-	unsigned long vpn;
-	unsigned long vsid;
-	long slot;
-	struct hash_pte *hptep;
-
-	vsid = get_kernel_vsid(ea, ssize);
-	vpn = hpt_vpn(ea, vsid, ssize);
-
-	slot = native_hpte_find(vpn, psize, ssize);
-	if (slot == -1)
-		return -ENOENT;
-
-	hptep = htab_address + slot;
-
-	VM_WARN_ON(!(be64_to_cpu(hptep->v) & HPTE_V_BOLTED));
-
-	/* Invalidate the hpte */
-	hptep->v = 0;
-
-	/* Invalidate the TLB */
-	tlbie(vpn, psize, psize, ssize, 0);
-	return 0;
-}
-
-
-static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
-				   int bpsize, int apsize, int ssize, int local)
-{
-	struct hash_pte *hptep = htab_address + slot;
-	unsigned long hpte_v;
-	unsigned long want_v;
-	unsigned long flags;
-
-	local_irq_save(flags);
-
-	DBG_LOW("    invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
-
-	want_v = hpte_encode_avpn(vpn, bpsize, ssize);
-	hpte_v = hpte_get_old_v(hptep);
-
-	if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
-		native_lock_hpte(hptep);
-		/* recheck with locks held */
-		hpte_v = hpte_get_old_v(hptep);
-
-		if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
-			/* Invalidate the hpte. NOTE: this also unlocks it */
-			hptep->v = 0;
-		else
-			native_unlock_hpte(hptep);
-	}
-	/*
-	 * We need to invalidate the TLB always because hpte_remove doesn't do
-	 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
-	 * random entry from it. When we do that we don't invalidate the TLB
-	 * (hpte_remove) because we assume the old translation is still
-	 * technically "valid".
-	 */
-	tlbie(vpn, bpsize, apsize, ssize, local);
-
-	local_irq_restore(flags);
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void native_hugepage_invalidate(unsigned long vsid,
-				       unsigned long addr,
-				       unsigned char *hpte_slot_array,
-				       int psize, int ssize, int local)
-{
-	int i;
-	struct hash_pte *hptep;
-	int actual_psize = MMU_PAGE_16M;
-	unsigned int max_hpte_count, valid;
-	unsigned long flags, s_addr = addr;
-	unsigned long hpte_v, want_v, shift;
-	unsigned long hidx, vpn = 0, hash, slot;
-
-	shift = mmu_psize_defs[psize].shift;
-	max_hpte_count = 1U << (PMD_SHIFT - shift);
-
-	local_irq_save(flags);
-	for (i = 0; i < max_hpte_count; i++) {
-		valid = hpte_valid(hpte_slot_array, i);
-		if (!valid)
-			continue;
-		hidx =  hpte_hash_index(hpte_slot_array, i);
-
-		/* get the vpn */
-		addr = s_addr + (i * (1ul << shift));
-		vpn = hpt_vpn(addr, vsid, ssize);
-		hash = hpt_hash(vpn, shift, ssize);
-		if (hidx & _PTEIDX_SECONDARY)
-			hash = ~hash;
-
-		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-		slot += hidx & _PTEIDX_GROUP_IX;
-
-		hptep = htab_address + slot;
-		want_v = hpte_encode_avpn(vpn, psize, ssize);
-		hpte_v = hpte_get_old_v(hptep);
-
-		/* Even if we miss, we need to invalidate the TLB */
-		if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
-			/* recheck with locks held */
-			native_lock_hpte(hptep);
-			hpte_v = hpte_get_old_v(hptep);
-
-			if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
-				/*
-				 * Invalidate the hpte. NOTE: this also unlocks it
-				 */
-
-				hptep->v = 0;
-			} else
-				native_unlock_hpte(hptep);
-		}
-		/*
-		 * We need to do tlb invalidate for all the address, tlbie
-		 * instruction compares entry_VA in tlb with the VA specified
-		 * here
-		 */
-		tlbie(vpn, psize, actual_psize, ssize, local);
-	}
-	local_irq_restore(flags);
-}
-#else
-static void native_hugepage_invalidate(unsigned long vsid,
-				       unsigned long addr,
-				       unsigned char *hpte_slot_array,
-				       int psize, int ssize, int local)
-{
-	WARN(1, "%s called without THP support\n", __func__);
-}
-#endif
-
-static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
-			int *psize, int *apsize, int *ssize, unsigned long *vpn)
-{
-	unsigned long avpn, pteg, vpi;
-	unsigned long hpte_v = be64_to_cpu(hpte->v);
-	unsigned long hpte_r = be64_to_cpu(hpte->r);
-	unsigned long vsid, seg_off;
-	int size, a_size, shift;
-	/* Look at the 8 bit LP value */
-	unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
-
-	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
-		hpte_v = hpte_new_to_old_v(hpte_v, hpte_r);
-		hpte_r = hpte_new_to_old_r(hpte_r);
-	}
-	if (!(hpte_v & HPTE_V_LARGE)) {
-		size   = MMU_PAGE_4K;
-		a_size = MMU_PAGE_4K;
-	} else {
-		size = hpte_page_sizes[lp] & 0xf;
-		a_size = hpte_page_sizes[lp] >> 4;
-	}
-	/* This works for all page sizes, and for 256M and 1T segments */
-	*ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
-	shift = mmu_psize_defs[size].shift;
-
-	avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm);
-	pteg = slot / HPTES_PER_GROUP;
-	if (hpte_v & HPTE_V_SECONDARY)
-		pteg = ~pteg;
-
-	switch (*ssize) {
-	case MMU_SEGSIZE_256M:
-		/* We only have 28 - 23 bits of seg_off in avpn */
-		seg_off = (avpn & 0x1f) << 23;
-		vsid    =  avpn >> 5;
-		/* We can find more bits from the pteg value */
-		if (shift < 23) {
-			vpi = (vsid ^ pteg) & htab_hash_mask;
-			seg_off |= vpi << shift;
-		}
-		*vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT;
-		break;
-	case MMU_SEGSIZE_1T:
-		/* We only have 40 - 23 bits of seg_off in avpn */
-		seg_off = (avpn & 0x1ffff) << 23;
-		vsid    = avpn >> 17;
-		if (shift < 23) {
-			vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask;
-			seg_off |= vpi << shift;
-		}
-		*vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT;
-		break;
-	default:
-		*vpn = size = 0;
-	}
-	*psize  = size;
-	*apsize = a_size;
-}
-
-/*
- * clear all mappings on kexec.  All cpus are in real mode (or they will
- * be when they isi), and we are the only one left.  We rely on our kernel
- * mapping being 0xC0's and the hardware ignoring those two real bits.
- *
- * This must be called with interrupts disabled.
- *
- * Taking the native_tlbie_lock is unsafe here due to the possibility of
- * lockdep being on. On pre POWER5 hardware, not taking the lock could
- * cause deadlock. POWER5 and newer not taking the lock is fine. This only
- * gets called during boot before secondary CPUs have come up and during
- * crashdump and all bets are off anyway.
- *
- * TODO: add batching support when enabled.  remember, no dynamic memory here,
- * although there is the control page available...
- */
-static void native_hpte_clear(void)
-{
-	unsigned long vpn = 0;
-	unsigned long slot, slots;
-	struct hash_pte *hptep = htab_address;
-	unsigned long hpte_v;
-	unsigned long pteg_count;
-	int psize, apsize, ssize;
-
-	pteg_count = htab_hash_mask + 1;
-
-	slots = pteg_count * HPTES_PER_GROUP;
-
-	for (slot = 0; slot < slots; slot++, hptep++) {
-		/*
-		 * we could lock the pte here, but we are the only cpu
-		 * running,  right?  and for crash dump, we probably
-		 * don't want to wait for a maybe bad cpu.
-		 */
-		hpte_v = be64_to_cpu(hptep->v);
-
-		/*
-		 * Call __tlbie() here rather than tlbie() since we can't take the
-		 * native_tlbie_lock.
-		 */
-		if (hpte_v & HPTE_V_VALID) {
-			hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn);
-			hptep->v = 0;
-			___tlbie(vpn, psize, apsize, ssize);
-		}
-	}
-
-	asm volatile("eieio; tlbsync; ptesync":::"memory");
-}
-
-/*
- * Batched hash table flush, we batch the tlbie's to avoid taking/releasing
- * the lock all the time
- */
-static void native_flush_hash_range(unsigned long number, int local)
-{
-	unsigned long vpn = 0;
-	unsigned long hash, index, hidx, shift, slot;
-	struct hash_pte *hptep;
-	unsigned long hpte_v;
-	unsigned long want_v;
-	unsigned long flags;
-	real_pte_t pte;
-	struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
-	unsigned long psize = batch->psize;
-	int ssize = batch->ssize;
-	int i;
-	unsigned int use_local;
-
-	use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) &&
-		mmu_psize_defs[psize].tlbiel && !cxl_ctx_in_use();
-
-	local_irq_save(flags);
-
-	for (i = 0; i < number; i++) {
-		vpn = batch->vpn[i];
-		pte = batch->pte[i];
-
-		pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
-			hash = hpt_hash(vpn, shift, ssize);
-			hidx = __rpte_to_hidx(pte, index);
-			if (hidx & _PTEIDX_SECONDARY)
-				hash = ~hash;
-			slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-			slot += hidx & _PTEIDX_GROUP_IX;
-			hptep = htab_address + slot;
-			want_v = hpte_encode_avpn(vpn, psize, ssize);
-			hpte_v = hpte_get_old_v(hptep);
-
-			if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
-				continue;
-			/* lock and try again */
-			native_lock_hpte(hptep);
-			hpte_v = hpte_get_old_v(hptep);
-
-			if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
-				native_unlock_hpte(hptep);
-			else
-				hptep->v = 0;
-
-		} pte_iterate_hashed_end();
-	}
-
-	if (use_local) {
-		asm volatile("ptesync":::"memory");
-		for (i = 0; i < number; i++) {
-			vpn = batch->vpn[i];
-			pte = batch->pte[i];
-
-			pte_iterate_hashed_subpages(pte, psize,
-						    vpn, index, shift) {
-				__tlbiel(vpn, psize, psize, ssize);
-			} pte_iterate_hashed_end();
-		}
-		asm volatile("ptesync":::"memory");
-	} else {
-		int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-
-		if (lock_tlbie)
-			raw_spin_lock(&native_tlbie_lock);
-
-		asm volatile("ptesync":::"memory");
-		for (i = 0; i < number; i++) {
-			vpn = batch->vpn[i];
-			pte = batch->pte[i];
-
-			pte_iterate_hashed_subpages(pte, psize,
-						    vpn, index, shift) {
-				__tlbie(vpn, psize, psize, ssize);
-			} pte_iterate_hashed_end();
-		}
-		/*
-		 * Just do one more with the last used values.
-		 */
-		fixup_tlbie(vpn, psize, psize, ssize);
-		asm volatile("eieio; tlbsync; ptesync":::"memory");
-
-		if (lock_tlbie)
-			raw_spin_unlock(&native_tlbie_lock);
-	}
-
-	local_irq_restore(flags);
-}
-
-void __init hpte_init_native(void)
-{
-	mmu_hash_ops.hpte_invalidate	= native_hpte_invalidate;
-	mmu_hash_ops.hpte_updatepp	= native_hpte_updatepp;
-	mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp;
-	mmu_hash_ops.hpte_removebolted = native_hpte_removebolted;
-	mmu_hash_ops.hpte_insert	= native_hpte_insert;
-	mmu_hash_ops.hpte_remove	= native_hpte_remove;
-	mmu_hash_ops.hpte_clear_all	= native_hpte_clear;
-	mmu_hash_ops.flush_hash_range = native_flush_hash_range;
-	mmu_hash_ops.hugepage_invalidate   = native_hugepage_invalidate;
-}
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
deleted file mode 100644
index 6eb89643ce58..000000000000
--- a/arch/powerpc/mm/hash_utils_64.c
+++ /dev/null
@@ -1,1930 +0,0 @@
-/*
- * PowerPC64 port by Mike Corrigan and Dave Engebretsen
- *   {mikejc|engebret}@us.ibm.com
- *
- *    Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com>
- *
- * SMP scalability work:
- *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- * 
- *    Module name: htab.c
- *
- *    Description:
- *      PowerPC Hashed Page Table functions
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#undef DEBUG
-#undef DEBUG_LOW
-
-#define pr_fmt(fmt) "hash-mmu: " fmt
-#include <linux/spinlock.h>
-#include <linux/errno.h>
-#include <linux/sched/mm.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
-#include <linux/sysctl.h>
-#include <linux/export.h>
-#include <linux/ctype.h>
-#include <linux/cache.h>
-#include <linux/init.h>
-#include <linux/signal.h>
-#include <linux/memblock.h>
-#include <linux/context_tracking.h>
-#include <linux/libfdt.h>
-#include <linux/pkeys.h>
-
-#include <asm/debugfs.h>
-#include <asm/processor.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/page.h>
-#include <asm/types.h>
-#include <linux/uaccess.h>
-#include <asm/machdep.h>
-#include <asm/prom.h>
-#include <asm/io.h>
-#include <asm/eeh.h>
-#include <asm/tlb.h>
-#include <asm/cacheflush.h>
-#include <asm/cputable.h>
-#include <asm/sections.h>
-#include <asm/copro.h>
-#include <asm/udbg.h>
-#include <asm/code-patching.h>
-#include <asm/fadump.h>
-#include <asm/firmware.h>
-#include <asm/tm.h>
-#include <asm/trace.h>
-#include <asm/ps3.h>
-#include <asm/pte-walk.h>
-#include <asm/asm-prototypes.h>
-
-#ifdef DEBUG
-#define DBG(fmt...) udbg_printf(fmt)
-#else
-#define DBG(fmt...)
-#endif
-
-#ifdef DEBUG_LOW
-#define DBG_LOW(fmt...) udbg_printf(fmt)
-#else
-#define DBG_LOW(fmt...)
-#endif
-
-#define KB (1024)
-#define MB (1024*KB)
-#define GB (1024L*MB)
-
-/*
- * Note:  pte   --> Linux PTE
- *        HPTE  --> PowerPC Hashed Page Table Entry
- *
- * Execution context:
- *   htab_initialize is called with the MMU off (of course), but
- *   the kernel has been copied down to zero so it can directly
- *   reference global data.  At this point it is very difficult
- *   to print debug info.
- *
- */
-
-static unsigned long _SDR1;
-struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
-EXPORT_SYMBOL_GPL(mmu_psize_defs);
-
-u8 hpte_page_sizes[1 << LP_BITS];
-EXPORT_SYMBOL_GPL(hpte_page_sizes);
-
-struct hash_pte *htab_address;
-unsigned long htab_size_bytes;
-unsigned long htab_hash_mask;
-EXPORT_SYMBOL_GPL(htab_hash_mask);
-int mmu_linear_psize = MMU_PAGE_4K;
-EXPORT_SYMBOL_GPL(mmu_linear_psize);
-int mmu_virtual_psize = MMU_PAGE_4K;
-int mmu_vmalloc_psize = MMU_PAGE_4K;
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-int mmu_vmemmap_psize = MMU_PAGE_4K;
-#endif
-int mmu_io_psize = MMU_PAGE_4K;
-int mmu_kernel_ssize = MMU_SEGSIZE_256M;
-EXPORT_SYMBOL_GPL(mmu_kernel_ssize);
-int mmu_highuser_ssize = MMU_SEGSIZE_256M;
-u16 mmu_slb_size = 64;
-EXPORT_SYMBOL_GPL(mmu_slb_size);
-#ifdef CONFIG_PPC_64K_PAGES
-int mmu_ci_restrictions;
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-static u8 *linear_map_hash_slots;
-static unsigned long linear_map_hash_count;
-static DEFINE_SPINLOCK(linear_map_hash_lock);
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-struct mmu_hash_ops mmu_hash_ops;
-EXPORT_SYMBOL(mmu_hash_ops);
-
-/* There are definitions of page sizes arrays to be used when none
- * is provided by the firmware.
- */
-
-/*
- * Fallback (4k pages only)
- */
-static struct mmu_psize_def mmu_psize_defaults[] = {
-	[MMU_PAGE_4K] = {
-		.shift	= 12,
-		.sllp	= 0,
-		.penc   = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
-		.avpnm	= 0,
-		.tlbiel = 0,
-	},
-};
-
-/* POWER4, GPUL, POWER5
- *
- * Support for 16Mb large pages
- */
-static struct mmu_psize_def mmu_psize_defaults_gp[] = {
-	[MMU_PAGE_4K] = {
-		.shift	= 12,
-		.sllp	= 0,
-		.penc   = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
-		.avpnm	= 0,
-		.tlbiel = 1,
-	},
-	[MMU_PAGE_16M] = {
-		.shift	= 24,
-		.sllp	= SLB_VSID_L,
-		.penc   = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0,
-			    [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 },
-		.avpnm	= 0x1UL,
-		.tlbiel = 0,
-	},
-};
-
-/*
- * 'R' and 'C' update notes:
- *  - Under pHyp or KVM, the updatepp path will not set C, thus it *will*
- *     create writeable HPTEs without C set, because the hcall H_PROTECT
- *     that we use in that case will not update C
- *  - The above is however not a problem, because we also don't do that
- *     fancy "no flush" variant of eviction and we use H_REMOVE which will
- *     do the right thing and thus we don't have the race I described earlier
- *
- *    - Under bare metal,  we do have the race, so we need R and C set
- *    - We make sure R is always set and never lost
- *    - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping
- */
-unsigned long htab_convert_pte_flags(unsigned long pteflags)
-{
-	unsigned long rflags = 0;
-
-	/* _PAGE_EXEC -> NOEXEC */
-	if ((pteflags & _PAGE_EXEC) == 0)
-		rflags |= HPTE_R_N;
-	/*
-	 * PPP bits:
-	 * Linux uses slb key 0 for kernel and 1 for user.
-	 * kernel RW areas are mapped with PPP=0b000
-	 * User area is mapped with PPP=0b010 for read/write
-	 * or PPP=0b011 for read-only (including writeable but clean pages).
-	 */
-	if (pteflags & _PAGE_PRIVILEGED) {
-		/*
-		 * Kernel read only mapped with ppp bits 0b110
-		 */
-		if (!(pteflags & _PAGE_WRITE)) {
-			if (mmu_has_feature(MMU_FTR_KERNEL_RO))
-				rflags |= (HPTE_R_PP0 | 0x2);
-			else
-				rflags |= 0x3;
-		}
-	} else {
-		if (pteflags & _PAGE_RWX)
-			rflags |= 0x2;
-		if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY)))
-			rflags |= 0x1;
-	}
-	/*
-	 * We can't allow hardware to update hpte bits. Hence always
-	 * set 'R' bit and set 'C' if it is a write fault
-	 */
-	rflags |=  HPTE_R_R;
-
-	if (pteflags & _PAGE_DIRTY)
-		rflags |= HPTE_R_C;
-	/*
-	 * Add in WIG bits
-	 */
-
-	if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT)
-		rflags |= HPTE_R_I;
-	else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)
-		rflags |= (HPTE_R_I | HPTE_R_G);
-	else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO)
-		rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M);
-	else
-		/*
-		 * Add memory coherence if cache inhibited is not set
-		 */
-		rflags |= HPTE_R_M;
-
-	rflags |= pte_to_hpte_pkey_bits(pteflags);
-	return rflags;
-}
-
-int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
-		      unsigned long pstart, unsigned long prot,
-		      int psize, int ssize)
-{
-	unsigned long vaddr, paddr;
-	unsigned int step, shift;
-	int ret = 0;
-
-	shift = mmu_psize_defs[psize].shift;
-	step = 1 << shift;
-
-	prot = htab_convert_pte_flags(prot);
-
-	DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n",
-	    vstart, vend, pstart, prot, psize, ssize);
-
-	for (vaddr = vstart, paddr = pstart; vaddr < vend;
-	     vaddr += step, paddr += step) {
-		unsigned long hash, hpteg;
-		unsigned long vsid = get_kernel_vsid(vaddr, ssize);
-		unsigned long vpn  = hpt_vpn(vaddr, vsid, ssize);
-		unsigned long tprot = prot;
-
-		/*
-		 * If we hit a bad address return error.
-		 */
-		if (!vsid)
-			return -1;
-		/* Make kernel text executable */
-		if (overlaps_kernel_text(vaddr, vaddr + step))
-			tprot &= ~HPTE_R_N;
-
-		/* Make kvm guest trampolines executable */
-		if (overlaps_kvm_tmp(vaddr, vaddr + step))
-			tprot &= ~HPTE_R_N;
-
-		/*
-		 * If relocatable, check if it overlaps interrupt vectors that
-		 * are copied down to real 0. For relocatable kernel
-		 * (e.g. kdump case) we copy interrupt vectors down to real
-		 * address 0. Mark that region as executable. This is
-		 * because on p8 system with relocation on exception feature
-		 * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence
-		 * in order to execute the interrupt handlers in virtual
-		 * mode the vector region need to be marked as executable.
-		 */
-		if ((PHYSICAL_START > MEMORY_START) &&
-			overlaps_interrupt_vector_text(vaddr, vaddr + step))
-				tprot &= ~HPTE_R_N;
-
-		hash = hpt_hash(vpn, shift, ssize);
-		hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
-
-		BUG_ON(!mmu_hash_ops.hpte_insert);
-		ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot,
-					       HPTE_V_BOLTED, psize, psize,
-					       ssize);
-
-		if (ret < 0)
-			break;
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-		if (debug_pagealloc_enabled() &&
-			(paddr >> PAGE_SHIFT) < linear_map_hash_count)
-			linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80;
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-	}
-	return ret < 0 ? ret : 0;
-}
-
-int htab_remove_mapping(unsigned long vstart, unsigned long vend,
-		      int psize, int ssize)
-{
-	unsigned long vaddr;
-	unsigned int step, shift;
-	int rc;
-	int ret = 0;
-
-	shift = mmu_psize_defs[psize].shift;
-	step = 1 << shift;
-
-	if (!mmu_hash_ops.hpte_removebolted)
-		return -ENODEV;
-
-	for (vaddr = vstart; vaddr < vend; vaddr += step) {
-		rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize);
-		if (rc == -ENOENT) {
-			ret = -ENOENT;
-			continue;
-		}
-		if (rc < 0)
-			return rc;
-	}
-
-	return ret;
-}
-
-static bool disable_1tb_segments = false;
-
-static int __init parse_disable_1tb_segments(char *p)
-{
-	disable_1tb_segments = true;
-	return 0;
-}
-early_param("disable_1tb_segments", parse_disable_1tb_segments);
-
-static int __init htab_dt_scan_seg_sizes(unsigned long node,
-					 const char *uname, int depth,
-					 void *data)
-{
-	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
-	const __be32 *prop;
-	int size = 0;
-
-	/* We are scanning "cpu" nodes only */
-	if (type == NULL || strcmp(type, "cpu") != 0)
-		return 0;
-
-	prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size);
-	if (prop == NULL)
-		return 0;
-	for (; size >= 4; size -= 4, ++prop) {
-		if (be32_to_cpu(prop[0]) == 40) {
-			DBG("1T segment support detected\n");
-
-			if (disable_1tb_segments) {
-				DBG("1T segments disabled by command line\n");
-				break;
-			}
-
-			cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT;
-			return 1;
-		}
-	}
-	cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
-	return 0;
-}
-
-static int __init get_idx_from_shift(unsigned int shift)
-{
-	int idx = -1;
-
-	switch (shift) {
-	case 0xc:
-		idx = MMU_PAGE_4K;
-		break;
-	case 0x10:
-		idx = MMU_PAGE_64K;
-		break;
-	case 0x14:
-		idx = MMU_PAGE_1M;
-		break;
-	case 0x18:
-		idx = MMU_PAGE_16M;
-		break;
-	case 0x22:
-		idx = MMU_PAGE_16G;
-		break;
-	}
-	return idx;
-}
-
-static int __init htab_dt_scan_page_sizes(unsigned long node,
-					  const char *uname, int depth,
-					  void *data)
-{
-	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
-	const __be32 *prop;
-	int size = 0;
-
-	/* We are scanning "cpu" nodes only */
-	if (type == NULL || strcmp(type, "cpu") != 0)
-		return 0;
-
-	prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size);
-	if (!prop)
-		return 0;
-
-	pr_info("Page sizes from device-tree:\n");
-	size /= 4;
-	cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE);
-	while(size > 0) {
-		unsigned int base_shift = be32_to_cpu(prop[0]);
-		unsigned int slbenc = be32_to_cpu(prop[1]);
-		unsigned int lpnum = be32_to_cpu(prop[2]);
-		struct mmu_psize_def *def;
-		int idx, base_idx;
-
-		size -= 3; prop += 3;
-		base_idx = get_idx_from_shift(base_shift);
-		if (base_idx < 0) {
-			/* skip the pte encoding also */
-			prop += lpnum * 2; size -= lpnum * 2;
-			continue;
-		}
-		def = &mmu_psize_defs[base_idx];
-		if (base_idx == MMU_PAGE_16M)
-			cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE;
-
-		def->shift = base_shift;
-		if (base_shift <= 23)
-			def->avpnm = 0;
-		else
-			def->avpnm = (1 << (base_shift - 23)) - 1;
-		def->sllp = slbenc;
-		/*
-		 * We don't know for sure what's up with tlbiel, so
-		 * for now we only set it for 4K and 64K pages
-		 */
-		if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K)
-			def->tlbiel = 1;
-		else
-			def->tlbiel = 0;
-
-		while (size > 0 && lpnum) {
-			unsigned int shift = be32_to_cpu(prop[0]);
-			int penc  = be32_to_cpu(prop[1]);
-
-			prop += 2; size -= 2;
-			lpnum--;
-
-			idx = get_idx_from_shift(shift);
-			if (idx < 0)
-				continue;
-
-			if (penc == -1)
-				pr_err("Invalid penc for base_shift=%d "
-				       "shift=%d\n", base_shift, shift);
-
-			def->penc[idx] = penc;
-			pr_info("base_shift=%d: shift=%d, sllp=0x%04lx,"
-				" avpnm=0x%08lx, tlbiel=%d, penc=%d\n",
-				base_shift, shift, def->sllp,
-				def->avpnm, def->tlbiel, def->penc[idx]);
-		}
-	}
-
-	return 1;
-}
-
-#ifdef CONFIG_HUGETLB_PAGE
-/* Scan for 16G memory blocks that have been set aside for huge pages
- * and reserve those blocks for 16G huge pages.
- */
-static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
-					const char *uname, int depth,
-					void *data) {
-	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
-	const __be64 *addr_prop;
-	const __be32 *page_count_prop;
-	unsigned int expected_pages;
-	long unsigned int phys_addr;
-	long unsigned int block_size;
-
-	/* We are scanning "memory" nodes only */
-	if (type == NULL || strcmp(type, "memory") != 0)
-		return 0;
-
-	/* This property is the log base 2 of the number of virtual pages that
-	 * will represent this memory block. */
-	page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
-	if (page_count_prop == NULL)
-		return 0;
-	expected_pages = (1 << be32_to_cpu(page_count_prop[0]));
-	addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
-	if (addr_prop == NULL)
-		return 0;
-	phys_addr = be64_to_cpu(addr_prop[0]);
-	block_size = be64_to_cpu(addr_prop[1]);
-	if (block_size != (16 * GB))
-		return 0;
-	printk(KERN_INFO "Huge page(16GB) memory: "
-			"addr = 0x%lX size = 0x%lX pages = %d\n",
-			phys_addr, block_size, expected_pages);
-	if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) {
-		memblock_reserve(phys_addr, block_size * expected_pages);
-		pseries_add_gpage(phys_addr, block_size, expected_pages);
-	}
-	return 0;
-}
-#endif /* CONFIG_HUGETLB_PAGE */
-
-static void mmu_psize_set_default_penc(void)
-{
-	int bpsize, apsize;
-	for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
-		for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++)
-			mmu_psize_defs[bpsize].penc[apsize] = -1;
-}
-
-#ifdef CONFIG_PPC_64K_PAGES
-
-static bool might_have_hea(void)
-{
-	/*
-	 * The HEA ethernet adapter requires awareness of the
-	 * GX bus. Without that awareness we can easily assume
-	 * we will never see an HEA ethernet device.
-	 */
-#ifdef CONFIG_IBMEBUS
-	return !cpu_has_feature(CPU_FTR_ARCH_207S) &&
-		firmware_has_feature(FW_FEATURE_SPLPAR);
-#else
-	return false;
-#endif
-}
-
-#endif /* #ifdef CONFIG_PPC_64K_PAGES */
-
-static void __init htab_scan_page_sizes(void)
-{
-	int rc;
-
-	/* se the invalid penc to -1 */
-	mmu_psize_set_default_penc();
-
-	/* Default to 4K pages only */
-	memcpy(mmu_psize_defs, mmu_psize_defaults,
-	       sizeof(mmu_psize_defaults));
-
-	/*
-	 * Try to find the available page sizes in the device-tree
-	 */
-	rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL);
-	if (rc == 0 && early_mmu_has_feature(MMU_FTR_16M_PAGE)) {
-		/*
-		 * Nothing in the device-tree, but the CPU supports 16M pages,
-		 * so let's fallback on a known size list for 16M capable CPUs.
-		 */
-		memcpy(mmu_psize_defs, mmu_psize_defaults_gp,
-		       sizeof(mmu_psize_defaults_gp));
-	}
-
-#ifdef CONFIG_HUGETLB_PAGE
-	if (!hugetlb_disabled) {
-		/* Reserve 16G huge page memory sections for huge pages */
-		of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
-	}
-#endif /* CONFIG_HUGETLB_PAGE */
-}
-
-/*
- * Fill in the hpte_page_sizes[] array.
- * We go through the mmu_psize_defs[] array looking for all the
- * supported base/actual page size combinations.  Each combination
- * has a unique pagesize encoding (penc) value in the low bits of
- * the LP field of the HPTE.  For actual page sizes less than 1MB,
- * some of the upper LP bits are used for RPN bits, meaning that
- * we need to fill in several entries in hpte_page_sizes[].
- *
- * In diagrammatic form, with r = RPN bits and z = page size bits:
- *        PTE LP     actual page size
- *    rrrr rrrz		>=8KB
- *    rrrr rrzz		>=16KB
- *    rrrr rzzz		>=32KB
- *    rrrr zzzz		>=64KB
- *    ...
- *
- * The zzzz bits are implementation-specific but are chosen so that
- * no encoding for a larger page size uses the same value in its
- * low-order N bits as the encoding for the 2^(12+N) byte page size
- * (if it exists).
- */
-static void init_hpte_page_sizes(void)
-{
-	long int ap, bp;
-	long int shift, penc;
-
-	for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) {
-		if (!mmu_psize_defs[bp].shift)
-			continue;	/* not a supported page size */
-		for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) {
-			penc = mmu_psize_defs[bp].penc[ap];
-			if (penc == -1 || !mmu_psize_defs[ap].shift)
-				continue;
-			shift = mmu_psize_defs[ap].shift - LP_SHIFT;
-			if (shift <= 0)
-				continue;	/* should never happen */
-			/*
-			 * For page sizes less than 1MB, this loop
-			 * replicates the entry for all possible values
-			 * of the rrrr bits.
-			 */
-			while (penc < (1 << LP_BITS)) {
-				hpte_page_sizes[penc] = (ap << 4) | bp;
-				penc += 1 << shift;
-			}
-		}
-	}
-}
-
-static void __init htab_init_page_sizes(void)
-{
-	init_hpte_page_sizes();
-
-	if (!debug_pagealloc_enabled()) {
-		/*
-		 * Pick a size for the linear mapping. Currently, we only
-		 * support 16M, 1M and 4K which is the default
-		 */
-		if (mmu_psize_defs[MMU_PAGE_16M].shift)
-			mmu_linear_psize = MMU_PAGE_16M;
-		else if (mmu_psize_defs[MMU_PAGE_1M].shift)
-			mmu_linear_psize = MMU_PAGE_1M;
-	}
-
-#ifdef CONFIG_PPC_64K_PAGES
-	/*
-	 * Pick a size for the ordinary pages. Default is 4K, we support
-	 * 64K for user mappings and vmalloc if supported by the processor.
-	 * We only use 64k for ioremap if the processor
-	 * (and firmware) support cache-inhibited large pages.
-	 * If not, we use 4k and set mmu_ci_restrictions so that
-	 * hash_page knows to switch processes that use cache-inhibited
-	 * mappings to 4k pages.
-	 */
-	if (mmu_psize_defs[MMU_PAGE_64K].shift) {
-		mmu_virtual_psize = MMU_PAGE_64K;
-		mmu_vmalloc_psize = MMU_PAGE_64K;
-		if (mmu_linear_psize == MMU_PAGE_4K)
-			mmu_linear_psize = MMU_PAGE_64K;
-		if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) {
-			/*
-			 * When running on pSeries using 64k pages for ioremap
-			 * would stop us accessing the HEA ethernet. So if we
-			 * have the chance of ever seeing one, stay at 4k.
-			 */
-			if (!might_have_hea())
-				mmu_io_psize = MMU_PAGE_64K;
-		} else
-			mmu_ci_restrictions = 1;
-	}
-#endif /* CONFIG_PPC_64K_PAGES */
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-	/* We try to use 16M pages for vmemmap if that is supported
-	 * and we have at least 1G of RAM at boot
-	 */
-	if (mmu_psize_defs[MMU_PAGE_16M].shift &&
-	    memblock_phys_mem_size() >= 0x40000000)
-		mmu_vmemmap_psize = MMU_PAGE_16M;
-	else if (mmu_psize_defs[MMU_PAGE_64K].shift)
-		mmu_vmemmap_psize = MMU_PAGE_64K;
-	else
-		mmu_vmemmap_psize = MMU_PAGE_4K;
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-
-	printk(KERN_DEBUG "Page orders: linear mapping = %d, "
-	       "virtual = %d, io = %d"
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-	       ", vmemmap = %d"
-#endif
-	       "\n",
-	       mmu_psize_defs[mmu_linear_psize].shift,
-	       mmu_psize_defs[mmu_virtual_psize].shift,
-	       mmu_psize_defs[mmu_io_psize].shift
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-	       ,mmu_psize_defs[mmu_vmemmap_psize].shift
-#endif
-	       );
-}
-
-static int __init htab_dt_scan_pftsize(unsigned long node,
-				       const char *uname, int depth,
-				       void *data)
-{
-	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
-	const __be32 *prop;
-
-	/* We are scanning "cpu" nodes only */
-	if (type == NULL || strcmp(type, "cpu") != 0)
-		return 0;
-
-	prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
-	if (prop != NULL) {
-		/* pft_size[0] is the NUMA CEC cookie */
-		ppc64_pft_size = be32_to_cpu(prop[1]);
-		return 1;
-	}
-	return 0;
-}
-
-unsigned htab_shift_for_mem_size(unsigned long mem_size)
-{
-	unsigned memshift = __ilog2(mem_size);
-	unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift;
-	unsigned pteg_shift;
-
-	/* round mem_size up to next power of 2 */
-	if ((1UL << memshift) < mem_size)
-		memshift += 1;
-
-	/* aim for 2 pages / pteg */
-	pteg_shift = memshift - (pshift + 1);
-
-	/*
-	 * 2^11 PTEGS of 128 bytes each, ie. 2^18 bytes is the minimum htab
-	 * size permitted by the architecture.
-	 */
-	return max(pteg_shift + 7, 18U);
-}
-
-static unsigned long __init htab_get_table_size(void)
-{
-	/* If hash size isn't already provided by the platform, we try to
-	 * retrieve it from the device-tree. If it's not there neither, we
-	 * calculate it now based on the total RAM size
-	 */
-	if (ppc64_pft_size == 0)
-		of_scan_flat_dt(htab_dt_scan_pftsize, NULL);
-	if (ppc64_pft_size)
-		return 1UL << ppc64_pft_size;
-
-	return 1UL << htab_shift_for_mem_size(memblock_phys_mem_size());
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-int resize_hpt_for_hotplug(unsigned long new_mem_size)
-{
-	unsigned target_hpt_shift;
-
-	if (!mmu_hash_ops.resize_hpt)
-		return 0;
-
-	target_hpt_shift = htab_shift_for_mem_size(new_mem_size);
-
-	/*
-	 * To avoid lots of HPT resizes if memory size is fluctuating
-	 * across a boundary, we deliberately have some hysterisis
-	 * here: we immediately increase the HPT size if the target
-	 * shift exceeds the current shift, but we won't attempt to
-	 * reduce unless the target shift is at least 2 below the
-	 * current shift
-	 */
-	if (target_hpt_shift > ppc64_pft_size ||
-	    target_hpt_shift < ppc64_pft_size - 1)
-		return mmu_hash_ops.resize_hpt(target_hpt_shift);
-
-	return 0;
-}
-
-int hash__create_section_mapping(unsigned long start, unsigned long end, int nid)
-{
-	int rc;
-
-	if (end >= H_VMALLOC_START) {
-		pr_warn("Outside the supported range\n");
-		return -1;
-	}
-
-	rc = htab_bolt_mapping(start, end, __pa(start),
-			       pgprot_val(PAGE_KERNEL), mmu_linear_psize,
-			       mmu_kernel_ssize);
-
-	if (rc < 0) {
-		int rc2 = htab_remove_mapping(start, end, mmu_linear_psize,
-					      mmu_kernel_ssize);
-		BUG_ON(rc2 && (rc2 != -ENOENT));
-	}
-	return rc;
-}
-
-int hash__remove_section_mapping(unsigned long start, unsigned long end)
-{
-	int rc = htab_remove_mapping(start, end, mmu_linear_psize,
-				     mmu_kernel_ssize);
-	WARN_ON(rc < 0);
-	return rc;
-}
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-static void __init hash_init_partition_table(phys_addr_t hash_table,
-					     unsigned long htab_size)
-{
-	mmu_partition_table_init();
-
-	/*
-	 * PS field (VRMA page size) is not used for LPID 0, hence set to 0.
-	 * For now, UPRT is 0 and we have no segment table.
-	 */
-	htab_size =  __ilog2(htab_size) - 18;
-	mmu_partition_table_set_entry(0, hash_table | htab_size, 0);
-	pr_info("Partition table %p\n", partition_tb);
-}
-
-static void __init htab_initialize(void)
-{
-	unsigned long table;
-	unsigned long pteg_count;
-	unsigned long prot;
-	unsigned long base = 0, size = 0;
-	struct memblock_region *reg;
-
-	DBG(" -> htab_initialize()\n");
-
-	if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
-		mmu_kernel_ssize = MMU_SEGSIZE_1T;
-		mmu_highuser_ssize = MMU_SEGSIZE_1T;
-		printk(KERN_INFO "Using 1TB segments\n");
-	}
-
-	/*
-	 * Calculate the required size of the htab.  We want the number of
-	 * PTEGs to equal one half the number of real pages.
-	 */ 
-	htab_size_bytes = htab_get_table_size();
-	pteg_count = htab_size_bytes >> 7;
-
-	htab_hash_mask = pteg_count - 1;
-
-	if (firmware_has_feature(FW_FEATURE_LPAR) ||
-	    firmware_has_feature(FW_FEATURE_PS3_LV1)) {
-		/* Using a hypervisor which owns the htab */
-		htab_address = NULL;
-		_SDR1 = 0; 
-		/*
-		 * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
-		 * to inform the hypervisor that we wish to use the HPT.
-		 */
-		if (cpu_has_feature(CPU_FTR_ARCH_300))
-			register_process_table(0, 0, 0);
-#ifdef CONFIG_FA_DUMP
-		/*
-		 * If firmware assisted dump is active firmware preserves
-		 * the contents of htab along with entire partition memory.
-		 * Clear the htab if firmware assisted dump is active so
-		 * that we dont end up using old mappings.
-		 */
-		if (is_fadump_active() && mmu_hash_ops.hpte_clear_all)
-			mmu_hash_ops.hpte_clear_all();
-#endif
-	} else {
-		unsigned long limit = MEMBLOCK_ALLOC_ANYWHERE;
-
-#ifdef CONFIG_PPC_CELL
-		/*
-		 * Cell may require the hash table down low when using the
-		 * Axon IOMMU in order to fit the dynamic region over it, see
-		 * comments in cell/iommu.c
-		 */
-		if (fdt_subnode_offset(initial_boot_params, 0, "axon") > 0) {
-			limit = 0x80000000;
-			pr_info("Hash table forced below 2G for Axon IOMMU\n");
-		}
-#endif /* CONFIG_PPC_CELL */
-
-		table = memblock_phys_alloc_range(htab_size_bytes,
-						  htab_size_bytes,
-						  0, limit);
-		if (!table)
-			panic("ERROR: Failed to allocate %pa bytes below %pa\n",
-			      &htab_size_bytes, &limit);
-
-		DBG("Hash table allocated at %lx, size: %lx\n", table,
-		    htab_size_bytes);
-
-		htab_address = __va(table);
-
-		/* htab absolute addr + encoded htabsize */
-		_SDR1 = table + __ilog2(htab_size_bytes) - 18;
-
-		/* Initialize the HPT with no entries */
-		memset((void *)table, 0, htab_size_bytes);
-
-		if (!cpu_has_feature(CPU_FTR_ARCH_300))
-			/* Set SDR1 */
-			mtspr(SPRN_SDR1, _SDR1);
-		else
-			hash_init_partition_table(table, htab_size_bytes);
-	}
-
-	prot = pgprot_val(PAGE_KERNEL);
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	if (debug_pagealloc_enabled()) {
-		linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
-		linear_map_hash_slots = memblock_alloc_try_nid(
-				linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
-				ppc64_rma_size,	NUMA_NO_NODE);
-		if (!linear_map_hash_slots)
-			panic("%s: Failed to allocate %lu bytes max_addr=%pa\n",
-			      __func__, linear_map_hash_count, &ppc64_rma_size);
-	}
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-
-	/* create bolted the linear mapping in the hash table */
-	for_each_memblock(memory, reg) {
-		base = (unsigned long)__va(reg->base);
-		size = reg->size;
-
-		DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
-		    base, size, prot);
-
-		if ((base + size) >= H_VMALLOC_START) {
-			pr_warn("Outside the supported range\n");
-			continue;
-		}
-
-		BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
-				prot, mmu_linear_psize, mmu_kernel_ssize));
-	}
-	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
-
-	/*
-	 * If we have a memory_limit and we've allocated TCEs then we need to
-	 * explicitly map the TCE area at the top of RAM. We also cope with the
-	 * case that the TCEs start below memory_limit.
-	 * tce_alloc_start/end are 16MB aligned so the mapping should work
-	 * for either 4K or 16MB pages.
-	 */
-	if (tce_alloc_start) {
-		tce_alloc_start = (unsigned long)__va(tce_alloc_start);
-		tce_alloc_end = (unsigned long)__va(tce_alloc_end);
-
-		if (base + size >= tce_alloc_start)
-			tce_alloc_start = base + size + 1;
-
-		BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end,
-					 __pa(tce_alloc_start), prot,
-					 mmu_linear_psize, mmu_kernel_ssize));
-	}
-
-
-	DBG(" <- htab_initialize()\n");
-}
-#undef KB
-#undef MB
-
-void __init hash__early_init_devtree(void)
-{
-	/* Initialize segment sizes */
-	of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
-
-	/* Initialize page sizes */
-	htab_scan_page_sizes();
-}
-
-struct hash_mm_context init_hash_mm_context;
-void __init hash__early_init_mmu(void)
-{
-#ifndef CONFIG_PPC_64K_PAGES
-	/*
-	 * We have code in __hash_page_4K() and elsewhere, which assumes it can
-	 * do the following:
-	 *   new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX);
-	 *
-	 * Where the slot number is between 0-15, and values of 8-15 indicate
-	 * the secondary bucket. For that code to work H_PAGE_F_SECOND and
-	 * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and
-	 * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here
-	 * with a BUILD_BUG_ON().
-	 */
-	BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul  << (H_PAGE_F_GIX_SHIFT + 3)));
-#endif /* CONFIG_PPC_64K_PAGES */
-
-	htab_init_page_sizes();
-
-	/*
-	 * initialize page table size
-	 */
-	__pte_frag_nr = H_PTE_FRAG_NR;
-	__pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
-	__pmd_frag_nr = H_PMD_FRAG_NR;
-	__pmd_frag_size_shift = H_PMD_FRAG_SIZE_SHIFT;
-
-	__pte_index_size = H_PTE_INDEX_SIZE;
-	__pmd_index_size = H_PMD_INDEX_SIZE;
-	__pud_index_size = H_PUD_INDEX_SIZE;
-	__pgd_index_size = H_PGD_INDEX_SIZE;
-	__pud_cache_index = H_PUD_CACHE_INDEX;
-	__pte_table_size = H_PTE_TABLE_SIZE;
-	__pmd_table_size = H_PMD_TABLE_SIZE;
-	__pud_table_size = H_PUD_TABLE_SIZE;
-	__pgd_table_size = H_PGD_TABLE_SIZE;
-	/*
-	 * 4k use hugepd format, so for hash set then to
-	 * zero
-	 */
-	__pmd_val_bits = HASH_PMD_VAL_BITS;
-	__pud_val_bits = HASH_PUD_VAL_BITS;
-	__pgd_val_bits = HASH_PGD_VAL_BITS;
-
-	__kernel_virt_start = H_KERN_VIRT_START;
-	__vmalloc_start = H_VMALLOC_START;
-	__vmalloc_end = H_VMALLOC_END;
-	__kernel_io_start = H_KERN_IO_START;
-	__kernel_io_end = H_KERN_IO_END;
-	vmemmap = (struct page *)H_VMEMMAP_START;
-	ioremap_bot = IOREMAP_BASE;
-
-#ifdef CONFIG_PCI
-	pci_io_base = ISA_IO_BASE;
-#endif
-
-	/* Select appropriate backend */
-	if (firmware_has_feature(FW_FEATURE_PS3_LV1))
-		ps3_early_mm_init();
-	else if (firmware_has_feature(FW_FEATURE_LPAR))
-		hpte_init_pseries();
-	else if (IS_ENABLED(CONFIG_PPC_NATIVE))
-		hpte_init_native();
-
-	if (!mmu_hash_ops.hpte_insert)
-		panic("hash__early_init_mmu: No MMU hash ops defined!\n");
-
-	/* Initialize the MMU Hash table and create the linear mapping
-	 * of memory. Has to be done before SLB initialization as this is
-	 * currently where the page size encoding is obtained.
-	 */
-	htab_initialize();
-
-	init_mm.context.hash_context = &init_hash_mm_context;
-	init_mm.context.hash_context->slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
-
-	pr_info("Initializing hash mmu with SLB\n");
-	/* Initialize SLB management */
-	slb_initialize();
-
-	if (cpu_has_feature(CPU_FTR_ARCH_206)
-			&& cpu_has_feature(CPU_FTR_HVMODE))
-		tlbiel_all();
-}
-
-#ifdef CONFIG_SMP
-void hash__early_init_mmu_secondary(void)
-{
-	/* Initialize hash table for that CPU */
-	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
-
-		if (!cpu_has_feature(CPU_FTR_ARCH_300))
-			mtspr(SPRN_SDR1, _SDR1);
-		else
-			mtspr(SPRN_PTCR,
-			      __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
-	}
-	/* Initialize SLB */
-	slb_initialize();
-
-	if (cpu_has_feature(CPU_FTR_ARCH_206)
-			&& cpu_has_feature(CPU_FTR_HVMODE))
-		tlbiel_all();
-}
-#endif /* CONFIG_SMP */
-
-/*
- * Called by asm hashtable.S for doing lazy icache flush
- */
-unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
-{
-	struct page *page;
-
-	if (!pfn_valid(pte_pfn(pte)))
-		return pp;
-
-	page = pte_page(pte);
-
-	/* page is dirty */
-	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
-		if (trap == 0x400) {
-			flush_dcache_icache_page(page);
-			set_bit(PG_arch_1, &page->flags);
-		} else
-			pp |= HPTE_R_N;
-	}
-	return pp;
-}
-
-#ifdef CONFIG_PPC_MM_SLICES
-static unsigned int get_paca_psize(unsigned long addr)
-{
-	unsigned char *psizes;
-	unsigned long index, mask_index;
-
-	if (addr < SLICE_LOW_TOP) {
-		psizes = get_paca()->mm_ctx_low_slices_psize;
-		index = GET_LOW_SLICE_INDEX(addr);
-	} else {
-		psizes = get_paca()->mm_ctx_high_slices_psize;
-		index = GET_HIGH_SLICE_INDEX(addr);
-	}
-	mask_index = index & 0x1;
-	return (psizes[index >> 1] >> (mask_index * 4)) & 0xF;
-}
-
-#else
-unsigned int get_paca_psize(unsigned long addr)
-{
-	return get_paca()->mm_ctx_user_psize;
-}
-#endif
-
-/*
- * Demote a segment to using 4k pages.
- * For now this makes the whole process use 4k pages.
- */
-#ifdef CONFIG_PPC_64K_PAGES
-void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
-{
-	if (get_slice_psize(mm, addr) == MMU_PAGE_4K)
-		return;
-	slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K);
-	copro_flush_all_slbs(mm);
-	if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
-
-		copy_mm_to_paca(mm);
-		slb_flush_and_restore_bolted();
-	}
-}
-#endif /* CONFIG_PPC_64K_PAGES */
-
-#ifdef CONFIG_PPC_SUBPAGE_PROT
-/*
- * This looks up a 2-bit protection code for a 4k subpage of a 64k page.
- * Userspace sets the subpage permissions using the subpage_prot system call.
- *
- * Result is 0: full permissions, _PAGE_RW: read-only,
- * _PAGE_RWX: no access.
- */
-static int subpage_protection(struct mm_struct *mm, unsigned long ea)
-{
-	struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
-	u32 spp = 0;
-	u32 **sbpm, *sbpp;
-
-	if (!spt)
-		return 0;
-
-	if (ea >= spt->maxaddr)
-		return 0;
-	if (ea < 0x100000000UL) {
-		/* addresses below 4GB use spt->low_prot */
-		sbpm = spt->low_prot;
-	} else {
-		sbpm = spt->protptrs[ea >> SBP_L3_SHIFT];
-		if (!sbpm)
-			return 0;
-	}
-	sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
-	if (!sbpp)
-		return 0;
-	spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)];
-
-	/* extract 2-bit bitfield for this 4k subpage */
-	spp >>= 30 - 2 * ((ea >> 12) & 0xf);
-
-	/*
-	 * 0 -> full premission
-	 * 1 -> Read only
-	 * 2 -> no access.
-	 * We return the flag that need to be cleared.
-	 */
-	spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0);
-	return spp;
-}
-
-#else /* CONFIG_PPC_SUBPAGE_PROT */
-static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
-{
-	return 0;
-}
-#endif
-
-void hash_failure_debug(unsigned long ea, unsigned long access,
-			unsigned long vsid, unsigned long trap,
-			int ssize, int psize, int lpsize, unsigned long pte)
-{
-	if (!printk_ratelimit())
-		return;
-	pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n",
-		ea, access, current->comm);
-	pr_info("    trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n",
-		trap, vsid, ssize, psize, lpsize, pte);
-}
-
-static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
-			     int psize, bool user_region)
-{
-	if (user_region) {
-		if (psize != get_paca_psize(ea)) {
-			copy_mm_to_paca(mm);
-			slb_flush_and_restore_bolted();
-		}
-	} else if (get_paca()->vmalloc_sllp !=
-		   mmu_psize_defs[mmu_vmalloc_psize].sllp) {
-		get_paca()->vmalloc_sllp =
-			mmu_psize_defs[mmu_vmalloc_psize].sllp;
-		slb_vmalloc_update();
-	}
-}
-
-/* Result code is:
- *  0 - handled
- *  1 - normal page fault
- * -1 - critical hash insertion error
- * -2 - access not permitted by subpage protection mechanism
- */
-int hash_page_mm(struct mm_struct *mm, unsigned long ea,
-		 unsigned long access, unsigned long trap,
-		 unsigned long flags)
-{
-	bool is_thp;
-	enum ctx_state prev_state = exception_enter();
-	pgd_t *pgdir;
-	unsigned long vsid;
-	pte_t *ptep;
-	unsigned hugeshift;
-	int rc, user_region = 0;
-	int psize, ssize;
-
-	DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
-		ea, access, trap);
-	trace_hash_fault(ea, access, trap);
-
-	/* Get region & vsid */
-	switch (get_region_id(ea)) {
-	case USER_REGION_ID:
-		user_region = 1;
-		if (! mm) {
-			DBG_LOW(" user region with no mm !\n");
-			rc = 1;
-			goto bail;
-		}
-		psize = get_slice_psize(mm, ea);
-		ssize = user_segment_size(ea);
-		vsid = get_user_vsid(&mm->context, ea, ssize);
-		break;
-	case VMALLOC_REGION_ID:
-		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
-		psize = mmu_vmalloc_psize;
-		ssize = mmu_kernel_ssize;
-		break;
-
-	case IO_REGION_ID:
-		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
-		psize = mmu_io_psize;
-		ssize = mmu_kernel_ssize;
-		break;
-	default:
-		/* Not a valid range
-		 * Send the problem up to do_page_fault 
-		 */
-		rc = 1;
-		goto bail;
-	}
-	DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
-
-	/* Bad address. */
-	if (!vsid) {
-		DBG_LOW("Bad address!\n");
-		rc = 1;
-		goto bail;
-	}
-	/* Get pgdir */
-	pgdir = mm->pgd;
-	if (pgdir == NULL) {
-		rc = 1;
-		goto bail;
-	}
-
-	/* Check CPU locality */
-	if (user_region && mm_is_thread_local(mm))
-		flags |= HPTE_LOCAL_UPDATE;
-
-#ifndef CONFIG_PPC_64K_PAGES
-	/* If we use 4K pages and our psize is not 4K, then we might
-	 * be hitting a special driver mapping, and need to align the
-	 * address before we fetch the PTE.
-	 *
-	 * It could also be a hugepage mapping, in which case this is
-	 * not necessary, but it's not harmful, either.
-	 */
-	if (psize != MMU_PAGE_4K)
-		ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
-#endif /* CONFIG_PPC_64K_PAGES */
-
-	/* Get PTE and page size from page tables */
-	ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift);
-	if (ptep == NULL || !pte_present(*ptep)) {
-		DBG_LOW(" no PTE !\n");
-		rc = 1;
-		goto bail;
-	}
-
-	/* Add _PAGE_PRESENT to the required access perm */
-	access |= _PAGE_PRESENT;
-
-	/* Pre-check access permissions (will be re-checked atomically
-	 * in __hash_page_XX but this pre-check is a fast path
-	 */
-	if (!check_pte_access(access, pte_val(*ptep))) {
-		DBG_LOW(" no access !\n");
-		rc = 1;
-		goto bail;
-	}
-
-	if (hugeshift) {
-		if (is_thp)
-			rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
-					     trap, flags, ssize, psize);
-#ifdef CONFIG_HUGETLB_PAGE
-		else
-			rc = __hash_page_huge(ea, access, vsid, ptep, trap,
-					      flags, ssize, hugeshift, psize);
-#else
-		else {
-			/*
-			 * if we have hugeshift, and is not transhuge with
-			 * hugetlb disabled, something is really wrong.
-			 */
-			rc = 1;
-			WARN_ON(1);
-		}
-#endif
-		if (current->mm == mm)
-			check_paca_psize(ea, mm, psize, user_region);
-
-		goto bail;
-	}
-
-#ifndef CONFIG_PPC_64K_PAGES
-	DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
-#else
-	DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep),
-		pte_val(*(ptep + PTRS_PER_PTE)));
-#endif
-	/* Do actual hashing */
-#ifdef CONFIG_PPC_64K_PAGES
-	/* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */
-	if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) {
-		demote_segment_4k(mm, ea);
-		psize = MMU_PAGE_4K;
-	}
-
-	/* If this PTE is non-cacheable and we have restrictions on
-	 * using non cacheable large pages, then we switch to 4k
-	 */
-	if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) {
-		if (user_region) {
-			demote_segment_4k(mm, ea);
-			psize = MMU_PAGE_4K;
-		} else if (ea < VMALLOC_END) {
-			/*
-			 * some driver did a non-cacheable mapping
-			 * in vmalloc space, so switch vmalloc
-			 * to 4k pages
-			 */
-			printk(KERN_ALERT "Reducing vmalloc segment "
-			       "to 4kB pages because of "
-			       "non-cacheable mapping\n");
-			psize = mmu_vmalloc_psize = MMU_PAGE_4K;
-			copro_flush_all_slbs(mm);
-		}
-	}
-
-#endif /* CONFIG_PPC_64K_PAGES */
-
-	if (current->mm == mm)
-		check_paca_psize(ea, mm, psize, user_region);
-
-#ifdef CONFIG_PPC_64K_PAGES
-	if (psize == MMU_PAGE_64K)
-		rc = __hash_page_64K(ea, access, vsid, ptep, trap,
-				     flags, ssize);
-	else
-#endif /* CONFIG_PPC_64K_PAGES */
-	{
-		int spp = subpage_protection(mm, ea);
-		if (access & spp)
-			rc = -2;
-		else
-			rc = __hash_page_4K(ea, access, vsid, ptep, trap,
-					    flags, ssize, spp);
-	}
-
-	/* Dump some info in case of hash insertion failure, they should
-	 * never happen so it is really useful to know if/when they do
-	 */
-	if (rc == -1)
-		hash_failure_debug(ea, access, vsid, trap, ssize, psize,
-				   psize, pte_val(*ptep));
-#ifndef CONFIG_PPC_64K_PAGES
-	DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep));
-#else
-	DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep),
-		pte_val(*(ptep + PTRS_PER_PTE)));
-#endif
-	DBG_LOW(" -> rc=%d\n", rc);
-
-bail:
-	exception_exit(prev_state);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(hash_page_mm);
-
-int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
-	      unsigned long dsisr)
-{
-	unsigned long flags = 0;
-	struct mm_struct *mm = current->mm;
-
-	if ((get_region_id(ea) == VMALLOC_REGION_ID) ||
-	    (get_region_id(ea) == IO_REGION_ID))
-		mm = &init_mm;
-
-	if (dsisr & DSISR_NOHPTE)
-		flags |= HPTE_NOHPTE_UPDATE;
-
-	return hash_page_mm(mm, ea, access, trap, flags);
-}
-EXPORT_SYMBOL_GPL(hash_page);
-
-int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap,
-		unsigned long dsisr)
-{
-	unsigned long access = _PAGE_PRESENT | _PAGE_READ;
-	unsigned long flags = 0;
-	struct mm_struct *mm = current->mm;
-	unsigned int region_id = get_region_id(ea);
-
-	if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID))
-		mm = &init_mm;
-
-	if (dsisr & DSISR_NOHPTE)
-		flags |= HPTE_NOHPTE_UPDATE;
-
-	if (dsisr & DSISR_ISSTORE)
-		access |= _PAGE_WRITE;
-	/*
-	 * We set _PAGE_PRIVILEGED only when
-	 * kernel mode access kernel space.
-	 *
-	 * _PAGE_PRIVILEGED is NOT set
-	 * 1) when kernel mode access user space
-	 * 2) user space access kernel space.
-	 */
-	access |= _PAGE_PRIVILEGED;
-	if ((msr & MSR_PR) || (region_id == USER_REGION_ID))
-		access &= ~_PAGE_PRIVILEGED;
-
-	if (trap == 0x400)
-		access |= _PAGE_EXEC;
-
-	return hash_page_mm(mm, ea, access, trap, flags);
-}
-
-#ifdef CONFIG_PPC_MM_SLICES
-static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
-{
-	int psize = get_slice_psize(mm, ea);
-
-	/* We only prefault standard pages for now */
-	if (unlikely(psize != mm_ctx_user_psize(&mm->context)))
-		return false;
-
-	/*
-	 * Don't prefault if subpage protection is enabled for the EA.
-	 */
-	if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea)))
-		return false;
-
-	return true;
-}
-#else
-static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
-{
-	return true;
-}
-#endif
-
-void hash_preload(struct mm_struct *mm, unsigned long ea,
-		  bool is_exec, unsigned long trap)
-{
-	int hugepage_shift;
-	unsigned long vsid;
-	pgd_t *pgdir;
-	pte_t *ptep;
-	unsigned long flags;
-	int rc, ssize, update_flags = 0;
-	unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0);
-
-	BUG_ON(get_region_id(ea) != USER_REGION_ID);
-
-	if (!should_hash_preload(mm, ea))
-		return;
-
-	DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx,"
-		" trap=%lx\n", mm, mm->pgd, ea, access, trap);
-
-	/* Get Linux PTE if available */
-	pgdir = mm->pgd;
-	if (pgdir == NULL)
-		return;
-
-	/* Get VSID */
-	ssize = user_segment_size(ea);
-	vsid = get_user_vsid(&mm->context, ea, ssize);
-	if (!vsid)
-		return;
-	/*
-	 * Hash doesn't like irqs. Walking linux page table with irq disabled
-	 * saves us from holding multiple locks.
-	 */
-	local_irq_save(flags);
-
-	/*
-	 * THP pages use update_mmu_cache_pmd. We don't do
-	 * hash preload there. Hence can ignore THP here
-	 */
-	ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift);
-	if (!ptep)
-		goto out_exit;
-
-	WARN_ON(hugepage_shift);
-#ifdef CONFIG_PPC_64K_PAGES
-	/* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on
-	 * a 64K kernel), then we don't preload, hash_page() will take
-	 * care of it once we actually try to access the page.
-	 * That way we don't have to duplicate all of the logic for segment
-	 * page size demotion here
-	 */
-	if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep))
-		goto out_exit;
-#endif /* CONFIG_PPC_64K_PAGES */
-
-	/* Is that local to this CPU ? */
-	if (mm_is_thread_local(mm))
-		update_flags |= HPTE_LOCAL_UPDATE;
-
-	/* Hash it in */
-#ifdef CONFIG_PPC_64K_PAGES
-	if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K)
-		rc = __hash_page_64K(ea, access, vsid, ptep, trap,
-				     update_flags, ssize);
-	else
-#endif /* CONFIG_PPC_64K_PAGES */
-		rc = __hash_page_4K(ea, access, vsid, ptep, trap, update_flags,
-				    ssize, subpage_protection(mm, ea));
-
-	/* Dump some info in case of hash insertion failure, they should
-	 * never happen so it is really useful to know if/when they do
-	 */
-	if (rc == -1)
-		hash_failure_debug(ea, access, vsid, trap, ssize,
-				   mm_ctx_user_psize(&mm->context),
-				   mm_ctx_user_psize(&mm->context),
-				   pte_val(*ptep));
-out_exit:
-	local_irq_restore(flags);
-}
-
-#ifdef CONFIG_PPC_MEM_KEYS
-/*
- * Return the protection key associated with the given address and the
- * mm_struct.
- */
-u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
-{
-	pte_t *ptep;
-	u16 pkey = 0;
-	unsigned long flags;
-
-	if (!mm || !mm->pgd)
-		return 0;
-
-	local_irq_save(flags);
-	ptep = find_linux_pte(mm->pgd, address, NULL, NULL);
-	if (ptep)
-		pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep)));
-	local_irq_restore(flags);
-
-	return pkey;
-}
-#endif /* CONFIG_PPC_MEM_KEYS */
-
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-static inline void tm_flush_hash_page(int local)
-{
-	/*
-	 * Transactions are not aborted by tlbiel, only tlbie. Without, syncing a
-	 * page back to a block device w/PIO could pick up transactional data
-	 * (bad!) so we force an abort here. Before the sync the page will be
-	 * made read-only, which will flush_hash_page. BIG ISSUE here: if the
-	 * kernel uses a page from userspace without unmapping it first, it may
-	 * see the speculated version.
-	 */
-	if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
-	    MSR_TM_ACTIVE(current->thread.regs->msr)) {
-		tm_enable();
-		tm_abort(TM_CAUSE_TLBI);
-	}
-}
-#else
-static inline void tm_flush_hash_page(int local)
-{
-}
-#endif
-
-/*
- * Return the global hash slot, corresponding to the given PTE, which contains
- * the HPTE.
- */
-unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift,
-		int ssize, real_pte_t rpte, unsigned int subpg_index)
-{
-	unsigned long hash, gslot, hidx;
-
-	hash = hpt_hash(vpn, shift, ssize);
-	hidx = __rpte_to_hidx(rpte, subpg_index);
-	if (hidx & _PTEIDX_SECONDARY)
-		hash = ~hash;
-	gslot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-	gslot += hidx & _PTEIDX_GROUP_IX;
-	return gslot;
-}
-
-/* WARNING: This is called from hash_low_64.S, if you change this prototype,
- *          do not forget to update the assembly call site !
- */
-void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
-		     unsigned long flags)
-{
-	unsigned long index, shift, gslot;
-	int local = flags & HPTE_LOCAL_UPDATE;
-
-	DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn);
-	pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
-		gslot = pte_get_hash_gslot(vpn, shift, ssize, pte, index);
-		DBG_LOW(" sub %ld: gslot=%lx\n", index, gslot);
-		/*
-		 * We use same base page size and actual psize, because we don't
-		 * use these functions for hugepage
-		 */
-		mmu_hash_ops.hpte_invalidate(gslot, vpn, psize, psize,
-					     ssize, local);
-	} pte_iterate_hashed_end();
-
-	tm_flush_hash_page(local);
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
-			 pmd_t *pmdp, unsigned int psize, int ssize,
-			 unsigned long flags)
-{
-	int i, max_hpte_count, valid;
-	unsigned long s_addr;
-	unsigned char *hpte_slot_array;
-	unsigned long hidx, shift, vpn, hash, slot;
-	int local = flags & HPTE_LOCAL_UPDATE;
-
-	s_addr = addr & HPAGE_PMD_MASK;
-	hpte_slot_array = get_hpte_slot_array(pmdp);
-	/*
-	 * IF we try to do a HUGE PTE update after a withdraw is done.
-	 * we will find the below NULL. This happens when we do
-	 * split_huge_page_pmd
-	 */
-	if (!hpte_slot_array)
-		return;
-
-	if (mmu_hash_ops.hugepage_invalidate) {
-		mmu_hash_ops.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
-						 psize, ssize, local);
-		goto tm_abort;
-	}
-	/*
-	 * No bluk hpte removal support, invalidate each entry
-	 */
-	shift = mmu_psize_defs[psize].shift;
-	max_hpte_count = HPAGE_PMD_SIZE >> shift;
-	for (i = 0; i < max_hpte_count; i++) {
-		/*
-		 * 8 bits per each hpte entries
-		 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
-		 */
-		valid = hpte_valid(hpte_slot_array, i);
-		if (!valid)
-			continue;
-		hidx =  hpte_hash_index(hpte_slot_array, i);
-
-		/* get the vpn */
-		addr = s_addr + (i * (1ul << shift));
-		vpn = hpt_vpn(addr, vsid, ssize);
-		hash = hpt_hash(vpn, shift, ssize);
-		if (hidx & _PTEIDX_SECONDARY)
-			hash = ~hash;
-
-		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-		slot += hidx & _PTEIDX_GROUP_IX;
-		mmu_hash_ops.hpte_invalidate(slot, vpn, psize,
-					     MMU_PAGE_16M, ssize, local);
-	}
-tm_abort:
-	tm_flush_hash_page(local);
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-void flush_hash_range(unsigned long number, int local)
-{
-	if (mmu_hash_ops.flush_hash_range)
-		mmu_hash_ops.flush_hash_range(number, local);
-	else {
-		int i;
-		struct ppc64_tlb_batch *batch =
-			this_cpu_ptr(&ppc64_tlb_batch);
-
-		for (i = 0; i < number; i++)
-			flush_hash_page(batch->vpn[i], batch->pte[i],
-					batch->psize, batch->ssize, local);
-	}
-}
-
-/*
- * low_hash_fault is called when we the low level hash code failed
- * to instert a PTE due to an hypervisor error
- */
-void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
-{
-	enum ctx_state prev_state = exception_enter();
-
-	if (user_mode(regs)) {
-#ifdef CONFIG_PPC_SUBPAGE_PROT
-		if (rc == -2)
-			_exception(SIGSEGV, regs, SEGV_ACCERR, address);
-		else
-#endif
-			_exception(SIGBUS, regs, BUS_ADRERR, address);
-	} else
-		bad_page_fault(regs, address, SIGBUS);
-
-	exception_exit(prev_state);
-}
-
-long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
-			   unsigned long pa, unsigned long rflags,
-			   unsigned long vflags, int psize, int ssize)
-{
-	unsigned long hpte_group;
-	long slot;
-
-repeat:
-	hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-
-	/* Insert into the hash table, primary slot */
-	slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags,
-					psize, psize, ssize);
-
-	/* Primary is full, try the secondary */
-	if (unlikely(slot == -1)) {
-		hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
-		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags,
-						vflags | HPTE_V_SECONDARY,
-						psize, psize, ssize);
-		if (slot == -1) {
-			if (mftb() & 0x1)
-				hpte_group = (hash & htab_hash_mask) *
-						HPTES_PER_GROUP;
-
-			mmu_hash_ops.hpte_remove(hpte_group);
-			goto repeat;
-		}
-	}
-
-	return slot;
-}
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
-{
-	unsigned long hash;
-	unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
-	unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
-	unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
-	long ret;
-
-	hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
-
-	/* Don't create HPTE entries for bad address */
-	if (!vsid)
-		return;
-
-	ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
-				    HPTE_V_BOLTED,
-				    mmu_linear_psize, mmu_kernel_ssize);
-
-	BUG_ON (ret < 0);
-	spin_lock(&linear_map_hash_lock);
-	BUG_ON(linear_map_hash_slots[lmi] & 0x80);
-	linear_map_hash_slots[lmi] = ret | 0x80;
-	spin_unlock(&linear_map_hash_lock);
-}
-
-static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
-{
-	unsigned long hash, hidx, slot;
-	unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
-	unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
-
-	hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
-	spin_lock(&linear_map_hash_lock);
-	BUG_ON(!(linear_map_hash_slots[lmi] & 0x80));
-	hidx = linear_map_hash_slots[lmi] & 0x7f;
-	linear_map_hash_slots[lmi] = 0;
-	spin_unlock(&linear_map_hash_lock);
-	if (hidx & _PTEIDX_SECONDARY)
-		hash = ~hash;
-	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-	slot += hidx & _PTEIDX_GROUP_IX;
-	mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize,
-				     mmu_linear_psize,
-				     mmu_kernel_ssize, 0);
-}
-
-void __kernel_map_pages(struct page *page, int numpages, int enable)
-{
-	unsigned long flags, vaddr, lmi;
-	int i;
-
-	local_irq_save(flags);
-	for (i = 0; i < numpages; i++, page++) {
-		vaddr = (unsigned long)page_address(page);
-		lmi = __pa(vaddr) >> PAGE_SHIFT;
-		if (lmi >= linear_map_hash_count)
-			continue;
-		if (enable)
-			kernel_map_linear_page(vaddr, lmi);
-		else
-			kernel_unmap_linear_page(vaddr, lmi);
-	}
-	local_irq_restore(flags);
-}
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-
-void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
-				phys_addr_t first_memblock_size)
-{
-	/* We don't currently support the first MEMBLOCK not mapping 0
-	 * physical on those processors
-	 */
-	BUG_ON(first_memblock_base != 0);
-
-	/*
-	 * On virtualized systems the first entry is our RMA region aka VRMA,
-	 * non-virtualized 64-bit hash MMU systems don't have a limitation
-	 * on real mode access.
-	 *
-	 * For guests on platforms before POWER9, we clamp the it limit to 1G
-	 * to avoid some funky things such as RTAS bugs etc...
-	 */
-	if (!early_cpu_has_feature(CPU_FTR_HVMODE)) {
-		ppc64_rma_size = first_memblock_size;
-		if (!early_cpu_has_feature(CPU_FTR_ARCH_300))
-			ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000);
-
-		/* Finally limit subsequent allocations */
-		memblock_set_current_limit(ppc64_rma_size);
-	} else {
-		ppc64_rma_size = ULONG_MAX;
-	}
-}
-
-#ifdef CONFIG_DEBUG_FS
-
-static int hpt_order_get(void *data, u64 *val)
-{
-	*val = ppc64_pft_size;
-	return 0;
-}
-
-static int hpt_order_set(void *data, u64 val)
-{
-	if (!mmu_hash_ops.resize_hpt)
-		return -ENODEV;
-
-	return mmu_hash_ops.resize_hpt(val);
-}
-
-DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
-
-static int __init hash64_debugfs(void)
-{
-	if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root,
-					NULL, &fops_hpt_order)) {
-		pr_err("lpar: unable to create hpt_order debugsfs file\n");
-	}
-
-	return 0;
-}
-machine_device_initcall(pseries, hash64_debugfs);
-#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
deleted file mode 100644
index dfbc3b32f09b..000000000000
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright IBM Corporation, 2013
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-/*
- * PPC64 THP Support for hash based MMUs
- */
-#include <linux/mm.h>
-#include <asm/machdep.h>
-
-int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
-		    pmd_t *pmdp, unsigned long trap, unsigned long flags,
-		    int ssize, unsigned int psize)
-{
-	unsigned int index, valid;
-	unsigned char *hpte_slot_array;
-	unsigned long rflags, pa, hidx;
-	unsigned long old_pmd, new_pmd;
-	int ret, lpsize = MMU_PAGE_16M;
-	unsigned long vpn, hash, shift, slot;
-
-	/*
-	 * atomically mark the linux large page PMD busy and dirty
-	 */
-	do {
-		pmd_t pmd = READ_ONCE(*pmdp);
-
-		old_pmd = pmd_val(pmd);
-		/* If PMD busy, retry the access */
-		if (unlikely(old_pmd & H_PAGE_BUSY))
-			return 0;
-		/* If PMD permissions don't match, take page fault */
-		if (unlikely(!check_pte_access(access, old_pmd)))
-			return 1;
-		/*
-		 * Try to lock the PTE, add ACCESSED and DIRTY if it was
-		 * a write access
-		 */
-		new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED;
-		if (access & _PAGE_WRITE)
-			new_pmd |= _PAGE_DIRTY;
-	} while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd)));
-
-	/*
-	 * Make sure this is thp or devmap entry
-	 */
-	if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)))
-		return 0;
-
-	rflags = htab_convert_pte_flags(new_pmd);
-
-#if 0
-	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
-
-		/*
-		 * No CPU has hugepages but lacks no execute, so we
-		 * don't need to worry about that case
-		 */
-		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
-	}
-#endif
-	/*
-	 * Find the slot index details for this ea, using base page size.
-	 */
-	shift = mmu_psize_defs[psize].shift;
-	index = (ea & ~HPAGE_PMD_MASK) >> shift;
-	BUG_ON(index >= PTE_FRAG_SIZE);
-
-	vpn = hpt_vpn(ea, vsid, ssize);
-	hpte_slot_array = get_hpte_slot_array(pmdp);
-	if (psize == MMU_PAGE_4K) {
-		/*
-		 * invalidate the old hpte entry if we have that mapped via 64K
-		 * base page size. This is because demote_segment won't flush
-		 * hash page table entries.
-		 */
-		if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) {
-			flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K,
-					    ssize, flags);
-			/*
-			 * With THP, we also clear the slot information with
-			 * respect to all the 64K hash pte mapping the 16MB
-			 * page. They are all invalid now. This make sure we
-			 * don't find the slot valid when we fault with 4k
-			 * base page size.
-			 *
-			 */
-			memset(hpte_slot_array, 0, PTE_FRAG_SIZE);
-		}
-	}
-
-	valid = hpte_valid(hpte_slot_array, index);
-	if (valid) {
-		/* update the hpte bits */
-		hash = hpt_hash(vpn, shift, ssize);
-		hidx =  hpte_hash_index(hpte_slot_array, index);
-		if (hidx & _PTEIDX_SECONDARY)
-			hash = ~hash;
-		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-		slot += hidx & _PTEIDX_GROUP_IX;
-
-		ret = mmu_hash_ops.hpte_updatepp(slot, rflags, vpn,
-						 psize, lpsize, ssize, flags);
-		/*
-		 * We failed to update, try to insert a new entry.
-		 */
-		if (ret == -1) {
-			/*
-			 * large pte is marked busy, so we can be sure
-			 * nobody is looking at hpte_slot_array. hence we can
-			 * safely update this here.
-			 */
-			valid = 0;
-			hpte_slot_array[index] = 0;
-		}
-	}
-
-	if (!valid) {
-		unsigned long hpte_group;
-
-		hash = hpt_hash(vpn, shift, ssize);
-		/* insert new entry */
-		pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
-		new_pmd |= H_PAGE_HASHPTE;
-
-repeat:
-		hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-
-		/* Insert into the hash table, primary slot */
-		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
-						psize, lpsize, ssize);
-		/*
-		 * Primary is full, try the secondary
-		 */
-		if (unlikely(slot == -1)) {
-			hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
-			slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
-							rflags,
-							HPTE_V_SECONDARY,
-							psize, lpsize, ssize);
-			if (slot == -1) {
-				if (mftb() & 0x1)
-					hpte_group = (hash & htab_hash_mask) *
-							HPTES_PER_GROUP;
-
-				mmu_hash_ops.hpte_remove(hpte_group);
-				goto repeat;
-			}
-		}
-		/*
-		 * Hypervisor failure. Restore old pmd and return -1
-		 * similar to __hash_page_*
-		 */
-		if (unlikely(slot == -2)) {
-			*pmdp = __pmd(old_pmd);
-			hash_failure_debug(ea, access, vsid, trap, ssize,
-					   psize, lpsize, old_pmd);
-			return -1;
-		}
-		/*
-		 * large pte is marked busy, so we can be sure
-		 * nobody is looking at hpte_slot_array. hence we can
-		 * safely update this here.
-		 */
-		mark_hpte_slot_valid(hpte_slot_array, index, slot);
-	}
-	/*
-	 * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with
-	 * base page size 4k.
-	 */
-	if (psize == MMU_PAGE_4K)
-		new_pmd |= H_PAGE_COMBO;
-	/*
-	 * The hpte valid is stored in the pgtable whose address is in the
-	 * second half of the PMD. Order this against clearing of the busy bit in
-	 * huge pmd.
-	 */
-	smp_wmb();
-	*pmdp = __pmd(new_pmd & ~H_PAGE_BUSY);
-	return 0;
-}
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
deleted file mode 100644
index b0d9209d9a86..000000000000
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ /dev/null
@@ -1,147 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
- *
- * Copyright (C) 2003 David Gibson, IBM Corporation.
- *
- * Based on the IA-32 version:
- * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
- */
-
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/cacheflush.h>
-#include <asm/machdep.h>
-
-extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
-				  unsigned long pa, unsigned long rlags,
-				  unsigned long vflags, int psize, int ssize);
-
-int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
-		     pte_t *ptep, unsigned long trap, unsigned long flags,
-		     int ssize, unsigned int shift, unsigned int mmu_psize)
-{
-	real_pte_t rpte;
-	unsigned long vpn;
-	unsigned long old_pte, new_pte;
-	unsigned long rflags, pa;
-	long slot, offset;
-
-	BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
-
-	/* Search the Linux page table for a match with va */
-	vpn = hpt_vpn(ea, vsid, ssize);
-
-	/* At this point, we have a pte (old_pte) which can be used to build
-	 * or update an HPTE. There are 2 cases:
-	 *
-	 * 1. There is a valid (present) pte with no associated HPTE (this is
-	 *	the most common case)
-	 * 2. There is a valid (present) pte with an associated HPTE. The
-	 *	current values of the pp bits in the HPTE prevent access
-	 *	because we are doing software DIRTY bit management and the
-	 *	page is currently not DIRTY.
-	 */
-
-
-	do {
-		old_pte = pte_val(*ptep);
-		/* If PTE busy, retry the access */
-		if (unlikely(old_pte & H_PAGE_BUSY))
-			return 0;
-		/* If PTE permissions don't match, take page fault */
-		if (unlikely(!check_pte_access(access, old_pte)))
-			return 1;
-
-		/* Try to lock the PTE, add ACCESSED and DIRTY if it was
-		 * a write access */
-		new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
-		if (access & _PAGE_WRITE)
-			new_pte |= _PAGE_DIRTY;
-	} while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
-
-	/* Make sure this is a hugetlb entry */
-	if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))
-		return 0;
-
-	rflags = htab_convert_pte_flags(new_pte);
-	if (unlikely(mmu_psize == MMU_PAGE_16G))
-		offset = PTRS_PER_PUD;
-	else
-		offset = PTRS_PER_PMD;
-	rpte = __real_pte(__pte(old_pte), ptep, offset);
-
-	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
-		/* No CPU has hugepages but lacks no execute, so we
-		 * don't need to worry about that case */
-		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
-
-	/* Check if pte already has an hpte (case 2) */
-	if (unlikely(old_pte & H_PAGE_HASHPTE)) {
-		/* There MIGHT be an HPTE for this pte */
-		unsigned long gslot;
-
-		gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
-		if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize,
-					       mmu_psize, ssize, flags) == -1)
-			old_pte &= ~_PAGE_HPTEFLAGS;
-	}
-
-	if (likely(!(old_pte & H_PAGE_HASHPTE))) {
-		unsigned long hash = hpt_hash(vpn, shift, ssize);
-
-		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
-
-		/* clear HPTE slot informations in new PTE */
-		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
-
-		slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
-					     mmu_psize, ssize);
-
-		/*
-		 * Hypervisor failure. Restore old pte and return -1
-		 * similar to __hash_page_*
-		 */
-		if (unlikely(slot == -2)) {
-			*ptep = __pte(old_pte);
-			hash_failure_debug(ea, access, vsid, trap, ssize,
-					   mmu_psize, mmu_psize, old_pte);
-			return -1;
-		}
-
-		new_pte |= pte_set_hidx(ptep, rpte, 0, slot, offset);
-	}
-
-	/*
-	 * No need to use ldarx/stdcx here
-	 */
-	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
-	return 0;
-}
-
-pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
-				  unsigned long addr, pte_t *ptep)
-{
-	unsigned long pte_val;
-	/*
-	 * Clear the _PAGE_PRESENT so that no hardware parallel update is
-	 * possible. Also keep the pte_present true so that we don't take
-	 * wrong fault.
-	 */
-	pte_val = pte_update(vma->vm_mm, addr, ptep,
-			     _PAGE_PRESENT, _PAGE_INVALID, 1);
-
-	return __pte(pte_val);
-}
-
-void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
-				  pte_t *ptep, pte_t old_pte, pte_t pte)
-{
-
-	if (radix_enabled())
-		return radix__huge_ptep_modify_prot_commit(vma, addr, ptep,
-							   old_pte, pte);
-	set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
-}
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c
deleted file mode 100644
index cab06331c0c0..000000000000
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/security.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/cacheflush.h>
-#include <asm/machdep.h>
-#include <asm/mman.h>
-#include <asm/tlb.h>
-
-void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
-	int psize;
-	struct hstate *hstate = hstate_file(vma->vm_file);
-
-	psize = hstate_get_psize(hstate);
-	radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, psize);
-}
-
-void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
-	int psize;
-	struct hstate *hstate = hstate_file(vma->vm_file);
-
-	psize = hstate_get_psize(hstate);
-	radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, psize);
-}
-
-void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start,
-				   unsigned long end)
-{
-	int psize;
-	struct hstate *hstate = hstate_file(vma->vm_file);
-
-	psize = hstate_get_psize(hstate);
-	radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize);
-}
-
-/*
- * A vairant of hugetlb_get_unmapped_area doing topdown search
- * FIXME!! should we do as x86 does or non hugetlb area does ?
- * ie, use topdown or not based on mmap_is_legacy check ?
- */
-unsigned long
-radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-				unsigned long len, unsigned long pgoff,
-				unsigned long flags)
-{
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-	struct hstate *h = hstate_file(file);
-	int fixed = (flags & MAP_FIXED);
-	unsigned long high_limit;
-	struct vm_unmapped_area_info info;
-
-	high_limit = DEFAULT_MAP_WINDOW;
-	if (addr >= high_limit || (fixed && (addr + len > high_limit)))
-		high_limit = TASK_SIZE;
-
-	if (len & ~huge_page_mask(h))
-		return -EINVAL;
-	if (len > high_limit)
-		return -ENOMEM;
-
-	if (fixed) {
-		if (addr > high_limit - len)
-			return -ENOMEM;
-		if (prepare_hugepage_range(file, addr, len))
-			return -EINVAL;
-		return addr;
-	}
-
-	if (addr) {
-		addr = ALIGN(addr, huge_page_size(h));
-		vma = find_vma(mm, addr);
-		if (high_limit - len >= addr && addr >= mmap_min_addr &&
-		    (!vma || addr + len <= vm_start_gap(vma)))
-			return addr;
-	}
-	/*
-	 * We are always doing an topdown search here. Slice code
-	 * does that too.
-	 */
-	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
-	info.length = len;
-	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
-	info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
-	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
-	info.align_offset = 0;
-
-	return vm_unmapped_area(&info);
-}
-
-void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
-					 unsigned long addr, pte_t *ptep,
-					 pte_t old_pte, pte_t pte)
-{
-	struct mm_struct *mm = vma->vm_mm;
-
-	/*
-	 * To avoid NMMU hang while relaxing access we need to flush the tlb before
-	 * we set the new value.
-	 */
-	if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
-	    (atomic_read(&mm->context.copros) > 0))
-		radix__flush_hugetlb_page(vma, addr);
-
-	set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
-}
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
deleted file mode 100644
index cb2b08635508..000000000000
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- *  MMU context allocation for 64-bit kernels.
- *
- *  Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/pkeys.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-#include <linux/export.h>
-#include <linux/gfp.h>
-#include <linux/slab.h>
-
-#include <asm/mmu_context.h>
-#include <asm/pgalloc.h>
-
-static DEFINE_IDA(mmu_context_ida);
-
-static int alloc_context_id(int min_id, int max_id)
-{
-	return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL);
-}
-
-void hash__reserve_context_id(int id)
-{
-	int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL);
-
-	WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result);
-}
-
-int hash__alloc_context_id(void)
-{
-	unsigned long max;
-
-	if (mmu_has_feature(MMU_FTR_68_BIT_VA))
-		max = MAX_USER_CONTEXT;
-	else
-		max = MAX_USER_CONTEXT_65BIT_VA;
-
-	return alloc_context_id(MIN_USER_CONTEXT, max);
-}
-EXPORT_SYMBOL_GPL(hash__alloc_context_id);
-
-void slb_setup_new_exec(void);
-
-static int hash__init_new_context(struct mm_struct *mm)
-{
-	int index;
-
-	index = hash__alloc_context_id();
-	if (index < 0)
-		return index;
-
-	mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context),
-					   GFP_KERNEL);
-	if (!mm->context.hash_context) {
-		ida_free(&mmu_context_ida, index);
-		return -ENOMEM;
-	}
-
-	/*
-	 * The old code would re-promote on fork, we don't do that when using
-	 * slices as it could cause problem promoting slices that have been
-	 * forced down to 4K.
-	 *
-	 * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
-	 * explicitly against context.id == 0. This ensures that we properly
-	 * initialize context slice details for newly allocated mm's (which will
-	 * have id == 0) and don't alter context slice inherited via fork (which
-	 * will have id != 0).
-	 *
-	 * We should not be calling init_new_context() on init_mm. Hence a
-	 * check against 0 is OK.
-	 */
-	if (mm->context.id == 0) {
-		memset(mm->context.hash_context, 0, sizeof(struct hash_mm_context));
-		slice_init_new_context_exec(mm);
-	} else {
-		/* This is fork. Copy hash_context details from current->mm */
-		memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context));
-#ifdef CONFIG_PPC_SUBPAGE_PROT
-		/* inherit subpage prot detalis if we have one. */
-		if (current->mm->context.hash_context->spt) {
-			mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table),
-								GFP_KERNEL);
-			if (!mm->context.hash_context->spt) {
-				ida_free(&mmu_context_ida, index);
-				kfree(mm->context.hash_context);
-				return -ENOMEM;
-			}
-		}
-#endif
-
-	}
-
-	pkey_mm_init(mm);
-	return index;
-}
-
-void hash__setup_new_exec(void)
-{
-	slice_setup_new_exec();
-
-	slb_setup_new_exec();
-}
-
-static int radix__init_new_context(struct mm_struct *mm)
-{
-	unsigned long rts_field;
-	int index, max_id;
-
-	max_id = (1 << mmu_pid_bits) - 1;
-	index = alloc_context_id(mmu_base_pid, max_id);
-	if (index < 0)
-		return index;
-
-	/*
-	 * set the process table entry,
-	 */
-	rts_field = radix__get_tree_size();
-	process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE);
-
-	/*
-	 * Order the above store with subsequent update of the PID
-	 * register (at which point HW can start loading/caching
-	 * the entry) and the corresponding load by the MMU from
-	 * the L2 cache.
-	 */
-	asm volatile("ptesync;isync" : : : "memory");
-
-	mm->context.npu_context = NULL;
-	mm->context.hash_context = NULL;
-
-	return index;
-}
-
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
-	int index;
-
-	if (radix_enabled())
-		index = radix__init_new_context(mm);
-	else
-		index = hash__init_new_context(mm);
-
-	if (index < 0)
-		return index;
-
-	mm->context.id = index;
-
-	mm->context.pte_frag = NULL;
-	mm->context.pmd_frag = NULL;
-#ifdef CONFIG_SPAPR_TCE_IOMMU
-	mm_iommu_init(mm);
-#endif
-	atomic_set(&mm->context.active_cpus, 0);
-	atomic_set(&mm->context.copros, 0);
-
-	return 0;
-}
-
-void __destroy_context(int context_id)
-{
-	ida_free(&mmu_context_ida, context_id);
-}
-EXPORT_SYMBOL_GPL(__destroy_context);
-
-static void destroy_contexts(mm_context_t *ctx)
-{
-	int index, context_id;
-
-	for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) {
-		context_id = ctx->extended_id[index];
-		if (context_id)
-			ida_free(&mmu_context_ida, context_id);
-	}
-	kfree(ctx->hash_context);
-}
-
-static void pmd_frag_destroy(void *pmd_frag)
-{
-	int count;
-	struct page *page;
-
-	page = virt_to_page(pmd_frag);
-	/* drop all the pending references */
-	count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
-	/* We allow PTE_FRAG_NR fragments from a PTE page */
-	if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) {
-		pgtable_pmd_page_dtor(page);
-		__free_page(page);
-	}
-}
-
-static void destroy_pagetable_cache(struct mm_struct *mm)
-{
-	void *frag;
-
-	frag = mm->context.pte_frag;
-	if (frag)
-		pte_frag_destroy(frag);
-
-	frag = mm->context.pmd_frag;
-	if (frag)
-		pmd_frag_destroy(frag);
-	return;
-}
-
-void destroy_context(struct mm_struct *mm)
-{
-#ifdef CONFIG_SPAPR_TCE_IOMMU
-	WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
-#endif
-	if (radix_enabled())
-		WARN_ON(process_tb[mm->context.id].prtb0 != 0);
-	else
-		subpage_prot_free(mm);
-	destroy_contexts(&mm->context);
-	mm->context.id = MMU_NO_CONTEXT;
-}
-
-void arch_exit_mmap(struct mm_struct *mm)
-{
-	destroy_pagetable_cache(mm);
-
-	if (radix_enabled()) {
-		/*
-		 * Radix doesn't have a valid bit in the process table
-		 * entries. However we know that at least P9 implementation
-		 * will avoid caching an entry with an invalid RTS field,
-		 * and 0 is invalid. So this will do.
-		 *
-		 * This runs before the "fullmm" tlb flush in exit_mmap,
-		 * which does a RIC=2 tlbie to clear the process table
-		 * entry. See the "fullmm" comments in tlb-radix.c.
-		 *
-		 * No barrier required here after the store because
-		 * this process will do the invalidate, which starts with
-		 * ptesync.
-		 */
-		process_tb[mm->context.id].prtb0 = 0;
-	}
-}
-
-#ifdef CONFIG_PPC_RADIX_MMU
-void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
-{
-	mtspr(SPRN_PID, next->context.id);
-	isync();
-}
-#endif
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
deleted file mode 100644
index e7a9c4f6bfca..000000000000
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ /dev/null
@@ -1,482 +0,0 @@
-/*
- *  IOMMU helpers in MMU context.
- *
- *  Copyright (C) 2015 IBM Corp. <aik@ozlabs.ru>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched/signal.h>
-#include <linux/slab.h>
-#include <linux/rculist.h>
-#include <linux/vmalloc.h>
-#include <linux/mutex.h>
-#include <linux/migrate.h>
-#include <linux/hugetlb.h>
-#include <linux/swap.h>
-#include <linux/sizes.h>
-#include <asm/mmu_context.h>
-#include <asm/pte-walk.h>
-#include <linux/mm_inline.h>
-
-static DEFINE_MUTEX(mem_list_mutex);
-
-#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY	0x1
-#define MM_IOMMU_TABLE_GROUP_PAGE_MASK	~(SZ_4K - 1)
-
-struct mm_iommu_table_group_mem_t {
-	struct list_head next;
-	struct rcu_head rcu;
-	unsigned long used;
-	atomic64_t mapped;
-	unsigned int pageshift;
-	u64 ua;			/* userspace address */
-	u64 entries;		/* number of entries in hpas/hpages[] */
-	/*
-	 * in mm_iommu_get we temporarily use this to store
-	 * struct page address.
-	 *
-	 * We need to convert ua to hpa in real mode. Make it
-	 * simpler by storing physical address.
-	 */
-	union {
-		struct page **hpages;	/* vmalloc'ed */
-		phys_addr_t *hpas;
-	};
-#define MM_IOMMU_TABLE_INVALID_HPA	((uint64_t)-1)
-	u64 dev_hpa;		/* Device memory base address */
-};
-
-static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
-		unsigned long npages, bool incr)
-{
-	long ret = 0, locked, lock_limit;
-
-	if (!npages)
-		return 0;
-
-	down_write(&mm->mmap_sem);
-
-	if (incr) {
-		locked = mm->locked_vm + npages;
-		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-			ret = -ENOMEM;
-		else
-			mm->locked_vm += npages;
-	} else {
-		if (WARN_ON_ONCE(npages > mm->locked_vm))
-			npages = mm->locked_vm;
-		mm->locked_vm -= npages;
-	}
-
-	pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
-			current ? current->pid : 0,
-			incr ? '+' : '-',
-			npages << PAGE_SHIFT,
-			mm->locked_vm << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK));
-	up_write(&mm->mmap_sem);
-
-	return ret;
-}
-
-bool mm_iommu_preregistered(struct mm_struct *mm)
-{
-	return !list_empty(&mm->context.iommu_group_mem_list);
-}
-EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
-
-static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
-			      unsigned long entries, unsigned long dev_hpa,
-			      struct mm_iommu_table_group_mem_t **pmem)
-{
-	struct mm_iommu_table_group_mem_t *mem;
-	long i, ret, locked_entries = 0;
-	unsigned int pageshift;
-
-	mutex_lock(&mem_list_mutex);
-
-	list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list,
-			next) {
-		/* Overlap? */
-		if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
-				(ua < (mem->ua +
-				       (mem->entries << PAGE_SHIFT)))) {
-			ret = -EINVAL;
-			goto unlock_exit;
-		}
-
-	}
-
-	if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
-		ret = mm_iommu_adjust_locked_vm(mm, entries, true);
-		if (ret)
-			goto unlock_exit;
-
-		locked_entries = entries;
-	}
-
-	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
-	if (!mem) {
-		ret = -ENOMEM;
-		goto unlock_exit;
-	}
-
-	if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) {
-		mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT));
-		mem->dev_hpa = dev_hpa;
-		goto good_exit;
-	}
-	mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA;
-
-	/*
-	 * For a starting point for a maximum page size calculation
-	 * we use @ua and @entries natural alignment to allow IOMMU pages
-	 * smaller than huge pages but still bigger than PAGE_SIZE.
-	 */
-	mem->pageshift = __ffs(ua | (entries << PAGE_SHIFT));
-	mem->hpas = vzalloc(array_size(entries, sizeof(mem->hpas[0])));
-	if (!mem->hpas) {
-		kfree(mem);
-		ret = -ENOMEM;
-		goto unlock_exit;
-	}
-
-	down_read(&mm->mmap_sem);
-	ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL);
-	up_read(&mm->mmap_sem);
-	if (ret != entries) {
-		/* free the reference taken */
-		for (i = 0; i < ret; i++)
-			put_page(mem->hpages[i]);
-
-		vfree(mem->hpas);
-		kfree(mem);
-		ret = -EFAULT;
-		goto unlock_exit;
-	}
-
-	pageshift = PAGE_SHIFT;
-	for (i = 0; i < entries; ++i) {
-		struct page *page = mem->hpages[i];
-
-		/*
-		 * Allow to use larger than 64k IOMMU pages. Only do that
-		 * if we are backed by hugetlb.
-		 */
-		if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) {
-			struct page *head = compound_head(page);
-
-			pageshift = compound_order(head) + PAGE_SHIFT;
-		}
-		mem->pageshift = min(mem->pageshift, pageshift);
-		/*
-		 * We don't need struct page reference any more, switch
-		 * to physical address.
-		 */
-		mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
-	}
-
-good_exit:
-	ret = 0;
-	atomic64_set(&mem->mapped, 1);
-	mem->used = 1;
-	mem->ua = ua;
-	mem->entries = entries;
-	*pmem = mem;
-
-	list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list);
-
-unlock_exit:
-	if (locked_entries && ret)
-		mm_iommu_adjust_locked_vm(mm, locked_entries, false);
-
-	mutex_unlock(&mem_list_mutex);
-
-	return ret;
-}
-
-long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
-		struct mm_iommu_table_group_mem_t **pmem)
-{
-	return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA,
-			pmem);
-}
-EXPORT_SYMBOL_GPL(mm_iommu_new);
-
-long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua,
-		unsigned long entries, unsigned long dev_hpa,
-		struct mm_iommu_table_group_mem_t **pmem)
-{
-	return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem);
-}
-EXPORT_SYMBOL_GPL(mm_iommu_newdev);
-
-static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
-{
-	long i;
-	struct page *page = NULL;
-
-	if (!mem->hpas)
-		return;
-
-	for (i = 0; i < mem->entries; ++i) {
-		if (!mem->hpas[i])
-			continue;
-
-		page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
-		if (!page)
-			continue;
-
-		if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
-			SetPageDirty(page);
-
-		put_page(page);
-		mem->hpas[i] = 0;
-	}
-}
-
-static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem)
-{
-
-	mm_iommu_unpin(mem);
-	vfree(mem->hpas);
-	kfree(mem);
-}
-
-static void mm_iommu_free(struct rcu_head *head)
-{
-	struct mm_iommu_table_group_mem_t *mem = container_of(head,
-			struct mm_iommu_table_group_mem_t, rcu);
-
-	mm_iommu_do_free(mem);
-}
-
-static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem)
-{
-	list_del_rcu(&mem->next);
-	call_rcu(&mem->rcu, mm_iommu_free);
-}
-
-long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
-{
-	long ret = 0;
-	unsigned long entries, dev_hpa;
-
-	mutex_lock(&mem_list_mutex);
-
-	if (mem->used == 0) {
-		ret = -ENOENT;
-		goto unlock_exit;
-	}
-
-	--mem->used;
-	/* There are still users, exit */
-	if (mem->used)
-		goto unlock_exit;
-
-	/* Are there still mappings? */
-	if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) {
-		++mem->used;
-		ret = -EBUSY;
-		goto unlock_exit;
-	}
-
-	/* @mapped became 0 so now mappings are disabled, release the region */
-	entries = mem->entries;
-	dev_hpa = mem->dev_hpa;
-	mm_iommu_release(mem);
-
-	if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
-		mm_iommu_adjust_locked_vm(mm, entries, false);
-
-unlock_exit:
-	mutex_unlock(&mem_list_mutex);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_put);
-
-struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
-		unsigned long ua, unsigned long size)
-{
-	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
-
-	list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
-		if ((mem->ua <= ua) &&
-				(ua + size <= mem->ua +
-				 (mem->entries << PAGE_SHIFT))) {
-			ret = mem;
-			break;
-		}
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_lookup);
-
-struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
-		unsigned long ua, unsigned long size)
-{
-	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
-
-	list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list,
-			next) {
-		if ((mem->ua <= ua) &&
-				(ua + size <= mem->ua +
-				 (mem->entries << PAGE_SHIFT))) {
-			ret = mem;
-			break;
-		}
-	}
-
-	return ret;
-}
-
-struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
-		unsigned long ua, unsigned long entries)
-{
-	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
-
-	mutex_lock(&mem_list_mutex);
-
-	list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
-		if ((mem->ua == ua) && (mem->entries == entries)) {
-			ret = mem;
-			++mem->used;
-			break;
-		}
-	}
-
-	mutex_unlock(&mem_list_mutex);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_get);
-
-long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
-		unsigned long ua, unsigned int pageshift, unsigned long *hpa)
-{
-	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
-	u64 *va;
-
-	if (entry >= mem->entries)
-		return -EFAULT;
-
-	if (pageshift > mem->pageshift)
-		return -EFAULT;
-
-	if (!mem->hpas) {
-		*hpa = mem->dev_hpa + (ua - mem->ua);
-		return 0;
-	}
-
-	va = &mem->hpas[entry];
-	*hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
-
-long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
-		unsigned long ua, unsigned int pageshift, unsigned long *hpa)
-{
-	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
-	unsigned long *pa;
-
-	if (entry >= mem->entries)
-		return -EFAULT;
-
-	if (pageshift > mem->pageshift)
-		return -EFAULT;
-
-	if (!mem->hpas) {
-		*hpa = mem->dev_hpa + (ua - mem->ua);
-		return 0;
-	}
-
-	pa = (void *) vmalloc_to_phys(&mem->hpas[entry]);
-	if (!pa)
-		return -EFAULT;
-
-	*hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
-
-	return 0;
-}
-
-extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua)
-{
-	struct mm_iommu_table_group_mem_t *mem;
-	long entry;
-	void *va;
-	unsigned long *pa;
-
-	mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE);
-	if (!mem)
-		return;
-
-	if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA)
-		return;
-
-	entry = (ua - mem->ua) >> PAGE_SHIFT;
-	va = &mem->hpas[entry];
-
-	pa = (void *) vmalloc_to_phys(va);
-	if (!pa)
-		return;
-
-	*pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY;
-}
-
-bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
-		unsigned int pageshift, unsigned long *size)
-{
-	struct mm_iommu_table_group_mem_t *mem;
-	unsigned long end;
-
-	list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
-		if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
-			continue;
-
-		end = mem->dev_hpa + (mem->entries << PAGE_SHIFT);
-		if ((mem->dev_hpa <= hpa) && (hpa < end)) {
-			/*
-			 * Since the IOMMU page size might be bigger than
-			 * PAGE_SIZE, the amount of preregistered memory
-			 * starting from @hpa might be smaller than 1<<pageshift
-			 * and the caller needs to distinguish this situation.
-			 */
-			*size = min(1UL << pageshift, end - hpa);
-			return true;
-		}
-	}
-
-	return false;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_is_devmem);
-
-long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
-{
-	if (atomic64_inc_not_zero(&mem->mapped))
-		return 0;
-
-	/* Last mm_iommu_put() has been called, no more mappings allowed() */
-	return -ENXIO;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc);
-
-void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem)
-{
-	atomic64_add_unless(&mem->mapped, -1, 1);
-}
-EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec);
-
-void mm_iommu_init(struct mm_struct *mm)
-{
-	INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
-}
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 6ef36d553cde..57e64273cb33 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1068,7 +1068,7 @@ u64 memory_hotplug_max(void)
 /* Virtual Processor Home Node (VPHN) support */
 #ifdef CONFIG_PPC_SPLPAR
 
-#include "vphn.h"
+#include "book3s64/vphn.h"
 
 struct topology_update_data {
 	struct topology_update_data *next;
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
deleted file mode 100644
index 16bda049187a..000000000000
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ /dev/null
@@ -1,449 +0,0 @@
-/*
- * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/sched.h>
-#include <linux/mm_types.h>
-#include <linux/memblock.h>
-#include <misc/cxl-base.h>
-
-#include <asm/pgalloc.h>
-#include <asm/tlb.h>
-#include <asm/trace.h>
-#include <asm/powernv.h>
-
-#include <mm/mmu_decl.h>
-#include <trace/events/thp.h>
-
-unsigned long __pmd_frag_nr;
-EXPORT_SYMBOL(__pmd_frag_nr);
-unsigned long __pmd_frag_size_shift;
-EXPORT_SYMBOL(__pmd_frag_size_shift);
-
-int (*register_process_table)(unsigned long base, unsigned long page_size,
-			      unsigned long tbl_size);
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/*
- * This is called when relaxing access to a hugepage. It's also called in the page
- * fault path when we don't hit any of the major fault cases, ie, a minor
- * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
- * handled those two for us, we additionally deal with missing execute
- * permission here on some processors
- */
-int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
-			  pmd_t *pmdp, pmd_t entry, int dirty)
-{
-	int changed;
-#ifdef CONFIG_DEBUG_VM
-	WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
-	assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp));
-#endif
-	changed = !pmd_same(*(pmdp), entry);
-	if (changed) {
-		/*
-		 * We can use MMU_PAGE_2M here, because only radix
-		 * path look at the psize.
-		 */
-		__ptep_set_access_flags(vma, pmdp_ptep(pmdp),
-					pmd_pte(entry), address, MMU_PAGE_2M);
-	}
-	return changed;
-}
-
-int pmdp_test_and_clear_young(struct vm_area_struct *vma,
-			      unsigned long address, pmd_t *pmdp)
-{
-	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
-}
-/*
- * set a new huge pmd. We should not be called for updating
- * an existing pmd entry. That should go via pmd_hugepage_update.
- */
-void set_pmd_at(struct mm_struct *mm, unsigned long addr,
-		pmd_t *pmdp, pmd_t pmd)
-{
-#ifdef CONFIG_DEBUG_VM
-	/*
-	 * Make sure hardware valid bit is not set. We don't do
-	 * tlb flush for this update.
-	 */
-
-	WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
-	assert_spin_locked(pmd_lockptr(mm, pmdp));
-	WARN_ON(!(pmd_large(pmd) || pmd_devmap(pmd)));
-#endif
-	trace_hugepage_set_pmd(addr, pmd_val(pmd));
-	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
-}
-
-static void do_nothing(void *unused)
-{
-
-}
-/*
- * Serialize against find_current_mm_pte which does lock-less
- * lookup in page tables with local interrupts disabled. For huge pages
- * it casts pmd_t to pte_t. Since format of pte_t is different from
- * pmd_t we want to prevent transit from pmd pointing to page table
- * to pmd pointing to huge page (and back) while interrupts are disabled.
- * We clear pmd to possibly replace it with page table pointer in
- * different code paths. So make sure we wait for the parallel
- * find_current_mm_pte to finish.
- */
-void serialize_against_pte_lookup(struct mm_struct *mm)
-{
-	smp_mb();
-	smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
-}
-
-/*
- * We use this to invalidate a pmdp entry before switching from a
- * hugepte to regular pmd entry.
- */
-pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
-		     pmd_t *pmdp)
-{
-	unsigned long old_pmd;
-
-	old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
-	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
-	/*
-	 * This ensures that generic code that rely on IRQ disabling
-	 * to prevent a parallel THP split work as expected.
-	 */
-	serialize_against_pte_lookup(vma->vm_mm);
-	return __pmd(old_pmd);
-}
-
-static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
-{
-	return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
-}
-
-pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
-{
-	unsigned long pmdv;
-
-	pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
-	return pmd_set_protbits(__pmd(pmdv), pgprot);
-}
-
-pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
-{
-	return pfn_pmd(page_to_pfn(page), pgprot);
-}
-
-pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
-{
-	unsigned long pmdv;
-
-	pmdv = pmd_val(pmd);
-	pmdv &= _HPAGE_CHG_MASK;
-	return pmd_set_protbits(__pmd(pmdv), newprot);
-}
-
-/*
- * This is called at the end of handling a user page fault, when the
- * fault has been handled by updating a HUGE PMD entry in the linux page tables.
- * We use it to preload an HPTE into the hash table corresponding to
- * the updated linux HUGE PMD entry.
- */
-void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
-			  pmd_t *pmd)
-{
-	if (radix_enabled())
-		prefetch((void *)addr);
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-/* For use by kexec */
-void mmu_cleanup_all(void)
-{
-	if (radix_enabled())
-		radix__mmu_cleanup_all();
-	else if (mmu_hash_ops.hpte_clear_all)
-		mmu_hash_ops.hpte_clear_all();
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-int __meminit create_section_mapping(unsigned long start, unsigned long end, int nid)
-{
-	if (radix_enabled())
-		return radix__create_section_mapping(start, end, nid);
-
-	return hash__create_section_mapping(start, end, nid);
-}
-
-int __meminit remove_section_mapping(unsigned long start, unsigned long end)
-{
-	if (radix_enabled())
-		return radix__remove_section_mapping(start, end);
-
-	return hash__remove_section_mapping(start, end);
-}
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-void __init mmu_partition_table_init(void)
-{
-	unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
-	unsigned long ptcr;
-
-	BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
-	/* Initialize the Partition Table with no entries */
-	partition_tb = memblock_alloc(patb_size, patb_size);
-	if (!partition_tb)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, patb_size, patb_size);
-
-	/*
-	 * update partition table control register,
-	 * 64 K size.
-	 */
-	ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
-	mtspr(SPRN_PTCR, ptcr);
-	powernv_set_nmmu_ptcr(ptcr);
-}
-
-void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
-				   unsigned long dw1)
-{
-	unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
-
-	partition_tb[lpid].patb0 = cpu_to_be64(dw0);
-	partition_tb[lpid].patb1 = cpu_to_be64(dw1);
-
-	/*
-	 * Global flush of TLBs and partition table caches for this lpid.
-	 * The type of flush (hash or radix) depends on what the previous
-	 * use of this partition ID was, not the new use.
-	 */
-	asm volatile("ptesync" : : : "memory");
-	if (old & PATB_HR) {
-		asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
-			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
-		asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
-			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
-		trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1);
-	} else {
-		asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
-			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
-		trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0);
-	}
-	/* do we need fixup here ?*/
-	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-}
-EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
-
-static pmd_t *get_pmd_from_cache(struct mm_struct *mm)
-{
-	void *pmd_frag, *ret;
-
-	if (PMD_FRAG_NR == 1)
-		return NULL;
-
-	spin_lock(&mm->page_table_lock);
-	ret = mm->context.pmd_frag;
-	if (ret) {
-		pmd_frag = ret + PMD_FRAG_SIZE;
-		/*
-		 * If we have taken up all the fragments mark PTE page NULL
-		 */
-		if (((unsigned long)pmd_frag & ~PAGE_MASK) == 0)
-			pmd_frag = NULL;
-		mm->context.pmd_frag = pmd_frag;
-	}
-	spin_unlock(&mm->page_table_lock);
-	return (pmd_t *)ret;
-}
-
-static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
-{
-	void *ret = NULL;
-	struct page *page;
-	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
-
-	if (mm == &init_mm)
-		gfp &= ~__GFP_ACCOUNT;
-	page = alloc_page(gfp);
-	if (!page)
-		return NULL;
-	if (!pgtable_pmd_page_ctor(page)) {
-		__free_pages(page, 0);
-		return NULL;
-	}
-
-	atomic_set(&page->pt_frag_refcount, 1);
-
-	ret = page_address(page);
-	/*
-	 * if we support only one fragment just return the
-	 * allocated page.
-	 */
-	if (PMD_FRAG_NR == 1)
-		return ret;
-
-	spin_lock(&mm->page_table_lock);
-	/*
-	 * If we find pgtable_page set, we return
-	 * the allocated page with single fragement
-	 * count.
-	 */
-	if (likely(!mm->context.pmd_frag)) {
-		atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR);
-		mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
-	}
-	spin_unlock(&mm->page_table_lock);
-
-	return (pmd_t *)ret;
-}
-
-pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr)
-{
-	pmd_t *pmd;
-
-	pmd = get_pmd_from_cache(mm);
-	if (pmd)
-		return pmd;
-
-	return __alloc_for_pmdcache(mm);
-}
-
-void pmd_fragment_free(unsigned long *pmd)
-{
-	struct page *page = virt_to_page(pmd);
-
-	BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
-	if (atomic_dec_and_test(&page->pt_frag_refcount)) {
-		pgtable_pmd_page_dtor(page);
-		__free_page(page);
-	}
-}
-
-static inline void pgtable_free(void *table, int index)
-{
-	switch (index) {
-	case PTE_INDEX:
-		pte_fragment_free(table, 0);
-		break;
-	case PMD_INDEX:
-		pmd_fragment_free(table);
-		break;
-	case PUD_INDEX:
-		kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table);
-		break;
-#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE)
-		/* 16M hugepd directory at pud level */
-	case HTLB_16M_INDEX:
-		BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0);
-		kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table);
-		break;
-		/* 16G hugepd directory at the pgd level */
-	case HTLB_16G_INDEX:
-		BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0);
-		kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table);
-		break;
-#endif
-		/* We don't free pgd table via RCU callback */
-	default:
-		BUG();
-	}
-}
-
-#ifdef CONFIG_SMP
-void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
-{
-	unsigned long pgf = (unsigned long)table;
-
-	BUG_ON(index > MAX_PGTABLE_INDEX_SIZE);
-	pgf |= index;
-	tlb_remove_table(tlb, (void *)pgf);
-}
-
-void __tlb_remove_table(void *_table)
-{
-	void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
-	unsigned int index = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
-
-	return pgtable_free(table, index);
-}
-#else
-void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
-{
-	return pgtable_free(table, index);
-}
-#endif
-
-#ifdef CONFIG_PROC_FS
-atomic_long_t direct_pages_count[MMU_PAGE_COUNT];
-
-void arch_report_meminfo(struct seq_file *m)
-{
-	/*
-	 * Hash maps the memory with one size mmu_linear_psize.
-	 * So don't bother to print these on hash
-	 */
-	if (!radix_enabled())
-		return;
-	seq_printf(m, "DirectMap4k:    %8lu kB\n",
-		   atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2);
-	seq_printf(m, "DirectMap64k:    %8lu kB\n",
-		   atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6);
-	seq_printf(m, "DirectMap2M:    %8lu kB\n",
-		   atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11);
-	seq_printf(m, "DirectMap1G:    %8lu kB\n",
-		   atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20);
-}
-#endif /* CONFIG_PROC_FS */
-
-pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
-			     pte_t *ptep)
-{
-	unsigned long pte_val;
-
-	/*
-	 * Clear the _PAGE_PRESENT so that no hardware parallel update is
-	 * possible. Also keep the pte_present true so that we don't take
-	 * wrong fault.
-	 */
-	pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0);
-
-	return __pte(pte_val);
-
-}
-
-void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
-			     pte_t *ptep, pte_t old_pte, pte_t pte)
-{
-	if (radix_enabled())
-		return radix__ptep_modify_prot_commit(vma, addr,
-						      ptep, old_pte, pte);
-	set_pte_at(vma->vm_mm, addr, ptep, pte);
-}
-
-/*
- * For hash translation mode, we use the deposited table to store hash slot
- * information and they are stored at PTRS_PER_PMD offset from related pmd
- * location. Hence a pmd move requires deposit and withdraw.
- *
- * For radix translation with split pmd ptl, we store the deposited table in the
- * pmd page. Hence if we have different pmd page we need to withdraw during pmd
- * move.
- *
- * With hash we use deposited table always irrespective of anon or not.
- * With radix we use deposited table only for anonymous mapping.
- */
-int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
-			   struct spinlock *old_pmd_ptl,
-			   struct vm_area_struct *vma)
-{
-	if (radix_enabled())
-		return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
-
-	return true;
-}
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
deleted file mode 100644
index 1fd025dba4a3..000000000000
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ /dev/null
@@ -1,463 +0,0 @@
-/*
- * Copyright 2005, Paul Mackerras, IBM Corporation.
- * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
- * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/sched.h>
-#include <linux/mm_types.h>
-#include <linux/mm.h>
-
-#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
-#include <asm/sections.h>
-#include <asm/mmu.h>
-#include <asm/tlb.h>
-
-#include <mm/mmu_decl.h>
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/thp.h>
-
-#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE))
-#warning Limited user VSID range means pagetable space is wasted
-#endif
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-/*
- * vmemmap is the starting address of the virtual address space where
- * struct pages are allocated for all possible PFNs present on the system
- * including holes and bad memory (hence sparse). These virtual struct
- * pages are stored in sequence in this virtual address space irrespective
- * of the fact whether the corresponding PFN is valid or not. This achieves
- * constant relationship between address of struct page and its PFN.
- *
- * During boot or memory hotplug operation when a new memory section is
- * added, physical memory allocation (including hash table bolting) will
- * be performed for the set of struct pages which are part of the memory
- * section. This saves memory by not allocating struct pages for PFNs
- * which are not valid.
- *
- *		----------------------------------------------
- *		| PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES|
- *		----------------------------------------------
- *
- *	   f000000000000000                  c000000000000000
- * vmemmap +--------------+                  +--------------+
- *  +      |  page struct | +--------------> |  page struct |
- *  |      +--------------+                  +--------------+
- *  |      |  page struct | +--------------> |  page struct |
- *  |      +--------------+ |                +--------------+
- *  |      |  page struct | +       +------> |  page struct |
- *  |      +--------------+         |        +--------------+
- *  |      |  page struct |         |   +--> |  page struct |
- *  |      +--------------+         |   |    +--------------+
- *  |      |  page struct |         |   |
- *  |      +--------------+         |   |
- *  |      |  page struct |         |   |
- *  |      +--------------+         |   |
- *  |      |  page struct |         |   |
- *  |      +--------------+         |   |
- *  |      |  page struct |         |   |
- *  |      +--------------+         |   |
- *  |      |  page struct | +-------+   |
- *  |      +--------------+             |
- *  |      |  page struct | +-----------+
- *  |      +--------------+
- *  |      |  page struct | No mapping
- *  |      +--------------+
- *  |      |  page struct | No mapping
- *  v      +--------------+
- *
- *		-----------------------------------------
- *		| RELATION BETWEEN STRUCT PAGES AND PFNS|
- *		-----------------------------------------
- *
- * vmemmap +--------------+                 +---------------+
- *  +      |  page struct | +-------------> |      PFN      |
- *  |      +--------------+                 +---------------+
- *  |      |  page struct | +-------------> |      PFN      |
- *  |      +--------------+                 +---------------+
- *  |      |  page struct | +-------------> |      PFN      |
- *  |      +--------------+                 +---------------+
- *  |      |  page struct | +-------------> |      PFN      |
- *  |      +--------------+                 +---------------+
- *  |      |              |
- *  |      +--------------+
- *  |      |              |
- *  |      +--------------+
- *  |      |              |
- *  |      +--------------+                 +---------------+
- *  |      |  page struct | +-------------> |      PFN      |
- *  |      +--------------+                 +---------------+
- *  |      |              |
- *  |      +--------------+
- *  |      |              |
- *  |      +--------------+                 +---------------+
- *  |      |  page struct | +-------------> |      PFN      |
- *  |      +--------------+                 +---------------+
- *  |      |  page struct | +-------------> |      PFN      |
- *  v      +--------------+                 +---------------+
- */
-/*
- * On hash-based CPUs, the vmemmap is bolted in the hash table.
- *
- */
-int __meminit hash__vmemmap_create_mapping(unsigned long start,
-				       unsigned long page_size,
-				       unsigned long phys)
-{
-	int rc;
-
-	if ((start + page_size) >= H_VMEMMAP_END) {
-		pr_warn("Outside the supported range\n");
-		return -1;
-	}
-
-	rc = htab_bolt_mapping(start, start + page_size, phys,
-			       pgprot_val(PAGE_KERNEL),
-			       mmu_vmemmap_psize, mmu_kernel_ssize);
-	if (rc < 0) {
-		int rc2 = htab_remove_mapping(start, start + page_size,
-					      mmu_vmemmap_psize,
-					      mmu_kernel_ssize);
-		BUG_ON(rc2 && (rc2 != -ENOENT));
-	}
-	return rc;
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-void hash__vmemmap_remove_mapping(unsigned long start,
-			      unsigned long page_size)
-{
-	int rc = htab_remove_mapping(start, start + page_size,
-				     mmu_vmemmap_psize,
-				     mmu_kernel_ssize);
-	BUG_ON((rc < 0) && (rc != -ENOENT));
-	WARN_ON(rc == -ENOENT);
-}
-#endif
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-
-/*
- * map_kernel_page currently only called by __ioremap
- * map_kernel_page adds an entry to the ioremap page table
- * and adds an entry to the HPT, possibly bolting it
- */
-int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
-{
-	pgd_t *pgdp;
-	pud_t *pudp;
-	pmd_t *pmdp;
-	pte_t *ptep;
-
-	BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
-	if (slab_is_available()) {
-		pgdp = pgd_offset_k(ea);
-		pudp = pud_alloc(&init_mm, pgdp, ea);
-		if (!pudp)
-			return -ENOMEM;
-		pmdp = pmd_alloc(&init_mm, pudp, ea);
-		if (!pmdp)
-			return -ENOMEM;
-		ptep = pte_alloc_kernel(pmdp, ea);
-		if (!ptep)
-			return -ENOMEM;
-		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
-	} else {
-		/*
-		 * If the mm subsystem is not fully up, we cannot create a
-		 * linux page table entry for this mapping.  Simply bolt an
-		 * entry in the hardware page table.
-		 *
-		 */
-		if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot),
-				      mmu_io_psize, mmu_kernel_ssize)) {
-			printk(KERN_ERR "Failed to do bolted mapping IO "
-			       "memory at %016lx !\n", pa);
-			return -ENOMEM;
-		}
-	}
-
-	smp_wmb();
-	return 0;
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
-unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
-				    pmd_t *pmdp, unsigned long clr,
-				    unsigned long set)
-{
-	__be64 old_be, tmp;
-	unsigned long old;
-
-#ifdef CONFIG_DEBUG_VM
-	WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
-	assert_spin_locked(pmd_lockptr(mm, pmdp));
-#endif
-
-	__asm__ __volatile__(
-	"1:	ldarx	%0,0,%3\n\
-		and.	%1,%0,%6\n\
-		bne-	1b \n\
-		andc	%1,%0,%4 \n\
-		or	%1,%1,%7\n\
-		stdcx.	%1,0,%3 \n\
-		bne-	1b"
-	: "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
-	: "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
-	  "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
-	: "cc" );
-
-	old = be64_to_cpu(old_be);
-
-	trace_hugepage_update(addr, old, clr, set);
-	if (old & H_PAGE_HASHPTE)
-		hpte_do_hugepage_flush(mm, addr, pmdp, old);
-	return old;
-}
-
-pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
-			    pmd_t *pmdp)
-{
-	pmd_t pmd;
-
-	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-	VM_BUG_ON(pmd_trans_huge(*pmdp));
-	VM_BUG_ON(pmd_devmap(*pmdp));
-
-	pmd = *pmdp;
-	pmd_clear(pmdp);
-	/*
-	 * Wait for all pending hash_page to finish. This is needed
-	 * in case of subpage collapse. When we collapse normal pages
-	 * to hugepage, we first clear the pmd, then invalidate all
-	 * the PTE entries. The assumption here is that any low level
-	 * page fault will see a none pmd and take the slow path that
-	 * will wait on mmap_sem. But we could very well be in a
-	 * hash_page with local ptep pointer value. Such a hash page
-	 * can result in adding new HPTE entries for normal subpages.
-	 * That means we could be modifying the page content as we
-	 * copy them to a huge page. So wait for parallel hash_page
-	 * to finish before invalidating HPTE entries. We can do this
-	 * by sending an IPI to all the cpus and executing a dummy
-	 * function there.
-	 */
-	serialize_against_pte_lookup(vma->vm_mm);
-	/*
-	 * Now invalidate the hpte entries in the range
-	 * covered by pmd. This make sure we take a
-	 * fault and will find the pmd as none, which will
-	 * result in a major fault which takes mmap_sem and
-	 * hence wait for collapse to complete. Without this
-	 * the __collapse_huge_page_copy can result in copying
-	 * the old content.
-	 */
-	flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
-	return pmd;
-}
-
-/*
- * We want to put the pgtable in pmd and use pgtable for tracking
- * the base page size hptes
- */
-void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
-				  pgtable_t pgtable)
-{
-	pgtable_t *pgtable_slot;
-
-	assert_spin_locked(pmd_lockptr(mm, pmdp));
-	/*
-	 * we store the pgtable in the second half of PMD
-	 */
-	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
-	*pgtable_slot = pgtable;
-	/*
-	 * expose the deposited pgtable to other cpus.
-	 * before we set the hugepage PTE at pmd level
-	 * hash fault code looks at the deposted pgtable
-	 * to store hash index values.
-	 */
-	smp_wmb();
-}
-
-pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
-{
-	pgtable_t pgtable;
-	pgtable_t *pgtable_slot;
-
-	assert_spin_locked(pmd_lockptr(mm, pmdp));
-
-	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
-	pgtable = *pgtable_slot;
-	/*
-	 * Once we withdraw, mark the entry NULL.
-	 */
-	*pgtable_slot = NULL;
-	/*
-	 * We store HPTE information in the deposited PTE fragment.
-	 * zero out the content on withdraw.
-	 */
-	memset(pgtable, 0, PTE_FRAG_SIZE);
-	return pgtable;
-}
-
-/*
- * A linux hugepage PMD was changed and the corresponding hash table entries
- * neesd to be flushed.
- */
-void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
-			    pmd_t *pmdp, unsigned long old_pmd)
-{
-	int ssize;
-	unsigned int psize;
-	unsigned long vsid;
-	unsigned long flags = 0;
-
-	/* get the base page size,vsid and segment size */
-#ifdef CONFIG_DEBUG_VM
-	psize = get_slice_psize(mm, addr);
-	BUG_ON(psize == MMU_PAGE_16M);
-#endif
-	if (old_pmd & H_PAGE_COMBO)
-		psize = MMU_PAGE_4K;
-	else
-		psize = MMU_PAGE_64K;
-
-	if (!is_kernel_addr(addr)) {
-		ssize = user_segment_size(addr);
-		vsid = get_user_vsid(&mm->context, addr, ssize);
-		WARN_ON(vsid == 0);
-	} else {
-		vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
-		ssize = mmu_kernel_ssize;
-	}
-
-	if (mm_is_thread_local(mm))
-		flags |= HPTE_LOCAL_UPDATE;
-
-	return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
-}
-
-pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
-				unsigned long addr, pmd_t *pmdp)
-{
-	pmd_t old_pmd;
-	pgtable_t pgtable;
-	unsigned long old;
-	pgtable_t *pgtable_slot;
-
-	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
-	old_pmd = __pmd(old);
-	/*
-	 * We have pmd == none and we are holding page_table_lock.
-	 * So we can safely go and clear the pgtable hash
-	 * index info.
-	 */
-	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
-	pgtable = *pgtable_slot;
-	/*
-	 * Let's zero out old valid and hash index details
-	 * hash fault look at them.
-	 */
-	memset(pgtable, 0, PTE_FRAG_SIZE);
-	/*
-	 * Serialize against find_current_mm_pte variants which does lock-less
-	 * lookup in page tables with local interrupts disabled. For huge pages
-	 * it casts pmd_t to pte_t. Since format of pte_t is different from
-	 * pmd_t we want to prevent transit from pmd pointing to page table
-	 * to pmd pointing to huge page (and back) while interrupts are disabled.
-	 * We clear pmd to possibly replace it with page table pointer in
-	 * different code paths. So make sure we wait for the parallel
-	 * find_curren_mm_pte to finish.
-	 */
-	serialize_against_pte_lookup(mm);
-	return old_pmd;
-}
-
-int hash__has_transparent_hugepage(void)
-{
-
-	if (!mmu_has_feature(MMU_FTR_16M_PAGE))
-		return 0;
-	/*
-	 * We support THP only if PMD_SIZE is 16MB.
-	 */
-	if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
-		return 0;
-	/*
-	 * We need to make sure that we support 16MB hugepage in a segement
-	 * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
-	 * of 64K.
-	 */
-	/*
-	 * If we have 64K HPTE, we will be using that by default
-	 */
-	if (mmu_psize_defs[MMU_PAGE_64K].shift &&
-	    (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
-		return 0;
-	/*
-	 * Ok we only have 4K HPTE
-	 */
-	if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
-		return 0;
-
-	return 1;
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-#ifdef CONFIG_STRICT_KERNEL_RWX
-static bool hash__change_memory_range(unsigned long start, unsigned long end,
-				      unsigned long newpp)
-{
-	unsigned long idx;
-	unsigned int step, shift;
-
-	shift = mmu_psize_defs[mmu_linear_psize].shift;
-	step = 1 << shift;
-
-	start = ALIGN_DOWN(start, step);
-	end = ALIGN(end, step); // aligns up
-
-	if (start >= end)
-		return false;
-
-	pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
-		 start, end, newpp, step);
-
-	for (idx = start; idx < end; idx += step)
-		/* Not sure if we can do much with the return value */
-		mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
-							mmu_kernel_ssize);
-
-	return true;
-}
-
-void hash__mark_rodata_ro(void)
-{
-	unsigned long start, end;
-
-	start = (unsigned long)_stext;
-	end = (unsigned long)__init_begin;
-
-	WARN_ON(!hash__change_memory_range(start, end, PP_RXXX));
-}
-
-void hash__mark_initmem_nx(void)
-{
-	unsigned long start, end, pp;
-
-	start = (unsigned long)__init_begin;
-	end = (unsigned long)__init_end;
-
-	pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
-
-	WARN_ON(!hash__change_memory_range(start, end, pp));
-}
-#endif
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
deleted file mode 100644
index fcb0169e2d32..000000000000
--- a/arch/powerpc/mm/pgtable-radix.c
+++ /dev/null
@@ -1,1124 +0,0 @@
-/*
- * Page table handling routines for radix page table.
- *
- * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define pr_fmt(fmt) "radix-mmu: " fmt
-
-#include <linux/kernel.h>
-#include <linux/sched/mm.h>
-#include <linux/memblock.h>
-#include <linux/of_fdt.h>
-#include <linux/mm.h>
-#include <linux/string_helpers.h>
-#include <linux/stop_machine.h>
-
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/mmu_context.h>
-#include <asm/dma.h>
-#include <asm/machdep.h>
-#include <asm/mmu.h>
-#include <asm/firmware.h>
-#include <asm/powernv.h>
-#include <asm/sections.h>
-#include <asm/trace.h>
-#include <asm/uaccess.h>
-
-#include <trace/events/thp.h>
-
-unsigned int mmu_pid_bits;
-unsigned int mmu_base_pid;
-
-static int native_register_process_table(unsigned long base, unsigned long pg_sz,
-					 unsigned long table_size)
-{
-	unsigned long patb0, patb1;
-
-	patb0 = be64_to_cpu(partition_tb[0].patb0);
-	patb1 = base | table_size | PATB_GR;
-
-	mmu_partition_table_set_entry(0, patb0, patb1);
-
-	return 0;
-}
-
-static __ref void *early_alloc_pgtable(unsigned long size, int nid,
-			unsigned long region_start, unsigned long region_end)
-{
-	phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
-	phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
-	void *ptr;
-
-	if (region_start)
-		min_addr = region_start;
-	if (region_end)
-		max_addr = region_end;
-
-	ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
-
-	if (!ptr)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
-		      __func__, size, size, nid, &min_addr, &max_addr);
-
-	return ptr;
-}
-
-static int early_map_kernel_page(unsigned long ea, unsigned long pa,
-			  pgprot_t flags,
-			  unsigned int map_page_size,
-			  int nid,
-			  unsigned long region_start, unsigned long region_end)
-{
-	unsigned long pfn = pa >> PAGE_SHIFT;
-	pgd_t *pgdp;
-	pud_t *pudp;
-	pmd_t *pmdp;
-	pte_t *ptep;
-
-	pgdp = pgd_offset_k(ea);
-	if (pgd_none(*pgdp)) {
-		pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
-						region_start, region_end);
-		pgd_populate(&init_mm, pgdp, pudp);
-	}
-	pudp = pud_offset(pgdp, ea);
-	if (map_page_size == PUD_SIZE) {
-		ptep = (pte_t *)pudp;
-		goto set_the_pte;
-	}
-	if (pud_none(*pudp)) {
-		pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
-						region_start, region_end);
-		pud_populate(&init_mm, pudp, pmdp);
-	}
-	pmdp = pmd_offset(pudp, ea);
-	if (map_page_size == PMD_SIZE) {
-		ptep = pmdp_ptep(pmdp);
-		goto set_the_pte;
-	}
-	if (!pmd_present(*pmdp)) {
-		ptep = early_alloc_pgtable(PAGE_SIZE, nid,
-						region_start, region_end);
-		pmd_populate_kernel(&init_mm, pmdp, ptep);
-	}
-	ptep = pte_offset_kernel(pmdp, ea);
-
-set_the_pte:
-	set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
-	smp_wmb();
-	return 0;
-}
-
-/*
- * nid, region_start, and region_end are hints to try to place the page
- * table memory in the same node or region.
- */
-static int __map_kernel_page(unsigned long ea, unsigned long pa,
-			  pgprot_t flags,
-			  unsigned int map_page_size,
-			  int nid,
-			  unsigned long region_start, unsigned long region_end)
-{
-	unsigned long pfn = pa >> PAGE_SHIFT;
-	pgd_t *pgdp;
-	pud_t *pudp;
-	pmd_t *pmdp;
-	pte_t *ptep;
-	/*
-	 * Make sure task size is correct as per the max adddr
-	 */
-	BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
-
-#ifdef CONFIG_PPC_64K_PAGES
-	BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
-#endif
-
-	if (unlikely(!slab_is_available()))
-		return early_map_kernel_page(ea, pa, flags, map_page_size,
-						nid, region_start, region_end);
-
-	/*
-	 * Should make page table allocation functions be able to take a
-	 * node, so we can place kernel page tables on the right nodes after
-	 * boot.
-	 */
-	pgdp = pgd_offset_k(ea);
-	pudp = pud_alloc(&init_mm, pgdp, ea);
-	if (!pudp)
-		return -ENOMEM;
-	if (map_page_size == PUD_SIZE) {
-		ptep = (pte_t *)pudp;
-		goto set_the_pte;
-	}
-	pmdp = pmd_alloc(&init_mm, pudp, ea);
-	if (!pmdp)
-		return -ENOMEM;
-	if (map_page_size == PMD_SIZE) {
-		ptep = pmdp_ptep(pmdp);
-		goto set_the_pte;
-	}
-	ptep = pte_alloc_kernel(pmdp, ea);
-	if (!ptep)
-		return -ENOMEM;
-
-set_the_pte:
-	set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
-	smp_wmb();
-	return 0;
-}
-
-int radix__map_kernel_page(unsigned long ea, unsigned long pa,
-			  pgprot_t flags,
-			  unsigned int map_page_size)
-{
-	return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
-}
-
-#ifdef CONFIG_STRICT_KERNEL_RWX
-void radix__change_memory_range(unsigned long start, unsigned long end,
-				unsigned long clear)
-{
-	unsigned long idx;
-	pgd_t *pgdp;
-	pud_t *pudp;
-	pmd_t *pmdp;
-	pte_t *ptep;
-
-	start = ALIGN_DOWN(start, PAGE_SIZE);
-	end = PAGE_ALIGN(end); // aligns up
-
-	pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
-		 start, end, clear);
-
-	for (idx = start; idx < end; idx += PAGE_SIZE) {
-		pgdp = pgd_offset_k(idx);
-		pudp = pud_alloc(&init_mm, pgdp, idx);
-		if (!pudp)
-			continue;
-		if (pud_huge(*pudp)) {
-			ptep = (pte_t *)pudp;
-			goto update_the_pte;
-		}
-		pmdp = pmd_alloc(&init_mm, pudp, idx);
-		if (!pmdp)
-			continue;
-		if (pmd_huge(*pmdp)) {
-			ptep = pmdp_ptep(pmdp);
-			goto update_the_pte;
-		}
-		ptep = pte_alloc_kernel(pmdp, idx);
-		if (!ptep)
-			continue;
-update_the_pte:
-		radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
-	}
-
-	radix__flush_tlb_kernel_range(start, end);
-}
-
-void radix__mark_rodata_ro(void)
-{
-	unsigned long start, end;
-
-	start = (unsigned long)_stext;
-	end = (unsigned long)__init_begin;
-
-	radix__change_memory_range(start, end, _PAGE_WRITE);
-}
-
-void radix__mark_initmem_nx(void)
-{
-	unsigned long start = (unsigned long)__init_begin;
-	unsigned long end = (unsigned long)__init_end;
-
-	radix__change_memory_range(start, end, _PAGE_EXEC);
-}
-#endif /* CONFIG_STRICT_KERNEL_RWX */
-
-static inline void __meminit
-print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
-{
-	char buf[10];
-
-	if (end <= start)
-		return;
-
-	string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
-
-	pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
-		exec ? " (exec)" : "");
-}
-
-static unsigned long next_boundary(unsigned long addr, unsigned long end)
-{
-#ifdef CONFIG_STRICT_KERNEL_RWX
-	if (addr < __pa_symbol(__init_begin))
-		return __pa_symbol(__init_begin);
-#endif
-	return end;
-}
-
-static int __meminit create_physical_mapping(unsigned long start,
-					     unsigned long end,
-					     int nid)
-{
-	unsigned long vaddr, addr, mapping_size = 0;
-	bool prev_exec, exec = false;
-	pgprot_t prot;
-	int psize;
-
-	start = _ALIGN_UP(start, PAGE_SIZE);
-	for (addr = start; addr < end; addr += mapping_size) {
-		unsigned long gap, previous_size;
-		int rc;
-
-		gap = next_boundary(addr, end) - addr;
-		previous_size = mapping_size;
-		prev_exec = exec;
-
-		if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
-		    mmu_psize_defs[MMU_PAGE_1G].shift) {
-			mapping_size = PUD_SIZE;
-			psize = MMU_PAGE_1G;
-		} else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
-			   mmu_psize_defs[MMU_PAGE_2M].shift) {
-			mapping_size = PMD_SIZE;
-			psize = MMU_PAGE_2M;
-		} else {
-			mapping_size = PAGE_SIZE;
-			psize = mmu_virtual_psize;
-		}
-
-		vaddr = (unsigned long)__va(addr);
-
-		if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
-		    overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
-			prot = PAGE_KERNEL_X;
-			exec = true;
-		} else {
-			prot = PAGE_KERNEL;
-			exec = false;
-		}
-
-		if (mapping_size != previous_size || exec != prev_exec) {
-			print_mapping(start, addr, previous_size, prev_exec);
-			start = addr;
-		}
-
-		rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
-		if (rc)
-			return rc;
-
-		update_page_count(psize, 1);
-	}
-
-	print_mapping(start, addr, mapping_size, exec);
-	return 0;
-}
-
-void __init radix_init_pgtable(void)
-{
-	unsigned long rts_field;
-	struct memblock_region *reg;
-
-	/* We don't support slb for radix */
-	mmu_slb_size = 0;
-	/*
-	 * Create the linear mapping, using standard page size for now
-	 */
-	for_each_memblock(memory, reg) {
-		/*
-		 * The memblock allocator  is up at this point, so the
-		 * page tables will be allocated within the range. No
-		 * need or a node (which we don't have yet).
-		 */
-
-		if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
-			pr_warn("Outside the supported range\n");
-			continue;
-		}
-
-		WARN_ON(create_physical_mapping(reg->base,
-						reg->base + reg->size,
-						-1));
-	}
-
-	/* Find out how many PID bits are supported */
-	if (cpu_has_feature(CPU_FTR_HVMODE)) {
-		if (!mmu_pid_bits)
-			mmu_pid_bits = 20;
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-		/*
-		 * When KVM is possible, we only use the top half of the
-		 * PID space to avoid collisions between host and guest PIDs
-		 * which can cause problems due to prefetch when exiting the
-		 * guest with AIL=3
-		 */
-		mmu_base_pid = 1 << (mmu_pid_bits - 1);
-#else
-		mmu_base_pid = 1;
-#endif
-	} else {
-		/* The guest uses the bottom half of the PID space */
-		if (!mmu_pid_bits)
-			mmu_pid_bits = 19;
-		mmu_base_pid = 1;
-	}
-
-	/*
-	 * Allocate Partition table and process table for the
-	 * host.
-	 */
-	BUG_ON(PRTB_SIZE_SHIFT > 36);
-	process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
-	/*
-	 * Fill in the process table.
-	 */
-	rts_field = radix__get_tree_size();
-	process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
-	/*
-	 * Fill in the partition table. We are suppose to use effective address
-	 * of process table here. But our linear mapping also enable us to use
-	 * physical address here.
-	 */
-	register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
-	pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
-	asm volatile("ptesync" : : : "memory");
-	asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
-		     "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
-	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-	trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
-
-	/*
-	 * The init_mm context is given the first available (non-zero) PID,
-	 * which is the "guard PID" and contains no page table. PIDR should
-	 * never be set to zero because that duplicates the kernel address
-	 * space at the 0x0... offset (quadrant 0)!
-	 *
-	 * An arbitrary PID that may later be allocated by the PID allocator
-	 * for userspace processes must not be used either, because that
-	 * would cause stale user mappings for that PID on CPUs outside of
-	 * the TLB invalidation scheme (because it won't be in mm_cpumask).
-	 *
-	 * So permanently carve out one PID for the purpose of a guard PID.
-	 */
-	init_mm.context.id = mmu_base_pid;
-	mmu_base_pid++;
-}
-
-static void __init radix_init_partition_table(void)
-{
-	unsigned long rts_field, dw0;
-
-	mmu_partition_table_init();
-	rts_field = radix__get_tree_size();
-	dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
-	mmu_partition_table_set_entry(0, dw0, 0);
-
-	pr_info("Initializing Radix MMU\n");
-	pr_info("Partition table %p\n", partition_tb);
-}
-
-void __init radix_init_native(void)
-{
-	register_process_table = native_register_process_table;
-}
-
-static int __init get_idx_from_shift(unsigned int shift)
-{
-	int idx = -1;
-
-	switch (shift) {
-	case 0xc:
-		idx = MMU_PAGE_4K;
-		break;
-	case 0x10:
-		idx = MMU_PAGE_64K;
-		break;
-	case 0x15:
-		idx = MMU_PAGE_2M;
-		break;
-	case 0x1e:
-		idx = MMU_PAGE_1G;
-		break;
-	}
-	return idx;
-}
-
-static int __init radix_dt_scan_page_sizes(unsigned long node,
-					   const char *uname, int depth,
-					   void *data)
-{
-	int size = 0;
-	int shift, idx;
-	unsigned int ap;
-	const __be32 *prop;
-	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
-
-	/* We are scanning "cpu" nodes only */
-	if (type == NULL || strcmp(type, "cpu") != 0)
-		return 0;
-
-	/* Find MMU PID size */
-	prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
-	if (prop && size == 4)
-		mmu_pid_bits = be32_to_cpup(prop);
-
-	/* Grab page size encodings */
-	prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
-	if (!prop)
-		return 0;
-
-	pr_info("Page sizes from device-tree:\n");
-	for (; size >= 4; size -= 4, ++prop) {
-
-		struct mmu_psize_def *def;
-
-		/* top 3 bit is AP encoding */
-		shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
-		ap = be32_to_cpu(prop[0]) >> 29;
-		pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
-
-		idx = get_idx_from_shift(shift);
-		if (idx < 0)
-			continue;
-
-		def = &mmu_psize_defs[idx];
-		def->shift = shift;
-		def->ap  = ap;
-	}
-
-	/* needed ? */
-	cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
-	return 1;
-}
-
-void __init radix__early_init_devtree(void)
-{
-	int rc;
-
-	/*
-	 * Try to find the available page sizes in the device-tree
-	 */
-	rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
-	if (rc != 0)  /* Found */
-		goto found;
-	/*
-	 * let's assume we have page 4k and 64k support
-	 */
-	mmu_psize_defs[MMU_PAGE_4K].shift = 12;
-	mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
-
-	mmu_psize_defs[MMU_PAGE_64K].shift = 16;
-	mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
-found:
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-	if (mmu_psize_defs[MMU_PAGE_2M].shift) {
-		/*
-		 * map vmemmap using 2M if available
-		 */
-		mmu_vmemmap_psize = MMU_PAGE_2M;
-	}
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-	return;
-}
-
-static void radix_init_amor(void)
-{
-	/*
-	* In HV mode, we init AMOR (Authority Mask Override Register) so that
-	* the hypervisor and guest can setup IAMR (Instruction Authority Mask
-	* Register), enable key 0 and set it to 1.
-	*
-	* AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
-	*/
-	mtspr(SPRN_AMOR, (3ul << 62));
-}
-
-#ifdef CONFIG_PPC_KUEP
-void setup_kuep(bool disabled)
-{
-	if (disabled || !early_radix_enabled())
-		return;
-
-	if (smp_processor_id() == boot_cpuid)
-		pr_info("Activating Kernel Userspace Execution Prevention\n");
-
-	/*
-	 * Radix always uses key0 of the IAMR to determine if an access is
-	 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
-	 * fetch.
-	 */
-	mtspr(SPRN_IAMR, (1ul << 62));
-}
-#endif
-
-#ifdef CONFIG_PPC_KUAP
-void setup_kuap(bool disabled)
-{
-	if (disabled || !early_radix_enabled())
-		return;
-
-	if (smp_processor_id() == boot_cpuid) {
-		pr_info("Activating Kernel Userspace Access Prevention\n");
-		cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP;
-	}
-
-	/* Make sure userspace can't change the AMR */
-	mtspr(SPRN_UAMOR, 0);
-	mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
-	isync();
-}
-#endif
-
-void __init radix__early_init_mmu(void)
-{
-	unsigned long lpcr;
-
-#ifdef CONFIG_PPC_64K_PAGES
-	/* PAGE_SIZE mappings */
-	mmu_virtual_psize = MMU_PAGE_64K;
-#else
-	mmu_virtual_psize = MMU_PAGE_4K;
-#endif
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-	/* vmemmap mapping */
-	mmu_vmemmap_psize = mmu_virtual_psize;
-#endif
-	/*
-	 * initialize page table size
-	 */
-	__pte_index_size = RADIX_PTE_INDEX_SIZE;
-	__pmd_index_size = RADIX_PMD_INDEX_SIZE;
-	__pud_index_size = RADIX_PUD_INDEX_SIZE;
-	__pgd_index_size = RADIX_PGD_INDEX_SIZE;
-	__pud_cache_index = RADIX_PUD_INDEX_SIZE;
-	__pte_table_size = RADIX_PTE_TABLE_SIZE;
-	__pmd_table_size = RADIX_PMD_TABLE_SIZE;
-	__pud_table_size = RADIX_PUD_TABLE_SIZE;
-	__pgd_table_size = RADIX_PGD_TABLE_SIZE;
-
-	__pmd_val_bits = RADIX_PMD_VAL_BITS;
-	__pud_val_bits = RADIX_PUD_VAL_BITS;
-	__pgd_val_bits = RADIX_PGD_VAL_BITS;
-
-	__kernel_virt_start = RADIX_KERN_VIRT_START;
-	__vmalloc_start = RADIX_VMALLOC_START;
-	__vmalloc_end = RADIX_VMALLOC_END;
-	__kernel_io_start = RADIX_KERN_IO_START;
-	__kernel_io_end = RADIX_KERN_IO_END;
-	vmemmap = (struct page *)RADIX_VMEMMAP_START;
-	ioremap_bot = IOREMAP_BASE;
-
-#ifdef CONFIG_PCI
-	pci_io_base = ISA_IO_BASE;
-#endif
-	__pte_frag_nr = RADIX_PTE_FRAG_NR;
-	__pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
-	__pmd_frag_nr = RADIX_PMD_FRAG_NR;
-	__pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
-
-	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
-		radix_init_native();
-		lpcr = mfspr(SPRN_LPCR);
-		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
-		radix_init_partition_table();
-		radix_init_amor();
-	} else {
-		radix_init_pseries();
-	}
-
-	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
-
-	radix_init_pgtable();
-	/* Switch to the guard PID before turning on MMU */
-	radix__switch_mmu_context(NULL, &init_mm);
-	if (cpu_has_feature(CPU_FTR_HVMODE))
-		tlbiel_all();
-}
-
-void radix__early_init_mmu_secondary(void)
-{
-	unsigned long lpcr;
-	/*
-	 * update partition table control register and UPRT
-	 */
-	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
-		lpcr = mfspr(SPRN_LPCR);
-		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
-
-		mtspr(SPRN_PTCR,
-		      __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
-		radix_init_amor();
-	}
-
-	radix__switch_mmu_context(NULL, &init_mm);
-	if (cpu_has_feature(CPU_FTR_HVMODE))
-		tlbiel_all();
-}
-
-void radix__mmu_cleanup_all(void)
-{
-	unsigned long lpcr;
-
-	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
-		lpcr = mfspr(SPRN_LPCR);
-		mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
-		mtspr(SPRN_PTCR, 0);
-		powernv_set_nmmu_ptcr(0);
-		radix__flush_tlb_all();
-	}
-}
-
-void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
-				phys_addr_t first_memblock_size)
-{
-	/* We don't currently support the first MEMBLOCK not mapping 0
-	 * physical on those processors
-	 */
-	BUG_ON(first_memblock_base != 0);
-
-	/*
-	 * Radix mode is not limited by RMA / VRMA addressing.
-	 */
-	ppc64_rma_size = ULONG_MAX;
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
-{
-	pte_t *pte;
-	int i;
-
-	for (i = 0; i < PTRS_PER_PTE; i++) {
-		pte = pte_start + i;
-		if (!pte_none(*pte))
-			return;
-	}
-
-	pte_free_kernel(&init_mm, pte_start);
-	pmd_clear(pmd);
-}
-
-static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
-{
-	pmd_t *pmd;
-	int i;
-
-	for (i = 0; i < PTRS_PER_PMD; i++) {
-		pmd = pmd_start + i;
-		if (!pmd_none(*pmd))
-			return;
-	}
-
-	pmd_free(&init_mm, pmd_start);
-	pud_clear(pud);
-}
-
-struct change_mapping_params {
-	pte_t *pte;
-	unsigned long start;
-	unsigned long end;
-	unsigned long aligned_start;
-	unsigned long aligned_end;
-};
-
-static int __meminit stop_machine_change_mapping(void *data)
-{
-	struct change_mapping_params *params =
-			(struct change_mapping_params *)data;
-
-	if (!data)
-		return -1;
-
-	spin_unlock(&init_mm.page_table_lock);
-	pte_clear(&init_mm, params->aligned_start, params->pte);
-	create_physical_mapping(params->aligned_start, params->start, -1);
-	create_physical_mapping(params->end, params->aligned_end, -1);
-	spin_lock(&init_mm.page_table_lock);
-	return 0;
-}
-
-static void remove_pte_table(pte_t *pte_start, unsigned long addr,
-			     unsigned long end)
-{
-	unsigned long next;
-	pte_t *pte;
-
-	pte = pte_start + pte_index(addr);
-	for (; addr < end; addr = next, pte++) {
-		next = (addr + PAGE_SIZE) & PAGE_MASK;
-		if (next > end)
-			next = end;
-
-		if (!pte_present(*pte))
-			continue;
-
-		if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
-			/*
-			 * The vmemmap_free() and remove_section_mapping()
-			 * codepaths call us with aligned addresses.
-			 */
-			WARN_ONCE(1, "%s: unaligned range\n", __func__);
-			continue;
-		}
-
-		pte_clear(&init_mm, addr, pte);
-	}
-}
-
-/*
- * clear the pte and potentially split the mapping helper
- */
-static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end,
-				unsigned long size, pte_t *pte)
-{
-	unsigned long mask = ~(size - 1);
-	unsigned long aligned_start = addr & mask;
-	unsigned long aligned_end = addr + size;
-	struct change_mapping_params params;
-	bool split_region = false;
-
-	if ((end - addr) < size) {
-		/*
-		 * We're going to clear the PTE, but not flushed
-		 * the mapping, time to remap and flush. The
-		 * effects if visible outside the processor or
-		 * if we are running in code close to the
-		 * mapping we cleared, we are in trouble.
-		 */
-		if (overlaps_kernel_text(aligned_start, addr) ||
-			overlaps_kernel_text(end, aligned_end)) {
-			/*
-			 * Hack, just return, don't pte_clear
-			 */
-			WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel "
-				  "text, not splitting\n", addr, end);
-			return;
-		}
-		split_region = true;
-	}
-
-	if (split_region) {
-		params.pte = pte;
-		params.start = addr;
-		params.end = end;
-		params.aligned_start = addr & ~(size - 1);
-		params.aligned_end = min_t(unsigned long, aligned_end,
-				(unsigned long)__va(memblock_end_of_DRAM()));
-		stop_machine(stop_machine_change_mapping, &params, NULL);
-		return;
-	}
-
-	pte_clear(&init_mm, addr, pte);
-}
-
-static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
-			     unsigned long end)
-{
-	unsigned long next;
-	pte_t *pte_base;
-	pmd_t *pmd;
-
-	pmd = pmd_start + pmd_index(addr);
-	for (; addr < end; addr = next, pmd++) {
-		next = pmd_addr_end(addr, end);
-
-		if (!pmd_present(*pmd))
-			continue;
-
-		if (pmd_huge(*pmd)) {
-			split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
-			continue;
-		}
-
-		pte_base = (pte_t *)pmd_page_vaddr(*pmd);
-		remove_pte_table(pte_base, addr, next);
-		free_pte_table(pte_base, pmd);
-	}
-}
-
-static void remove_pud_table(pud_t *pud_start, unsigned long addr,
-			     unsigned long end)
-{
-	unsigned long next;
-	pmd_t *pmd_base;
-	pud_t *pud;
-
-	pud = pud_start + pud_index(addr);
-	for (; addr < end; addr = next, pud++) {
-		next = pud_addr_end(addr, end);
-
-		if (!pud_present(*pud))
-			continue;
-
-		if (pud_huge(*pud)) {
-			split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
-			continue;
-		}
-
-		pmd_base = (pmd_t *)pud_page_vaddr(*pud);
-		remove_pmd_table(pmd_base, addr, next);
-		free_pmd_table(pmd_base, pud);
-	}
-}
-
-static void __meminit remove_pagetable(unsigned long start, unsigned long end)
-{
-	unsigned long addr, next;
-	pud_t *pud_base;
-	pgd_t *pgd;
-
-	spin_lock(&init_mm.page_table_lock);
-
-	for (addr = start; addr < end; addr = next) {
-		next = pgd_addr_end(addr, end);
-
-		pgd = pgd_offset_k(addr);
-		if (!pgd_present(*pgd))
-			continue;
-
-		if (pgd_huge(*pgd)) {
-			split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
-			continue;
-		}
-
-		pud_base = (pud_t *)pgd_page_vaddr(*pgd);
-		remove_pud_table(pud_base, addr, next);
-	}
-
-	spin_unlock(&init_mm.page_table_lock);
-	radix__flush_tlb_kernel_range(start, end);
-}
-
-int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
-{
-	if (end >= RADIX_VMALLOC_START) {
-		pr_warn("Outside the supported range\n");
-		return -1;
-	}
-
-	return create_physical_mapping(start, end, nid);
-}
-
-int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
-{
-	remove_pagetable(start, end);
-	return 0;
-}
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
-				 pgprot_t flags, unsigned int map_page_size,
-				 int nid)
-{
-	return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
-}
-
-int __meminit radix__vmemmap_create_mapping(unsigned long start,
-				      unsigned long page_size,
-				      unsigned long phys)
-{
-	/* Create a PTE encoding */
-	unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
-	int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
-	int ret;
-
-	if ((start + page_size) >= RADIX_VMEMMAP_END) {
-		pr_warn("Outside the supported range\n");
-		return -1;
-	}
-
-	ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
-	BUG_ON(ret);
-
-	return 0;
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
-{
-	remove_pagetable(start, start + page_size);
-}
-#endif
-#endif
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
-unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
-				  pmd_t *pmdp, unsigned long clr,
-				  unsigned long set)
-{
-	unsigned long old;
-
-#ifdef CONFIG_DEBUG_VM
-	WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
-	assert_spin_locked(pmd_lockptr(mm, pmdp));
-#endif
-
-	old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
-	trace_hugepage_update(addr, old, clr, set);
-
-	return old;
-}
-
-pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
-			pmd_t *pmdp)
-
-{
-	pmd_t pmd;
-
-	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-	VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
-	VM_BUG_ON(pmd_devmap(*pmdp));
-	/*
-	 * khugepaged calls this for normal pmd
-	 */
-	pmd = *pmdp;
-	pmd_clear(pmdp);
-
-	/*FIXME!!  Verify whether we need this kick below */
-	serialize_against_pte_lookup(vma->vm_mm);
-
-	radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
-
-	return pmd;
-}
-
-/*
- * For us pgtable_t is pte_t *. Inorder to save the deposisted
- * page table, we consider the allocated page table as a list
- * head. On withdraw we need to make sure we zero out the used
- * list_head memory area.
- */
-void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
-				 pgtable_t pgtable)
-{
-        struct list_head *lh = (struct list_head *) pgtable;
-
-        assert_spin_locked(pmd_lockptr(mm, pmdp));
-
-        /* FIFO */
-        if (!pmd_huge_pte(mm, pmdp))
-                INIT_LIST_HEAD(lh);
-        else
-                list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
-        pmd_huge_pte(mm, pmdp) = pgtable;
-}
-
-pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
-{
-        pte_t *ptep;
-        pgtable_t pgtable;
-        struct list_head *lh;
-
-        assert_spin_locked(pmd_lockptr(mm, pmdp));
-
-        /* FIFO */
-        pgtable = pmd_huge_pte(mm, pmdp);
-        lh = (struct list_head *) pgtable;
-        if (list_empty(lh))
-                pmd_huge_pte(mm, pmdp) = NULL;
-        else {
-                pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
-                list_del(lh);
-        }
-        ptep = (pte_t *) pgtable;
-        *ptep = __pte(0);
-        ptep++;
-        *ptep = __pte(0);
-        return pgtable;
-}
-
-
-pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
-			       unsigned long addr, pmd_t *pmdp)
-{
-	pmd_t old_pmd;
-	unsigned long old;
-
-	old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
-	old_pmd = __pmd(old);
-	/*
-	 * Serialize against find_current_mm_pte which does lock-less
-	 * lookup in page tables with local interrupts disabled. For huge pages
-	 * it casts pmd_t to pte_t. Since format of pte_t is different from
-	 * pmd_t we want to prevent transit from pmd pointing to page table
-	 * to pmd pointing to huge page (and back) while interrupts are disabled.
-	 * We clear pmd to possibly replace it with page table pointer in
-	 * different code paths. So make sure we wait for the parallel
-	 * find_current_mm_pte to finish.
-	 */
-	serialize_against_pte_lookup(mm);
-	return old_pmd;
-}
-
-int radix__has_transparent_hugepage(void)
-{
-	/* For radix 2M at PMD level means thp */
-	if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
-		return 1;
-	return 0;
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
-				  pte_t entry, unsigned long address, int psize)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
-					      _PAGE_RW | _PAGE_EXEC);
-
-	unsigned long change = pte_val(entry) ^ pte_val(*ptep);
-	/*
-	 * To avoid NMMU hang while relaxing access, we need mark
-	 * the pte invalid in between.
-	 */
-	if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) {
-		unsigned long old_pte, new_pte;
-
-		old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
-		/*
-		 * new value of pte
-		 */
-		new_pte = old_pte | set;
-		radix__flush_tlb_page_psize(mm, address, psize);
-		__radix_pte_update(ptep, _PAGE_INVALID, new_pte);
-	} else {
-		__radix_pte_update(ptep, 0, set);
-		/*
-		 * Book3S does not require a TLB flush when relaxing access
-		 * restrictions when the address space is not attached to a
-		 * NMMU, because the core MMU will reload the pte after taking
-		 * an access fault, which is defined by the architectue.
-		 */
-	}
-	/* See ptesync comment in radix__set_pte_at */
-}
-
-void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
-				    unsigned long addr, pte_t *ptep,
-				    pte_t old_pte, pte_t pte)
-{
-	struct mm_struct *mm = vma->vm_mm;
-
-	/*
-	 * To avoid NMMU hang while relaxing access we need to flush the tlb before
-	 * we set the new value. We need to do this only for radix, because hash
-	 * translation does flush when updating the linux pte.
-	 */
-	if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
-	    (atomic_read(&mm->context.copros) > 0))
-		radix__flush_tlb_page(vma, addr);
-
-	set_pte_at(mm, addr, ptep, pte);
-}
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
deleted file mode 100644
index ae7fca40e5b3..000000000000
--- a/arch/powerpc/mm/pkeys.c
+++ /dev/null
@@ -1,428 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * PowerPC Memory Protection Keys management
- *
- * Copyright 2017, Ram Pai, IBM Corporation.
- */
-
-#include <asm/mman.h>
-#include <asm/mmu_context.h>
-#include <asm/mmu.h>
-#include <asm/setup.h>
-#include <linux/pkeys.h>
-#include <linux/of_device.h>
-
-DEFINE_STATIC_KEY_TRUE(pkey_disabled);
-int  pkeys_total;		/* Total pkeys as per device tree */
-u32  initial_allocation_mask;   /* Bits set for the initially allocated keys */
-u32  reserved_allocation_mask;  /* Bits set for reserved keys */
-static bool pkey_execute_disable_supported;
-static bool pkeys_devtree_defined;	/* property exported by device tree */
-static u64 pkey_amr_mask;		/* Bits in AMR not to be touched */
-static u64 pkey_iamr_mask;		/* Bits in AMR not to be touched */
-static u64 pkey_uamor_mask;		/* Bits in UMOR not to be touched */
-static int execute_only_key = 2;
-
-#define AMR_BITS_PER_PKEY 2
-#define AMR_RD_BIT 0x1UL
-#define AMR_WR_BIT 0x2UL
-#define IAMR_EX_BIT 0x1UL
-#define PKEY_REG_BITS (sizeof(u64)*8)
-#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY))
-
-static void scan_pkey_feature(void)
-{
-	u32 vals[2];
-	struct device_node *cpu;
-
-	cpu = of_find_node_by_type(NULL, "cpu");
-	if (!cpu)
-		return;
-
-	if (of_property_read_u32_array(cpu,
-			"ibm,processor-storage-keys", vals, 2))
-		return;
-
-	/*
-	 * Since any pkey can be used for data or execute, we will just treat
-	 * all keys as equal and track them as one entity.
-	 */
-	pkeys_total = vals[0];
-	pkeys_devtree_defined = true;
-}
-
-static inline bool pkey_mmu_enabled(void)
-{
-	if (firmware_has_feature(FW_FEATURE_LPAR))
-		return pkeys_total;
-	else
-		return cpu_has_feature(CPU_FTR_PKEY);
-}
-
-static int pkey_initialize(void)
-{
-	int os_reserved, i;
-
-	/*
-	 * We define PKEY_DISABLE_EXECUTE in addition to the arch-neutral
-	 * generic defines for PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE.
-	 * Ensure that the bits a distinct.
-	 */
-	BUILD_BUG_ON(PKEY_DISABLE_EXECUTE &
-		     (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
-
-	/*
-	 * pkey_to_vmflag_bits() assumes that the pkey bits are contiguous
-	 * in the vmaflag. Make sure that is really the case.
-	 */
-	BUILD_BUG_ON(__builtin_clzl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) +
-		     __builtin_popcountl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)
-				!= (sizeof(u64) * BITS_PER_BYTE));
-
-	/* scan the device tree for pkey feature */
-	scan_pkey_feature();
-
-	/*
-	 * Let's assume 32 pkeys on P8 bare metal, if its not defined by device
-	 * tree. We make this exception since skiboot forgot to expose this
-	 * property on power8.
-	 */
-	if (!pkeys_devtree_defined && !firmware_has_feature(FW_FEATURE_LPAR) &&
-			cpu_has_feature(CPU_FTRS_POWER8))
-		pkeys_total = 32;
-
-	/*
-	 * Adjust the upper limit, based on the number of bits supported by
-	 * arch-neutral code.
-	 */
-	pkeys_total = min_t(int, pkeys_total,
-			((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)+1));
-
-	if (!pkey_mmu_enabled() || radix_enabled() || !pkeys_total)
-		static_branch_enable(&pkey_disabled);
-	else
-		static_branch_disable(&pkey_disabled);
-
-	if (static_branch_likely(&pkey_disabled))
-		return 0;
-
-	/*
-	 * The device tree cannot be relied to indicate support for
-	 * execute_disable support. Instead we use a PVR check.
-	 */
-	if (pvr_version_is(PVR_POWER7) || pvr_version_is(PVR_POWER7p))
-		pkey_execute_disable_supported = false;
-	else
-		pkey_execute_disable_supported = true;
-
-#ifdef CONFIG_PPC_4K_PAGES
-	/*
-	 * The OS can manage only 8 pkeys due to its inability to represent them
-	 * in the Linux 4K PTE.
-	 */
-	os_reserved = pkeys_total - 8;
-#else
-	os_reserved = 0;
-#endif
-	/* Bits are in LE format. */
-	reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key);
-
-	/* register mask is in BE format */
-	pkey_amr_mask = ~0x0ul;
-	pkey_amr_mask &= ~(0x3ul << pkeyshift(0));
-
-	pkey_iamr_mask = ~0x0ul;
-	pkey_iamr_mask &= ~(0x3ul << pkeyshift(0));
-	pkey_iamr_mask &= ~(0x3ul << pkeyshift(execute_only_key));
-
-	pkey_uamor_mask = ~0x0ul;
-	pkey_uamor_mask &= ~(0x3ul << pkeyshift(0));
-	pkey_uamor_mask &= ~(0x3ul << pkeyshift(execute_only_key));
-
-	/* mark the rest of the keys as reserved and hence unavailable */
-	for (i = (pkeys_total - os_reserved); i < pkeys_total; i++) {
-		reserved_allocation_mask |= (0x1 << i);
-		pkey_uamor_mask &= ~(0x3ul << pkeyshift(i));
-	}
-	initial_allocation_mask = reserved_allocation_mask | (0x1 << 0);
-
-	if (unlikely((pkeys_total - os_reserved) <= execute_only_key)) {
-		/*
-		 * Insufficient number of keys to support
-		 * execute only key. Mark it unavailable.
-		 * Any AMR, UAMOR, IAMR bit set for
-		 * this key is irrelevant since this key
-		 * can never be allocated.
-		 */
-		execute_only_key = -1;
-	}
-
-	return 0;
-}
-
-arch_initcall(pkey_initialize);
-
-void pkey_mm_init(struct mm_struct *mm)
-{
-	if (static_branch_likely(&pkey_disabled))
-		return;
-	mm_pkey_allocation_map(mm) = initial_allocation_mask;
-	mm->context.execute_only_pkey = execute_only_key;
-}
-
-static inline u64 read_amr(void)
-{
-	return mfspr(SPRN_AMR);
-}
-
-static inline void write_amr(u64 value)
-{
-	mtspr(SPRN_AMR, value);
-}
-
-static inline u64 read_iamr(void)
-{
-	if (!likely(pkey_execute_disable_supported))
-		return 0x0UL;
-
-	return mfspr(SPRN_IAMR);
-}
-
-static inline void write_iamr(u64 value)
-{
-	if (!likely(pkey_execute_disable_supported))
-		return;
-
-	mtspr(SPRN_IAMR, value);
-}
-
-static inline u64 read_uamor(void)
-{
-	return mfspr(SPRN_UAMOR);
-}
-
-static inline void write_uamor(u64 value)
-{
-	mtspr(SPRN_UAMOR, value);
-}
-
-static bool is_pkey_enabled(int pkey)
-{
-	u64 uamor = read_uamor();
-	u64 pkey_bits = 0x3ul << pkeyshift(pkey);
-	u64 uamor_pkey_bits = (uamor & pkey_bits);
-
-	/*
-	 * Both the bits in UAMOR corresponding to the key should be set or
-	 * reset.
-	 */
-	WARN_ON(uamor_pkey_bits && (uamor_pkey_bits != pkey_bits));
-	return !!(uamor_pkey_bits);
-}
-
-static inline void init_amr(int pkey, u8 init_bits)
-{
-	u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey));
-	u64 old_amr = read_amr() & ~((u64)(0x3ul) << pkeyshift(pkey));
-
-	write_amr(old_amr | new_amr_bits);
-}
-
-static inline void init_iamr(int pkey, u8 init_bits)
-{
-	u64 new_iamr_bits = (((u64)init_bits & 0x1UL) << pkeyshift(pkey));
-	u64 old_iamr = read_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey));
-
-	write_iamr(old_iamr | new_iamr_bits);
-}
-
-/*
- * Set the access rights in AMR IAMR and UAMOR registers for @pkey to that
- * specified in @init_val.
- */
-int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
-				unsigned long init_val)
-{
-	u64 new_amr_bits = 0x0ul;
-	u64 new_iamr_bits = 0x0ul;
-
-	if (!is_pkey_enabled(pkey))
-		return -EINVAL;
-
-	if (init_val & PKEY_DISABLE_EXECUTE) {
-		if (!pkey_execute_disable_supported)
-			return -EINVAL;
-		new_iamr_bits |= IAMR_EX_BIT;
-	}
-	init_iamr(pkey, new_iamr_bits);
-
-	/* Set the bits we need in AMR: */
-	if (init_val & PKEY_DISABLE_ACCESS)
-		new_amr_bits |= AMR_RD_BIT | AMR_WR_BIT;
-	else if (init_val & PKEY_DISABLE_WRITE)
-		new_amr_bits |= AMR_WR_BIT;
-
-	init_amr(pkey, new_amr_bits);
-	return 0;
-}
-
-void thread_pkey_regs_save(struct thread_struct *thread)
-{
-	if (static_branch_likely(&pkey_disabled))
-		return;
-
-	/*
-	 * TODO: Skip saving registers if @thread hasn't used any keys yet.
-	 */
-	thread->amr = read_amr();
-	thread->iamr = read_iamr();
-	thread->uamor = read_uamor();
-}
-
-void thread_pkey_regs_restore(struct thread_struct *new_thread,
-			      struct thread_struct *old_thread)
-{
-	if (static_branch_likely(&pkey_disabled))
-		return;
-
-	if (old_thread->amr != new_thread->amr)
-		write_amr(new_thread->amr);
-	if (old_thread->iamr != new_thread->iamr)
-		write_iamr(new_thread->iamr);
-	if (old_thread->uamor != new_thread->uamor)
-		write_uamor(new_thread->uamor);
-}
-
-void thread_pkey_regs_init(struct thread_struct *thread)
-{
-	if (static_branch_likely(&pkey_disabled))
-		return;
-
-	thread->amr = pkey_amr_mask;
-	thread->iamr = pkey_iamr_mask;
-	thread->uamor = pkey_uamor_mask;
-
-	write_uamor(pkey_uamor_mask);
-	write_amr(pkey_amr_mask);
-	write_iamr(pkey_iamr_mask);
-}
-
-static inline bool pkey_allows_readwrite(int pkey)
-{
-	int pkey_shift = pkeyshift(pkey);
-
-	if (!is_pkey_enabled(pkey))
-		return true;
-
-	return !(read_amr() & ((AMR_RD_BIT|AMR_WR_BIT) << pkey_shift));
-}
-
-int __execute_only_pkey(struct mm_struct *mm)
-{
-	return mm->context.execute_only_pkey;
-}
-
-static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
-{
-	/* Do this check first since the vm_flags should be hot */
-	if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
-		return false;
-
-	return (vma_pkey(vma) == vma->vm_mm->context.execute_only_pkey);
-}
-
-/*
- * This should only be called for *plain* mprotect calls.
- */
-int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot,
-				  int pkey)
-{
-	/*
-	 * If the currently associated pkey is execute-only, but the requested
-	 * protection is not execute-only, move it back to the default pkey.
-	 */
-	if (vma_is_pkey_exec_only(vma) && (prot != PROT_EXEC))
-		return 0;
-
-	/*
-	 * The requested protection is execute-only. Hence let's use an
-	 * execute-only pkey.
-	 */
-	if (prot == PROT_EXEC) {
-		pkey = execute_only_pkey(vma->vm_mm);
-		if (pkey > 0)
-			return pkey;
-	}
-
-	/* Nothing to override. */
-	return vma_pkey(vma);
-}
-
-static bool pkey_access_permitted(int pkey, bool write, bool execute)
-{
-	int pkey_shift;
-	u64 amr;
-
-	if (!is_pkey_enabled(pkey))
-		return true;
-
-	pkey_shift = pkeyshift(pkey);
-	if (execute && !(read_iamr() & (IAMR_EX_BIT << pkey_shift)))
-		return true;
-
-	amr = read_amr(); /* Delay reading amr until absolutely needed */
-	return ((!write && !(amr & (AMR_RD_BIT << pkey_shift))) ||
-		(write &&  !(amr & (AMR_WR_BIT << pkey_shift))));
-}
-
-bool arch_pte_access_permitted(u64 pte, bool write, bool execute)
-{
-	if (static_branch_likely(&pkey_disabled))
-		return true;
-
-	return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute);
-}
-
-/*
- * We only want to enforce protection keys on the current thread because we
- * effectively have no access to AMR/IAMR for other threads or any way to tell
- * which AMR/IAMR in a threaded process we could use.
- *
- * So do not enforce things if the VMA is not from the current mm, or if we are
- * in a kernel thread.
- */
-static inline bool vma_is_foreign(struct vm_area_struct *vma)
-{
-	if (!current->mm)
-		return true;
-
-	/* if it is not our ->mm, it has to be foreign */
-	if (current->mm != vma->vm_mm)
-		return true;
-
-	return false;
-}
-
-bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
-			       bool execute, bool foreign)
-{
-	if (static_branch_likely(&pkey_disabled))
-		return true;
-	/*
-	 * Do not enforce our key-permissions on a foreign vma.
-	 */
-	if (foreign || vma_is_foreign(vma))
-		return true;
-
-	return pkey_access_permitted(vma_pkey(vma), write, execute);
-}
-
-void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
-{
-	if (static_branch_likely(&pkey_disabled))
-		return;
-
-	/* Duplicate the oldmm pkey state in mm: */
-	mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm);
-	mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
-}
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
deleted file mode 100644
index 89e4531de64b..000000000000
--- a/arch/powerpc/mm/slb.c
+++ /dev/null
@@ -1,832 +0,0 @@
-/*
- * PowerPC64 SLB support.
- *
- * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
- * Based on earlier code written by:
- * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
- *    Copyright (c) 2001 Dave Engebretsen
- * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- *
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- */
-
-#include <asm/asm-prototypes.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/paca.h>
-#include <asm/ppc-opcode.h>
-#include <asm/cputable.h>
-#include <asm/cacheflush.h>
-#include <asm/smp.h>
-#include <linux/compiler.h>
-#include <linux/context_tracking.h>
-#include <linux/mm_types.h>
-
-#include <asm/udbg.h>
-#include <asm/code-patching.h>
-
-enum slb_index {
-	LINEAR_INDEX	= 0, /* Kernel linear map  (0xc000000000000000) */
-	KSTACK_INDEX	= 1, /* Kernel stack map */
-};
-
-static long slb_allocate_user(struct mm_struct *mm, unsigned long ea);
-
-#define slb_esid_mask(ssize)	\
-	(((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
-
-static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
-					 enum slb_index index)
-{
-	return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index;
-}
-
-static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize,
-					 unsigned long flags)
-{
-	return (vsid << slb_vsid_shift(ssize)) | flags |
-		((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT);
-}
-
-static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
-					 unsigned long flags)
-{
-	return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags);
-}
-
-static void assert_slb_presence(bool present, unsigned long ea)
-{
-#ifdef CONFIG_DEBUG_VM
-	unsigned long tmp;
-
-	WARN_ON_ONCE(mfmsr() & MSR_EE);
-
-	if (!cpu_has_feature(CPU_FTR_ARCH_206))
-		return;
-
-	/*
-	 * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware
-	 * ignores all other bits from 0-27, so just clear them all.
-	 */
-	ea &= ~((1UL << 28) - 1);
-	asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0");
-
-	WARN_ON(present == (tmp == 0));
-#endif
-}
-
-static inline void slb_shadow_update(unsigned long ea, int ssize,
-				     unsigned long flags,
-				     enum slb_index index)
-{
-	struct slb_shadow *p = get_slb_shadow();
-
-	/*
-	 * Clear the ESID first so the entry is not valid while we are
-	 * updating it.  No write barriers are needed here, provided
-	 * we only update the current CPU's SLB shadow buffer.
-	 */
-	WRITE_ONCE(p->save_area[index].esid, 0);
-	WRITE_ONCE(p->save_area[index].vsid, cpu_to_be64(mk_vsid_data(ea, ssize, flags)));
-	WRITE_ONCE(p->save_area[index].esid, cpu_to_be64(mk_esid_data(ea, ssize, index)));
-}
-
-static inline void slb_shadow_clear(enum slb_index index)
-{
-	WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index));
-}
-
-static inline void create_shadowed_slbe(unsigned long ea, int ssize,
-					unsigned long flags,
-					enum slb_index index)
-{
-	/*
-	 * Updating the shadow buffer before writing the SLB ensures
-	 * we don't get a stale entry here if we get preempted by PHYP
-	 * between these two statements.
-	 */
-	slb_shadow_update(ea, ssize, flags, index);
-
-	assert_slb_presence(false, ea);
-	asm volatile("slbmte  %0,%1" :
-		     : "r" (mk_vsid_data(ea, ssize, flags)),
-		       "r" (mk_esid_data(ea, ssize, index))
-		     : "memory" );
-}
-
-/*
- * Insert bolted entries into SLB (which may not be empty, so don't clear
- * slb_cache_ptr).
- */
-void __slb_restore_bolted_realmode(void)
-{
-	struct slb_shadow *p = get_slb_shadow();
-	enum slb_index index;
-
-	 /* No isync needed because realmode. */
-	for (index = 0; index < SLB_NUM_BOLTED; index++) {
-		asm volatile("slbmte  %0,%1" :
-		     : "r" (be64_to_cpu(p->save_area[index].vsid)),
-		       "r" (be64_to_cpu(p->save_area[index].esid)));
-	}
-
-	assert_slb_presence(true, local_paca->kstack);
-}
-
-/*
- * Insert the bolted entries into an empty SLB.
- */
-void slb_restore_bolted_realmode(void)
-{
-	__slb_restore_bolted_realmode();
-	get_paca()->slb_cache_ptr = 0;
-
-	get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
-	get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
-}
-
-/*
- * This flushes all SLB entries including 0, so it must be realmode.
- */
-void slb_flush_all_realmode(void)
-{
-	asm volatile("slbmte %0,%0; slbia" : : "r" (0));
-}
-
-/*
- * This flushes non-bolted entries, it can be run in virtual mode. Must
- * be called with interrupts disabled.
- */
-void slb_flush_and_restore_bolted(void)
-{
-	struct slb_shadow *p = get_slb_shadow();
-
-	BUILD_BUG_ON(SLB_NUM_BOLTED != 2);
-
-	WARN_ON(!irqs_disabled());
-
-	/*
-	 * We can't take a PMU exception in the following code, so hard
-	 * disable interrupts.
-	 */
-	hard_irq_disable();
-
-	asm volatile("isync\n"
-		     "slbia\n"
-		     "slbmte  %0, %1\n"
-		     "isync\n"
-		     :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)),
-			"r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid))
-		     : "memory");
-	assert_slb_presence(true, get_paca()->kstack);
-
-	get_paca()->slb_cache_ptr = 0;
-
-	get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
-	get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
-}
-
-void slb_save_contents(struct slb_entry *slb_ptr)
-{
-	int i;
-	unsigned long e, v;
-
-	/* Save slb_cache_ptr value. */
-	get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr;
-
-	if (!slb_ptr)
-		return;
-
-	for (i = 0; i < mmu_slb_size; i++) {
-		asm volatile("slbmfee  %0,%1" : "=r" (e) : "r" (i));
-		asm volatile("slbmfev  %0,%1" : "=r" (v) : "r" (i));
-		slb_ptr->esid = e;
-		slb_ptr->vsid = v;
-		slb_ptr++;
-	}
-}
-
-void slb_dump_contents(struct slb_entry *slb_ptr)
-{
-	int i, n;
-	unsigned long e, v;
-	unsigned long llp;
-
-	if (!slb_ptr)
-		return;
-
-	pr_err("SLB contents of cpu 0x%x\n", smp_processor_id());
-	pr_err("Last SLB entry inserted at slot %d\n", get_paca()->stab_rr);
-
-	for (i = 0; i < mmu_slb_size; i++) {
-		e = slb_ptr->esid;
-		v = slb_ptr->vsid;
-		slb_ptr++;
-
-		if (!e && !v)
-			continue;
-
-		pr_err("%02d %016lx %016lx\n", i, e, v);
-
-		if (!(e & SLB_ESID_V)) {
-			pr_err("\n");
-			continue;
-		}
-		llp = v & SLB_VSID_LLP;
-		if (v & SLB_VSID_B_1T) {
-			pr_err("  1T  ESID=%9lx  VSID=%13lx LLP:%3lx\n",
-			       GET_ESID_1T(e),
-			       (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp);
-		} else {
-			pr_err(" 256M ESID=%9lx  VSID=%13lx LLP:%3lx\n",
-			       GET_ESID(e),
-			       (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp);
-		}
-	}
-	pr_err("----------------------------------\n");
-
-	/* Dump slb cache entires as well. */
-	pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr);
-	pr_err("Valid SLB cache entries:\n");
-	n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES);
-	for (i = 0; i < n; i++)
-		pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
-	pr_err("Rest of SLB cache entries:\n");
-	for (i = n; i < SLB_CACHE_ENTRIES; i++)
-		pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
-}
-
-void slb_vmalloc_update(void)
-{
-	/*
-	 * vmalloc is not bolted, so just have to flush non-bolted.
-	 */
-	slb_flush_and_restore_bolted();
-}
-
-static bool preload_hit(struct thread_info *ti, unsigned long esid)
-{
-	unsigned char i;
-
-	for (i = 0; i < ti->slb_preload_nr; i++) {
-		unsigned char idx;
-
-		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
-		if (esid == ti->slb_preload_esid[idx])
-			return true;
-	}
-	return false;
-}
-
-static bool preload_add(struct thread_info *ti, unsigned long ea)
-{
-	unsigned char idx;
-	unsigned long esid;
-
-	if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
-		/* EAs are stored >> 28 so 256MB segments don't need clearing */
-		if (ea & ESID_MASK_1T)
-			ea &= ESID_MASK_1T;
-	}
-
-	esid = ea >> SID_SHIFT;
-
-	if (preload_hit(ti, esid))
-		return false;
-
-	idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR;
-	ti->slb_preload_esid[idx] = esid;
-	if (ti->slb_preload_nr == SLB_PRELOAD_NR)
-		ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
-	else
-		ti->slb_preload_nr++;
-
-	return true;
-}
-
-static void preload_age(struct thread_info *ti)
-{
-	if (!ti->slb_preload_nr)
-		return;
-	ti->slb_preload_nr--;
-	ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
-}
-
-void slb_setup_new_exec(void)
-{
-	struct thread_info *ti = current_thread_info();
-	struct mm_struct *mm = current->mm;
-	unsigned long exec = 0x10000000;
-
-	WARN_ON(irqs_disabled());
-
-	/*
-	 * preload cache can only be used to determine whether a SLB
-	 * entry exists if it does not start to overflow.
-	 */
-	if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR)
-		return;
-
-	hard_irq_disable();
-
-	/*
-	 * We have no good place to clear the slb preload cache on exec,
-	 * flush_thread is about the earliest arch hook but that happens
-	 * after we switch to the mm and have aleady preloaded the SLBEs.
-	 *
-	 * For the most part that's probably okay to use entries from the
-	 * previous exec, they will age out if unused. It may turn out to
-	 * be an advantage to clear the cache before switching to it,
-	 * however.
-	 */
-
-	/*
-	 * preload some userspace segments into the SLB.
-	 * Almost all 32 and 64bit PowerPC executables are linked at
-	 * 0x10000000 so it makes sense to preload this segment.
-	 */
-	if (!is_kernel_addr(exec)) {
-		if (preload_add(ti, exec))
-			slb_allocate_user(mm, exec);
-	}
-
-	/* Libraries and mmaps. */
-	if (!is_kernel_addr(mm->mmap_base)) {
-		if (preload_add(ti, mm->mmap_base))
-			slb_allocate_user(mm, mm->mmap_base);
-	}
-
-	/* see switch_slb */
-	asm volatile("isync" : : : "memory");
-
-	local_irq_enable();
-}
-
-void preload_new_slb_context(unsigned long start, unsigned long sp)
-{
-	struct thread_info *ti = current_thread_info();
-	struct mm_struct *mm = current->mm;
-	unsigned long heap = mm->start_brk;
-
-	WARN_ON(irqs_disabled());
-
-	/* see above */
-	if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR)
-		return;
-
-	hard_irq_disable();
-
-	/* Userspace entry address. */
-	if (!is_kernel_addr(start)) {
-		if (preload_add(ti, start))
-			slb_allocate_user(mm, start);
-	}
-
-	/* Top of stack, grows down. */
-	if (!is_kernel_addr(sp)) {
-		if (preload_add(ti, sp))
-			slb_allocate_user(mm, sp);
-	}
-
-	/* Bottom of heap, grows up. */
-	if (heap && !is_kernel_addr(heap)) {
-		if (preload_add(ti, heap))
-			slb_allocate_user(mm, heap);
-	}
-
-	/* see switch_slb */
-	asm volatile("isync" : : : "memory");
-
-	local_irq_enable();
-}
-
-
-/* Flush all user entries from the segment table of the current processor. */
-void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
-{
-	struct thread_info *ti = task_thread_info(tsk);
-	unsigned char i;
-
-	/*
-	 * We need interrupts hard-disabled here, not just soft-disabled,
-	 * so that a PMU interrupt can't occur, which might try to access
-	 * user memory (to get a stack trace) and possible cause an SLB miss
-	 * which would update the slb_cache/slb_cache_ptr fields in the PACA.
-	 */
-	hard_irq_disable();
-	asm volatile("isync" : : : "memory");
-	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
-		/*
-		 * SLBIA IH=3 invalidates all Class=1 SLBEs and their
-		 * associated lookaside structures, which matches what
-		 * switch_slb wants. So ARCH_300 does not use the slb
-		 * cache.
-		 */
-		asm volatile(PPC_SLBIA(3));
-	} else {
-		unsigned long offset = get_paca()->slb_cache_ptr;
-
-		if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
-		    offset <= SLB_CACHE_ENTRIES) {
-			unsigned long slbie_data = 0;
-
-			for (i = 0; i < offset; i++) {
-				unsigned long ea;
-
-				ea = (unsigned long)
-					get_paca()->slb_cache[i] << SID_SHIFT;
-				/*
-				 * Could assert_slb_presence(true) here, but
-				 * hypervisor or machine check could have come
-				 * in and removed the entry at this point.
-				 */
-
-				slbie_data = ea;
-				slbie_data |= user_segment_size(slbie_data)
-						<< SLBIE_SSIZE_SHIFT;
-				slbie_data |= SLBIE_C; /* user slbs have C=1 */
-				asm volatile("slbie %0" : : "r" (slbie_data));
-			}
-
-			/* Workaround POWER5 < DD2.1 issue */
-			if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1)
-				asm volatile("slbie %0" : : "r" (slbie_data));
-
-		} else {
-			struct slb_shadow *p = get_slb_shadow();
-			unsigned long ksp_esid_data =
-				be64_to_cpu(p->save_area[KSTACK_INDEX].esid);
-			unsigned long ksp_vsid_data =
-				be64_to_cpu(p->save_area[KSTACK_INDEX].vsid);
-
-			asm volatile(PPC_SLBIA(1) "\n"
-				     "slbmte	%0,%1\n"
-				     "isync"
-				     :: "r"(ksp_vsid_data),
-					"r"(ksp_esid_data));
-
-			get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
-		}
-
-		get_paca()->slb_cache_ptr = 0;
-	}
-	get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
-
-	copy_mm_to_paca(mm);
-
-	/*
-	 * We gradually age out SLBs after a number of context switches to
-	 * reduce reload overhead of unused entries (like we do with FP/VEC
-	 * reload). Each time we wrap 256 switches, take an entry out of the
-	 * SLB preload cache.
-	 */
-	tsk->thread.load_slb++;
-	if (!tsk->thread.load_slb) {
-		unsigned long pc = KSTK_EIP(tsk);
-
-		preload_age(ti);
-		preload_add(ti, pc);
-	}
-
-	for (i = 0; i < ti->slb_preload_nr; i++) {
-		unsigned char idx;
-		unsigned long ea;
-
-		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
-		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
-
-		slb_allocate_user(mm, ea);
-	}
-
-	/*
-	 * Synchronize slbmte preloads with possible subsequent user memory
-	 * address accesses by the kernel (user mode won't happen until
-	 * rfid, which is safe).
-	 */
-	asm volatile("isync" : : : "memory");
-}
-
-void slb_set_size(u16 size)
-{
-	mmu_slb_size = size;
-}
-
-void slb_initialize(void)
-{
-	unsigned long linear_llp, vmalloc_llp, io_llp;
-	unsigned long lflags;
-	static int slb_encoding_inited;
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-	unsigned long vmemmap_llp;
-#endif
-
-	/* Prepare our SLB miss handler based on our page size */
-	linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
-	io_llp = mmu_psize_defs[mmu_io_psize].sllp;
-	vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
-	get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp;
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-	vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp;
-#endif
-	if (!slb_encoding_inited) {
-		slb_encoding_inited = 1;
-		pr_devel("SLB: linear  LLP = %04lx\n", linear_llp);
-		pr_devel("SLB: io      LLP = %04lx\n", io_llp);
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-		pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp);
-#endif
-	}
-
-	get_paca()->stab_rr = SLB_NUM_BOLTED - 1;
-	get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
-	get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
-
-	lflags = SLB_VSID_KERNEL | linear_llp;
-
-	/* Invalidate the entire SLB (even entry 0) & all the ERATS */
-	asm volatile("isync":::"memory");
-	asm volatile("slbmte  %0,%0"::"r" (0) : "memory");
-	asm volatile("isync; slbia; isync":::"memory");
-	create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX);
-
-	/* For the boot cpu, we're running on the stack in init_thread_union,
-	 * which is in the first segment of the linear mapping, and also
-	 * get_paca()->kstack hasn't been initialized yet.
-	 * For secondary cpus, we need to bolt the kernel stack entry now.
-	 */
-	slb_shadow_clear(KSTACK_INDEX);
-	if (raw_smp_processor_id() != boot_cpuid &&
-	    (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET)
-		create_shadowed_slbe(get_paca()->kstack,
-				     mmu_kernel_ssize, lflags, KSTACK_INDEX);
-
-	asm volatile("isync":::"memory");
-}
-
-static void slb_cache_update(unsigned long esid_data)
-{
-	int slb_cache_index;
-
-	if (cpu_has_feature(CPU_FTR_ARCH_300))
-		return; /* ISAv3.0B and later does not use slb_cache */
-
-	/*
-	 * Now update slb cache entries
-	 */
-	slb_cache_index = local_paca->slb_cache_ptr;
-	if (slb_cache_index < SLB_CACHE_ENTRIES) {
-		/*
-		 * We have space in slb cache for optimized switch_slb().
-		 * Top 36 bits from esid_data as per ISA
-		 */
-		local_paca->slb_cache[slb_cache_index++] = esid_data >> 28;
-		local_paca->slb_cache_ptr++;
-	} else {
-		/*
-		 * Our cache is full and the current cache content strictly
-		 * doesn't indicate the active SLB conents. Bump the ptr
-		 * so that switch_slb() will ignore the cache.
-		 */
-		local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1;
-	}
-}
-
-static enum slb_index alloc_slb_index(bool kernel)
-{
-	enum slb_index index;
-
-	/*
-	 * The allocation bitmaps can become out of synch with the SLB
-	 * when the _switch code does slbie when bolting a new stack
-	 * segment and it must not be anywhere else in the SLB. This leaves
-	 * a kernel allocated entry that is unused in the SLB. With very
-	 * large systems or small segment sizes, the bitmaps could slowly
-	 * fill with these entries. They will eventually be cleared out
-	 * by the round robin allocator in that case, so it's probably not
-	 * worth accounting for.
-	 */
-
-	/*
-	 * SLBs beyond 32 entries are allocated with stab_rr only
-	 * POWER7/8/9 have 32 SLB entries, this could be expanded if a
-	 * future CPU has more.
-	 */
-	if (local_paca->slb_used_bitmap != U32_MAX) {
-		index = ffz(local_paca->slb_used_bitmap);
-		local_paca->slb_used_bitmap |= 1U << index;
-		if (kernel)
-			local_paca->slb_kern_bitmap |= 1U << index;
-	} else {
-		/* round-robin replacement of slb starting at SLB_NUM_BOLTED. */
-		index = local_paca->stab_rr;
-		if (index < (mmu_slb_size - 1))
-			index++;
-		else
-			index = SLB_NUM_BOLTED;
-		local_paca->stab_rr = index;
-		if (index < 32) {
-			if (kernel)
-				local_paca->slb_kern_bitmap |= 1U << index;
-			else
-				local_paca->slb_kern_bitmap &= ~(1U << index);
-		}
-	}
-	BUG_ON(index < SLB_NUM_BOLTED);
-
-	return index;
-}
-
-static long slb_insert_entry(unsigned long ea, unsigned long context,
-				unsigned long flags, int ssize, bool kernel)
-{
-	unsigned long vsid;
-	unsigned long vsid_data, esid_data;
-	enum slb_index index;
-
-	vsid = get_vsid(context, ea, ssize);
-	if (!vsid)
-		return -EFAULT;
-
-	/*
-	 * There must not be a kernel SLB fault in alloc_slb_index or before
-	 * slbmte here or the allocation bitmaps could get out of whack with
-	 * the SLB.
-	 *
-	 * User SLB faults or preloads take this path which might get inlined
-	 * into the caller, so add compiler barriers here to ensure unsafe
-	 * memory accesses do not come between.
-	 */
-	barrier();
-
-	index = alloc_slb_index(kernel);
-
-	vsid_data = __mk_vsid_data(vsid, ssize, flags);
-	esid_data = mk_esid_data(ea, ssize, index);
-
-	/*
-	 * No need for an isync before or after this slbmte. The exception
-	 * we enter with and the rfid we exit with are context synchronizing.
-	 * User preloads should add isync afterwards in case the kernel
-	 * accesses user memory before it returns to userspace with rfid.
-	 */
-	assert_slb_presence(false, ea);
-	asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data));
-
-	barrier();
-
-	if (!kernel)
-		slb_cache_update(esid_data);
-
-	return 0;
-}
-
-static long slb_allocate_kernel(unsigned long ea, unsigned long id)
-{
-	unsigned long context;
-	unsigned long flags;
-	int ssize;
-
-	if (id == LINEAR_MAP_REGION_ID) {
-
-		/* We only support upto MAX_PHYSMEM_BITS */
-		if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS))
-			return -EFAULT;
-
-		flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp;
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-	} else if (id == VMEMMAP_REGION_ID) {
-
-		if (ea >= H_VMEMMAP_END)
-			return -EFAULT;
-
-		flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp;
-#endif
-	} else if (id == VMALLOC_REGION_ID) {
-
-		if (ea >= H_VMALLOC_END)
-			return -EFAULT;
-
-		flags = local_paca->vmalloc_sllp;
-
-	} else if (id == IO_REGION_ID) {
-
-		if (ea >= H_KERN_IO_END)
-			return -EFAULT;
-
-		flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp;
-
-	} else {
-		return -EFAULT;
-	}
-
-	ssize = MMU_SEGSIZE_1T;
-	if (!mmu_has_feature(MMU_FTR_1T_SEGMENT))
-		ssize = MMU_SEGSIZE_256M;
-
-	context = get_kernel_context(ea);
-
-	return slb_insert_entry(ea, context, flags, ssize, true);
-}
-
-static long slb_allocate_user(struct mm_struct *mm, unsigned long ea)
-{
-	unsigned long context;
-	unsigned long flags;
-	int bpsize;
-	int ssize;
-
-	/*
-	 * consider this as bad access if we take a SLB miss
-	 * on an address above addr limit.
-	 */
-	if (ea >= mm_ctx_slb_addr_limit(&mm->context))
-		return -EFAULT;
-
-	context = get_user_context(&mm->context, ea);
-	if (!context)
-		return -EFAULT;
-
-	if (unlikely(ea >= H_PGTABLE_RANGE)) {
-		WARN_ON(1);
-		return -EFAULT;
-	}
-
-	ssize = user_segment_size(ea);
-
-	bpsize = get_slice_psize(mm, ea);
-	flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp;
-
-	return slb_insert_entry(ea, context, flags, ssize, false);
-}
-
-long do_slb_fault(struct pt_regs *regs, unsigned long ea)
-{
-	unsigned long id = get_region_id(ea);
-
-	/* IRQs are not reconciled here, so can't check irqs_disabled */
-	VM_WARN_ON(mfmsr() & MSR_EE);
-
-	if (unlikely(!(regs->msr & MSR_RI)))
-		return -EINVAL;
-
-	/*
-	 * SLB kernel faults must be very careful not to touch anything
-	 * that is not bolted. E.g., PACA and global variables are okay,
-	 * mm->context stuff is not.
-	 *
-	 * SLB user faults can access all of kernel memory, but must be
-	 * careful not to touch things like IRQ state because it is not
-	 * "reconciled" here. The difficulty is that we must use
-	 * fast_exception_return to return from kernel SLB faults without
-	 * looking at possible non-bolted memory. We could test user vs
-	 * kernel faults in the interrupt handler asm and do a full fault,
-	 * reconcile, ret_from_except for user faults which would make them
-	 * first class kernel code. But for performance it's probably nicer
-	 * if they go via fast_exception_return too.
-	 */
-	if (id >= LINEAR_MAP_REGION_ID) {
-		long err;
-#ifdef CONFIG_DEBUG_VM
-		/* Catch recursive kernel SLB faults. */
-		BUG_ON(local_paca->in_kernel_slb_handler);
-		local_paca->in_kernel_slb_handler = 1;
-#endif
-		err = slb_allocate_kernel(ea, id);
-#ifdef CONFIG_DEBUG_VM
-		local_paca->in_kernel_slb_handler = 0;
-#endif
-		return err;
-	} else {
-		struct mm_struct *mm = current->mm;
-		long err;
-
-		if (unlikely(!mm))
-			return -EFAULT;
-
-		err = slb_allocate_user(mm, ea);
-		if (!err)
-			preload_add(current_thread_info(), ea);
-
-		return err;
-	}
-}
-
-void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err)
-{
-	if (err == -EFAULT) {
-		if (user_mode(regs))
-			_exception(SIGSEGV, regs, SEGV_BNDERR, ea);
-		else
-			bad_page_fault(regs, ea, SIGSEGV);
-	} else if (err == -EINVAL) {
-		unrecoverable_exception(regs);
-	} else {
-		BUG();
-	}
-}
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
deleted file mode 100644
index 473dd430e306..000000000000
--- a/arch/powerpc/mm/subpage-prot.c
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- * Copyright 2007-2008 Paul Mackerras, IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/gfp.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/syscalls.h>
-
-#include <asm/pgtable.h>
-#include <linux/uaccess.h>
-
-/*
- * Free all pages allocated for subpage protection maps and pointers.
- * Also makes sure that the subpage_prot_table structure is
- * reinitialized for the next user.
- */
-void subpage_prot_free(struct mm_struct *mm)
-{
-	struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
-	unsigned long i, j, addr;
-	u32 **p;
-
-	if (!spt)
-		return;
-
-	for (i = 0; i < 4; ++i) {
-		if (spt->low_prot[i]) {
-			free_page((unsigned long)spt->low_prot[i]);
-			spt->low_prot[i] = NULL;
-		}
-	}
-	addr = 0;
-	for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) {
-		p = spt->protptrs[i];
-		if (!p)
-			continue;
-		spt->protptrs[i] = NULL;
-		for (j = 0; j < SBP_L2_COUNT && addr < spt->maxaddr;
-		     ++j, addr += PAGE_SIZE)
-			if (p[j])
-				free_page((unsigned long)p[j]);
-		free_page((unsigned long)p);
-	}
-	spt->maxaddr = 0;
-	kfree(spt);
-}
-
-static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
-			     int npages)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	spinlock_t *ptl;
-
-	pgd = pgd_offset(mm, addr);
-	if (pgd_none(*pgd))
-		return;
-	pud = pud_offset(pgd, addr);
-	if (pud_none(*pud))
-		return;
-	pmd = pmd_offset(pud, addr);
-	if (pmd_none(*pmd))
-		return;
-	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
-	arch_enter_lazy_mmu_mode();
-	for (; npages > 0; --npages) {
-		pte_update(mm, addr, pte, 0, 0, 0);
-		addr += PAGE_SIZE;
-		++pte;
-	}
-	arch_leave_lazy_mmu_mode();
-	pte_unmap_unlock(pte - 1, ptl);
-}
-
-/*
- * Clear the subpage protection map for an address range, allowing
- * all accesses that are allowed by the pte permissions.
- */
-static void subpage_prot_clear(unsigned long addr, unsigned long len)
-{
-	struct mm_struct *mm = current->mm;
-	struct subpage_prot_table *spt;
-	u32 **spm, *spp;
-	unsigned long i;
-	size_t nw;
-	unsigned long next, limit;
-
-	down_write(&mm->mmap_sem);
-
-	spt = mm_ctx_subpage_prot(&mm->context);
-	if (!spt)
-		goto err_out;
-
-	limit = addr + len;
-	if (limit > spt->maxaddr)
-		limit = spt->maxaddr;
-	for (; addr < limit; addr = next) {
-		next = pmd_addr_end(addr, limit);
-		if (addr < 0x100000000UL) {
-			spm = spt->low_prot;
-		} else {
-			spm = spt->protptrs[addr >> SBP_L3_SHIFT];
-			if (!spm)
-				continue;
-		}
-		spp = spm[(addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
-		if (!spp)
-			continue;
-		spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
-
-		i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
-		nw = PTRS_PER_PTE - i;
-		if (addr + (nw << PAGE_SHIFT) > next)
-			nw = (next - addr) >> PAGE_SHIFT;
-
-		memset(spp, 0, nw * sizeof(u32));
-
-		/* now flush any existing HPTEs for the range */
-		hpte_flush_range(mm, addr, nw);
-	}
-
-err_out:
-	up_write(&mm->mmap_sem);
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
-				  unsigned long end, struct mm_walk *walk)
-{
-	struct vm_area_struct *vma = walk->vma;
-	split_huge_pmd(vma, pmd, addr);
-	return 0;
-}
-
-static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
-				    unsigned long len)
-{
-	struct vm_area_struct *vma;
-	struct mm_walk subpage_proto_walk = {
-		.mm = mm,
-		.pmd_entry = subpage_walk_pmd_entry,
-	};
-
-	/*
-	 * We don't try too hard, we just mark all the vma in that range
-	 * VM_NOHUGEPAGE and split them.
-	 */
-	vma = find_vma(mm, addr);
-	/*
-	 * If the range is in unmapped range, just return
-	 */
-	if (vma && ((addr + len) <= vma->vm_start))
-		return;
-
-	while (vma) {
-		if (vma->vm_start >= (addr + len))
-			break;
-		vma->vm_flags |= VM_NOHUGEPAGE;
-		walk_page_vma(vma, &subpage_proto_walk);
-		vma = vma->vm_next;
-	}
-}
-#else
-static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
-				    unsigned long len)
-{
-	return;
-}
-#endif
-
-/*
- * Copy in a subpage protection map for an address range.
- * The map has 2 bits per 4k subpage, so 32 bits per 64k page.
- * Each 2-bit field is 0 to allow any access, 1 to prevent writes,
- * 2 or 3 to prevent all accesses.
- * Note that the normal page protections also apply; the subpage
- * protection mechanism is an additional constraint, so putting 0
- * in a 2-bit field won't allow writes to a page that is otherwise
- * write-protected.
- */
-SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
-		unsigned long, len, u32 __user *, map)
-{
-	struct mm_struct *mm = current->mm;
-	struct subpage_prot_table *spt;
-	u32 **spm, *spp;
-	unsigned long i;
-	size_t nw;
-	unsigned long next, limit;
-	int err;
-
-	if (radix_enabled())
-		return -ENOENT;
-
-	/* Check parameters */
-	if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) ||
-	    addr >= mm->task_size || len >= mm->task_size ||
-	    addr + len > mm->task_size)
-		return -EINVAL;
-
-	if (is_hugepage_only_range(mm, addr, len))
-		return -EINVAL;
-
-	if (!map) {
-		/* Clear out the protection map for the address range */
-		subpage_prot_clear(addr, len);
-		return 0;
-	}
-
-	if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32)))
-		return -EFAULT;
-
-	down_write(&mm->mmap_sem);
-
-	spt = mm_ctx_subpage_prot(&mm->context);
-	if (!spt) {
-		/*
-		 * Allocate subpage prot table if not already done.
-		 * Do this with mmap_sem held
-		 */
-		spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL);
-		if (!spt) {
-			err = -ENOMEM;
-			goto out;
-		}
-		mm->context.hash_context->spt = spt;
-	}
-
-	subpage_mark_vma_nohuge(mm, addr, len);
-	for (limit = addr + len; addr < limit; addr = next) {
-		next = pmd_addr_end(addr, limit);
-		err = -ENOMEM;
-		if (addr < 0x100000000UL) {
-			spm = spt->low_prot;
-		} else {
-			spm = spt->protptrs[addr >> SBP_L3_SHIFT];
-			if (!spm) {
-				spm = (u32 **)get_zeroed_page(GFP_KERNEL);
-				if (!spm)
-					goto out;
-				spt->protptrs[addr >> SBP_L3_SHIFT] = spm;
-			}
-		}
-		spm += (addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1);
-		spp = *spm;
-		if (!spp) {
-			spp = (u32 *)get_zeroed_page(GFP_KERNEL);
-			if (!spp)
-				goto out;
-			*spm = spp;
-		}
-		spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
-
-		local_irq_disable();
-		demote_segment_4k(mm, addr);
-		local_irq_enable();
-
-		i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
-		nw = PTRS_PER_PTE - i;
-		if (addr + (nw << PAGE_SHIFT) > next)
-			nw = (next - addr) >> PAGE_SHIFT;
-
-		up_write(&mm->mmap_sem);
-		if (__copy_from_user(spp, map, nw * sizeof(u32)))
-			return -EFAULT;
-		map += nw;
-		down_write(&mm->mmap_sem);
-
-		/* now flush any existing HPTEs for the range */
-		hpte_flush_range(mm, addr, nw);
-	}
-	if (limit > spt->maxaddr)
-		spt->maxaddr = limit;
-	err = 0;
- out:
-	up_write(&mm->mmap_sem);
-	return err;
-}
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
deleted file mode 100644
index 6a23b9ebd2a1..000000000000
--- a/arch/powerpc/mm/tlb-radix.c
+++ /dev/null
@@ -1,1101 +0,0 @@
-/*
- * TLB flush routines for radix kernels.
- *
- * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/memblock.h>
-#include <linux/mmu_context.h>
-#include <linux/sched/mm.h>
-
-#include <asm/ppc-opcode.h>
-#include <asm/tlb.h>
-#include <asm/tlbflush.h>
-#include <asm/trace.h>
-#include <asm/cputhreads.h>
-
-#define RIC_FLUSH_TLB 0
-#define RIC_FLUSH_PWC 1
-#define RIC_FLUSH_ALL 2
-
-/*
- * tlbiel instruction for radix, set invalidation
- * i.e., r=1 and is=01 or is=10 or is=11
- */
-static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is,
-					unsigned int pid,
-					unsigned int ric, unsigned int prs)
-{
-	unsigned long rb;
-	unsigned long rs;
-
-	rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
-	rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
-
-	asm volatile(PPC_TLBIEL(%0, %1, %2, %3, 1)
-		     : : "r"(rb), "r"(rs), "i"(ric), "i"(prs)
-		     : "memory");
-}
-
-static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
-{
-	unsigned int set;
-
-	asm volatile("ptesync": : :"memory");
-
-	/*
-	 * Flush the first set of the TLB, and the entire Page Walk Cache
-	 * and partition table entries. Then flush the remaining sets of the
-	 * TLB.
-	 */
-	tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
-	for (set = 1; set < num_sets; set++)
-		tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
-
-	/* Do the same for process scoped entries. */
-	tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
-	for (set = 1; set < num_sets; set++)
-		tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
-
-	asm volatile("ptesync": : :"memory");
-}
-
-void radix__tlbiel_all(unsigned int action)
-{
-	unsigned int is;
-
-	switch (action) {
-	case TLB_INVAL_SCOPE_GLOBAL:
-		is = 3;
-		break;
-	case TLB_INVAL_SCOPE_LPID:
-		is = 2;
-		break;
-	default:
-		BUG();
-	}
-
-	if (early_cpu_has_feature(CPU_FTR_ARCH_300))
-		tlbiel_all_isa300(POWER9_TLB_SETS_RADIX, is);
-	else
-		WARN(1, "%s called on pre-POWER9 CPU\n", __func__);
-
-	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
-}
-
-static inline void __tlbiel_pid(unsigned long pid, int set,
-				unsigned long ric)
-{
-	unsigned long rb,rs,prs,r;
-
-	rb = PPC_BIT(53); /* IS = 1 */
-	rb |= set << PPC_BITLSHIFT(51);
-	rs = ((unsigned long)pid) << PPC_BITLSHIFT(31);
-	prs = 1; /* process scoped */
-	r = 1;   /* radix format */
-
-	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
-		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-	trace_tlbie(0, 1, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
-{
-	unsigned long rb,rs,prs,r;
-
-	rb = PPC_BIT(53); /* IS = 1 */
-	rs = pid << PPC_BITLSHIFT(31);
-	prs = 1; /* process scoped */
-	r = 1;   /* radix format */
-
-	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
-		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-	trace_tlbie(0, 0, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbiel_lpid(unsigned long lpid, int set,
-				unsigned long ric)
-{
-	unsigned long rb,rs,prs,r;
-
-	rb = PPC_BIT(52); /* IS = 2 */
-	rb |= set << PPC_BITLSHIFT(51);
-	rs = 0;  /* LPID comes from LPIDR */
-	prs = 0; /* partition scoped */
-	r = 1;   /* radix format */
-
-	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
-		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-	trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
-{
-	unsigned long rb,rs,prs,r;
-
-	rb = PPC_BIT(52); /* IS = 2 */
-	rs = lpid;
-	prs = 0; /* partition scoped */
-	r = 1;   /* radix format */
-
-	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
-		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-	trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbiel_lpid_guest(unsigned long lpid, int set,
-				unsigned long ric)
-{
-	unsigned long rb,rs,prs,r;
-
-	rb = PPC_BIT(52); /* IS = 2 */
-	rb |= set << PPC_BITLSHIFT(51);
-	rs = 0;  /* LPID comes from LPIDR */
-	prs = 1; /* process scoped */
-	r = 1;   /* radix format */
-
-	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
-		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-	trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
-}
-
-
-static inline void __tlbiel_va(unsigned long va, unsigned long pid,
-			       unsigned long ap, unsigned long ric)
-{
-	unsigned long rb,rs,prs,r;
-
-	rb = va & ~(PPC_BITMASK(52, 63));
-	rb |= ap << PPC_BITLSHIFT(58);
-	rs = pid << PPC_BITLSHIFT(31);
-	prs = 1; /* process scoped */
-	r = 1;   /* radix format */
-
-	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
-		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-	trace_tlbie(0, 1, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbie_va(unsigned long va, unsigned long pid,
-			      unsigned long ap, unsigned long ric)
-{
-	unsigned long rb,rs,prs,r;
-
-	rb = va & ~(PPC_BITMASK(52, 63));
-	rb |= ap << PPC_BITLSHIFT(58);
-	rs = pid << PPC_BITLSHIFT(31);
-	prs = 1; /* process scoped */
-	r = 1;   /* radix format */
-
-	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
-		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-	trace_tlbie(0, 0, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
-			      unsigned long ap, unsigned long ric)
-{
-	unsigned long rb,rs,prs,r;
-
-	rb = va & ~(PPC_BITMASK(52, 63));
-	rb |= ap << PPC_BITLSHIFT(58);
-	rs = lpid;
-	prs = 0; /* partition scoped */
-	r = 1;   /* radix format */
-
-	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
-		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-	trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
-}
-
-static inline void fixup_tlbie(void)
-{
-	unsigned long pid = 0;
-	unsigned long va = ((1UL << 52) - 1);
-
-	if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
-		asm volatile("ptesync": : :"memory");
-		__tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
-	}
-}
-
-static inline void fixup_tlbie_lpid(unsigned long lpid)
-{
-	unsigned long va = ((1UL << 52) - 1);
-
-	if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
-		asm volatile("ptesync": : :"memory");
-		__tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
-	}
-}
-
-/*
- * We use 128 set in radix mode and 256 set in hpt mode.
- */
-static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
-{
-	int set;
-
-	asm volatile("ptesync": : :"memory");
-
-	/*
-	 * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
-	 * also flush the entire Page Walk Cache.
-	 */
-	__tlbiel_pid(pid, 0, ric);
-
-	/* For PWC, only one flush is needed */
-	if (ric == RIC_FLUSH_PWC) {
-		asm volatile("ptesync": : :"memory");
-		return;
-	}
-
-	/* For the remaining sets, just flush the TLB */
-	for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
-		__tlbiel_pid(pid, set, RIC_FLUSH_TLB);
-
-	asm volatile("ptesync": : :"memory");
-	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
-}
-
-static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
-{
-	asm volatile("ptesync": : :"memory");
-
-	/*
-	 * Workaround the fact that the "ric" argument to __tlbie_pid
-	 * must be a compile-time contraint to match the "i" constraint
-	 * in the asm statement.
-	 */
-	switch (ric) {
-	case RIC_FLUSH_TLB:
-		__tlbie_pid(pid, RIC_FLUSH_TLB);
-		break;
-	case RIC_FLUSH_PWC:
-		__tlbie_pid(pid, RIC_FLUSH_PWC);
-		break;
-	case RIC_FLUSH_ALL:
-	default:
-		__tlbie_pid(pid, RIC_FLUSH_ALL);
-	}
-	fixup_tlbie();
-	asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
-
-static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric)
-{
-	int set;
-
-	VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
-
-	asm volatile("ptesync": : :"memory");
-
-	/*
-	 * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
-	 * also flush the entire Page Walk Cache.
-	 */
-	__tlbiel_lpid(lpid, 0, ric);
-
-	/* For PWC, only one flush is needed */
-	if (ric == RIC_FLUSH_PWC) {
-		asm volatile("ptesync": : :"memory");
-		return;
-	}
-
-	/* For the remaining sets, just flush the TLB */
-	for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
-		__tlbiel_lpid(lpid, set, RIC_FLUSH_TLB);
-
-	asm volatile("ptesync": : :"memory");
-	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
-}
-
-static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
-{
-	asm volatile("ptesync": : :"memory");
-
-	/*
-	 * Workaround the fact that the "ric" argument to __tlbie_pid
-	 * must be a compile-time contraint to match the "i" constraint
-	 * in the asm statement.
-	 */
-	switch (ric) {
-	case RIC_FLUSH_TLB:
-		__tlbie_lpid(lpid, RIC_FLUSH_TLB);
-		break;
-	case RIC_FLUSH_PWC:
-		__tlbie_lpid(lpid, RIC_FLUSH_PWC);
-		break;
-	case RIC_FLUSH_ALL:
-	default:
-		__tlbie_lpid(lpid, RIC_FLUSH_ALL);
-	}
-	fixup_tlbie_lpid(lpid);
-	asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
-
-static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric)
-{
-	int set;
-
-	VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
-
-	asm volatile("ptesync": : :"memory");
-
-	/*
-	 * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
-	 * also flush the entire Page Walk Cache.
-	 */
-	__tlbiel_lpid_guest(lpid, 0, ric);
-
-	/* For PWC, only one flush is needed */
-	if (ric == RIC_FLUSH_PWC) {
-		asm volatile("ptesync": : :"memory");
-		return;
-	}
-
-	/* For the remaining sets, just flush the TLB */
-	for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
-		__tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB);
-
-	asm volatile("ptesync": : :"memory");
-	asm volatile(PPC_INVALIDATE_ERAT : : :"memory");
-}
-
-
-static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
-				    unsigned long pid, unsigned long page_size,
-				    unsigned long psize)
-{
-	unsigned long addr;
-	unsigned long ap = mmu_get_ap(psize);
-
-	for (addr = start; addr < end; addr += page_size)
-		__tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
-}
-
-static inline void _tlbiel_va(unsigned long va, unsigned long pid,
-			      unsigned long psize, unsigned long ric)
-{
-	unsigned long ap = mmu_get_ap(psize);
-
-	asm volatile("ptesync": : :"memory");
-	__tlbiel_va(va, pid, ap, ric);
-	asm volatile("ptesync": : :"memory");
-}
-
-static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
-				    unsigned long pid, unsigned long page_size,
-				    unsigned long psize, bool also_pwc)
-{
-	asm volatile("ptesync": : :"memory");
-	if (also_pwc)
-		__tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
-	__tlbiel_va_range(start, end, pid, page_size, psize);
-	asm volatile("ptesync": : :"memory");
-}
-
-static inline void __tlbie_va_range(unsigned long start, unsigned long end,
-				    unsigned long pid, unsigned long page_size,
-				    unsigned long psize)
-{
-	unsigned long addr;
-	unsigned long ap = mmu_get_ap(psize);
-
-	for (addr = start; addr < end; addr += page_size)
-		__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
-}
-
-static inline void _tlbie_va(unsigned long va, unsigned long pid,
-			      unsigned long psize, unsigned long ric)
-{
-	unsigned long ap = mmu_get_ap(psize);
-
-	asm volatile("ptesync": : :"memory");
-	__tlbie_va(va, pid, ap, ric);
-	fixup_tlbie();
-	asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
-
-static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
-			      unsigned long psize, unsigned long ric)
-{
-	unsigned long ap = mmu_get_ap(psize);
-
-	asm volatile("ptesync": : :"memory");
-	__tlbie_lpid_va(va, lpid, ap, ric);
-	fixup_tlbie_lpid(lpid);
-	asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
-
-static inline void _tlbie_va_range(unsigned long start, unsigned long end,
-				    unsigned long pid, unsigned long page_size,
-				    unsigned long psize, bool also_pwc)
-{
-	asm volatile("ptesync": : :"memory");
-	if (also_pwc)
-		__tlbie_pid(pid, RIC_FLUSH_PWC);
-	__tlbie_va_range(start, end, pid, page_size, psize);
-	fixup_tlbie();
-	asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
-
-/*
- * Base TLB flushing operations:
- *
- *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
- *  - flush_tlb_page(vma, vmaddr) flushes one page
- *  - flush_tlb_range(vma, start, end) flushes a range of pages
- *  - flush_tlb_kernel_range(start, end) flushes kernel pages
- *
- *  - local_* variants of page and mm only apply to the current
- *    processor
- */
-void radix__local_flush_tlb_mm(struct mm_struct *mm)
-{
-	unsigned long pid;
-
-	preempt_disable();
-	pid = mm->context.id;
-	if (pid != MMU_NO_CONTEXT)
-		_tlbiel_pid(pid, RIC_FLUSH_TLB);
-	preempt_enable();
-}
-EXPORT_SYMBOL(radix__local_flush_tlb_mm);
-
-#ifndef CONFIG_SMP
-void radix__local_flush_all_mm(struct mm_struct *mm)
-{
-	unsigned long pid;
-
-	preempt_disable();
-	pid = mm->context.id;
-	if (pid != MMU_NO_CONTEXT)
-		_tlbiel_pid(pid, RIC_FLUSH_ALL);
-	preempt_enable();
-}
-EXPORT_SYMBOL(radix__local_flush_all_mm);
-#endif /* CONFIG_SMP */
-
-void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
-				       int psize)
-{
-	unsigned long pid;
-
-	preempt_disable();
-	pid = mm->context.id;
-	if (pid != MMU_NO_CONTEXT)
-		_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
-	preempt_enable();
-}
-
-void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
-#ifdef CONFIG_HUGETLB_PAGE
-	/* need the return fix for nohash.c */
-	if (is_vm_hugetlb_page(vma))
-		return radix__local_flush_hugetlb_page(vma, vmaddr);
-#endif
-	radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
-}
-EXPORT_SYMBOL(radix__local_flush_tlb_page);
-
-static bool mm_is_singlethreaded(struct mm_struct *mm)
-{
-	if (atomic_read(&mm->context.copros) > 0)
-		return false;
-	if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm)
-		return true;
-	return false;
-}
-
-static bool mm_needs_flush_escalation(struct mm_struct *mm)
-{
-	/*
-	 * P9 nest MMU has issues with the page walk cache
-	 * caching PTEs and not flushing them properly when
-	 * RIC = 0 for a PID/LPID invalidate
-	 */
-	if (atomic_read(&mm->context.copros) > 0)
-		return true;
-	return false;
-}
-
-#ifdef CONFIG_SMP
-static void do_exit_flush_lazy_tlb(void *arg)
-{
-	struct mm_struct *mm = arg;
-	unsigned long pid = mm->context.id;
-
-	if (current->mm == mm)
-		return; /* Local CPU */
-
-	if (current->active_mm == mm) {
-		/*
-		 * Must be a kernel thread because sender is single-threaded.
-		 */
-		BUG_ON(current->mm);
-		mmgrab(&init_mm);
-		switch_mm(mm, &init_mm, current);
-		current->active_mm = &init_mm;
-		mmdrop(mm);
-	}
-	_tlbiel_pid(pid, RIC_FLUSH_ALL);
-}
-
-static void exit_flush_lazy_tlbs(struct mm_struct *mm)
-{
-	/*
-	 * Would be nice if this was async so it could be run in
-	 * parallel with our local flush, but generic code does not
-	 * give a good API for it. Could extend the generic code or
-	 * make a special powerpc IPI for flushing TLBs.
-	 * For now it's not too performance critical.
-	 */
-	smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
-				(void *)mm, 1);
-	mm_reset_thread_local(mm);
-}
-
-void radix__flush_tlb_mm(struct mm_struct *mm)
-{
-	unsigned long pid;
-
-	pid = mm->context.id;
-	if (unlikely(pid == MMU_NO_CONTEXT))
-		return;
-
-	preempt_disable();
-	/*
-	 * Order loads of mm_cpumask vs previous stores to clear ptes before
-	 * the invalidate. See barrier in switch_mm_irqs_off
-	 */
-	smp_mb();
-	if (!mm_is_thread_local(mm)) {
-		if (unlikely(mm_is_singlethreaded(mm))) {
-			exit_flush_lazy_tlbs(mm);
-			goto local;
-		}
-
-		if (mm_needs_flush_escalation(mm))
-			_tlbie_pid(pid, RIC_FLUSH_ALL);
-		else
-			_tlbie_pid(pid, RIC_FLUSH_TLB);
-	} else {
-local:
-		_tlbiel_pid(pid, RIC_FLUSH_TLB);
-	}
-	preempt_enable();
-}
-EXPORT_SYMBOL(radix__flush_tlb_mm);
-
-static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
-{
-	unsigned long pid;
-
-	pid = mm->context.id;
-	if (unlikely(pid == MMU_NO_CONTEXT))
-		return;
-
-	preempt_disable();
-	smp_mb(); /* see radix__flush_tlb_mm */
-	if (!mm_is_thread_local(mm)) {
-		if (unlikely(mm_is_singlethreaded(mm))) {
-			if (!fullmm) {
-				exit_flush_lazy_tlbs(mm);
-				goto local;
-			}
-		}
-		_tlbie_pid(pid, RIC_FLUSH_ALL);
-	} else {
-local:
-		_tlbiel_pid(pid, RIC_FLUSH_ALL);
-	}
-	preempt_enable();
-}
-void radix__flush_all_mm(struct mm_struct *mm)
-{
-	__flush_all_mm(mm, false);
-}
-EXPORT_SYMBOL(radix__flush_all_mm);
-
-void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
-{
-	tlb->need_flush_all = 1;
-}
-EXPORT_SYMBOL(radix__flush_tlb_pwc);
-
-void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
-				 int psize)
-{
-	unsigned long pid;
-
-	pid = mm->context.id;
-	if (unlikely(pid == MMU_NO_CONTEXT))
-		return;
-
-	preempt_disable();
-	smp_mb(); /* see radix__flush_tlb_mm */
-	if (!mm_is_thread_local(mm)) {
-		if (unlikely(mm_is_singlethreaded(mm))) {
-			exit_flush_lazy_tlbs(mm);
-			goto local;
-		}
-		_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
-	} else {
-local:
-		_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
-	}
-	preempt_enable();
-}
-
-void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
-#ifdef CONFIG_HUGETLB_PAGE
-	if (is_vm_hugetlb_page(vma))
-		return radix__flush_hugetlb_page(vma, vmaddr);
-#endif
-	radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
-}
-EXPORT_SYMBOL(radix__flush_tlb_page);
-
-#else /* CONFIG_SMP */
-#define radix__flush_all_mm radix__local_flush_all_mm
-#endif /* CONFIG_SMP */
-
-void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
-{
-	_tlbie_pid(0, RIC_FLUSH_ALL);
-}
-EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
-
-#define TLB_FLUSH_ALL -1UL
-
-/*
- * Number of pages above which we invalidate the entire PID rather than
- * flush individual pages, for local and global flushes respectively.
- *
- * tlbie goes out to the interconnect and individual ops are more costly.
- * It also does not iterate over sets like the local tlbiel variant when
- * invalidating a full PID, so it has a far lower threshold to change from
- * individual page flushes to full-pid flushes.
- */
-static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
-static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
-
-static inline void __radix__flush_tlb_range(struct mm_struct *mm,
-					unsigned long start, unsigned long end,
-					bool flush_all_sizes)
-
-{
-	unsigned long pid;
-	unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
-	unsigned long page_size = 1UL << page_shift;
-	unsigned long nr_pages = (end - start) >> page_shift;
-	bool local, full;
-
-	pid = mm->context.id;
-	if (unlikely(pid == MMU_NO_CONTEXT))
-		return;
-
-	preempt_disable();
-	smp_mb(); /* see radix__flush_tlb_mm */
-	if (!mm_is_thread_local(mm)) {
-		if (unlikely(mm_is_singlethreaded(mm))) {
-			if (end != TLB_FLUSH_ALL) {
-				exit_flush_lazy_tlbs(mm);
-				goto is_local;
-			}
-		}
-		local = false;
-		full = (end == TLB_FLUSH_ALL ||
-				nr_pages > tlb_single_page_flush_ceiling);
-	} else {
-is_local:
-		local = true;
-		full = (end == TLB_FLUSH_ALL ||
-				nr_pages > tlb_local_single_page_flush_ceiling);
-	}
-
-	if (full) {
-		if (local) {
-			_tlbiel_pid(pid, RIC_FLUSH_TLB);
-		} else {
-			if (mm_needs_flush_escalation(mm))
-				_tlbie_pid(pid, RIC_FLUSH_ALL);
-			else
-				_tlbie_pid(pid, RIC_FLUSH_TLB);
-		}
-	} else {
-		bool hflush = flush_all_sizes;
-		bool gflush = flush_all_sizes;
-		unsigned long hstart, hend;
-		unsigned long gstart, gend;
-
-		if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
-			hflush = true;
-
-		if (hflush) {
-			hstart = (start + PMD_SIZE - 1) & PMD_MASK;
-			hend = end & PMD_MASK;
-			if (hstart == hend)
-				hflush = false;
-		}
-
-		if (gflush) {
-			gstart = (start + PUD_SIZE - 1) & PUD_MASK;
-			gend = end & PUD_MASK;
-			if (gstart == gend)
-				gflush = false;
-		}
-
-		asm volatile("ptesync": : :"memory");
-		if (local) {
-			__tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
-			if (hflush)
-				__tlbiel_va_range(hstart, hend, pid,
-						PMD_SIZE, MMU_PAGE_2M);
-			if (gflush)
-				__tlbiel_va_range(gstart, gend, pid,
-						PUD_SIZE, MMU_PAGE_1G);
-			asm volatile("ptesync": : :"memory");
-		} else {
-			__tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
-			if (hflush)
-				__tlbie_va_range(hstart, hend, pid,
-						PMD_SIZE, MMU_PAGE_2M);
-			if (gflush)
-				__tlbie_va_range(gstart, gend, pid,
-						PUD_SIZE, MMU_PAGE_1G);
-			fixup_tlbie();
-			asm volatile("eieio; tlbsync; ptesync": : :"memory");
-		}
-	}
-	preempt_enable();
-}
-
-void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
-		     unsigned long end)
-
-{
-#ifdef CONFIG_HUGETLB_PAGE
-	if (is_vm_hugetlb_page(vma))
-		return radix__flush_hugetlb_tlb_range(vma, start, end);
-#endif
-
-	__radix__flush_tlb_range(vma->vm_mm, start, end, false);
-}
-EXPORT_SYMBOL(radix__flush_tlb_range);
-
-static int radix_get_mmu_psize(int page_size)
-{
-	int psize;
-
-	if (page_size == (1UL << mmu_psize_defs[mmu_virtual_psize].shift))
-		psize = mmu_virtual_psize;
-	else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_2M].shift))
-		psize = MMU_PAGE_2M;
-	else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_1G].shift))
-		psize = MMU_PAGE_1G;
-	else
-		return -1;
-	return psize;
-}
-
-/*
- * Flush partition scoped LPID address translation for all CPUs.
- */
-void radix__flush_tlb_lpid_page(unsigned int lpid,
-					unsigned long addr,
-					unsigned long page_size)
-{
-	int psize = radix_get_mmu_psize(page_size);
-
-	_tlbie_lpid_va(addr, lpid, psize, RIC_FLUSH_TLB);
-}
-EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid_page);
-
-/*
- * Flush partition scoped PWC from LPID for all CPUs.
- */
-void radix__flush_pwc_lpid(unsigned int lpid)
-{
-	_tlbie_lpid(lpid, RIC_FLUSH_PWC);
-}
-EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
-
-/*
- * Flush partition scoped translations from LPID (=LPIDR)
- */
-void radix__flush_tlb_lpid(unsigned int lpid)
-{
-	_tlbie_lpid(lpid, RIC_FLUSH_ALL);
-}
-EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid);
-
-/*
- * Flush partition scoped translations from LPID (=LPIDR)
- */
-void radix__local_flush_tlb_lpid(unsigned int lpid)
-{
-	_tlbiel_lpid(lpid, RIC_FLUSH_ALL);
-}
-EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid);
-
-/*
- * Flush process scoped translations from LPID (=LPIDR).
- * Important difference, the guest normally manages its own translations,
- * but some cases e.g., vCPU CPU migration require KVM to flush.
- */
-void radix__local_flush_tlb_lpid_guest(unsigned int lpid)
-{
-	_tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL);
-}
-EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest);
-
-
-static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
-				  unsigned long end, int psize);
-
-void radix__tlb_flush(struct mmu_gather *tlb)
-{
-	int psize = 0;
-	struct mm_struct *mm = tlb->mm;
-	int page_size = tlb->page_size;
-	unsigned long start = tlb->start;
-	unsigned long end = tlb->end;
-
-	/*
-	 * if page size is not something we understand, do a full mm flush
-	 *
-	 * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush
-	 * that flushes the process table entry cache upon process teardown.
-	 * See the comment for radix in arch_exit_mmap().
-	 */
-	if (tlb->fullmm) {
-		__flush_all_mm(mm, true);
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
-	} else if (mm_tlb_flush_nested(mm)) {
-		/*
-		 * If there is a concurrent invalidation that is clearing ptes,
-		 * then it's possible this invalidation will miss one of those
-		 * cleared ptes and miss flushing the TLB. If this invalidate
-		 * returns before the other one flushes TLBs, that can result
-		 * in it returning while there are still valid TLBs inside the
-		 * range to be invalidated.
-		 *
-		 * See mm/memory.c:tlb_finish_mmu() for more details.
-		 *
-		 * The solution to this is ensure the entire range is always
-		 * flushed here. The problem for powerpc is that the flushes
-		 * are page size specific, so this "forced flush" would not
-		 * do the right thing if there are a mix of page sizes in
-		 * the range to be invalidated. So use __flush_tlb_range
-		 * which invalidates all possible page sizes in the range.
-		 *
-		 * PWC flush probably is not be required because the core code
-		 * shouldn't free page tables in this path, but accounting
-		 * for the possibility makes us a bit more robust.
-		 *
-		 * need_flush_all is an uncommon case because page table
-		 * teardown should be done with exclusive locks held (but
-		 * after locks are dropped another invalidate could come
-		 * in), it could be optimized further if necessary.
-		 */
-		if (!tlb->need_flush_all)
-			__radix__flush_tlb_range(mm, start, end, true);
-		else
-			radix__flush_all_mm(mm);
-#endif
-	} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
-		if (!tlb->need_flush_all)
-			radix__flush_tlb_mm(mm);
-		else
-			radix__flush_all_mm(mm);
-	} else {
-		if (!tlb->need_flush_all)
-			radix__flush_tlb_range_psize(mm, start, end, psize);
-		else
-			radix__flush_tlb_pwc_range_psize(mm, start, end, psize);
-	}
-	tlb->need_flush_all = 0;
-}
-
-static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
-				unsigned long start, unsigned long end,
-				int psize, bool also_pwc)
-{
-	unsigned long pid;
-	unsigned int page_shift = mmu_psize_defs[psize].shift;
-	unsigned long page_size = 1UL << page_shift;
-	unsigned long nr_pages = (end - start) >> page_shift;
-	bool local, full;
-
-	pid = mm->context.id;
-	if (unlikely(pid == MMU_NO_CONTEXT))
-		return;
-
-	preempt_disable();
-	smp_mb(); /* see radix__flush_tlb_mm */
-	if (!mm_is_thread_local(mm)) {
-		if (unlikely(mm_is_singlethreaded(mm))) {
-			if (end != TLB_FLUSH_ALL) {
-				exit_flush_lazy_tlbs(mm);
-				goto is_local;
-			}
-		}
-		local = false;
-		full = (end == TLB_FLUSH_ALL ||
-				nr_pages > tlb_single_page_flush_ceiling);
-	} else {
-is_local:
-		local = true;
-		full = (end == TLB_FLUSH_ALL ||
-				nr_pages > tlb_local_single_page_flush_ceiling);
-	}
-
-	if (full) {
-		if (local) {
-			_tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
-		} else {
-			if (mm_needs_flush_escalation(mm))
-				also_pwc = true;
-
-			_tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
-		}
-	} else {
-		if (local)
-			_tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
-		else
-			_tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
-	}
-	preempt_enable();
-}
-
-void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
-				  unsigned long end, int psize)
-{
-	return __radix__flush_tlb_range_psize(mm, start, end, psize, false);
-}
-
-static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
-				  unsigned long end, int psize)
-{
-	__radix__flush_tlb_range_psize(mm, start, end, psize, true);
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
-{
-	unsigned long pid, end;
-
-	pid = mm->context.id;
-	if (unlikely(pid == MMU_NO_CONTEXT))
-		return;
-
-	/* 4k page size, just blow the world */
-	if (PAGE_SIZE == 0x1000) {
-		radix__flush_all_mm(mm);
-		return;
-	}
-
-	end = addr + HPAGE_PMD_SIZE;
-
-	/* Otherwise first do the PWC, then iterate the pages. */
-	preempt_disable();
-	smp_mb(); /* see radix__flush_tlb_mm */
-	if (!mm_is_thread_local(mm)) {
-		if (unlikely(mm_is_singlethreaded(mm))) {
-			exit_flush_lazy_tlbs(mm);
-			goto local;
-		}
-		_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
-	} else {
-local:
-		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
-	}
-
-	preempt_enable();
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
-				unsigned long start, unsigned long end)
-{
-	radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_2M);
-}
-EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
-
-void radix__flush_tlb_all(void)
-{
-	unsigned long rb,prs,r,rs;
-	unsigned long ric = RIC_FLUSH_ALL;
-
-	rb = 0x3 << PPC_BITLSHIFT(53); /* IS = 3 */
-	prs = 0; /* partition scoped */
-	r = 1;   /* radix format */
-	rs = 1 & ((1UL << 32) - 1); /* any LPID value to flush guest mappings */
-
-	asm volatile("ptesync": : :"memory");
-	/*
-	 * now flush guest entries by passing PRS = 1 and LPID != 0
-	 */
-	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
-		     : : "r"(rb), "i"(r), "i"(1), "i"(ric), "r"(rs) : "memory");
-	/*
-	 * now flush host entires by passing PRS = 0 and LPID == 0
-	 */
-	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
-		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory");
-	asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
-
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
-{
-	unsigned long pid = mm->context.id;
-
-	if (unlikely(pid == MMU_NO_CONTEXT))
-		return;
-
-	/*
-	 * If this context hasn't run on that CPU before and KVM is
-	 * around, there's a slim chance that the guest on another
-	 * CPU just brought in obsolete translation into the TLB of
-	 * this CPU due to a bad prefetch using the guest PID on
-	 * the way into the hypervisor.
-	 *
-	 * We work around this here. If KVM is possible, we check if
-	 * any sibling thread is in KVM. If it is, the window may exist
-	 * and thus we flush that PID from the core.
-	 *
-	 * A potential future improvement would be to mark which PIDs
-	 * have never been used on the system and avoid it if the PID
-	 * is new and the process has no other cpumask bit set.
-	 */
-	if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) {
-		int cpu = smp_processor_id();
-		int sib = cpu_first_thread_sibling(cpu);
-		bool flush = false;
-
-		for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) {
-			if (sib == cpu)
-				continue;
-			if (!cpu_possible(sib))
-				continue;
-			if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu)
-				flush = true;
-		}
-		if (flush)
-			_tlbiel_pid(pid, RIC_FLUSH_ALL);
-	}
-}
-EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
-#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
deleted file mode 100644
index 87d71dd25441..000000000000
--- a/arch/powerpc/mm/tlb_hash64.c
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * This file contains the routines for flushing entries from the
- * TLB and MMU hash table.
- *
- *  Derived from arch/ppc64/mm/init.c:
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
- *    Copyright (C) 1996 Paul Mackerras
- *
- *  Derived from "arch/i386/mm/init.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  Dave Engebretsen <engebret@us.ibm.com>
- *      Rework for PPC64 port.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/percpu.h>
-#include <linux/hardirq.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-#include <asm/tlb.h>
-#include <asm/bug.h>
-#include <asm/pte-walk.h>
-
-
-#include <trace/events/thp.h>
-
-DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
-
-/*
- * A linux PTE was changed and the corresponding hash table entry
- * neesd to be flushed. This function will either perform the flush
- * immediately or will batch it up if the current CPU has an active
- * batch on it.
- */
-void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep, unsigned long pte, int huge)
-{
-	unsigned long vpn;
-	struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
-	unsigned long vsid;
-	unsigned int psize;
-	int ssize;
-	real_pte_t rpte;
-	int i, offset;
-
-	i = batch->index;
-
-	/* Get page size (maybe move back to caller).
-	 *
-	 * NOTE: when using special 64K mappings in 4K environment like
-	 * for SPEs, we obtain the page size from the slice, which thus
-	 * must still exist (and thus the VMA not reused) at the time
-	 * of this call
-	 */
-	if (huge) {
-#ifdef CONFIG_HUGETLB_PAGE
-		psize = get_slice_psize(mm, addr);
-		/* Mask the address for the correct page size */
-		addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1);
-		if (unlikely(psize == MMU_PAGE_16G))
-			offset = PTRS_PER_PUD;
-		else
-			offset = PTRS_PER_PMD;
-#else
-		BUG();
-		psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
-#endif
-	} else {
-		psize = pte_pagesize_index(mm, addr, pte);
-		/* Mask the address for the standard page size.  If we
-		 * have a 64k page kernel, but the hardware does not
-		 * support 64k pages, this might be different from the
-		 * hardware page size encoded in the slice table. */
-		addr &= PAGE_MASK;
-		offset = PTRS_PER_PTE;
-	}
-
-
-	/* Build full vaddr */
-	if (!is_kernel_addr(addr)) {
-		ssize = user_segment_size(addr);
-		vsid = get_user_vsid(&mm->context, addr, ssize);
-	} else {
-		vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
-		ssize = mmu_kernel_ssize;
-	}
-	WARN_ON(vsid == 0);
-	vpn = hpt_vpn(addr, vsid, ssize);
-	rpte = __real_pte(__pte(pte), ptep, offset);
-
-	/*
-	 * Check if we have an active batch on this CPU. If not, just
-	 * flush now and return.
-	 */
-	if (!batch->active) {
-		flush_hash_page(vpn, rpte, psize, ssize, mm_is_thread_local(mm));
-		put_cpu_var(ppc64_tlb_batch);
-		return;
-	}
-
-	/*
-	 * This can happen when we are in the middle of a TLB batch and
-	 * we encounter memory pressure (eg copy_page_range when it tries
-	 * to allocate a new pte). If we have to reclaim memory and end
-	 * up scanning and resetting referenced bits then our batch context
-	 * will change mid stream.
-	 *
-	 * We also need to ensure only one page size is present in a given
-	 * batch
-	 */
-	if (i != 0 && (mm != batch->mm || batch->psize != psize ||
-		       batch->ssize != ssize)) {
-		__flush_tlb_pending(batch);
-		i = 0;
-	}
-	if (i == 0) {
-		batch->mm = mm;
-		batch->psize = psize;
-		batch->ssize = ssize;
-	}
-	batch->pte[i] = rpte;
-	batch->vpn[i] = vpn;
-	batch->index = ++i;
-	if (i >= PPC64_TLB_BATCH_NR)
-		__flush_tlb_pending(batch);
-	put_cpu_var(ppc64_tlb_batch);
-}
-
-/*
- * This function is called when terminating an mmu batch or when a batch
- * is full. It will perform the flush of all the entries currently stored
- * in a batch.
- *
- * Must be called from within some kind of spinlock/non-preempt region...
- */
-void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
-{
-	int i, local;
-
-	i = batch->index;
-	local = mm_is_thread_local(batch->mm);
-	if (i == 1)
-		flush_hash_page(batch->vpn[0], batch->pte[0],
-				batch->psize, batch->ssize, local);
-	else
-		flush_hash_range(i, local);
-	batch->index = 0;
-}
-
-void hash__tlb_flush(struct mmu_gather *tlb)
-{
-	struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch);
-
-	/* If there's a TLB batch pending, then we must flush it because the
-	 * pages are going to be freed and we really don't want to have a CPU
-	 * access a freed page because it has a stale TLB
-	 */
-	if (tlbbatch->index)
-		__flush_tlb_pending(tlbbatch);
-
-	put_cpu_var(ppc64_tlb_batch);
-}
-
-/**
- * __flush_hash_table_range - Flush all HPTEs for a given address range
- *                            from the hash table (and the TLB). But keeps
- *                            the linux PTEs intact.
- *
- * @mm		: mm_struct of the target address space (generally init_mm)
- * @start	: starting address
- * @end         : ending address (not included in the flush)
- *
- * This function is mostly to be used by some IO hotplug code in order
- * to remove all hash entries from a given address range used to map IO
- * space on a removed PCI-PCI bidge without tearing down the full mapping
- * since 64K pages may overlap with other bridges when using 64K pages
- * with 4K HW pages on IO space.
- *
- * Because of that usage pattern, it is implemented for small size rather
- * than speed.
- */
-void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
-			      unsigned long end)
-{
-	bool is_thp;
-	int hugepage_shift;
-	unsigned long flags;
-
-	start = _ALIGN_DOWN(start, PAGE_SIZE);
-	end = _ALIGN_UP(end, PAGE_SIZE);
-
-	BUG_ON(!mm->pgd);
-
-	/* Note: Normally, we should only ever use a batch within a
-	 * PTE locked section. This violates the rule, but will work
-	 * since we don't actually modify the PTEs, we just flush the
-	 * hash while leaving the PTEs intact (including their reference
-	 * to being hashed). This is not the most performance oriented
-	 * way to do things but is fine for our needs here.
-	 */
-	local_irq_save(flags);
-	arch_enter_lazy_mmu_mode();
-	for (; start < end; start += PAGE_SIZE) {
-		pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp,
-						  &hugepage_shift);
-		unsigned long pte;
-
-		if (ptep == NULL)
-			continue;
-		pte = pte_val(*ptep);
-		if (is_thp)
-			trace_hugepage_invalidate(start, pte);
-		if (!(pte & H_PAGE_HASHPTE))
-			continue;
-		if (unlikely(is_thp))
-			hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
-		else
-			hpte_need_flush(mm, start, ptep, pte, hugepage_shift);
-	}
-	arch_leave_lazy_mmu_mode();
-	local_irq_restore(flags);
-}
-
-void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
-{
-	pte_t *pte;
-	pte_t *start_pte;
-	unsigned long flags;
-
-	addr = _ALIGN_DOWN(addr, PMD_SIZE);
-	/* Note: Normally, we should only ever use a batch within a
-	 * PTE locked section. This violates the rule, but will work
-	 * since we don't actually modify the PTEs, we just flush the
-	 * hash while leaving the PTEs intact (including their reference
-	 * to being hashed). This is not the most performance oriented
-	 * way to do things but is fine for our needs here.
-	 */
-	local_irq_save(flags);
-	arch_enter_lazy_mmu_mode();
-	start_pte = pte_offset_map(pmd, addr);
-	for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
-		unsigned long pteval = pte_val(*pte);
-		if (pteval & H_PAGE_HASHPTE)
-			hpte_need_flush(mm, addr, pte, pteval, 0);
-		addr += PAGE_SIZE;
-	}
-	arch_leave_lazy_mmu_mode();
-	local_irq_restore(flags);
-}
diff --git a/arch/powerpc/mm/vphn.c b/arch/powerpc/mm/vphn.c
deleted file mode 100644
index f83044faac23..000000000000
--- a/arch/powerpc/mm/vphn.c
+++ /dev/null
@@ -1,71 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <asm/byteorder.h>
-#include "vphn.h"
-
-/*
- * The associativity domain numbers are returned from the hypervisor as a
- * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the
- * special value of "all ones" (aka. 0xffff) and its size may not exceed 48
- * bytes.
- *
- *    --- 16-bit fields -->
- *  _________________________
- *  |  0  |  1  |  2  |  3  |   be_packed[0]
- *  ------+-----+-----+------
- *  _________________________
- *  |  4  |  5  |  6  |  7  |   be_packed[1]
- *  -------------------------
- *            ...
- *  _________________________
- *  | 20  | 21  | 22  | 23  |   be_packed[5]
- *  -------------------------
- *
- * Convert to the sequence they would appear in the ibm,associativity property.
- */
-int vphn_unpack_associativity(const long *packed, __be32 *unpacked)
-{
-	__be64 be_packed[VPHN_REGISTER_COUNT];
-	int i, nr_assoc_doms = 0;
-	const __be16 *field = (const __be16 *) be_packed;
-	u16 last = 0;
-	bool is_32bit = false;
-
-#define VPHN_FIELD_UNUSED	(0xffff)
-#define VPHN_FIELD_MSB		(0x8000)
-#define VPHN_FIELD_MASK		(~VPHN_FIELD_MSB)
-
-	/* Let's fix the values returned by plpar_hcall9() */
-	for (i = 0; i < VPHN_REGISTER_COUNT; i++)
-		be_packed[i] = cpu_to_be64(packed[i]);
-
-	for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
-		u16 new = be16_to_cpup(field++);
-
-		if (is_32bit) {
-			/* Let's concatenate the 16 bits of this field to the
-			 * 15 lower bits of the previous field
-			 */
-			unpacked[++nr_assoc_doms] =
-				cpu_to_be32(last << 16 | new);
-			is_32bit = false;
-		} else if (new == VPHN_FIELD_UNUSED)
-			/* This is the list terminator */
-			break;
-		else if (new & VPHN_FIELD_MSB) {
-			/* Data is in the lower 15 bits of this field */
-			unpacked[++nr_assoc_doms] =
-				cpu_to_be32(new & VPHN_FIELD_MASK);
-		} else {
-			/* Data is in the lower 15 bits of this field
-			 * concatenated with the next 16 bit field
-			 */
-			last = new;
-			is_32bit = true;
-		}
-	}
-
-	/* The first cell contains the length of the property */
-	unpacked[0] = cpu_to_be32(nr_assoc_doms);
-
-	return nr_assoc_doms;
-}
diff --git a/arch/powerpc/mm/vphn.h b/arch/powerpc/mm/vphn.h
deleted file mode 100644
index f9ffdb3942fc..000000000000
--- a/arch/powerpc/mm/vphn.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ARCH_POWERPC_MM_VPHN_H_
-#define _ARCH_POWERPC_MM_VPHN_H_
-
-/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers.
- */
-#define VPHN_REGISTER_COUNT 6
-
-/*
- * 6 64-bit registers unpacked into up to 24 be32 associativity values. To
- * form the complete property we have to add the length in the first cell.
- */
-#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1)
-
-extern int vphn_unpack_associativity(const long *packed, __be32 *unpacked);
-
-#endif
diff --git a/tools/testing/selftests/powerpc/vphn/vphn.c b/tools/testing/selftests/powerpc/vphn/vphn.c
index 186b906e66d5..1d1f5f2be3b2 120000
--- a/tools/testing/selftests/powerpc/vphn/vphn.c
+++ b/tools/testing/selftests/powerpc/vphn/vphn.c
@@ -1 +1 @@
-../../../../../arch/powerpc/mm/vphn.c
\ No newline at end of file
+../../../../../arch/powerpc/mm/book3s64/vphn.c
\ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/vphn/vphn.h b/tools/testing/selftests/powerpc/vphn/vphn.h
index 7131efe38c65..45fe160f8288 120000
--- a/tools/testing/selftests/powerpc/vphn/vphn.h
+++ b/tools/testing/selftests/powerpc/vphn/vphn.h
@@ -1 +1 @@
-../../../../../arch/powerpc/mm/vphn.h
\ No newline at end of file
+../../../../../arch/powerpc/mm/book3s64/vphn.h
\ No newline at end of file
-- 
cgit v1.2.3-59-g8ed1b


From 17312f258cf6eb584f276ad592972ade7e16e318 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 29 Mar 2019 10:00:01 +0000
Subject: powerpc/mm: Move book3s32 specifics in subdirectory mm/book3s64

Several files in arch/powerpc/mm are only for book3S32. This patch
creates a subdirectory for them.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
[mpe: Shorten new filenames]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/Makefile               |   3 +-
 arch/powerpc/mm/book3s32/Makefile      |   3 +
 arch/powerpc/mm/book3s32/hash_low.S    | 705 +++++++++++++++++++++++++++++++++
 arch/powerpc/mm/book3s32/mmu.c         | 419 ++++++++++++++++++++
 arch/powerpc/mm/book3s32/mmu_context.c | 118 ++++++
 arch/powerpc/mm/book3s32/tlb.c         | 173 ++++++++
 arch/powerpc/mm/hash_low_32.S          | 705 ---------------------------------
 arch/powerpc/mm/mmu_context_hash32.c   | 118 ------
 arch/powerpc/mm/ppc_mmu_32.c           | 419 --------------------
 arch/powerpc/mm/tlb_hash32.c           | 173 --------
 10 files changed, 1419 insertions(+), 1417 deletions(-)
 create mode 100644 arch/powerpc/mm/book3s32/Makefile
 create mode 100644 arch/powerpc/mm/book3s32/hash_low.S
 create mode 100644 arch/powerpc/mm/book3s32/mmu.c
 create mode 100644 arch/powerpc/mm/book3s32/mmu_context.c
 create mode 100644 arch/powerpc/mm/book3s32/tlb.c
 delete mode 100644 arch/powerpc/mm/hash_low_32.S
 delete mode 100644 arch/powerpc/mm/mmu_context_hash32.c
 delete mode 100644 arch/powerpc/mm/ppc_mmu_32.c
 delete mode 100644 arch/powerpc/mm/tlb_hash32.c

(limited to 'arch')

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index a137fdf775e2..68cb1e840b5e 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -12,11 +12,10 @@ obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
 				   tlb_nohash_low.o
 obj-$(CONFIG_PPC_BOOK3E)	+= tlb_low_$(BITS)e.o
 obj-$(CONFIG_PPC_BOOK3E_64)   += pgtable-book3e.o
+obj-$(CONFIG_PPC_BOOK3S_32)	+= book3s32/
 obj-$(CONFIG_PPC_BOOK3S_64)	+= book3s64/
 obj-$(CONFIG_PPC_BOOK3S_64)	+= pgtable-frag.o
 obj-$(CONFIG_PPC32)		+= pgtable-frag.o
-obj-$(CONFIG_PPC_BOOK3S_32)	+= ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o
-obj-$(CONFIG_PPC_BOOK3S_32)	+= tlb_hash32.o
 obj-$(CONFIG_40x)		+= 40x_mmu.o
 obj-$(CONFIG_44x)		+= 44x_mmu.o
 obj-$(CONFIG_PPC_8xx)		+= 8xx_mmu.o
diff --git a/arch/powerpc/mm/book3s32/Makefile b/arch/powerpc/mm/book3s32/Makefile
new file mode 100644
index 000000000000..a4e217d0f3b7
--- /dev/null
+++ b/arch/powerpc/mm/book3s32/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y += mmu.o hash_low.o mmu_context.o tlb.o
diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S
new file mode 100644
index 000000000000..e27792d0b744
--- /dev/null
+++ b/arch/powerpc/mm/book3s32/hash_low.S
@@ -0,0 +1,705 @@
+/*
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *  Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP
+ *    Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu>
+ *  Adapted for Power Macintosh by Paul Mackerras.
+ *  Low-level exception handlers and MMU support
+ *  rewritten by Paul Mackerras.
+ *    Copyright (C) 1996 Paul Mackerras.
+ *
+ *  This file contains low-level assembler routines for managing
+ *  the PowerPC MMU hash table.  (PPC 8xx processors don't use a
+ *  hash table, so this file is not used on them.)
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/cputable.h>
+#include <asm/ppc_asm.h>
+#include <asm/thread_info.h>
+#include <asm/asm-offsets.h>
+#include <asm/export.h>
+#include <asm/feature-fixups.h>
+#include <asm/code-patching-asm.h>
+
+#ifdef CONFIG_SMP
+	.section .bss
+	.align	2
+mmu_hash_lock:
+	.space	4
+#endif /* CONFIG_SMP */
+
+/*
+ * Load a PTE into the hash table, if possible.
+ * The address is in r4, and r3 contains an access flag:
+ * _PAGE_RW (0x400) if a write.
+ * r9 contains the SRR1 value, from which we use the MSR_PR bit.
+ * SPRG_THREAD contains the physical address of the current task's thread.
+ *
+ * Returns to the caller if the access is illegal or there is no
+ * mapping for the address.  Otherwise it places an appropriate PTE
+ * in the hash table and returns from the exception.
+ * Uses r0, r3 - r6, r8, r10, ctr, lr.
+ */
+	.text
+_GLOBAL(hash_page)
+#ifdef CONFIG_SMP
+	lis	r8, (mmu_hash_lock - PAGE_OFFSET)@h
+	ori	r8, r8, (mmu_hash_lock - PAGE_OFFSET)@l
+	lis	r0,0x0fff
+	b	10f
+11:	lwz	r6,0(r8)
+	cmpwi	0,r6,0
+	bne	11b
+10:	lwarx	r6,0,r8
+	cmpwi	0,r6,0
+	bne-	11b
+	stwcx.	r0,0,r8
+	bne-	10b
+	isync
+#endif
+	/* Get PTE (linux-style) and check access */
+	lis	r0,KERNELBASE@h		/* check if kernel address */
+	cmplw	0,r4,r0
+	ori	r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
+	mfspr	r5, SPRN_SPRG_PGDIR	/* phys page-table root */
+	blt+	112f			/* assume user more likely */
+	lis	r5, (swapper_pg_dir - PAGE_OFFSET)@ha	/* if kernel address, use */
+	addi	r5 ,r5 ,(swapper_pg_dir - PAGE_OFFSET)@l	/* kernel page table */
+	rlwimi	r3,r9,32-12,29,29	/* MSR_PR -> _PAGE_USER */
+112:
+#ifndef CONFIG_PTE_64BIT
+	rlwimi	r5,r4,12,20,29		/* insert top 10 bits of address */
+	lwz	r8,0(r5)		/* get pmd entry */
+	rlwinm.	r8,r8,0,0,19		/* extract address of pte page */
+#else
+	rlwinm	r8,r4,13,19,29		/* Compute pgdir/pmd offset */
+	lwzx	r8,r8,r5		/* Get L1 entry */
+	rlwinm.	r8,r8,0,0,20		/* extract pt base address */
+#endif
+#ifdef CONFIG_SMP
+	beq-	hash_page_out		/* return if no mapping */
+#else
+	/* XXX it seems like the 601 will give a machine fault on the
+	   rfi if its alignment is wrong (bottom 4 bits of address are
+	   8 or 0xc) and we have had a not-taken conditional branch
+	   to the address following the rfi. */
+	beqlr-
+#endif
+#ifndef CONFIG_PTE_64BIT
+	rlwimi	r8,r4,22,20,29		/* insert next 10 bits of address */
+#else
+	rlwimi	r8,r4,23,20,28		/* compute pte address */
+#endif
+	rlwinm	r0,r3,32-3,24,24	/* _PAGE_RW access -> _PAGE_DIRTY */
+	ori	r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE
+
+	/*
+	 * Update the linux PTE atomically.  We do the lwarx up-front
+	 * because almost always, there won't be a permission violation
+	 * and there won't already be an HPTE, and thus we will have
+	 * to update the PTE to set _PAGE_HASHPTE.  -- paulus.
+	 *
+	 * If PTE_64BIT is set, the low word is the flags word; use that
+	 * word for locking since it contains all the interesting bits.
+	 */
+#if (PTE_FLAGS_OFFSET != 0)
+	addi	r8,r8,PTE_FLAGS_OFFSET
+#endif
+retry:
+	lwarx	r6,0,r8			/* get linux-style pte, flag word */
+	andc.	r5,r3,r6		/* check access & ~permission */
+#ifdef CONFIG_SMP
+	bne-	hash_page_out		/* return if access not permitted */
+#else
+	bnelr-
+#endif
+	or	r5,r0,r6		/* set accessed/dirty bits */
+#ifdef CONFIG_PTE_64BIT
+#ifdef CONFIG_SMP
+	subf	r10,r6,r8		/* create false data dependency */
+	subi	r10,r10,PTE_FLAGS_OFFSET
+	lwzx	r10,r6,r10		/* Get upper PTE word */
+#else
+	lwz	r10,-PTE_FLAGS_OFFSET(r8)
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_PTE_64BIT */
+	stwcx.	r5,0,r8			/* attempt to update PTE */
+	bne-	retry			/* retry if someone got there first */
+
+	mfsrin	r3,r4			/* get segment reg for segment */
+	mfctr	r0
+	stw	r0,_CTR(r11)
+	bl	create_hpte		/* add the hash table entry */
+
+#ifdef CONFIG_SMP
+	eieio
+	lis	r8, (mmu_hash_lock - PAGE_OFFSET)@ha
+	li	r0,0
+	stw	r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8)
+#endif
+
+	/* Return from the exception */
+	lwz	r5,_CTR(r11)
+	mtctr	r5
+	lwz	r0,GPR0(r11)
+	lwz	r8,GPR8(r11)
+	b	fast_exception_return
+
+#ifdef CONFIG_SMP
+hash_page_out:
+	eieio
+	lis	r8, (mmu_hash_lock - PAGE_OFFSET)@ha
+	li	r0,0
+	stw	r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8)
+	blr
+#endif /* CONFIG_SMP */
+
+/*
+ * Add an entry for a particular page to the hash table.
+ *
+ * add_hash_page(unsigned context, unsigned long va, unsigned long pmdval)
+ *
+ * We assume any necessary modifications to the pte (e.g. setting
+ * the accessed bit) have already been done and that there is actually
+ * a hash table in use (i.e. we're not on a 603).
+ */
+_GLOBAL(add_hash_page)
+	mflr	r0
+	stw	r0,4(r1)
+
+	/* Convert context and va to VSID */
+	mulli	r3,r3,897*16		/* multiply context by context skew */
+	rlwinm	r0,r4,4,28,31		/* get ESID (top 4 bits of va) */
+	mulli	r0,r0,0x111		/* multiply by ESID skew */
+	add	r3,r3,r0		/* note create_hpte trims to 24 bits */
+
+#ifdef CONFIG_SMP
+	lwz	r8,TASK_CPU(r2)		/* to go in mmu_hash_lock */
+	oris	r8,r8,12
+#endif /* CONFIG_SMP */
+
+	/*
+	 * We disable interrupts here, even on UP, because we don't
+	 * want to race with hash_page, and because we want the
+	 * _PAGE_HASHPTE bit to be a reliable indication of whether
+	 * the HPTE exists (or at least whether one did once).
+	 * We also turn off the MMU for data accesses so that we
+	 * we can't take a hash table miss (assuming the code is
+	 * covered by a BAT).  -- paulus
+	 */
+	mfmsr	r9
+	SYNC
+	rlwinm	r0,r9,0,17,15		/* clear bit 16 (MSR_EE) */
+	rlwinm	r0,r0,0,28,26		/* clear MSR_DR */
+	mtmsr	r0
+	SYNC_601
+	isync
+
+#ifdef CONFIG_SMP
+	lis	r6, (mmu_hash_lock - PAGE_OFFSET)@ha
+	addi	r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l
+10:	lwarx	r0,0,r6			/* take the mmu_hash_lock */
+	cmpi	0,r0,0
+	bne-	11f
+	stwcx.	r8,0,r6
+	beq+	12f
+11:	lwz	r0,0(r6)
+	cmpi	0,r0,0
+	beq	10b
+	b	11b
+12:	isync
+#endif
+
+	/*
+	 * Fetch the linux pte and test and set _PAGE_HASHPTE atomically.
+	 * If _PAGE_HASHPTE was already set, we don't replace the existing
+	 * HPTE, so we just unlock and return.
+	 */
+	mr	r8,r5
+#ifndef CONFIG_PTE_64BIT
+	rlwimi	r8,r4,22,20,29
+#else
+	rlwimi	r8,r4,23,20,28
+	addi	r8,r8,PTE_FLAGS_OFFSET
+#endif
+1:	lwarx	r6,0,r8
+	andi.	r0,r6,_PAGE_HASHPTE
+	bne	9f			/* if HASHPTE already set, done */
+#ifdef CONFIG_PTE_64BIT
+#ifdef CONFIG_SMP
+	subf	r10,r6,r8		/* create false data dependency */
+	subi	r10,r10,PTE_FLAGS_OFFSET
+	lwzx	r10,r6,r10		/* Get upper PTE word */
+#else
+	lwz	r10,-PTE_FLAGS_OFFSET(r8)
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_PTE_64BIT */
+	ori	r5,r6,_PAGE_HASHPTE
+	stwcx.	r5,0,r8
+	bne-	1b
+
+	bl	create_hpte
+
+9:
+#ifdef CONFIG_SMP
+	lis	r6, (mmu_hash_lock - PAGE_OFFSET)@ha
+	addi	r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l
+	eieio
+	li	r0,0
+	stw	r0,0(r6)		/* clear mmu_hash_lock */
+#endif
+
+	/* reenable interrupts and DR */
+	mtmsr	r9
+	SYNC_601
+	isync
+
+	lwz	r0,4(r1)
+	mtlr	r0
+	blr
+
+/*
+ * This routine adds a hardware PTE to the hash table.
+ * It is designed to be called with the MMU either on or off.
+ * r3 contains the VSID, r4 contains the virtual address,
+ * r5 contains the linux PTE, r6 contains the old value of the
+ * linux PTE (before setting _PAGE_HASHPTE). r10 contains the
+ * upper half of the PTE if CONFIG_PTE_64BIT.
+ * On SMP, the caller should have the mmu_hash_lock held.
+ * We assume that the caller has (or will) set the _PAGE_HASHPTE
+ * bit in the linux PTE in memory.  The value passed in r6 should
+ * be the old linux PTE value; if it doesn't have _PAGE_HASHPTE set
+ * this routine will skip the search for an existing HPTE.
+ * This procedure modifies r0, r3 - r6, r8, cr0.
+ *  -- paulus.
+ *
+ * For speed, 4 of the instructions get patched once the size and
+ * physical address of the hash table are known.  These definitions
+ * of Hash_base and Hash_bits below are just an example.
+ */
+Hash_base = 0xc0180000
+Hash_bits = 12				/* e.g. 256kB hash table */
+Hash_msk = (((1 << Hash_bits) - 1) * 64)
+
+/* defines for the PTE format for 32-bit PPCs */
+#define HPTE_SIZE	8
+#define PTEG_SIZE	64
+#define LG_PTEG_SIZE	6
+#define LDPTEu		lwzu
+#define LDPTE		lwz
+#define STPTE		stw
+#define CMPPTE		cmpw
+#define PTE_H		0x40
+#define PTE_V		0x80000000
+#define TST_V(r)	rlwinm. r,r,0,0,0
+#define SET_V(r)	oris r,r,PTE_V@h
+#define CLR_V(r,t)	rlwinm r,r,0,1,31
+
+#define HASH_LEFT	31-(LG_PTEG_SIZE+Hash_bits-1)
+#define HASH_RIGHT	31-LG_PTEG_SIZE
+
+_GLOBAL(create_hpte)
+	/* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */
+	rlwinm	r8,r5,32-9,30,30	/* _PAGE_RW -> PP msb */
+	rlwinm	r0,r5,32-6,30,30	/* _PAGE_DIRTY -> PP msb */
+	and	r8,r8,r0		/* writable if _RW & _DIRTY */
+	rlwimi	r5,r5,32-1,30,30	/* _PAGE_USER -> PP msb */
+	rlwimi	r5,r5,32-2,31,31	/* _PAGE_USER -> PP lsb */
+	ori	r8,r8,0xe04		/* clear out reserved bits */
+	andc	r8,r5,r8		/* PP = user? (rw&dirty? 1: 3): 0 */
+BEGIN_FTR_SECTION
+	rlwinm	r8,r8,0,~_PAGE_COHERENT	/* clear M (coherence not required) */
+END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
+#ifdef CONFIG_PTE_64BIT
+	/* Put the XPN bits into the PTE */
+	rlwimi	r8,r10,8,20,22
+	rlwimi	r8,r10,2,29,29
+#endif
+
+	/* Construct the high word of the PPC-style PTE (r5) */
+	rlwinm	r5,r3,7,1,24		/* put VSID in 0x7fffff80 bits */
+	rlwimi	r5,r4,10,26,31		/* put in API (abbrev page index) */
+	SET_V(r5)			/* set V (valid) bit */
+
+	patch_site	0f, patch__hash_page_A0
+	patch_site	1f, patch__hash_page_A1
+	patch_site	2f, patch__hash_page_A2
+	/* Get the address of the primary PTE group in the hash table (r3) */
+0:	lis	r0, (Hash_base - PAGE_OFFSET)@h	/* base address of hash table */
+1:	rlwimi	r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT    /* VSID -> hash */
+2:	rlwinm	r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
+	xor	r3,r3,r0		/* make primary hash */
+	li	r0,8			/* PTEs/group */
+
+	/*
+	 * Test the _PAGE_HASHPTE bit in the old linux PTE, and skip the search
+	 * if it is clear, meaning that the HPTE isn't there already...
+	 */
+	andi.	r6,r6,_PAGE_HASHPTE
+	beq+	10f			/* no PTE: go look for an empty slot */
+	tlbie	r4
+
+	lis	r4, (htab_hash_searches - PAGE_OFFSET)@ha
+	lwz	r6, (htab_hash_searches - PAGE_OFFSET)@l(r4)
+	addi	r6,r6,1			/* count how many searches we do */
+	stw	r6, (htab_hash_searches - PAGE_OFFSET)@l(r4)
+
+	/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
+	mtctr	r0
+	addi	r4,r3,-HPTE_SIZE
+1:	LDPTEu	r6,HPTE_SIZE(r4)	/* get next PTE */
+	CMPPTE	0,r6,r5
+	bdnzf	2,1b			/* loop while ctr != 0 && !cr0.eq */
+	beq+	found_slot
+
+	patch_site	0f, patch__hash_page_B
+	/* Search the secondary PTEG for a matching PTE */
+	ori	r5,r5,PTE_H		/* set H (secondary hash) bit */
+0:	xoris	r4,r3,Hash_msk>>16	/* compute secondary hash */
+	xori	r4,r4,(-PTEG_SIZE & 0xffff)
+	addi	r4,r4,-HPTE_SIZE
+	mtctr	r0
+2:	LDPTEu	r6,HPTE_SIZE(r4)
+	CMPPTE	0,r6,r5
+	bdnzf	2,2b
+	beq+	found_slot
+	xori	r5,r5,PTE_H		/* clear H bit again */
+
+	/* Search the primary PTEG for an empty slot */
+10:	mtctr	r0
+	addi	r4,r3,-HPTE_SIZE	/* search primary PTEG */
+1:	LDPTEu	r6,HPTE_SIZE(r4)	/* get next PTE */
+	TST_V(r6)			/* test valid bit */
+	bdnzf	2,1b			/* loop while ctr != 0 && !cr0.eq */
+	beq+	found_empty
+
+	/* update counter of times that the primary PTEG is full */
+	lis	r4, (primary_pteg_full - PAGE_OFFSET)@ha
+	lwz	r6, (primary_pteg_full - PAGE_OFFSET)@l(r4)
+	addi	r6,r6,1
+	stw	r6, (primary_pteg_full - PAGE_OFFSET)@l(r4)
+
+	patch_site	0f, patch__hash_page_C
+	/* Search the secondary PTEG for an empty slot */
+	ori	r5,r5,PTE_H		/* set H (secondary hash) bit */
+0:	xoris	r4,r3,Hash_msk>>16	/* compute secondary hash */
+	xori	r4,r4,(-PTEG_SIZE & 0xffff)
+	addi	r4,r4,-HPTE_SIZE
+	mtctr	r0
+2:	LDPTEu	r6,HPTE_SIZE(r4)
+	TST_V(r6)
+	bdnzf	2,2b
+	beq+	found_empty
+	xori	r5,r5,PTE_H		/* clear H bit again */
+
+	/*
+	 * Choose an arbitrary slot in the primary PTEG to overwrite.
+	 * Since both the primary and secondary PTEGs are full, and we
+	 * have no information that the PTEs in the primary PTEG are
+	 * more important or useful than those in the secondary PTEG,
+	 * and we know there is a definite (although small) speed
+	 * advantage to putting the PTE in the primary PTEG, we always
+	 * put the PTE in the primary PTEG.
+	 *
+	 * In addition, we skip any slot that is mapping kernel text in
+	 * order to avoid a deadlock when not using BAT mappings if
+	 * trying to hash in the kernel hash code itself after it has
+	 * already taken the hash table lock. This works in conjunction
+	 * with pre-faulting of the kernel text.
+	 *
+	 * If the hash table bucket is full of kernel text entries, we'll
+	 * lockup here but that shouldn't happen
+	 */
+
+1:	lis	r4, (next_slot - PAGE_OFFSET)@ha	/* get next evict slot */
+	lwz	r6, (next_slot - PAGE_OFFSET)@l(r4)
+	addi	r6,r6,HPTE_SIZE			/* search for candidate */
+	andi.	r6,r6,7*HPTE_SIZE
+	stw	r6,next_slot@l(r4)
+	add	r4,r3,r6
+	LDPTE	r0,HPTE_SIZE/2(r4)		/* get PTE second word */
+	clrrwi	r0,r0,12
+	lis	r6,etext@h
+	ori	r6,r6,etext@l			/* get etext */
+	tophys(r6,r6)
+	cmpl	cr0,r0,r6			/* compare and try again */
+	blt	1b
+
+#ifndef CONFIG_SMP
+	/* Store PTE in PTEG */
+found_empty:
+	STPTE	r5,0(r4)
+found_slot:
+	STPTE	r8,HPTE_SIZE/2(r4)
+
+#else /* CONFIG_SMP */
+/*
+ * Between the tlbie above and updating the hash table entry below,
+ * another CPU could read the hash table entry and put it in its TLB.
+ * There are 3 cases:
+ * 1. using an empty slot
+ * 2. updating an earlier entry to change permissions (i.e. enable write)
+ * 3. taking over the PTE for an unrelated address
+ *
+ * In each case it doesn't really matter if the other CPUs have the old
+ * PTE in their TLB.  So we don't need to bother with another tlbie here,
+ * which is convenient as we've overwritten the register that had the
+ * address. :-)  The tlbie above is mainly to make sure that this CPU comes
+ * and gets the new PTE from the hash table.
+ *
+ * We do however have to make sure that the PTE is never in an invalid
+ * state with the V bit set.
+ */
+found_empty:
+found_slot:
+	CLR_V(r5,r0)		/* clear V (valid) bit in PTE */
+	STPTE	r5,0(r4)
+	sync
+	TLBSYNC
+	STPTE	r8,HPTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */
+	sync
+	SET_V(r5)
+	STPTE	r5,0(r4)	/* finally set V bit in PTE */
+#endif /* CONFIG_SMP */
+
+	sync		/* make sure pte updates get to memory */
+	blr
+
+	.section .bss
+	.align	2
+next_slot:
+	.space	4
+primary_pteg_full:
+	.space	4
+htab_hash_searches:
+	.space	4
+	.previous
+
+/*
+ * Flush the entry for a particular page from the hash table.
+ *
+ * flush_hash_pages(unsigned context, unsigned long va, unsigned long pmdval,
+ *		    int count)
+ *
+ * We assume that there is a hash table in use (Hash != 0).
+ */
+_GLOBAL(flush_hash_pages)
+	/*
+	 * We disable interrupts here, even on UP, because we want
+	 * the _PAGE_HASHPTE bit to be a reliable indication of
+	 * whether the HPTE exists (or at least whether one did once).
+	 * We also turn off the MMU for data accesses so that we
+	 * we can't take a hash table miss (assuming the code is
+	 * covered by a BAT).  -- paulus
+	 */
+	mfmsr	r10
+	SYNC
+	rlwinm	r0,r10,0,17,15		/* clear bit 16 (MSR_EE) */
+	rlwinm	r0,r0,0,28,26		/* clear MSR_DR */
+	mtmsr	r0
+	SYNC_601
+	isync
+
+	/* First find a PTE in the range that has _PAGE_HASHPTE set */
+#ifndef CONFIG_PTE_64BIT
+	rlwimi	r5,r4,22,20,29
+#else
+	rlwimi	r5,r4,23,20,28
+#endif
+1:	lwz	r0,PTE_FLAGS_OFFSET(r5)
+	cmpwi	cr1,r6,1
+	andi.	r0,r0,_PAGE_HASHPTE
+	bne	2f
+	ble	cr1,19f
+	addi	r4,r4,0x1000
+	addi	r5,r5,PTE_SIZE
+	addi	r6,r6,-1
+	b	1b
+
+	/* Convert context and va to VSID */
+2:	mulli	r3,r3,897*16		/* multiply context by context skew */
+	rlwinm	r0,r4,4,28,31		/* get ESID (top 4 bits of va) */
+	mulli	r0,r0,0x111		/* multiply by ESID skew */
+	add	r3,r3,r0		/* note code below trims to 24 bits */
+
+	/* Construct the high word of the PPC-style PTE (r11) */
+	rlwinm	r11,r3,7,1,24		/* put VSID in 0x7fffff80 bits */
+	rlwimi	r11,r4,10,26,31		/* put in API (abbrev page index) */
+	SET_V(r11)			/* set V (valid) bit */
+
+#ifdef CONFIG_SMP
+	lis	r9, (mmu_hash_lock - PAGE_OFFSET)@ha
+	addi	r9, r9, (mmu_hash_lock - PAGE_OFFSET)@l
+	lwz	r8,TASK_CPU(r2)
+	oris	r8,r8,9
+10:	lwarx	r0,0,r9
+	cmpi	0,r0,0
+	bne-	11f
+	stwcx.	r8,0,r9
+	beq+	12f
+11:	lwz	r0,0(r9)
+	cmpi	0,r0,0
+	beq	10b
+	b	11b
+12:	isync
+#endif
+
+	/*
+	 * Check the _PAGE_HASHPTE bit in the linux PTE.  If it is
+	 * already clear, we're done (for this pte).  If not,
+	 * clear it (atomically) and proceed.  -- paulus.
+	 */
+#if (PTE_FLAGS_OFFSET != 0)
+	addi	r5,r5,PTE_FLAGS_OFFSET
+#endif
+33:	lwarx	r8,0,r5			/* fetch the pte flags word */
+	andi.	r0,r8,_PAGE_HASHPTE
+	beq	8f			/* done if HASHPTE is already clear */
+	rlwinm	r8,r8,0,31,29		/* clear HASHPTE bit */
+	stwcx.	r8,0,r5			/* update the pte */
+	bne-	33b
+
+	patch_site	0f, patch__flush_hash_A0
+	patch_site	1f, patch__flush_hash_A1
+	patch_site	2f, patch__flush_hash_A2
+	/* Get the address of the primary PTE group in the hash table (r3) */
+0:	lis	r8, (Hash_base - PAGE_OFFSET)@h	/* base address of hash table */
+1:	rlwimi	r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT    /* VSID -> hash */
+2:	rlwinm	r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
+	xor	r8,r0,r8		/* make primary hash */
+
+	/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
+	li	r0,8			/* PTEs/group */
+	mtctr	r0
+	addi	r12,r8,-HPTE_SIZE
+1:	LDPTEu	r0,HPTE_SIZE(r12)	/* get next PTE */
+	CMPPTE	0,r0,r11
+	bdnzf	2,1b			/* loop while ctr != 0 && !cr0.eq */
+	beq+	3f
+
+	patch_site	0f, patch__flush_hash_B
+	/* Search the secondary PTEG for a matching PTE */
+	ori	r11,r11,PTE_H		/* set H (secondary hash) bit */
+	li	r0,8			/* PTEs/group */
+0:	xoris	r12,r8,Hash_msk>>16	/* compute secondary hash */
+	xori	r12,r12,(-PTEG_SIZE & 0xffff)
+	addi	r12,r12,-HPTE_SIZE
+	mtctr	r0
+2:	LDPTEu	r0,HPTE_SIZE(r12)
+	CMPPTE	0,r0,r11
+	bdnzf	2,2b
+	xori	r11,r11,PTE_H		/* clear H again */
+	bne-	4f			/* should rarely fail to find it */
+
+3:	li	r0,0
+	STPTE	r0,0(r12)		/* invalidate entry */
+4:	sync
+	tlbie	r4			/* in hw tlb too */
+	sync
+
+8:	ble	cr1,9f			/* if all ptes checked */
+81:	addi	r6,r6,-1
+	addi	r5,r5,PTE_SIZE
+	addi	r4,r4,0x1000
+	lwz	r0,0(r5)		/* check next pte */
+	cmpwi	cr1,r6,1
+	andi.	r0,r0,_PAGE_HASHPTE
+	bne	33b
+	bgt	cr1,81b
+
+9:
+#ifdef CONFIG_SMP
+	TLBSYNC
+	li	r0,0
+	stw	r0,0(r9)		/* clear mmu_hash_lock */
+#endif
+
+19:	mtmsr	r10
+	SYNC_601
+	isync
+	blr
+EXPORT_SYMBOL(flush_hash_pages)
+
+/*
+ * Flush an entry from the TLB
+ */
+_GLOBAL(_tlbie)
+#ifdef CONFIG_SMP
+	lwz	r8,TASK_CPU(r2)
+	oris	r8,r8,11
+	mfmsr	r10
+	SYNC
+	rlwinm	r0,r10,0,17,15		/* clear bit 16 (MSR_EE) */
+	rlwinm	r0,r0,0,28,26		/* clear DR */
+	mtmsr	r0
+	SYNC_601
+	isync
+	lis	r9,mmu_hash_lock@h
+	ori	r9,r9,mmu_hash_lock@l
+	tophys(r9,r9)
+10:	lwarx	r7,0,r9
+	cmpwi	0,r7,0
+	bne-	10b
+	stwcx.	r8,0,r9
+	bne-	10b
+	eieio
+	tlbie	r3
+	sync
+	TLBSYNC
+	li	r0,0
+	stw	r0,0(r9)		/* clear mmu_hash_lock */
+	mtmsr	r10
+	SYNC_601
+	isync
+#else /* CONFIG_SMP */
+	tlbie	r3
+	sync
+#endif /* CONFIG_SMP */
+	blr
+
+/*
+ * Flush the entire TLB. 603/603e only
+ */
+_GLOBAL(_tlbia)
+#if defined(CONFIG_SMP)
+	lwz	r8,TASK_CPU(r2)
+	oris	r8,r8,10
+	mfmsr	r10
+	SYNC
+	rlwinm	r0,r10,0,17,15		/* clear bit 16 (MSR_EE) */
+	rlwinm	r0,r0,0,28,26		/* clear DR */
+	mtmsr	r0
+	SYNC_601
+	isync
+	lis	r9,mmu_hash_lock@h
+	ori	r9,r9,mmu_hash_lock@l
+	tophys(r9,r9)
+10:	lwarx	r7,0,r9
+	cmpwi	0,r7,0
+	bne-	10b
+	stwcx.	r8,0,r9
+	bne-	10b
+	sync
+	tlbia
+	sync
+	TLBSYNC
+	li	r0,0
+	stw	r0,0(r9)		/* clear mmu_hash_lock */
+	mtmsr	r10
+	SYNC_601
+	isync
+#else /* CONFIG_SMP */
+	sync
+	tlbia
+	sync
+#endif /* CONFIG_SMP */
+	blr
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
new file mode 100644
index 000000000000..1db55159031c
--- /dev/null
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -0,0 +1,419 @@
+/*
+ * This file contains the routines for handling the MMU on those
+ * PowerPC implementations where the MMU substantially follows the
+ * architecture specification.  This includes the 6xx, 7xx, 7xxx,
+ * and 8260 implementations but excludes the 8xx and 4xx.
+ *  -- paulus
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/memblock.h>
+
+#include <asm/prom.h>
+#include <asm/mmu.h>
+#include <asm/machdep.h>
+#include <asm/code-patching.h>
+#include <asm/sections.h>
+
+#include <mm/mmu_decl.h>
+
+struct hash_pte *Hash, *Hash_end;
+unsigned long Hash_size, Hash_mask;
+unsigned long _SDR1;
+
+struct ppc_bat BATS[8][2];	/* 8 pairs of IBAT, DBAT */
+
+struct batrange {		/* stores address ranges mapped by BATs */
+	unsigned long start;
+	unsigned long limit;
+	phys_addr_t phys;
+} bat_addrs[8];
+
+/*
+ * Return PA for this VA if it is mapped by a BAT, or 0
+ */
+phys_addr_t v_block_mapped(unsigned long va)
+{
+	int b;
+	for (b = 0; b < ARRAY_SIZE(bat_addrs); ++b)
+		if (va >= bat_addrs[b].start && va < bat_addrs[b].limit)
+			return bat_addrs[b].phys + (va - bat_addrs[b].start);
+	return 0;
+}
+
+/*
+ * Return VA for a given PA or 0 if not mapped
+ */
+unsigned long p_block_mapped(phys_addr_t pa)
+{
+	int b;
+	for (b = 0; b < ARRAY_SIZE(bat_addrs); ++b)
+		if (pa >= bat_addrs[b].phys
+	    	    && pa < (bat_addrs[b].limit-bat_addrs[b].start)
+		              +bat_addrs[b].phys)
+			return bat_addrs[b].start+(pa-bat_addrs[b].phys);
+	return 0;
+}
+
+static int find_free_bat(void)
+{
+	int b;
+
+	if (cpu_has_feature(CPU_FTR_601)) {
+		for (b = 0; b < 4; b++) {
+			struct ppc_bat *bat = BATS[b];
+
+			if (!(bat[0].batl & 0x40))
+				return b;
+		}
+	} else {
+		int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
+
+		for (b = 0; b < n; b++) {
+			struct ppc_bat *bat = BATS[b];
+
+			if (!(bat[1].batu & 3))
+				return b;
+		}
+	}
+	return -1;
+}
+
+static unsigned int block_size(unsigned long base, unsigned long top)
+{
+	unsigned int max_size = (cpu_has_feature(CPU_FTR_601) ? 8 : 256) << 20;
+	unsigned int base_shift = (fls(base) - 1) & 31;
+	unsigned int block_shift = (fls(top - base) - 1) & 31;
+
+	return min3(max_size, 1U << base_shift, 1U << block_shift);
+}
+
+/*
+ * Set up one of the IBAT (block address translation) register pairs.
+ * The parameters are not checked; in particular size must be a power
+ * of 2 between 128k and 256M.
+ * Only for 603+ ...
+ */
+static void setibat(int index, unsigned long virt, phys_addr_t phys,
+		    unsigned int size, pgprot_t prot)
+{
+	unsigned int bl = (size >> 17) - 1;
+	int wimgxpp;
+	struct ppc_bat *bat = BATS[index];
+	unsigned long flags = pgprot_val(prot);
+
+	if (!cpu_has_feature(CPU_FTR_NEED_COHERENT))
+		flags &= ~_PAGE_COHERENT;
+
+	wimgxpp = (flags & _PAGE_COHERENT) | (_PAGE_EXEC ? BPP_RX : BPP_XX);
+	bat[0].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
+	bat[0].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
+	if (flags & _PAGE_USER)
+		bat[0].batu |= 1;	/* Vp = 1 */
+}
+
+static void clearibat(int index)
+{
+	struct ppc_bat *bat = BATS[index];
+
+	bat[0].batu = 0;
+	bat[0].batl = 0;
+}
+
+static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long top)
+{
+	int idx;
+
+	while ((idx = find_free_bat()) != -1 && base != top) {
+		unsigned int size = block_size(base, top);
+
+		if (size < 128 << 10)
+			break;
+		setbat(idx, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X);
+		base += size;
+	}
+
+	return base;
+}
+
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
+{
+	int done;
+	unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET;
+
+	if (__map_without_bats) {
+		pr_debug("RAM mapped without BATs\n");
+		return base;
+	}
+
+	if (!strict_kernel_rwx_enabled() || base >= border || top <= border)
+		return __mmu_mapin_ram(base, top);
+
+	done = __mmu_mapin_ram(base, border);
+	if (done != border - base)
+		return done;
+
+	return done + __mmu_mapin_ram(border, top);
+}
+
+void mmu_mark_initmem_nx(void)
+{
+	int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
+	int i;
+	unsigned long base = (unsigned long)_stext - PAGE_OFFSET;
+	unsigned long top = (unsigned long)_etext - PAGE_OFFSET;
+	unsigned long size;
+
+	if (cpu_has_feature(CPU_FTR_601))
+		return;
+
+	for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) {
+		size = block_size(base, top);
+		setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
+		base += size;
+	}
+	if (base < top) {
+		size = block_size(base, top);
+		size = max(size, 128UL << 10);
+		if ((top - base) > size) {
+			if (strict_kernel_rwx_enabled())
+				pr_warn("Kernel _etext not properly aligned\n");
+			size <<= 1;
+		}
+		setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
+		base += size;
+	}
+	for (; i < nb; i++)
+		clearibat(i);
+
+	update_bats();
+
+	for (i = TASK_SIZE >> 28; i < 16; i++) {
+		/* Do not set NX on VM space for modules */
+		if (IS_ENABLED(CONFIG_MODULES) &&
+		    (VMALLOC_START & 0xf0000000) == i << 28)
+			break;
+		mtsrin(mfsrin(i << 28) | 0x10000000, i << 28);
+	}
+}
+
+void mmu_mark_rodata_ro(void)
+{
+	int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
+	int i;
+
+	if (cpu_has_feature(CPU_FTR_601))
+		return;
+
+	for (i = 0; i < nb; i++) {
+		struct ppc_bat *bat = BATS[i];
+
+		if (bat_addrs[i].start < (unsigned long)__init_begin)
+			bat[1].batl = (bat[1].batl & ~BPP_RW) | BPP_RX;
+	}
+
+	update_bats();
+}
+
+/*
+ * Set up one of the I/D BAT (block address translation) register pairs.
+ * The parameters are not checked; in particular size must be a power
+ * of 2 between 128k and 256M.
+ * On 603+, only set IBAT when _PAGE_EXEC is set
+ */
+void __init setbat(int index, unsigned long virt, phys_addr_t phys,
+		   unsigned int size, pgprot_t prot)
+{
+	unsigned int bl;
+	int wimgxpp;
+	struct ppc_bat *bat = BATS[index];
+	unsigned long flags = pgprot_val(prot);
+
+	if ((flags & _PAGE_NO_CACHE) ||
+	    (cpu_has_feature(CPU_FTR_NEED_COHERENT) == 0))
+		flags &= ~_PAGE_COHERENT;
+
+	bl = (size >> 17) - 1;
+	if (PVR_VER(mfspr(SPRN_PVR)) != 1) {
+		/* 603, 604, etc. */
+		/* Do DBAT first */
+		wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
+				   | _PAGE_COHERENT | _PAGE_GUARDED);
+		wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX;
+		bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
+		bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
+		if (flags & _PAGE_USER)
+			bat[1].batu |= 1; 	/* Vp = 1 */
+		if (flags & _PAGE_GUARDED) {
+			/* G bit must be zero in IBATs */
+			flags &= ~_PAGE_EXEC;
+		}
+		if (flags & _PAGE_EXEC)
+			bat[0] = bat[1];
+		else
+			bat[0].batu = bat[0].batl = 0;
+	} else {
+		/* 601 cpu */
+		if (bl > BL_8M)
+			bl = BL_8M;
+		wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
+				   | _PAGE_COHERENT);
+		wimgxpp |= (flags & _PAGE_RW)?
+			((flags & _PAGE_USER)? PP_RWRW: PP_RWXX): PP_RXRX;
+		bat->batu = virt | wimgxpp | 4;	/* Ks=0, Ku=1 */
+		bat->batl = phys | bl | 0x40;	/* V=1 */
+	}
+
+	bat_addrs[index].start = virt;
+	bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1;
+	bat_addrs[index].phys = phys;
+}
+
+/*
+ * Preload a translation in the hash table
+ */
+void hash_preload(struct mm_struct *mm, unsigned long ea,
+		  bool is_exec, unsigned long trap)
+{
+	pmd_t *pmd;
+
+	if (!Hash)
+		return;
+	pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea);
+	if (!pmd_none(*pmd))
+		add_hash_page(mm->context.id, ea, pmd_val(*pmd));
+}
+
+/*
+ * Initialize the hash table and patch the instructions in hashtable.S.
+ */
+void __init MMU_init_hw(void)
+{
+	unsigned int hmask, mb, mb2;
+	unsigned int n_hpteg, lg_n_hpteg;
+
+	if (!mmu_has_feature(MMU_FTR_HPTE_TABLE))
+		return;
+
+	if ( ppc_md.progress ) ppc_md.progress("hash:enter", 0x105);
+
+#define LG_HPTEG_SIZE	6		/* 64 bytes per HPTEG */
+#define SDR1_LOW_BITS	((n_hpteg - 1) >> 10)
+#define MIN_N_HPTEG	1024		/* min 64kB hash table */
+
+	/*
+	 * Allow 1 HPTE (1/8 HPTEG) for each page of memory.
+	 * This is less than the recommended amount, but then
+	 * Linux ain't AIX.
+	 */
+	n_hpteg = total_memory / (PAGE_SIZE * 8);
+	if (n_hpteg < MIN_N_HPTEG)
+		n_hpteg = MIN_N_HPTEG;
+	lg_n_hpteg = __ilog2(n_hpteg);
+	if (n_hpteg & (n_hpteg - 1)) {
+		++lg_n_hpteg;		/* round up if not power of 2 */
+		n_hpteg = 1 << lg_n_hpteg;
+	}
+	Hash_size = n_hpteg << LG_HPTEG_SIZE;
+
+	/*
+	 * Find some memory for the hash table.
+	 */
+	if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
+	Hash = memblock_alloc(Hash_size, Hash_size);
+	if (!Hash)
+		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+		      __func__, Hash_size, Hash_size);
+	_SDR1 = __pa(Hash) | SDR1_LOW_BITS;
+
+	Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size);
+
+	printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n",
+	       (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash);
+
+
+	/*
+	 * Patch up the instructions in hashtable.S:create_hpte
+	 */
+	if ( ppc_md.progress ) ppc_md.progress("hash:patch", 0x345);
+	Hash_mask = n_hpteg - 1;
+	hmask = Hash_mask >> (16 - LG_HPTEG_SIZE);
+	mb2 = mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg;
+	if (lg_n_hpteg > 16)
+		mb2 = 16 - LG_HPTEG_SIZE;
+
+	modify_instruction_site(&patch__hash_page_A0, 0xffff,
+				((unsigned int)Hash - PAGE_OFFSET) >> 16);
+	modify_instruction_site(&patch__hash_page_A1, 0x7c0, mb << 6);
+	modify_instruction_site(&patch__hash_page_A2, 0x7c0, mb2 << 6);
+	modify_instruction_site(&patch__hash_page_B, 0xffff, hmask);
+	modify_instruction_site(&patch__hash_page_C, 0xffff, hmask);
+
+	/*
+	 * Patch up the instructions in hashtable.S:flush_hash_page
+	 */
+	modify_instruction_site(&patch__flush_hash_A0, 0xffff,
+				((unsigned int)Hash - PAGE_OFFSET) >> 16);
+	modify_instruction_site(&patch__flush_hash_A1, 0x7c0, mb << 6);
+	modify_instruction_site(&patch__flush_hash_A2, 0x7c0, mb2 << 6);
+	modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask);
+
+	if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205);
+}
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+				phys_addr_t first_memblock_size)
+{
+	/* We don't currently support the first MEMBLOCK not mapping 0
+	 * physical on those processors
+	 */
+	BUG_ON(first_memblock_base != 0);
+
+	/* 601 can only access 16MB at the moment */
+	if (PVR_VER(mfspr(SPRN_PVR)) == 1)
+		memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000));
+	else /* Anything else has 256M mapped */
+		memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000));
+}
+
+#ifdef CONFIG_PPC_KUEP
+void __init setup_kuep(bool disabled)
+{
+	pr_info("Activating Kernel Userspace Execution Prevention\n");
+
+	if (cpu_has_feature(CPU_FTR_601))
+		pr_warn("KUEP is not working on powerpc 601 (No NX bit in Seg Regs)\n");
+
+	if (disabled)
+		pr_warn("KUEP cannot be disabled yet on 6xx when compiled in\n");
+}
+#endif
+
+#ifdef CONFIG_PPC_KUAP
+void __init setup_kuap(bool disabled)
+{
+	pr_info("Activating Kernel Userspace Access Protection\n");
+
+	if (disabled)
+		pr_warn("KUAP cannot be disabled yet on 6xx when compiled in\n");
+}
+#endif
diff --git a/arch/powerpc/mm/book3s32/mmu_context.c b/arch/powerpc/mm/book3s32/mmu_context.c
new file mode 100644
index 000000000000..921c1e33e941
--- /dev/null
+++ b/arch/powerpc/mm/book3s32/mmu_context.c
@@ -0,0 +1,118 @@
+/*
+ * This file contains the routines for handling the MMU on those
+ * PowerPC implementations where the MMU substantially follows the
+ * architecture specification.  This includes the 6xx, 7xx, 7xxx,
+ * and 8260 implementations but excludes the 8xx and 4xx.
+ *  -- paulus
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/export.h>
+
+#include <asm/mmu_context.h>
+
+/*
+ * On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs
+ * (virtual segment identifiers) for each context.  Although the
+ * hardware supports 24-bit VSIDs, and thus >1 million contexts,
+ * we only use 32,768 of them.  That is ample, since there can be
+ * at most around 30,000 tasks in the system anyway, and it means
+ * that we can use a bitmap to indicate which contexts are in use.
+ * Using a bitmap means that we entirely avoid all of the problems
+ * that we used to have when the context number overflowed,
+ * particularly on SMP systems.
+ *  -- paulus.
+ */
+#define NO_CONTEXT      	((unsigned long) -1)
+#define LAST_CONTEXT    	32767
+#define FIRST_CONTEXT    	1
+
+/*
+ * This function defines the mapping from contexts to VSIDs (virtual
+ * segment IDs).  We use a skew on both the context and the high 4 bits
+ * of the 32-bit virtual address (the "effective segment ID") in order
+ * to spread out the entries in the MMU hash table.  Note, if this
+ * function is changed then arch/ppc/mm/hashtable.S will have to be
+ * changed to correspond.
+ *
+ *
+ * CTX_TO_VSID(ctx, va)	(((ctx) * (897 * 16) + ((va) >> 28) * 0x111) \
+ *				 & 0xffffff)
+ */
+
+static unsigned long next_mmu_context;
+static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
+
+unsigned long __init_new_context(void)
+{
+	unsigned long ctx = next_mmu_context;
+
+	while (test_and_set_bit(ctx, context_map)) {
+		ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
+		if (ctx > LAST_CONTEXT)
+			ctx = 0;
+	}
+	next_mmu_context = (ctx + 1) & LAST_CONTEXT;
+
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(__init_new_context);
+
+/*
+ * Set up the context for a new address space.
+ */
+int init_new_context(struct task_struct *t, struct mm_struct *mm)
+{
+	mm->context.id = __init_new_context();
+
+	return 0;
+}
+
+/*
+ * Free a context ID. Make sure to call this with preempt disabled!
+ */
+void __destroy_context(unsigned long ctx)
+{
+	clear_bit(ctx, context_map);
+}
+EXPORT_SYMBOL_GPL(__destroy_context);
+
+/*
+ * We're finished using the context for an address space.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+	preempt_disable();
+	if (mm->context.id != NO_CONTEXT) {
+		__destroy_context(mm->context.id);
+		mm->context.id = NO_CONTEXT;
+	}
+	preempt_enable();
+}
+
+/*
+ * Initialize the context management stuff.
+ */
+void __init mmu_context_init(void)
+{
+	/* Reserve context 0 for kernel use */
+	context_map[0] = (1 << FIRST_CONTEXT) - 1;
+	next_mmu_context = FIRST_CONTEXT;
+}
diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c
new file mode 100644
index 000000000000..8d56f0417f87
--- /dev/null
+++ b/arch/powerpc/mm/book3s32/tlb.c
@@ -0,0 +1,173 @@
+/*
+ * This file contains the routines for TLB flushing.
+ * On machines where the MMU uses a hash table to store virtual to
+ * physical translations, these routines flush entries from the
+ * hash table also.
+ *  -- paulus
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/export.h>
+
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+#include <mm/mmu_decl.h>
+
+/*
+ * Called when unmapping pages to flush entries from the TLB/hash table.
+ */
+void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr)
+{
+	unsigned long ptephys;
+
+	if (Hash) {
+		ptephys = __pa(ptep) & PAGE_MASK;
+		flush_hash_pages(mm->context.id, addr, ptephys, 1);
+	}
+}
+EXPORT_SYMBOL(flush_hash_entry);
+
+/*
+ * Called at the end of a mmu_gather operation to make sure the
+ * TLB flush is completely done.
+ */
+void tlb_flush(struct mmu_gather *tlb)
+{
+	if (!Hash) {
+		/*
+		 * 603 needs to flush the whole TLB here since
+		 * it doesn't use a hash table.
+		 */
+		_tlbia();
+	}
+}
+
+/*
+ * TLB flushing:
+ *
+ *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ *  - flush_tlb_page(vma, vmaddr) flushes one page
+ *  - flush_tlb_range(vma, start, end) flushes a range of pages
+ *  - flush_tlb_kernel_range(start, end) flushes kernel pages
+ *
+ * since the hardware hash table functions as an extension of the
+ * tlb as far as the linux tables are concerned, flush it too.
+ *    -- Cort
+ */
+
+static void flush_range(struct mm_struct *mm, unsigned long start,
+			unsigned long end)
+{
+	pmd_t *pmd;
+	unsigned long pmd_end;
+	int count;
+	unsigned int ctx = mm->context.id;
+
+	if (!Hash) {
+		_tlbia();
+		return;
+	}
+	start &= PAGE_MASK;
+	if (start >= end)
+		return;
+	end = (end - 1) | ~PAGE_MASK;
+	pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start);
+	for (;;) {
+		pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
+		if (pmd_end > end)
+			pmd_end = end;
+		if (!pmd_none(*pmd)) {
+			count = ((pmd_end - start) >> PAGE_SHIFT) + 1;
+			flush_hash_pages(ctx, start, pmd_val(*pmd), count);
+		}
+		if (pmd_end == end)
+			break;
+		start = pmd_end + 1;
+		++pmd;
+	}
+}
+
+/*
+ * Flush kernel TLB entries in the given range
+ */
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+	flush_range(&init_mm, start, end);
+}
+EXPORT_SYMBOL(flush_tlb_kernel_range);
+
+/*
+ * Flush all the (user) entries for the address space described by mm.
+ */
+void flush_tlb_mm(struct mm_struct *mm)
+{
+	struct vm_area_struct *mp;
+
+	if (!Hash) {
+		_tlbia();
+		return;
+	}
+
+	/*
+	 * It is safe to go down the mm's list of vmas when called
+	 * from dup_mmap, holding mmap_sem.  It would also be safe from
+	 * unmap_region or exit_mmap, but not from vmtruncate on SMP -
+	 * but it seems dup_mmap is the only SMP case which gets here.
+	 */
+	for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
+		flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
+}
+EXPORT_SYMBOL(flush_tlb_mm);
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+	struct mm_struct *mm;
+	pmd_t *pmd;
+
+	if (!Hash) {
+		_tlbie(vmaddr);
+		return;
+	}
+	mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
+	pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
+	if (!pmd_none(*pmd))
+		flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
+}
+EXPORT_SYMBOL(flush_tlb_page);
+
+/*
+ * For each address in the range, find the pte for the address
+ * and check _PAGE_HASHPTE bit; if it is set, find and destroy
+ * the corresponding HPTE.
+ */
+void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+		     unsigned long end)
+{
+	flush_range(vma->vm_mm, start, end);
+}
+EXPORT_SYMBOL(flush_tlb_range);
+
+void __init early_init_mmu(void)
+{
+}
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
deleted file mode 100644
index e27792d0b744..000000000000
--- a/arch/powerpc/mm/hash_low_32.S
+++ /dev/null
@@ -1,705 +0,0 @@
-/*
- *  PowerPC version
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *  Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP
- *    Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu>
- *  Adapted for Power Macintosh by Paul Mackerras.
- *  Low-level exception handlers and MMU support
- *  rewritten by Paul Mackerras.
- *    Copyright (C) 1996 Paul Mackerras.
- *
- *  This file contains low-level assembler routines for managing
- *  the PowerPC MMU hash table.  (PPC 8xx processors don't use a
- *  hash table, so this file is not used on them.)
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <asm/reg.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/cputable.h>
-#include <asm/ppc_asm.h>
-#include <asm/thread_info.h>
-#include <asm/asm-offsets.h>
-#include <asm/export.h>
-#include <asm/feature-fixups.h>
-#include <asm/code-patching-asm.h>
-
-#ifdef CONFIG_SMP
-	.section .bss
-	.align	2
-mmu_hash_lock:
-	.space	4
-#endif /* CONFIG_SMP */
-
-/*
- * Load a PTE into the hash table, if possible.
- * The address is in r4, and r3 contains an access flag:
- * _PAGE_RW (0x400) if a write.
- * r9 contains the SRR1 value, from which we use the MSR_PR bit.
- * SPRG_THREAD contains the physical address of the current task's thread.
- *
- * Returns to the caller if the access is illegal or there is no
- * mapping for the address.  Otherwise it places an appropriate PTE
- * in the hash table and returns from the exception.
- * Uses r0, r3 - r6, r8, r10, ctr, lr.
- */
-	.text
-_GLOBAL(hash_page)
-#ifdef CONFIG_SMP
-	lis	r8, (mmu_hash_lock - PAGE_OFFSET)@h
-	ori	r8, r8, (mmu_hash_lock - PAGE_OFFSET)@l
-	lis	r0,0x0fff
-	b	10f
-11:	lwz	r6,0(r8)
-	cmpwi	0,r6,0
-	bne	11b
-10:	lwarx	r6,0,r8
-	cmpwi	0,r6,0
-	bne-	11b
-	stwcx.	r0,0,r8
-	bne-	10b
-	isync
-#endif
-	/* Get PTE (linux-style) and check access */
-	lis	r0,KERNELBASE@h		/* check if kernel address */
-	cmplw	0,r4,r0
-	ori	r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
-	mfspr	r5, SPRN_SPRG_PGDIR	/* phys page-table root */
-	blt+	112f			/* assume user more likely */
-	lis	r5, (swapper_pg_dir - PAGE_OFFSET)@ha	/* if kernel address, use */
-	addi	r5 ,r5 ,(swapper_pg_dir - PAGE_OFFSET)@l	/* kernel page table */
-	rlwimi	r3,r9,32-12,29,29	/* MSR_PR -> _PAGE_USER */
-112:
-#ifndef CONFIG_PTE_64BIT
-	rlwimi	r5,r4,12,20,29		/* insert top 10 bits of address */
-	lwz	r8,0(r5)		/* get pmd entry */
-	rlwinm.	r8,r8,0,0,19		/* extract address of pte page */
-#else
-	rlwinm	r8,r4,13,19,29		/* Compute pgdir/pmd offset */
-	lwzx	r8,r8,r5		/* Get L1 entry */
-	rlwinm.	r8,r8,0,0,20		/* extract pt base address */
-#endif
-#ifdef CONFIG_SMP
-	beq-	hash_page_out		/* return if no mapping */
-#else
-	/* XXX it seems like the 601 will give a machine fault on the
-	   rfi if its alignment is wrong (bottom 4 bits of address are
-	   8 or 0xc) and we have had a not-taken conditional branch
-	   to the address following the rfi. */
-	beqlr-
-#endif
-#ifndef CONFIG_PTE_64BIT
-	rlwimi	r8,r4,22,20,29		/* insert next 10 bits of address */
-#else
-	rlwimi	r8,r4,23,20,28		/* compute pte address */
-#endif
-	rlwinm	r0,r3,32-3,24,24	/* _PAGE_RW access -> _PAGE_DIRTY */
-	ori	r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE
-
-	/*
-	 * Update the linux PTE atomically.  We do the lwarx up-front
-	 * because almost always, there won't be a permission violation
-	 * and there won't already be an HPTE, and thus we will have
-	 * to update the PTE to set _PAGE_HASHPTE.  -- paulus.
-	 *
-	 * If PTE_64BIT is set, the low word is the flags word; use that
-	 * word for locking since it contains all the interesting bits.
-	 */
-#if (PTE_FLAGS_OFFSET != 0)
-	addi	r8,r8,PTE_FLAGS_OFFSET
-#endif
-retry:
-	lwarx	r6,0,r8			/* get linux-style pte, flag word */
-	andc.	r5,r3,r6		/* check access & ~permission */
-#ifdef CONFIG_SMP
-	bne-	hash_page_out		/* return if access not permitted */
-#else
-	bnelr-
-#endif
-	or	r5,r0,r6		/* set accessed/dirty bits */
-#ifdef CONFIG_PTE_64BIT
-#ifdef CONFIG_SMP
-	subf	r10,r6,r8		/* create false data dependency */
-	subi	r10,r10,PTE_FLAGS_OFFSET
-	lwzx	r10,r6,r10		/* Get upper PTE word */
-#else
-	lwz	r10,-PTE_FLAGS_OFFSET(r8)
-#endif /* CONFIG_SMP */
-#endif /* CONFIG_PTE_64BIT */
-	stwcx.	r5,0,r8			/* attempt to update PTE */
-	bne-	retry			/* retry if someone got there first */
-
-	mfsrin	r3,r4			/* get segment reg for segment */
-	mfctr	r0
-	stw	r0,_CTR(r11)
-	bl	create_hpte		/* add the hash table entry */
-
-#ifdef CONFIG_SMP
-	eieio
-	lis	r8, (mmu_hash_lock - PAGE_OFFSET)@ha
-	li	r0,0
-	stw	r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8)
-#endif
-
-	/* Return from the exception */
-	lwz	r5,_CTR(r11)
-	mtctr	r5
-	lwz	r0,GPR0(r11)
-	lwz	r8,GPR8(r11)
-	b	fast_exception_return
-
-#ifdef CONFIG_SMP
-hash_page_out:
-	eieio
-	lis	r8, (mmu_hash_lock - PAGE_OFFSET)@ha
-	li	r0,0
-	stw	r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8)
-	blr
-#endif /* CONFIG_SMP */
-
-/*
- * Add an entry for a particular page to the hash table.
- *
- * add_hash_page(unsigned context, unsigned long va, unsigned long pmdval)
- *
- * We assume any necessary modifications to the pte (e.g. setting
- * the accessed bit) have already been done and that there is actually
- * a hash table in use (i.e. we're not on a 603).
- */
-_GLOBAL(add_hash_page)
-	mflr	r0
-	stw	r0,4(r1)
-
-	/* Convert context and va to VSID */
-	mulli	r3,r3,897*16		/* multiply context by context skew */
-	rlwinm	r0,r4,4,28,31		/* get ESID (top 4 bits of va) */
-	mulli	r0,r0,0x111		/* multiply by ESID skew */
-	add	r3,r3,r0		/* note create_hpte trims to 24 bits */
-
-#ifdef CONFIG_SMP
-	lwz	r8,TASK_CPU(r2)		/* to go in mmu_hash_lock */
-	oris	r8,r8,12
-#endif /* CONFIG_SMP */
-
-	/*
-	 * We disable interrupts here, even on UP, because we don't
-	 * want to race with hash_page, and because we want the
-	 * _PAGE_HASHPTE bit to be a reliable indication of whether
-	 * the HPTE exists (or at least whether one did once).
-	 * We also turn off the MMU for data accesses so that we
-	 * we can't take a hash table miss (assuming the code is
-	 * covered by a BAT).  -- paulus
-	 */
-	mfmsr	r9
-	SYNC
-	rlwinm	r0,r9,0,17,15		/* clear bit 16 (MSR_EE) */
-	rlwinm	r0,r0,0,28,26		/* clear MSR_DR */
-	mtmsr	r0
-	SYNC_601
-	isync
-
-#ifdef CONFIG_SMP
-	lis	r6, (mmu_hash_lock - PAGE_OFFSET)@ha
-	addi	r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l
-10:	lwarx	r0,0,r6			/* take the mmu_hash_lock */
-	cmpi	0,r0,0
-	bne-	11f
-	stwcx.	r8,0,r6
-	beq+	12f
-11:	lwz	r0,0(r6)
-	cmpi	0,r0,0
-	beq	10b
-	b	11b
-12:	isync
-#endif
-
-	/*
-	 * Fetch the linux pte and test and set _PAGE_HASHPTE atomically.
-	 * If _PAGE_HASHPTE was already set, we don't replace the existing
-	 * HPTE, so we just unlock and return.
-	 */
-	mr	r8,r5
-#ifndef CONFIG_PTE_64BIT
-	rlwimi	r8,r4,22,20,29
-#else
-	rlwimi	r8,r4,23,20,28
-	addi	r8,r8,PTE_FLAGS_OFFSET
-#endif
-1:	lwarx	r6,0,r8
-	andi.	r0,r6,_PAGE_HASHPTE
-	bne	9f			/* if HASHPTE already set, done */
-#ifdef CONFIG_PTE_64BIT
-#ifdef CONFIG_SMP
-	subf	r10,r6,r8		/* create false data dependency */
-	subi	r10,r10,PTE_FLAGS_OFFSET
-	lwzx	r10,r6,r10		/* Get upper PTE word */
-#else
-	lwz	r10,-PTE_FLAGS_OFFSET(r8)
-#endif /* CONFIG_SMP */
-#endif /* CONFIG_PTE_64BIT */
-	ori	r5,r6,_PAGE_HASHPTE
-	stwcx.	r5,0,r8
-	bne-	1b
-
-	bl	create_hpte
-
-9:
-#ifdef CONFIG_SMP
-	lis	r6, (mmu_hash_lock - PAGE_OFFSET)@ha
-	addi	r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l
-	eieio
-	li	r0,0
-	stw	r0,0(r6)		/* clear mmu_hash_lock */
-#endif
-
-	/* reenable interrupts and DR */
-	mtmsr	r9
-	SYNC_601
-	isync
-
-	lwz	r0,4(r1)
-	mtlr	r0
-	blr
-
-/*
- * This routine adds a hardware PTE to the hash table.
- * It is designed to be called with the MMU either on or off.
- * r3 contains the VSID, r4 contains the virtual address,
- * r5 contains the linux PTE, r6 contains the old value of the
- * linux PTE (before setting _PAGE_HASHPTE). r10 contains the
- * upper half of the PTE if CONFIG_PTE_64BIT.
- * On SMP, the caller should have the mmu_hash_lock held.
- * We assume that the caller has (or will) set the _PAGE_HASHPTE
- * bit in the linux PTE in memory.  The value passed in r6 should
- * be the old linux PTE value; if it doesn't have _PAGE_HASHPTE set
- * this routine will skip the search for an existing HPTE.
- * This procedure modifies r0, r3 - r6, r8, cr0.
- *  -- paulus.
- *
- * For speed, 4 of the instructions get patched once the size and
- * physical address of the hash table are known.  These definitions
- * of Hash_base and Hash_bits below are just an example.
- */
-Hash_base = 0xc0180000
-Hash_bits = 12				/* e.g. 256kB hash table */
-Hash_msk = (((1 << Hash_bits) - 1) * 64)
-
-/* defines for the PTE format for 32-bit PPCs */
-#define HPTE_SIZE	8
-#define PTEG_SIZE	64
-#define LG_PTEG_SIZE	6
-#define LDPTEu		lwzu
-#define LDPTE		lwz
-#define STPTE		stw
-#define CMPPTE		cmpw
-#define PTE_H		0x40
-#define PTE_V		0x80000000
-#define TST_V(r)	rlwinm. r,r,0,0,0
-#define SET_V(r)	oris r,r,PTE_V@h
-#define CLR_V(r,t)	rlwinm r,r,0,1,31
-
-#define HASH_LEFT	31-(LG_PTEG_SIZE+Hash_bits-1)
-#define HASH_RIGHT	31-LG_PTEG_SIZE
-
-_GLOBAL(create_hpte)
-	/* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */
-	rlwinm	r8,r5,32-9,30,30	/* _PAGE_RW -> PP msb */
-	rlwinm	r0,r5,32-6,30,30	/* _PAGE_DIRTY -> PP msb */
-	and	r8,r8,r0		/* writable if _RW & _DIRTY */
-	rlwimi	r5,r5,32-1,30,30	/* _PAGE_USER -> PP msb */
-	rlwimi	r5,r5,32-2,31,31	/* _PAGE_USER -> PP lsb */
-	ori	r8,r8,0xe04		/* clear out reserved bits */
-	andc	r8,r5,r8		/* PP = user? (rw&dirty? 1: 3): 0 */
-BEGIN_FTR_SECTION
-	rlwinm	r8,r8,0,~_PAGE_COHERENT	/* clear M (coherence not required) */
-END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
-#ifdef CONFIG_PTE_64BIT
-	/* Put the XPN bits into the PTE */
-	rlwimi	r8,r10,8,20,22
-	rlwimi	r8,r10,2,29,29
-#endif
-
-	/* Construct the high word of the PPC-style PTE (r5) */
-	rlwinm	r5,r3,7,1,24		/* put VSID in 0x7fffff80 bits */
-	rlwimi	r5,r4,10,26,31		/* put in API (abbrev page index) */
-	SET_V(r5)			/* set V (valid) bit */
-
-	patch_site	0f, patch__hash_page_A0
-	patch_site	1f, patch__hash_page_A1
-	patch_site	2f, patch__hash_page_A2
-	/* Get the address of the primary PTE group in the hash table (r3) */
-0:	lis	r0, (Hash_base - PAGE_OFFSET)@h	/* base address of hash table */
-1:	rlwimi	r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT    /* VSID -> hash */
-2:	rlwinm	r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
-	xor	r3,r3,r0		/* make primary hash */
-	li	r0,8			/* PTEs/group */
-
-	/*
-	 * Test the _PAGE_HASHPTE bit in the old linux PTE, and skip the search
-	 * if it is clear, meaning that the HPTE isn't there already...
-	 */
-	andi.	r6,r6,_PAGE_HASHPTE
-	beq+	10f			/* no PTE: go look for an empty slot */
-	tlbie	r4
-
-	lis	r4, (htab_hash_searches - PAGE_OFFSET)@ha
-	lwz	r6, (htab_hash_searches - PAGE_OFFSET)@l(r4)
-	addi	r6,r6,1			/* count how many searches we do */
-	stw	r6, (htab_hash_searches - PAGE_OFFSET)@l(r4)
-
-	/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
-	mtctr	r0
-	addi	r4,r3,-HPTE_SIZE
-1:	LDPTEu	r6,HPTE_SIZE(r4)	/* get next PTE */
-	CMPPTE	0,r6,r5
-	bdnzf	2,1b			/* loop while ctr != 0 && !cr0.eq */
-	beq+	found_slot
-
-	patch_site	0f, patch__hash_page_B
-	/* Search the secondary PTEG for a matching PTE */
-	ori	r5,r5,PTE_H		/* set H (secondary hash) bit */
-0:	xoris	r4,r3,Hash_msk>>16	/* compute secondary hash */
-	xori	r4,r4,(-PTEG_SIZE & 0xffff)
-	addi	r4,r4,-HPTE_SIZE
-	mtctr	r0
-2:	LDPTEu	r6,HPTE_SIZE(r4)
-	CMPPTE	0,r6,r5
-	bdnzf	2,2b
-	beq+	found_slot
-	xori	r5,r5,PTE_H		/* clear H bit again */
-
-	/* Search the primary PTEG for an empty slot */
-10:	mtctr	r0
-	addi	r4,r3,-HPTE_SIZE	/* search primary PTEG */
-1:	LDPTEu	r6,HPTE_SIZE(r4)	/* get next PTE */
-	TST_V(r6)			/* test valid bit */
-	bdnzf	2,1b			/* loop while ctr != 0 && !cr0.eq */
-	beq+	found_empty
-
-	/* update counter of times that the primary PTEG is full */
-	lis	r4, (primary_pteg_full - PAGE_OFFSET)@ha
-	lwz	r6, (primary_pteg_full - PAGE_OFFSET)@l(r4)
-	addi	r6,r6,1
-	stw	r6, (primary_pteg_full - PAGE_OFFSET)@l(r4)
-
-	patch_site	0f, patch__hash_page_C
-	/* Search the secondary PTEG for an empty slot */
-	ori	r5,r5,PTE_H		/* set H (secondary hash) bit */
-0:	xoris	r4,r3,Hash_msk>>16	/* compute secondary hash */
-	xori	r4,r4,(-PTEG_SIZE & 0xffff)
-	addi	r4,r4,-HPTE_SIZE
-	mtctr	r0
-2:	LDPTEu	r6,HPTE_SIZE(r4)
-	TST_V(r6)
-	bdnzf	2,2b
-	beq+	found_empty
-	xori	r5,r5,PTE_H		/* clear H bit again */
-
-	/*
-	 * Choose an arbitrary slot in the primary PTEG to overwrite.
-	 * Since both the primary and secondary PTEGs are full, and we
-	 * have no information that the PTEs in the primary PTEG are
-	 * more important or useful than those in the secondary PTEG,
-	 * and we know there is a definite (although small) speed
-	 * advantage to putting the PTE in the primary PTEG, we always
-	 * put the PTE in the primary PTEG.
-	 *
-	 * In addition, we skip any slot that is mapping kernel text in
-	 * order to avoid a deadlock when not using BAT mappings if
-	 * trying to hash in the kernel hash code itself after it has
-	 * already taken the hash table lock. This works in conjunction
-	 * with pre-faulting of the kernel text.
-	 *
-	 * If the hash table bucket is full of kernel text entries, we'll
-	 * lockup here but that shouldn't happen
-	 */
-
-1:	lis	r4, (next_slot - PAGE_OFFSET)@ha	/* get next evict slot */
-	lwz	r6, (next_slot - PAGE_OFFSET)@l(r4)
-	addi	r6,r6,HPTE_SIZE			/* search for candidate */
-	andi.	r6,r6,7*HPTE_SIZE
-	stw	r6,next_slot@l(r4)
-	add	r4,r3,r6
-	LDPTE	r0,HPTE_SIZE/2(r4)		/* get PTE second word */
-	clrrwi	r0,r0,12
-	lis	r6,etext@h
-	ori	r6,r6,etext@l			/* get etext */
-	tophys(r6,r6)
-	cmpl	cr0,r0,r6			/* compare and try again */
-	blt	1b
-
-#ifndef CONFIG_SMP
-	/* Store PTE in PTEG */
-found_empty:
-	STPTE	r5,0(r4)
-found_slot:
-	STPTE	r8,HPTE_SIZE/2(r4)
-
-#else /* CONFIG_SMP */
-/*
- * Between the tlbie above and updating the hash table entry below,
- * another CPU could read the hash table entry and put it in its TLB.
- * There are 3 cases:
- * 1. using an empty slot
- * 2. updating an earlier entry to change permissions (i.e. enable write)
- * 3. taking over the PTE for an unrelated address
- *
- * In each case it doesn't really matter if the other CPUs have the old
- * PTE in their TLB.  So we don't need to bother with another tlbie here,
- * which is convenient as we've overwritten the register that had the
- * address. :-)  The tlbie above is mainly to make sure that this CPU comes
- * and gets the new PTE from the hash table.
- *
- * We do however have to make sure that the PTE is never in an invalid
- * state with the V bit set.
- */
-found_empty:
-found_slot:
-	CLR_V(r5,r0)		/* clear V (valid) bit in PTE */
-	STPTE	r5,0(r4)
-	sync
-	TLBSYNC
-	STPTE	r8,HPTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */
-	sync
-	SET_V(r5)
-	STPTE	r5,0(r4)	/* finally set V bit in PTE */
-#endif /* CONFIG_SMP */
-
-	sync		/* make sure pte updates get to memory */
-	blr
-
-	.section .bss
-	.align	2
-next_slot:
-	.space	4
-primary_pteg_full:
-	.space	4
-htab_hash_searches:
-	.space	4
-	.previous
-
-/*
- * Flush the entry for a particular page from the hash table.
- *
- * flush_hash_pages(unsigned context, unsigned long va, unsigned long pmdval,
- *		    int count)
- *
- * We assume that there is a hash table in use (Hash != 0).
- */
-_GLOBAL(flush_hash_pages)
-	/*
-	 * We disable interrupts here, even on UP, because we want
-	 * the _PAGE_HASHPTE bit to be a reliable indication of
-	 * whether the HPTE exists (or at least whether one did once).
-	 * We also turn off the MMU for data accesses so that we
-	 * we can't take a hash table miss (assuming the code is
-	 * covered by a BAT).  -- paulus
-	 */
-	mfmsr	r10
-	SYNC
-	rlwinm	r0,r10,0,17,15		/* clear bit 16 (MSR_EE) */
-	rlwinm	r0,r0,0,28,26		/* clear MSR_DR */
-	mtmsr	r0
-	SYNC_601
-	isync
-
-	/* First find a PTE in the range that has _PAGE_HASHPTE set */
-#ifndef CONFIG_PTE_64BIT
-	rlwimi	r5,r4,22,20,29
-#else
-	rlwimi	r5,r4,23,20,28
-#endif
-1:	lwz	r0,PTE_FLAGS_OFFSET(r5)
-	cmpwi	cr1,r6,1
-	andi.	r0,r0,_PAGE_HASHPTE
-	bne	2f
-	ble	cr1,19f
-	addi	r4,r4,0x1000
-	addi	r5,r5,PTE_SIZE
-	addi	r6,r6,-1
-	b	1b
-
-	/* Convert context and va to VSID */
-2:	mulli	r3,r3,897*16		/* multiply context by context skew */
-	rlwinm	r0,r4,4,28,31		/* get ESID (top 4 bits of va) */
-	mulli	r0,r0,0x111		/* multiply by ESID skew */
-	add	r3,r3,r0		/* note code below trims to 24 bits */
-
-	/* Construct the high word of the PPC-style PTE (r11) */
-	rlwinm	r11,r3,7,1,24		/* put VSID in 0x7fffff80 bits */
-	rlwimi	r11,r4,10,26,31		/* put in API (abbrev page index) */
-	SET_V(r11)			/* set V (valid) bit */
-
-#ifdef CONFIG_SMP
-	lis	r9, (mmu_hash_lock - PAGE_OFFSET)@ha
-	addi	r9, r9, (mmu_hash_lock - PAGE_OFFSET)@l
-	lwz	r8,TASK_CPU(r2)
-	oris	r8,r8,9
-10:	lwarx	r0,0,r9
-	cmpi	0,r0,0
-	bne-	11f
-	stwcx.	r8,0,r9
-	beq+	12f
-11:	lwz	r0,0(r9)
-	cmpi	0,r0,0
-	beq	10b
-	b	11b
-12:	isync
-#endif
-
-	/*
-	 * Check the _PAGE_HASHPTE bit in the linux PTE.  If it is
-	 * already clear, we're done (for this pte).  If not,
-	 * clear it (atomically) and proceed.  -- paulus.
-	 */
-#if (PTE_FLAGS_OFFSET != 0)
-	addi	r5,r5,PTE_FLAGS_OFFSET
-#endif
-33:	lwarx	r8,0,r5			/* fetch the pte flags word */
-	andi.	r0,r8,_PAGE_HASHPTE
-	beq	8f			/* done if HASHPTE is already clear */
-	rlwinm	r8,r8,0,31,29		/* clear HASHPTE bit */
-	stwcx.	r8,0,r5			/* update the pte */
-	bne-	33b
-
-	patch_site	0f, patch__flush_hash_A0
-	patch_site	1f, patch__flush_hash_A1
-	patch_site	2f, patch__flush_hash_A2
-	/* Get the address of the primary PTE group in the hash table (r3) */
-0:	lis	r8, (Hash_base - PAGE_OFFSET)@h	/* base address of hash table */
-1:	rlwimi	r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT    /* VSID -> hash */
-2:	rlwinm	r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
-	xor	r8,r0,r8		/* make primary hash */
-
-	/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
-	li	r0,8			/* PTEs/group */
-	mtctr	r0
-	addi	r12,r8,-HPTE_SIZE
-1:	LDPTEu	r0,HPTE_SIZE(r12)	/* get next PTE */
-	CMPPTE	0,r0,r11
-	bdnzf	2,1b			/* loop while ctr != 0 && !cr0.eq */
-	beq+	3f
-
-	patch_site	0f, patch__flush_hash_B
-	/* Search the secondary PTEG for a matching PTE */
-	ori	r11,r11,PTE_H		/* set H (secondary hash) bit */
-	li	r0,8			/* PTEs/group */
-0:	xoris	r12,r8,Hash_msk>>16	/* compute secondary hash */
-	xori	r12,r12,(-PTEG_SIZE & 0xffff)
-	addi	r12,r12,-HPTE_SIZE
-	mtctr	r0
-2:	LDPTEu	r0,HPTE_SIZE(r12)
-	CMPPTE	0,r0,r11
-	bdnzf	2,2b
-	xori	r11,r11,PTE_H		/* clear H again */
-	bne-	4f			/* should rarely fail to find it */
-
-3:	li	r0,0
-	STPTE	r0,0(r12)		/* invalidate entry */
-4:	sync
-	tlbie	r4			/* in hw tlb too */
-	sync
-
-8:	ble	cr1,9f			/* if all ptes checked */
-81:	addi	r6,r6,-1
-	addi	r5,r5,PTE_SIZE
-	addi	r4,r4,0x1000
-	lwz	r0,0(r5)		/* check next pte */
-	cmpwi	cr1,r6,1
-	andi.	r0,r0,_PAGE_HASHPTE
-	bne	33b
-	bgt	cr1,81b
-
-9:
-#ifdef CONFIG_SMP
-	TLBSYNC
-	li	r0,0
-	stw	r0,0(r9)		/* clear mmu_hash_lock */
-#endif
-
-19:	mtmsr	r10
-	SYNC_601
-	isync
-	blr
-EXPORT_SYMBOL(flush_hash_pages)
-
-/*
- * Flush an entry from the TLB
- */
-_GLOBAL(_tlbie)
-#ifdef CONFIG_SMP
-	lwz	r8,TASK_CPU(r2)
-	oris	r8,r8,11
-	mfmsr	r10
-	SYNC
-	rlwinm	r0,r10,0,17,15		/* clear bit 16 (MSR_EE) */
-	rlwinm	r0,r0,0,28,26		/* clear DR */
-	mtmsr	r0
-	SYNC_601
-	isync
-	lis	r9,mmu_hash_lock@h
-	ori	r9,r9,mmu_hash_lock@l
-	tophys(r9,r9)
-10:	lwarx	r7,0,r9
-	cmpwi	0,r7,0
-	bne-	10b
-	stwcx.	r8,0,r9
-	bne-	10b
-	eieio
-	tlbie	r3
-	sync
-	TLBSYNC
-	li	r0,0
-	stw	r0,0(r9)		/* clear mmu_hash_lock */
-	mtmsr	r10
-	SYNC_601
-	isync
-#else /* CONFIG_SMP */
-	tlbie	r3
-	sync
-#endif /* CONFIG_SMP */
-	blr
-
-/*
- * Flush the entire TLB. 603/603e only
- */
-_GLOBAL(_tlbia)
-#if defined(CONFIG_SMP)
-	lwz	r8,TASK_CPU(r2)
-	oris	r8,r8,10
-	mfmsr	r10
-	SYNC
-	rlwinm	r0,r10,0,17,15		/* clear bit 16 (MSR_EE) */
-	rlwinm	r0,r0,0,28,26		/* clear DR */
-	mtmsr	r0
-	SYNC_601
-	isync
-	lis	r9,mmu_hash_lock@h
-	ori	r9,r9,mmu_hash_lock@l
-	tophys(r9,r9)
-10:	lwarx	r7,0,r9
-	cmpwi	0,r7,0
-	bne-	10b
-	stwcx.	r8,0,r9
-	bne-	10b
-	sync
-	tlbia
-	sync
-	TLBSYNC
-	li	r0,0
-	stw	r0,0(r9)		/* clear mmu_hash_lock */
-	mtmsr	r10
-	SYNC_601
-	isync
-#else /* CONFIG_SMP */
-	sync
-	tlbia
-	sync
-#endif /* CONFIG_SMP */
-	blr
diff --git a/arch/powerpc/mm/mmu_context_hash32.c b/arch/powerpc/mm/mmu_context_hash32.c
deleted file mode 100644
index 921c1e33e941..000000000000
--- a/arch/powerpc/mm/mmu_context_hash32.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * This file contains the routines for handling the MMU on those
- * PowerPC implementations where the MMU substantially follows the
- * architecture specification.  This includes the 6xx, 7xx, 7xxx,
- * and 8260 implementations but excludes the 8xx and 4xx.
- *  -- paulus
- *
- *  Derived from arch/ppc/mm/init.c:
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
- *    Copyright (C) 1996 Paul Mackerras
- *
- *  Derived from "arch/i386/mm/init.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/export.h>
-
-#include <asm/mmu_context.h>
-
-/*
- * On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs
- * (virtual segment identifiers) for each context.  Although the
- * hardware supports 24-bit VSIDs, and thus >1 million contexts,
- * we only use 32,768 of them.  That is ample, since there can be
- * at most around 30,000 tasks in the system anyway, and it means
- * that we can use a bitmap to indicate which contexts are in use.
- * Using a bitmap means that we entirely avoid all of the problems
- * that we used to have when the context number overflowed,
- * particularly on SMP systems.
- *  -- paulus.
- */
-#define NO_CONTEXT      	((unsigned long) -1)
-#define LAST_CONTEXT    	32767
-#define FIRST_CONTEXT    	1
-
-/*
- * This function defines the mapping from contexts to VSIDs (virtual
- * segment IDs).  We use a skew on both the context and the high 4 bits
- * of the 32-bit virtual address (the "effective segment ID") in order
- * to spread out the entries in the MMU hash table.  Note, if this
- * function is changed then arch/ppc/mm/hashtable.S will have to be
- * changed to correspond.
- *
- *
- * CTX_TO_VSID(ctx, va)	(((ctx) * (897 * 16) + ((va) >> 28) * 0x111) \
- *				 & 0xffffff)
- */
-
-static unsigned long next_mmu_context;
-static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
-
-unsigned long __init_new_context(void)
-{
-	unsigned long ctx = next_mmu_context;
-
-	while (test_and_set_bit(ctx, context_map)) {
-		ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
-		if (ctx > LAST_CONTEXT)
-			ctx = 0;
-	}
-	next_mmu_context = (ctx + 1) & LAST_CONTEXT;
-
-	return ctx;
-}
-EXPORT_SYMBOL_GPL(__init_new_context);
-
-/*
- * Set up the context for a new address space.
- */
-int init_new_context(struct task_struct *t, struct mm_struct *mm)
-{
-	mm->context.id = __init_new_context();
-
-	return 0;
-}
-
-/*
- * Free a context ID. Make sure to call this with preempt disabled!
- */
-void __destroy_context(unsigned long ctx)
-{
-	clear_bit(ctx, context_map);
-}
-EXPORT_SYMBOL_GPL(__destroy_context);
-
-/*
- * We're finished using the context for an address space.
- */
-void destroy_context(struct mm_struct *mm)
-{
-	preempt_disable();
-	if (mm->context.id != NO_CONTEXT) {
-		__destroy_context(mm->context.id);
-		mm->context.id = NO_CONTEXT;
-	}
-	preempt_enable();
-}
-
-/*
- * Initialize the context management stuff.
- */
-void __init mmu_context_init(void)
-{
-	/* Reserve context 0 for kernel use */
-	context_map[0] = (1 << FIRST_CONTEXT) - 1;
-	next_mmu_context = FIRST_CONTEXT;
-}
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
deleted file mode 100644
index 1db55159031c..000000000000
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ /dev/null
@@ -1,419 +0,0 @@
-/*
- * This file contains the routines for handling the MMU on those
- * PowerPC implementations where the MMU substantially follows the
- * architecture specification.  This includes the 6xx, 7xx, 7xxx,
- * and 8260 implementations but excludes the 8xx and 4xx.
- *  -- paulus
- *
- *  Derived from arch/ppc/mm/init.c:
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
- *    Copyright (C) 1996 Paul Mackerras
- *
- *  Derived from "arch/i386/mm/init.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/highmem.h>
-#include <linux/memblock.h>
-
-#include <asm/prom.h>
-#include <asm/mmu.h>
-#include <asm/machdep.h>
-#include <asm/code-patching.h>
-#include <asm/sections.h>
-
-#include <mm/mmu_decl.h>
-
-struct hash_pte *Hash, *Hash_end;
-unsigned long Hash_size, Hash_mask;
-unsigned long _SDR1;
-
-struct ppc_bat BATS[8][2];	/* 8 pairs of IBAT, DBAT */
-
-struct batrange {		/* stores address ranges mapped by BATs */
-	unsigned long start;
-	unsigned long limit;
-	phys_addr_t phys;
-} bat_addrs[8];
-
-/*
- * Return PA for this VA if it is mapped by a BAT, or 0
- */
-phys_addr_t v_block_mapped(unsigned long va)
-{
-	int b;
-	for (b = 0; b < ARRAY_SIZE(bat_addrs); ++b)
-		if (va >= bat_addrs[b].start && va < bat_addrs[b].limit)
-			return bat_addrs[b].phys + (va - bat_addrs[b].start);
-	return 0;
-}
-
-/*
- * Return VA for a given PA or 0 if not mapped
- */
-unsigned long p_block_mapped(phys_addr_t pa)
-{
-	int b;
-	for (b = 0; b < ARRAY_SIZE(bat_addrs); ++b)
-		if (pa >= bat_addrs[b].phys
-	    	    && pa < (bat_addrs[b].limit-bat_addrs[b].start)
-		              +bat_addrs[b].phys)
-			return bat_addrs[b].start+(pa-bat_addrs[b].phys);
-	return 0;
-}
-
-static int find_free_bat(void)
-{
-	int b;
-
-	if (cpu_has_feature(CPU_FTR_601)) {
-		for (b = 0; b < 4; b++) {
-			struct ppc_bat *bat = BATS[b];
-
-			if (!(bat[0].batl & 0x40))
-				return b;
-		}
-	} else {
-		int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
-
-		for (b = 0; b < n; b++) {
-			struct ppc_bat *bat = BATS[b];
-
-			if (!(bat[1].batu & 3))
-				return b;
-		}
-	}
-	return -1;
-}
-
-static unsigned int block_size(unsigned long base, unsigned long top)
-{
-	unsigned int max_size = (cpu_has_feature(CPU_FTR_601) ? 8 : 256) << 20;
-	unsigned int base_shift = (fls(base) - 1) & 31;
-	unsigned int block_shift = (fls(top - base) - 1) & 31;
-
-	return min3(max_size, 1U << base_shift, 1U << block_shift);
-}
-
-/*
- * Set up one of the IBAT (block address translation) register pairs.
- * The parameters are not checked; in particular size must be a power
- * of 2 between 128k and 256M.
- * Only for 603+ ...
- */
-static void setibat(int index, unsigned long virt, phys_addr_t phys,
-		    unsigned int size, pgprot_t prot)
-{
-	unsigned int bl = (size >> 17) - 1;
-	int wimgxpp;
-	struct ppc_bat *bat = BATS[index];
-	unsigned long flags = pgprot_val(prot);
-
-	if (!cpu_has_feature(CPU_FTR_NEED_COHERENT))
-		flags &= ~_PAGE_COHERENT;
-
-	wimgxpp = (flags & _PAGE_COHERENT) | (_PAGE_EXEC ? BPP_RX : BPP_XX);
-	bat[0].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
-	bat[0].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
-	if (flags & _PAGE_USER)
-		bat[0].batu |= 1;	/* Vp = 1 */
-}
-
-static void clearibat(int index)
-{
-	struct ppc_bat *bat = BATS[index];
-
-	bat[0].batu = 0;
-	bat[0].batl = 0;
-}
-
-static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long top)
-{
-	int idx;
-
-	while ((idx = find_free_bat()) != -1 && base != top) {
-		unsigned int size = block_size(base, top);
-
-		if (size < 128 << 10)
-			break;
-		setbat(idx, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X);
-		base += size;
-	}
-
-	return base;
-}
-
-unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
-{
-	int done;
-	unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET;
-
-	if (__map_without_bats) {
-		pr_debug("RAM mapped without BATs\n");
-		return base;
-	}
-
-	if (!strict_kernel_rwx_enabled() || base >= border || top <= border)
-		return __mmu_mapin_ram(base, top);
-
-	done = __mmu_mapin_ram(base, border);
-	if (done != border - base)
-		return done;
-
-	return done + __mmu_mapin_ram(border, top);
-}
-
-void mmu_mark_initmem_nx(void)
-{
-	int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
-	int i;
-	unsigned long base = (unsigned long)_stext - PAGE_OFFSET;
-	unsigned long top = (unsigned long)_etext - PAGE_OFFSET;
-	unsigned long size;
-
-	if (cpu_has_feature(CPU_FTR_601))
-		return;
-
-	for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) {
-		size = block_size(base, top);
-		setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
-		base += size;
-	}
-	if (base < top) {
-		size = block_size(base, top);
-		size = max(size, 128UL << 10);
-		if ((top - base) > size) {
-			if (strict_kernel_rwx_enabled())
-				pr_warn("Kernel _etext not properly aligned\n");
-			size <<= 1;
-		}
-		setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
-		base += size;
-	}
-	for (; i < nb; i++)
-		clearibat(i);
-
-	update_bats();
-
-	for (i = TASK_SIZE >> 28; i < 16; i++) {
-		/* Do not set NX on VM space for modules */
-		if (IS_ENABLED(CONFIG_MODULES) &&
-		    (VMALLOC_START & 0xf0000000) == i << 28)
-			break;
-		mtsrin(mfsrin(i << 28) | 0x10000000, i << 28);
-	}
-}
-
-void mmu_mark_rodata_ro(void)
-{
-	int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
-	int i;
-
-	if (cpu_has_feature(CPU_FTR_601))
-		return;
-
-	for (i = 0; i < nb; i++) {
-		struct ppc_bat *bat = BATS[i];
-
-		if (bat_addrs[i].start < (unsigned long)__init_begin)
-			bat[1].batl = (bat[1].batl & ~BPP_RW) | BPP_RX;
-	}
-
-	update_bats();
-}
-
-/*
- * Set up one of the I/D BAT (block address translation) register pairs.
- * The parameters are not checked; in particular size must be a power
- * of 2 between 128k and 256M.
- * On 603+, only set IBAT when _PAGE_EXEC is set
- */
-void __init setbat(int index, unsigned long virt, phys_addr_t phys,
-		   unsigned int size, pgprot_t prot)
-{
-	unsigned int bl;
-	int wimgxpp;
-	struct ppc_bat *bat = BATS[index];
-	unsigned long flags = pgprot_val(prot);
-
-	if ((flags & _PAGE_NO_CACHE) ||
-	    (cpu_has_feature(CPU_FTR_NEED_COHERENT) == 0))
-		flags &= ~_PAGE_COHERENT;
-
-	bl = (size >> 17) - 1;
-	if (PVR_VER(mfspr(SPRN_PVR)) != 1) {
-		/* 603, 604, etc. */
-		/* Do DBAT first */
-		wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
-				   | _PAGE_COHERENT | _PAGE_GUARDED);
-		wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX;
-		bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
-		bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
-		if (flags & _PAGE_USER)
-			bat[1].batu |= 1; 	/* Vp = 1 */
-		if (flags & _PAGE_GUARDED) {
-			/* G bit must be zero in IBATs */
-			flags &= ~_PAGE_EXEC;
-		}
-		if (flags & _PAGE_EXEC)
-			bat[0] = bat[1];
-		else
-			bat[0].batu = bat[0].batl = 0;
-	} else {
-		/* 601 cpu */
-		if (bl > BL_8M)
-			bl = BL_8M;
-		wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
-				   | _PAGE_COHERENT);
-		wimgxpp |= (flags & _PAGE_RW)?
-			((flags & _PAGE_USER)? PP_RWRW: PP_RWXX): PP_RXRX;
-		bat->batu = virt | wimgxpp | 4;	/* Ks=0, Ku=1 */
-		bat->batl = phys | bl | 0x40;	/* V=1 */
-	}
-
-	bat_addrs[index].start = virt;
-	bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1;
-	bat_addrs[index].phys = phys;
-}
-
-/*
- * Preload a translation in the hash table
- */
-void hash_preload(struct mm_struct *mm, unsigned long ea,
-		  bool is_exec, unsigned long trap)
-{
-	pmd_t *pmd;
-
-	if (!Hash)
-		return;
-	pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea);
-	if (!pmd_none(*pmd))
-		add_hash_page(mm->context.id, ea, pmd_val(*pmd));
-}
-
-/*
- * Initialize the hash table and patch the instructions in hashtable.S.
- */
-void __init MMU_init_hw(void)
-{
-	unsigned int hmask, mb, mb2;
-	unsigned int n_hpteg, lg_n_hpteg;
-
-	if (!mmu_has_feature(MMU_FTR_HPTE_TABLE))
-		return;
-
-	if ( ppc_md.progress ) ppc_md.progress("hash:enter", 0x105);
-
-#define LG_HPTEG_SIZE	6		/* 64 bytes per HPTEG */
-#define SDR1_LOW_BITS	((n_hpteg - 1) >> 10)
-#define MIN_N_HPTEG	1024		/* min 64kB hash table */
-
-	/*
-	 * Allow 1 HPTE (1/8 HPTEG) for each page of memory.
-	 * This is less than the recommended amount, but then
-	 * Linux ain't AIX.
-	 */
-	n_hpteg = total_memory / (PAGE_SIZE * 8);
-	if (n_hpteg < MIN_N_HPTEG)
-		n_hpteg = MIN_N_HPTEG;
-	lg_n_hpteg = __ilog2(n_hpteg);
-	if (n_hpteg & (n_hpteg - 1)) {
-		++lg_n_hpteg;		/* round up if not power of 2 */
-		n_hpteg = 1 << lg_n_hpteg;
-	}
-	Hash_size = n_hpteg << LG_HPTEG_SIZE;
-
-	/*
-	 * Find some memory for the hash table.
-	 */
-	if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
-	Hash = memblock_alloc(Hash_size, Hash_size);
-	if (!Hash)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, Hash_size, Hash_size);
-	_SDR1 = __pa(Hash) | SDR1_LOW_BITS;
-
-	Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size);
-
-	printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n",
-	       (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash);
-
-
-	/*
-	 * Patch up the instructions in hashtable.S:create_hpte
-	 */
-	if ( ppc_md.progress ) ppc_md.progress("hash:patch", 0x345);
-	Hash_mask = n_hpteg - 1;
-	hmask = Hash_mask >> (16 - LG_HPTEG_SIZE);
-	mb2 = mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg;
-	if (lg_n_hpteg > 16)
-		mb2 = 16 - LG_HPTEG_SIZE;
-
-	modify_instruction_site(&patch__hash_page_A0, 0xffff,
-				((unsigned int)Hash - PAGE_OFFSET) >> 16);
-	modify_instruction_site(&patch__hash_page_A1, 0x7c0, mb << 6);
-	modify_instruction_site(&patch__hash_page_A2, 0x7c0, mb2 << 6);
-	modify_instruction_site(&patch__hash_page_B, 0xffff, hmask);
-	modify_instruction_site(&patch__hash_page_C, 0xffff, hmask);
-
-	/*
-	 * Patch up the instructions in hashtable.S:flush_hash_page
-	 */
-	modify_instruction_site(&patch__flush_hash_A0, 0xffff,
-				((unsigned int)Hash - PAGE_OFFSET) >> 16);
-	modify_instruction_site(&patch__flush_hash_A1, 0x7c0, mb << 6);
-	modify_instruction_site(&patch__flush_hash_A2, 0x7c0, mb2 << 6);
-	modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask);
-
-	if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205);
-}
-
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
-				phys_addr_t first_memblock_size)
-{
-	/* We don't currently support the first MEMBLOCK not mapping 0
-	 * physical on those processors
-	 */
-	BUG_ON(first_memblock_base != 0);
-
-	/* 601 can only access 16MB at the moment */
-	if (PVR_VER(mfspr(SPRN_PVR)) == 1)
-		memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000));
-	else /* Anything else has 256M mapped */
-		memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000));
-}
-
-#ifdef CONFIG_PPC_KUEP
-void __init setup_kuep(bool disabled)
-{
-	pr_info("Activating Kernel Userspace Execution Prevention\n");
-
-	if (cpu_has_feature(CPU_FTR_601))
-		pr_warn("KUEP is not working on powerpc 601 (No NX bit in Seg Regs)\n");
-
-	if (disabled)
-		pr_warn("KUEP cannot be disabled yet on 6xx when compiled in\n");
-}
-#endif
-
-#ifdef CONFIG_PPC_KUAP
-void __init setup_kuap(bool disabled)
-{
-	pr_info("Activating Kernel Userspace Access Protection\n");
-
-	if (disabled)
-		pr_warn("KUAP cannot be disabled yet on 6xx when compiled in\n");
-}
-#endif
diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c
deleted file mode 100644
index 8d56f0417f87..000000000000
--- a/arch/powerpc/mm/tlb_hash32.c
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * This file contains the routines for TLB flushing.
- * On machines where the MMU uses a hash table to store virtual to
- * physical translations, these routines flush entries from the
- * hash table also.
- *  -- paulus
- *
- *  Derived from arch/ppc/mm/init.c:
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
- *    Copyright (C) 1996 Paul Mackerras
- *
- *  Derived from "arch/i386/mm/init.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-#include <linux/export.h>
-
-#include <asm/tlbflush.h>
-#include <asm/tlb.h>
-
-#include <mm/mmu_decl.h>
-
-/*
- * Called when unmapping pages to flush entries from the TLB/hash table.
- */
-void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr)
-{
-	unsigned long ptephys;
-
-	if (Hash) {
-		ptephys = __pa(ptep) & PAGE_MASK;
-		flush_hash_pages(mm->context.id, addr, ptephys, 1);
-	}
-}
-EXPORT_SYMBOL(flush_hash_entry);
-
-/*
- * Called at the end of a mmu_gather operation to make sure the
- * TLB flush is completely done.
- */
-void tlb_flush(struct mmu_gather *tlb)
-{
-	if (!Hash) {
-		/*
-		 * 603 needs to flush the whole TLB here since
-		 * it doesn't use a hash table.
-		 */
-		_tlbia();
-	}
-}
-
-/*
- * TLB flushing:
- *
- *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
- *  - flush_tlb_page(vma, vmaddr) flushes one page
- *  - flush_tlb_range(vma, start, end) flushes a range of pages
- *  - flush_tlb_kernel_range(start, end) flushes kernel pages
- *
- * since the hardware hash table functions as an extension of the
- * tlb as far as the linux tables are concerned, flush it too.
- *    -- Cort
- */
-
-static void flush_range(struct mm_struct *mm, unsigned long start,
-			unsigned long end)
-{
-	pmd_t *pmd;
-	unsigned long pmd_end;
-	int count;
-	unsigned int ctx = mm->context.id;
-
-	if (!Hash) {
-		_tlbia();
-		return;
-	}
-	start &= PAGE_MASK;
-	if (start >= end)
-		return;
-	end = (end - 1) | ~PAGE_MASK;
-	pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start);
-	for (;;) {
-		pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
-		if (pmd_end > end)
-			pmd_end = end;
-		if (!pmd_none(*pmd)) {
-			count = ((pmd_end - start) >> PAGE_SHIFT) + 1;
-			flush_hash_pages(ctx, start, pmd_val(*pmd), count);
-		}
-		if (pmd_end == end)
-			break;
-		start = pmd_end + 1;
-		++pmd;
-	}
-}
-
-/*
- * Flush kernel TLB entries in the given range
- */
-void flush_tlb_kernel_range(unsigned long start, unsigned long end)
-{
-	flush_range(&init_mm, start, end);
-}
-EXPORT_SYMBOL(flush_tlb_kernel_range);
-
-/*
- * Flush all the (user) entries for the address space described by mm.
- */
-void flush_tlb_mm(struct mm_struct *mm)
-{
-	struct vm_area_struct *mp;
-
-	if (!Hash) {
-		_tlbia();
-		return;
-	}
-
-	/*
-	 * It is safe to go down the mm's list of vmas when called
-	 * from dup_mmap, holding mmap_sem.  It would also be safe from
-	 * unmap_region or exit_mmap, but not from vmtruncate on SMP -
-	 * but it seems dup_mmap is the only SMP case which gets here.
-	 */
-	for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
-		flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
-}
-EXPORT_SYMBOL(flush_tlb_mm);
-
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
-	struct mm_struct *mm;
-	pmd_t *pmd;
-
-	if (!Hash) {
-		_tlbie(vmaddr);
-		return;
-	}
-	mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
-	pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
-	if (!pmd_none(*pmd))
-		flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
-}
-EXPORT_SYMBOL(flush_tlb_page);
-
-/*
- * For each address in the range, find the pte for the address
- * and check _PAGE_HASHPTE bit; if it is set, find and destroy
- * the corresponding HPTE.
- */
-void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
-		     unsigned long end)
-{
-	flush_range(vma->vm_mm, start, end);
-}
-EXPORT_SYMBOL(flush_tlb_range);
-
-void __init early_init_mmu(void)
-{
-}
-- 
cgit v1.2.3-59-g8ed1b


From 27e23b5f5f6f22292347901303aab2a1d458bcb5 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 29 Mar 2019 10:00:02 +0000
Subject: powerpc/mm: Move nohash specifics in subdirectory mm/nohash

Many files in arch/powerpc/mm are only for nohash. This patch
creates a subdirectory for them.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
[mpe: Shorten new filenames]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/40x_mmu.c                   |  159 ----
 arch/powerpc/mm/44x_mmu.c                   |  246 -----
 arch/powerpc/mm/8xx_mmu.c                   |  239 -----
 arch/powerpc/mm/Makefile                    |   17 +-
 arch/powerpc/mm/fsl_booke_mmu.c             |  326 -------
 arch/powerpc/mm/hugetlbpage-book3e.c        |  206 -----
 arch/powerpc/mm/mmu_context_nohash.c        |  497 -----------
 arch/powerpc/mm/nohash/40x.c                |  159 ++++
 arch/powerpc/mm/nohash/44x.c                |  246 +++++
 arch/powerpc/mm/nohash/8xx.c                |  239 +++++
 arch/powerpc/mm/nohash/Makefile             |   18 +
 arch/powerpc/mm/nohash/book3e_hugetlbpage.c |  206 +++++
 arch/powerpc/mm/nohash/book3e_pgtable.c     |  120 +++
 arch/powerpc/mm/nohash/fsl_booke.c          |  326 +++++++
 arch/powerpc/mm/nohash/mmu_context.c        |  497 +++++++++++
 arch/powerpc/mm/nohash/tlb.c                |  810 +++++++++++++++++
 arch/powerpc/mm/nohash/tlb_low.S            |  491 ++++++++++
 arch/powerpc/mm/nohash/tlb_low_64e.S        | 1280 +++++++++++++++++++++++++++
 arch/powerpc/mm/pgtable-book3e.c            |  120 ---
 arch/powerpc/mm/tlb_low_64e.S               | 1280 ---------------------------
 arch/powerpc/mm/tlb_nohash.c                |  810 -----------------
 arch/powerpc/mm/tlb_nohash_low.S            |  491 ----------
 22 files changed, 4393 insertions(+), 4390 deletions(-)
 delete mode 100644 arch/powerpc/mm/40x_mmu.c
 delete mode 100644 arch/powerpc/mm/44x_mmu.c
 delete mode 100644 arch/powerpc/mm/8xx_mmu.c
 delete mode 100644 arch/powerpc/mm/fsl_booke_mmu.c
 delete mode 100644 arch/powerpc/mm/hugetlbpage-book3e.c
 delete mode 100644 arch/powerpc/mm/mmu_context_nohash.c
 create mode 100644 arch/powerpc/mm/nohash/40x.c
 create mode 100644 arch/powerpc/mm/nohash/44x.c
 create mode 100644 arch/powerpc/mm/nohash/8xx.c
 create mode 100644 arch/powerpc/mm/nohash/Makefile
 create mode 100644 arch/powerpc/mm/nohash/book3e_hugetlbpage.c
 create mode 100644 arch/powerpc/mm/nohash/book3e_pgtable.c
 create mode 100644 arch/powerpc/mm/nohash/fsl_booke.c
 create mode 100644 arch/powerpc/mm/nohash/mmu_context.c
 create mode 100644 arch/powerpc/mm/nohash/tlb.c
 create mode 100644 arch/powerpc/mm/nohash/tlb_low.S
 create mode 100644 arch/powerpc/mm/nohash/tlb_low_64e.S
 delete mode 100644 arch/powerpc/mm/pgtable-book3e.c
 delete mode 100644 arch/powerpc/mm/tlb_low_64e.S
 delete mode 100644 arch/powerpc/mm/tlb_nohash.c
 delete mode 100644 arch/powerpc/mm/tlb_nohash_low.S

(limited to 'arch')

diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c
deleted file mode 100644
index 460459b6f53e..000000000000
--- a/arch/powerpc/mm/40x_mmu.c
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * This file contains the routines for initializing the MMU
- * on the 4xx series of chips.
- *  -- paulus
- *
- *  Derived from arch/ppc/mm/init.c:
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
- *    Copyright (C) 1996 Paul Mackerras
- *
- *  Derived from "arch/i386/mm/init.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/stddef.h>
-#include <linux/vmalloc.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/highmem.h>
-#include <linux/memblock.h>
-
-#include <asm/pgalloc.h>
-#include <asm/prom.h>
-#include <asm/io.h>
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <linux/uaccess.h>
-#include <asm/smp.h>
-#include <asm/bootx.h>
-#include <asm/machdep.h>
-#include <asm/setup.h>
-
-#include <mm/mmu_decl.h>
-
-extern int __map_without_ltlbs;
-/*
- * MMU_init_hw does the chip-specific initialization of the MMU hardware.
- */
-void __init MMU_init_hw(void)
-{
-	/*
-	 * The Zone Protection Register (ZPR) defines how protection will
-	 * be applied to every page which is a member of a given zone. At
-	 * present, we utilize only two of the 4xx's zones.
-	 * The zone index bits (of ZSEL) in the PTE are used for software
-	 * indicators, except the LSB.  For user access, zone 1 is used,
-	 * for kernel access, zone 0 is used.  We set all but zone 1
-	 * to zero, allowing only kernel access as indicated in the PTE.
-	 * For zone 1, we set a 01 binary (a value of 10 will not work)
-	 * to allow user access as indicated in the PTE.  This also allows
-	 * kernel access as indicated in the PTE.
-	 */
-
-        mtspr(SPRN_ZPR, 0x10000000);
-
-	flush_instruction_cache();
-
-	/*
-	 * Set up the real-mode cache parameters for the exception vector
-	 * handlers (which are run in real-mode).
-	 */
-
-        mtspr(SPRN_DCWR, 0x00000000);	/* All caching is write-back */
-
-        /*
-	 * Cache instruction and data space where the exception
-	 * vectors and the kernel live in real-mode.
-	 */
-
-        mtspr(SPRN_DCCR, 0xFFFF0000);	/* 2GByte of data space at 0x0. */
-        mtspr(SPRN_ICCR, 0xFFFF0000);	/* 2GByte of instr. space at 0x0. */
-}
-
-#define LARGE_PAGE_SIZE_16M	(1<<24)
-#define LARGE_PAGE_SIZE_4M	(1<<22)
-
-unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
-{
-	unsigned long v, s, mapped;
-	phys_addr_t p;
-
-	v = KERNELBASE;
-	p = 0;
-	s = total_lowmem;
-
-	if (__map_without_ltlbs)
-		return 0;
-
-	while (s >= LARGE_PAGE_SIZE_16M) {
-		pmd_t *pmdp;
-		unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE;
-
-		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
-		*pmdp++ = __pmd(val);
-		*pmdp++ = __pmd(val);
-		*pmdp++ = __pmd(val);
-		*pmdp++ = __pmd(val);
-
-		v += LARGE_PAGE_SIZE_16M;
-		p += LARGE_PAGE_SIZE_16M;
-		s -= LARGE_PAGE_SIZE_16M;
-	}
-
-	while (s >= LARGE_PAGE_SIZE_4M) {
-		pmd_t *pmdp;
-		unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE;
-
-		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
-		*pmdp = __pmd(val);
-
-		v += LARGE_PAGE_SIZE_4M;
-		p += LARGE_PAGE_SIZE_4M;
-		s -= LARGE_PAGE_SIZE_4M;
-	}
-
-	mapped = total_lowmem - s;
-
-	/* If the size of RAM is not an exact power of two, we may not
-	 * have covered RAM in its entirety with 16 and 4 MiB
-	 * pages. Consequently, restrict the top end of RAM currently
-	 * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail"
-	 * coverage with normal-sized pages (or other reasons) do not
-	 * attempt to allocate outside the allowed range.
-	 */
-	memblock_set_current_limit(mapped);
-
-	return mapped;
-}
-
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
-				phys_addr_t first_memblock_size)
-{
-	/* We don't currently support the first MEMBLOCK not mapping 0
-	 * physical on those processors
-	 */
-	BUG_ON(first_memblock_base != 0);
-
-	/* 40x can only access 16MB at the moment (see head_40x.S) */
-	memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000));
-}
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c
deleted file mode 100644
index c07983ebc02e..000000000000
--- a/arch/powerpc/mm/44x_mmu.c
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Modifications by Matt Porter (mporter@mvista.com) to support
- * PPC44x Book E processors.
- *
- * This file contains the routines for initializing the MMU
- * on the 4xx series of chips.
- *  -- paulus
- *
- *  Derived from arch/ppc/mm/init.c:
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
- *    Copyright (C) 1996 Paul Mackerras
- *
- *  Derived from "arch/i386/mm/init.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/init.h>
-#include <linux/memblock.h>
-
-#include <asm/mmu.h>
-#include <asm/page.h>
-#include <asm/cacheflush.h>
-#include <asm/code-patching.h>
-
-#include <mm/mmu_decl.h>
-
-/* Used by the 44x TLB replacement exception handler.
- * Just needed it declared someplace.
- */
-unsigned int tlb_44x_index; /* = 0 */
-unsigned int tlb_44x_hwater = PPC44x_TLB_SIZE - 1 - PPC44x_EARLY_TLBS;
-int icache_44x_need_flush;
-
-unsigned long tlb_47x_boltmap[1024/8];
-
-static void ppc44x_update_tlb_hwater(void)
-{
-	/* The TLB miss handlers hard codes the watermark in a cmpli
-	 * instruction to improve performances rather than loading it
-	 * from the global variable. Thus, we patch the instructions
-	 * in the 2 TLB miss handlers when updating the value
-	 */
-	modify_instruction_site(&patch__tlb_44x_hwater_D, 0xffff, tlb_44x_hwater);
-	modify_instruction_site(&patch__tlb_44x_hwater_I, 0xffff, tlb_44x_hwater);
-}
-
-/*
- * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 44x type MMU
- */
-static void __init ppc44x_pin_tlb(unsigned int virt, unsigned int phys)
-{
-	unsigned int entry = tlb_44x_hwater--;
-
-	ppc44x_update_tlb_hwater();
-
-	mtspr(SPRN_MMUCR, 0);
-
-	__asm__ __volatile__(
-		"tlbwe	%2,%3,%4\n"
-		"tlbwe	%1,%3,%5\n"
-		"tlbwe	%0,%3,%6\n"
-	:
-	: "r" (PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G),
-	  "r" (phys),
-	  "r" (virt | PPC44x_TLB_VALID | PPC44x_TLB_256M),
-	  "r" (entry),
-	  "i" (PPC44x_TLB_PAGEID),
-	  "i" (PPC44x_TLB_XLAT),
-	  "i" (PPC44x_TLB_ATTRIB));
-}
-
-static int __init ppc47x_find_free_bolted(void)
-{
-	unsigned int mmube0 = mfspr(SPRN_MMUBE0);
-	unsigned int mmube1 = mfspr(SPRN_MMUBE1);
-
-	if (!(mmube0 & MMUBE0_VBE0))
-		return 0;
-	if (!(mmube0 & MMUBE0_VBE1))
-		return 1;
-	if (!(mmube0 & MMUBE0_VBE2))
-		return 2;
-	if (!(mmube1 & MMUBE1_VBE3))
-		return 3;
-	if (!(mmube1 & MMUBE1_VBE4))
-		return 4;
-	if (!(mmube1 & MMUBE1_VBE5))
-		return 5;
-	return -1;
-}
-
-static void __init ppc47x_update_boltmap(void)
-{
-	unsigned int mmube0 = mfspr(SPRN_MMUBE0);
-	unsigned int mmube1 = mfspr(SPRN_MMUBE1);
-
-	if (mmube0 & MMUBE0_VBE0)
-		__set_bit((mmube0 >> MMUBE0_IBE0_SHIFT) & 0xff,
-			  tlb_47x_boltmap);
-	if (mmube0 & MMUBE0_VBE1)
-		__set_bit((mmube0 >> MMUBE0_IBE1_SHIFT) & 0xff,
-			  tlb_47x_boltmap);
-	if (mmube0 & MMUBE0_VBE2)
-		__set_bit((mmube0 >> MMUBE0_IBE2_SHIFT) & 0xff,
-			  tlb_47x_boltmap);
-	if (mmube1 & MMUBE1_VBE3)
-		__set_bit((mmube1 >> MMUBE1_IBE3_SHIFT) & 0xff,
-			  tlb_47x_boltmap);
-	if (mmube1 & MMUBE1_VBE4)
-		__set_bit((mmube1 >> MMUBE1_IBE4_SHIFT) & 0xff,
-			  tlb_47x_boltmap);
-	if (mmube1 & MMUBE1_VBE5)
-		__set_bit((mmube1 >> MMUBE1_IBE5_SHIFT) & 0xff,
-			  tlb_47x_boltmap);
-}
-
-/*
- * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU
- */
-static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
-{
-	unsigned int rA;
-	int bolted;
-
-	/* Base rA is HW way select, way 0, bolted bit set */
-	rA = 0x88000000;
-
-	/* Look for a bolted entry slot */
-	bolted = ppc47x_find_free_bolted();
-	BUG_ON(bolted < 0);
-
-	/* Insert bolted slot number */
-	rA |= bolted << 24;
-
-	pr_debug("256M TLB entry for 0x%08x->0x%08x in bolt slot %d\n",
-		 virt, phys, bolted);
-
-	mtspr(SPRN_MMUCR, 0);
-
-	__asm__ __volatile__(
-		"tlbwe	%2,%3,0\n"
-		"tlbwe	%1,%3,1\n"
-		"tlbwe	%0,%3,2\n"
-		:
-		: "r" (PPC47x_TLB2_SW | PPC47x_TLB2_SR |
-		       PPC47x_TLB2_SX
-#ifdef CONFIG_SMP
-		       | PPC47x_TLB2_M
-#endif
-		       ),
-		  "r" (phys),
-		  "r" (virt | PPC47x_TLB0_VALID | PPC47x_TLB0_256M),
-		  "r" (rA));
-}
-
-void __init MMU_init_hw(void)
-{
-	/* This is not useful on 47x but won't hurt either */
-	ppc44x_update_tlb_hwater();
-
-	flush_instruction_cache();
-}
-
-unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
-{
-	unsigned long addr;
-	unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
-
-	/* Pin in enough TLBs to cover any lowmem not covered by the
-	 * initial 256M mapping established in head_44x.S */
-	for (addr = memstart + PPC_PIN_SIZE; addr < lowmem_end_addr;
-	     addr += PPC_PIN_SIZE) {
-		if (mmu_has_feature(MMU_FTR_TYPE_47x))
-			ppc47x_pin_tlb(addr + PAGE_OFFSET, addr);
-		else
-			ppc44x_pin_tlb(addr + PAGE_OFFSET, addr);
-	}
-	if (mmu_has_feature(MMU_FTR_TYPE_47x)) {
-		ppc47x_update_boltmap();
-
-#ifdef DEBUG
-		{
-			int i;
-
-			printk(KERN_DEBUG "bolted entries: ");
-			for (i = 0; i < 255; i++) {
-				if (test_bit(i, tlb_47x_boltmap))
-					printk("%d ", i);
-			}
-			printk("\n");
-		}
-#endif /* DEBUG */
-	}
-	return total_lowmem;
-}
-
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
-				phys_addr_t first_memblock_size)
-{
-	u64 size;
-
-#ifndef CONFIG_NONSTATIC_KERNEL
-	/* We don't currently support the first MEMBLOCK not mapping 0
-	 * physical on those processors
-	 */
-	BUG_ON(first_memblock_base != 0);
-#endif
-
-	/* 44x has a 256M TLB entry pinned at boot */
-	size = (min_t(u64, first_memblock_size, PPC_PIN_SIZE));
-	memblock_set_current_limit(first_memblock_base + size);
-}
-
-#ifdef CONFIG_SMP
-void __init mmu_init_secondary(int cpu)
-{
-	unsigned long addr;
-	unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
-
-	/* Pin in enough TLBs to cover any lowmem not covered by the
-	 * initial 256M mapping established in head_44x.S
-	 *
-	 * WARNING: This is called with only the first 256M of the
-	 * linear mapping in the TLB and we can't take faults yet
-	 * so beware of what this code uses. It runs off a temporary
-	 * stack. current (r2) isn't initialized, smp_processor_id()
-	 * will not work, current thread info isn't accessible, ...
-	 */
-	for (addr = memstart + PPC_PIN_SIZE; addr < lowmem_end_addr;
-	     addr += PPC_PIN_SIZE) {
-		if (mmu_has_feature(MMU_FTR_TYPE_47x))
-			ppc47x_pin_tlb(addr + PAGE_OFFSET, addr);
-		else
-			ppc44x_pin_tlb(addr + PAGE_OFFSET, addr);
-	}
-}
-#endif /* CONFIG_SMP */
diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c
deleted file mode 100644
index 70d55b615b62..000000000000
--- a/arch/powerpc/mm/8xx_mmu.c
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * This file contains the routines for initializing the MMU
- * on the 8xx series of chips.
- *  -- christophe
- *
- *  Derived from arch/powerpc/mm/40x_mmu.c:
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/memblock.h>
-#include <linux/mmu_context.h>
-#include <asm/fixmap.h>
-#include <asm/code-patching.h>
-
-#include <mm/mmu_decl.h>
-
-#define IMMR_SIZE (FIX_IMMR_SIZE << PAGE_SHIFT)
-
-extern int __map_without_ltlbs;
-
-static unsigned long block_mapped_ram;
-
-/*
- * Return PA for this VA if it is in an area mapped with LTLBs.
- * Otherwise, returns 0
- */
-phys_addr_t v_block_mapped(unsigned long va)
-{
-	unsigned long p = PHYS_IMMR_BASE;
-
-	if (__map_without_ltlbs)
-		return 0;
-	if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE)
-		return p + va - VIRT_IMMR_BASE;
-	if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram)
-		return __pa(va);
-	return 0;
-}
-
-/*
- * Return VA for a given PA mapped with LTLBs or 0 if not mapped
- */
-unsigned long p_block_mapped(phys_addr_t pa)
-{
-	unsigned long p = PHYS_IMMR_BASE;
-
-	if (__map_without_ltlbs)
-		return 0;
-	if (pa >= p && pa < p + IMMR_SIZE)
-		return VIRT_IMMR_BASE + pa - p;
-	if (pa < block_mapped_ram)
-		return (unsigned long)__va(pa);
-	return 0;
-}
-
-#define LARGE_PAGE_SIZE_8M	(1<<23)
-
-/*
- * MMU_init_hw does the chip-specific initialization of the MMU hardware.
- */
-void __init MMU_init_hw(void)
-{
-	/* PIN up to the 3 first 8Mb after IMMR in DTLB table */
-	if (IS_ENABLED(CONFIG_PIN_TLB_DATA)) {
-		unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000;
-		unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY;
-		int i = IS_ENABLED(CONFIG_PIN_TLB_IMMR) ? 29 : 28;
-		unsigned long addr = 0;
-		unsigned long mem = total_lowmem;
-
-		for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) {
-			mtspr(SPRN_MD_CTR, ctr | (i << 8));
-			mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID);
-			mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID);
-			mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT);
-			addr += LARGE_PAGE_SIZE_8M;
-			mem -= LARGE_PAGE_SIZE_8M;
-		}
-	}
-}
-
-static void __init mmu_mapin_immr(void)
-{
-	unsigned long p = PHYS_IMMR_BASE;
-	unsigned long v = VIRT_IMMR_BASE;
-	int offset;
-
-	for (offset = 0; offset < IMMR_SIZE; offset += PAGE_SIZE)
-		map_kernel_page(v + offset, p + offset, PAGE_KERNEL_NCG);
-}
-
-static void mmu_patch_cmp_limit(s32 *site, unsigned long mapped)
-{
-	modify_instruction_site(site, 0xffff, (unsigned long)__va(mapped) >> 16);
-}
-
-static void mmu_patch_addis(s32 *site, long simm)
-{
-	unsigned int instr = *(unsigned int *)patch_site_addr(site);
-
-	instr &= 0xffff0000;
-	instr |= ((unsigned long)simm) >> 16;
-	patch_instruction_site(site, instr);
-}
-
-unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
-{
-	unsigned long mapped;
-
-	if (__map_without_ltlbs) {
-		mapped = 0;
-		mmu_mapin_immr();
-		if (!IS_ENABLED(CONFIG_PIN_TLB_IMMR))
-			patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP);
-		if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
-			mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0);
-	} else {
-		mapped = top & ~(LARGE_PAGE_SIZE_8M - 1);
-		if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
-			mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top,
-					    _ALIGN(__pa(_einittext), 8 << 20));
-	}
-
-	mmu_patch_cmp_limit(&patch__dtlbmiss_linmem_top, mapped);
-	mmu_patch_cmp_limit(&patch__fixupdar_linmem_top, mapped);
-
-	/* If the size of RAM is not an exact power of two, we may not
-	 * have covered RAM in its entirety with 8 MiB
-	 * pages. Consequently, restrict the top end of RAM currently
-	 * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail"
-	 * coverage with normal-sized pages (or other reasons) do not
-	 * attempt to allocate outside the allowed range.
-	 */
-	if (mapped)
-		memblock_set_current_limit(mapped);
-
-	block_mapped_ram = mapped;
-
-	return mapped;
-}
-
-void mmu_mark_initmem_nx(void)
-{
-	if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23)
-		mmu_patch_addis(&patch__itlbmiss_linmem_top8,
-				-((long)_etext & ~(LARGE_PAGE_SIZE_8M - 1)));
-	if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
-		mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, __pa(_etext));
-}
-
-#ifdef CONFIG_STRICT_KERNEL_RWX
-void mmu_mark_rodata_ro(void)
-{
-	if (CONFIG_DATA_SHIFT < 23)
-		mmu_patch_addis(&patch__dtlbmiss_romem_top8,
-				-__pa(((unsigned long)_sinittext) &
-				      ~(LARGE_PAGE_SIZE_8M - 1)));
-	mmu_patch_addis(&patch__dtlbmiss_romem_top, -__pa(_sinittext));
-}
-#endif
-
-void __init setup_initial_memory_limit(phys_addr_t first_memblock_base,
-				       phys_addr_t first_memblock_size)
-{
-	/* We don't currently support the first MEMBLOCK not mapping 0
-	 * physical on those processors
-	 */
-	BUG_ON(first_memblock_base != 0);
-
-	/* 8xx can only access 32MB at the moment */
-	memblock_set_current_limit(min_t(u64, first_memblock_size, 0x02000000));
-}
-
-/*
- * Set up to use a given MMU context.
- * id is context number, pgd is PGD pointer.
- *
- * We place the physical address of the new task page directory loaded
- * into the MMU base register, and set the ASID compare register with
- * the new "context."
- */
-void set_context(unsigned long id, pgd_t *pgd)
-{
-	s16 offset = (s16)(__pa(swapper_pg_dir));
-
-	/* Context switch the PTE pointer for the Abatron BDI2000.
-	 * The PGDIR is passed as second argument.
-	 */
-	if (IS_ENABLED(CONFIG_BDI_SWITCH))
-		abatron_pteptrs[1] = pgd;
-
-	/* Register M_TWB will contain base address of level 1 table minus the
-	 * lower part of the kernel PGDIR base address, so that all accesses to
-	 * level 1 table are done relative to lower part of kernel PGDIR base
-	 * address.
-	 */
-	mtspr(SPRN_M_TWB, __pa(pgd) - offset);
-
-	/* Update context */
-	mtspr(SPRN_M_CASID, id - 1);
-	/* sync */
-	mb();
-}
-
-void flush_instruction_cache(void)
-{
-	isync();
-	mtspr(SPRN_IC_CST, IDC_INVALL);
-	isync();
-}
-
-#ifdef CONFIG_PPC_KUEP
-void __init setup_kuep(bool disabled)
-{
-	if (disabled)
-		return;
-
-	pr_info("Activating Kernel Userspace Execution Prevention\n");
-
-	mtspr(SPRN_MI_AP, MI_APG_KUEP);
-}
-#endif
-
-#ifdef CONFIG_PPC_KUAP
-void __init setup_kuap(bool disabled)
-{
-	pr_info("Activating Kernel Userspace Access Protection\n");
-
-	if (disabled)
-		pr_warn("KUAP cannot be disabled yet on 8xx when compiled in\n");
-
-	mtspr(SPRN_MD_AP, MD_APG_KUAP);
-}
-#endif
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 68cb1e840b5e..08557bae6fa1 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -8,30 +8,15 @@ ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
 obj-y				:= fault.o mem.o pgtable.o mmap.o \
 				   init_$(BITS).o pgtable_$(BITS).o \
 				   init-common.o mmu_context.o drmem.o
-obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
-				   tlb_nohash_low.o
-obj-$(CONFIG_PPC_BOOK3E)	+= tlb_low_$(BITS)e.o
-obj-$(CONFIG_PPC_BOOK3E_64)   += pgtable-book3e.o
+obj-$(CONFIG_PPC_MMU_NOHASH)	+= nohash/
 obj-$(CONFIG_PPC_BOOK3S_32)	+= book3s32/
 obj-$(CONFIG_PPC_BOOK3S_64)	+= book3s64/
 obj-$(CONFIG_PPC_BOOK3S_64)	+= pgtable-frag.o
 obj-$(CONFIG_PPC32)		+= pgtable-frag.o
-obj-$(CONFIG_40x)		+= 40x_mmu.o
-obj-$(CONFIG_44x)		+= 44x_mmu.o
-obj-$(CONFIG_PPC_8xx)		+= 8xx_mmu.o
-obj-$(CONFIG_PPC_FSL_BOOK3E)	+= fsl_booke_mmu.o
 obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
 obj-y				+= hugetlbpage.o
-ifdef CONFIG_HUGETLB_PAGE
-obj-$(CONFIG_PPC_BOOK3E_MMU)	+= hugetlbpage-book3e.o
-endif
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)		+= highmem.o
 obj-$(CONFIG_PPC_COPRO_BASE)	+= copro_fault.o
 obj-$(CONFIG_PPC_PTDUMP)	+= ptdump/
-
-# Disable kcov instrumentation on sensitive code
-# This is necessary for booting with kcov enabled on book3e machines
-KCOV_INSTRUMENT_tlb_nohash.o := n
-KCOV_INSTRUMENT_fsl_booke_mmu.o := n
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
deleted file mode 100644
index 71a1a36751dd..000000000000
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
- * Modifications by Kumar Gala (galak@kernel.crashing.org) to support
- * E500 Book E processors.
- *
- * Copyright 2004,2010 Freescale Semiconductor, Inc.
- *
- * This file contains the routines for initializing the MMU
- * on the 4xx series of chips.
- *  -- paulus
- *
- *  Derived from arch/ppc/mm/init.c:
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
- *    Copyright (C) 1996 Paul Mackerras
- *
- *  Derived from "arch/i386/mm/init.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/stddef.h>
-#include <linux/vmalloc.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/highmem.h>
-#include <linux/memblock.h>
-
-#include <asm/pgalloc.h>
-#include <asm/prom.h>
-#include <asm/io.h>
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <linux/uaccess.h>
-#include <asm/smp.h>
-#include <asm/machdep.h>
-#include <asm/setup.h>
-#include <asm/paca.h>
-
-#include <mm/mmu_decl.h>
-
-unsigned int tlbcam_index;
-
-#define NUM_TLBCAMS	(64)
-struct tlbcam TLBCAM[NUM_TLBCAMS];
-
-struct tlbcamrange {
-	unsigned long start;
-	unsigned long limit;
-	phys_addr_t phys;
-} tlbcam_addrs[NUM_TLBCAMS];
-
-unsigned long tlbcam_sz(int idx)
-{
-	return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1;
-}
-
-#ifdef CONFIG_FSL_BOOKE
-/*
- * Return PA for this VA if it is mapped by a CAM, or 0
- */
-phys_addr_t v_block_mapped(unsigned long va)
-{
-	int b;
-	for (b = 0; b < tlbcam_index; ++b)
-		if (va >= tlbcam_addrs[b].start && va < tlbcam_addrs[b].limit)
-			return tlbcam_addrs[b].phys + (va - tlbcam_addrs[b].start);
-	return 0;
-}
-
-/*
- * Return VA for a given PA or 0 if not mapped
- */
-unsigned long p_block_mapped(phys_addr_t pa)
-{
-	int b;
-	for (b = 0; b < tlbcam_index; ++b)
-		if (pa >= tlbcam_addrs[b].phys
-			&& pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start)
-		              +tlbcam_addrs[b].phys)
-			return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys);
-	return 0;
-}
-#endif
-
-/*
- * Set up a variable-size TLB entry (tlbcam). The parameters are not checked;
- * in particular size must be a power of 4 between 4k and the max supported by
- * an implementation; max may further be limited by what can be represented in
- * an unsigned long (for example, 32-bit implementations cannot support a 4GB
- * size).
- */
-static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
-		unsigned long size, unsigned long flags, unsigned int pid)
-{
-	unsigned int tsize;
-
-	tsize = __ilog2(size) - 10;
-
-#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC)
-	if ((flags & _PAGE_NO_CACHE) == 0)
-		flags |= _PAGE_COHERENT;
-#endif
-
-	TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index) | MAS0_NV(index+1);
-	TLBCAM[index].MAS1 = MAS1_VALID | MAS1_IPROT | MAS1_TSIZE(tsize) | MAS1_TID(pid);
-	TLBCAM[index].MAS2 = virt & PAGE_MASK;
-
-	TLBCAM[index].MAS2 |= (flags & _PAGE_WRITETHRU) ? MAS2_W : 0;
-	TLBCAM[index].MAS2 |= (flags & _PAGE_NO_CACHE) ? MAS2_I : 0;
-	TLBCAM[index].MAS2 |= (flags & _PAGE_COHERENT) ? MAS2_M : 0;
-	TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0;
-	TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0;
-
-	TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR;
-	TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0);
-	if (mmu_has_feature(MMU_FTR_BIG_PHYS))
-		TLBCAM[index].MAS7 = (u64)phys >> 32;
-
-	/* Below is unlikely -- only for large user pages or similar */
-	if (pte_user(__pte(flags))) {
-	   TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
-	   TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
-	}
-
-	tlbcam_addrs[index].start = virt;
-	tlbcam_addrs[index].limit = virt + size - 1;
-	tlbcam_addrs[index].phys = phys;
-}
-
-unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
-			  phys_addr_t phys)
-{
-	unsigned int camsize = __ilog2(ram);
-	unsigned int align = __ffs(virt | phys);
-	unsigned long max_cam;
-
-	if ((mfspr(SPRN_MMUCFG) & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
-		/* Convert (4^max) kB to (2^max) bytes */
-		max_cam = ((mfspr(SPRN_TLB1CFG) >> 16) & 0xf) * 2 + 10;
-		camsize &= ~1U;
-		align &= ~1U;
-	} else {
-		/* Convert (2^max) kB to (2^max) bytes */
-		max_cam = __ilog2(mfspr(SPRN_TLB1PS)) + 10;
-	}
-
-	if (camsize > align)
-		camsize = align;
-	if (camsize > max_cam)
-		camsize = max_cam;
-
-	return 1UL << camsize;
-}
-
-static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
-					unsigned long ram, int max_cam_idx,
-					bool dryrun)
-{
-	int i;
-	unsigned long amount_mapped = 0;
-
-	/* Calculate CAM values */
-	for (i = 0; ram && i < max_cam_idx; i++) {
-		unsigned long cam_sz;
-
-		cam_sz = calc_cam_sz(ram, virt, phys);
-		if (!dryrun)
-			settlbcam(i, virt, phys, cam_sz,
-				  pgprot_val(PAGE_KERNEL_X), 0);
-
-		ram -= cam_sz;
-		amount_mapped += cam_sz;
-		virt += cam_sz;
-		phys += cam_sz;
-	}
-
-	if (dryrun)
-		return amount_mapped;
-
-	loadcam_multi(0, i, max_cam_idx);
-	tlbcam_index = i;
-
-#ifdef CONFIG_PPC64
-	get_paca()->tcd.esel_next = i;
-	get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
-	get_paca()->tcd.esel_first = i;
-#endif
-
-	return amount_mapped;
-}
-
-unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun)
-{
-	unsigned long virt = PAGE_OFFSET;
-	phys_addr_t phys = memstart_addr;
-
-	return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun);
-}
-
-#ifdef CONFIG_PPC32
-
-#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS)
-#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS"
-#endif
-
-unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
-{
-	return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1;
-}
-
-/*
- * MMU_init_hw does the chip-specific initialization of the MMU hardware.
- */
-void __init MMU_init_hw(void)
-{
-	flush_instruction_cache();
-}
-
-void __init adjust_total_lowmem(void)
-{
-	unsigned long ram;
-	int i;
-
-	/* adjust lowmem size to __max_low_memory */
-	ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem);
-
-	i = switch_to_as1();
-	__max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false);
-	restore_to_as0(i, 0, 0, 1);
-
-	pr_info("Memory CAM mapping: ");
-	for (i = 0; i < tlbcam_index - 1; i++)
-		pr_cont("%lu/", tlbcam_sz(i) >> 20);
-	pr_cont("%lu Mb, residual: %dMb\n", tlbcam_sz(tlbcam_index - 1) >> 20,
-	        (unsigned int)((total_lowmem - __max_low_memory) >> 20));
-
-	memblock_set_current_limit(memstart_addr + __max_low_memory);
-}
-
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
-				phys_addr_t first_memblock_size)
-{
-	phys_addr_t limit = first_memblock_base + first_memblock_size;
-
-	/* 64M mapped initially according to head_fsl_booke.S */
-	memblock_set_current_limit(min_t(u64, limit, 0x04000000));
-}
-
-#ifdef CONFIG_RELOCATABLE
-int __initdata is_second_reloc;
-notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start)
-{
-	unsigned long base = KERNELBASE;
-
-	kernstart_addr = start;
-	if (is_second_reloc) {
-		virt_phys_offset = PAGE_OFFSET - memstart_addr;
-		return;
-	}
-
-	/*
-	 * Relocatable kernel support based on processing of dynamic
-	 * relocation entries. Before we get the real memstart_addr,
-	 * We will compute the virt_phys_offset like this:
-	 * virt_phys_offset = stext.run - kernstart_addr
-	 *
-	 * stext.run = (KERNELBASE & ~0x3ffffff) +
-	 *				(kernstart_addr & 0x3ffffff)
-	 * When we relocate, we have :
-	 *
-	 *	(kernstart_addr & 0x3ffffff) = (stext.run & 0x3ffffff)
-	 *
-	 * hence:
-	 *  virt_phys_offset = (KERNELBASE & ~0x3ffffff) -
-	 *                              (kernstart_addr & ~0x3ffffff)
-	 *
-	 */
-	start &= ~0x3ffffff;
-	base &= ~0x3ffffff;
-	virt_phys_offset = base - start;
-	early_get_first_memblock_info(__va(dt_ptr), NULL);
-	/*
-	 * We now get the memstart_addr, then we should check if this
-	 * address is the same as what the PAGE_OFFSET map to now. If
-	 * not we have to change the map of PAGE_OFFSET to memstart_addr
-	 * and do a second relocation.
-	 */
-	if (start != memstart_addr) {
-		int n;
-		long offset = start - memstart_addr;
-
-		is_second_reloc = 1;
-		n = switch_to_as1();
-		/* map a 64M area for the second relocation */
-		if (memstart_addr > start)
-			map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM,
-					false);
-		else
-			map_mem_in_cams_addr(start, PAGE_OFFSET + offset,
-					0x4000000, CONFIG_LOWMEM_CAM_NUM,
-					false);
-		restore_to_as0(n, offset, __va(dt_ptr), 1);
-		/* We should never reach here */
-		panic("Relocation error");
-	}
-}
-#endif
-#endif
diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
deleted file mode 100644
index f84ec46cdb26..000000000000
--- a/arch/powerpc/mm/hugetlbpage-book3e.c
+++ /dev/null
@@ -1,206 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * PPC Huge TLB Page Support for Book3E MMU
- *
- * Copyright (C) 2009 David Gibson, IBM Corporation.
- * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
- *
- */
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-
-#include <asm/mmu.h>
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
-#ifdef CONFIG_PPC64
-static inline int tlb1_next(void)
-{
-	struct paca_struct *paca = get_paca();
-	struct tlb_core_data *tcd;
-	int this, next;
-
-	tcd = paca->tcd_ptr;
-	this = tcd->esel_next;
-
-	next = this + 1;
-	if (next >= tcd->esel_max)
-		next = tcd->esel_first;
-
-	tcd->esel_next = next;
-	return this;
-}
-#else
-static inline int tlb1_next(void)
-{
-	int index, ncams;
-
-	ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
-
-	index = this_cpu_read(next_tlbcam_idx);
-
-	/* Just round-robin the entries and wrap when we hit the end */
-	if (unlikely(index == ncams - 1))
-		__this_cpu_write(next_tlbcam_idx, tlbcam_index);
-	else
-		__this_cpu_inc(next_tlbcam_idx);
-
-	return index;
-}
-#endif /* !PPC64 */
-#endif /* FSL */
-
-static inline int mmu_get_tsize(int psize)
-{
-	return mmu_psize_defs[psize].enc;
-}
-
-#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_PPC64)
-#include <asm/paca.h>
-
-static inline void book3e_tlb_lock(void)
-{
-	struct paca_struct *paca = get_paca();
-	unsigned long tmp;
-	int token = smp_processor_id() + 1;
-
-	/*
-	 * Besides being unnecessary in the absence of SMT, this
-	 * check prevents trying to do lbarx/stbcx. on e5500 which
-	 * doesn't implement either feature.
-	 */
-	if (!cpu_has_feature(CPU_FTR_SMT))
-		return;
-
-	asm volatile("1: lbarx %0, 0, %1;"
-		     "cmpwi %0, 0;"
-		     "bne 2f;"
-		     "stbcx. %2, 0, %1;"
-		     "bne 1b;"
-		     "b 3f;"
-		     "2: lbzx %0, 0, %1;"
-		     "cmpwi %0, 0;"
-		     "bne 2b;"
-		     "b 1b;"
-		     "3:"
-		     : "=&r" (tmp)
-		     : "r" (&paca->tcd_ptr->lock), "r" (token)
-		     : "memory");
-}
-
-static inline void book3e_tlb_unlock(void)
-{
-	struct paca_struct *paca = get_paca();
-
-	if (!cpu_has_feature(CPU_FTR_SMT))
-		return;
-
-	isync();
-	paca->tcd_ptr->lock = 0;
-}
-#else
-static inline void book3e_tlb_lock(void)
-{
-}
-
-static inline void book3e_tlb_unlock(void)
-{
-}
-#endif
-
-static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid)
-{
-	int found = 0;
-
-	mtspr(SPRN_MAS6, pid << 16);
-	if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) {
-		asm volatile(
-			"li	%0,0\n"
-			"tlbsx.	0,%1\n"
-			"bne	1f\n"
-			"li	%0,1\n"
-			"1:\n"
-			: "=&r"(found) : "r"(ea));
-	} else {
-		asm volatile(
-			"tlbsx	0,%1\n"
-			"mfspr	%0,0x271\n"
-			"srwi	%0,%0,31\n"
-			: "=&r"(found) : "r"(ea));
-	}
-
-	return found;
-}
-
-void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
-			    pte_t pte)
-{
-	unsigned long mas1, mas2;
-	u64 mas7_3;
-	unsigned long psize, tsize, shift;
-	unsigned long flags;
-	struct mm_struct *mm;
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
-	int index;
-#endif
-
-	if (unlikely(is_kernel_addr(ea)))
-		return;
-
-	mm = vma->vm_mm;
-
-	psize = vma_mmu_pagesize(vma);
-	shift = __ilog2(psize);
-	tsize = shift - 10;
-	/*
-	 * We can't be interrupted while we're setting up the MAS
-	 * regusters or after we've confirmed that no tlb exists.
-	 */
-	local_irq_save(flags);
-
-	book3e_tlb_lock();
-
-	if (unlikely(book3e_tlb_exists(ea, mm->context.id))) {
-		book3e_tlb_unlock();
-		local_irq_restore(flags);
-		return;
-	}
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
-	/* We have to use the CAM(TLB1) on FSL parts for hugepages */
-	index = tlb1_next();
-	mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1));
-#endif
-
-	mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize);
-	mas2 = ea & ~((1UL << shift) - 1);
-	mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
-	mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT;
-	mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK;
-	if (!pte_dirty(pte))
-		mas7_3 &= ~(MAS3_SW|MAS3_UW);
-
-	mtspr(SPRN_MAS1, mas1);
-	mtspr(SPRN_MAS2, mas2);
-
-	if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) {
-		mtspr(SPRN_MAS7_MAS3, mas7_3);
-	} else {
-		if (mmu_has_feature(MMU_FTR_BIG_PHYS))
-			mtspr(SPRN_MAS7, upper_32_bits(mas7_3));
-		mtspr(SPRN_MAS3, lower_32_bits(mas7_3));
-	}
-
-	asm volatile ("tlbwe");
-
-	book3e_tlb_unlock();
-	local_irq_restore(flags);
-}
-
-void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
-	struct hstate *hstate = hstate_file(vma->vm_file);
-	unsigned long tsize = huge_page_shift(hstate) - 10;
-
-	__flush_tlb_page(vma->vm_mm, vmaddr, tsize, 0);
-}
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
deleted file mode 100644
index ae4505d5b4b8..000000000000
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ /dev/null
@@ -1,497 +0,0 @@
-/*
- * This file contains the routines for handling the MMU on those
- * PowerPC implementations where the MMU is not using the hash
- * table, such as 8xx, 4xx, BookE's etc...
- *
- * Copyright 2008 Ben Herrenschmidt <benh@kernel.crashing.org>
- *                IBM Corp.
- *
- *  Derived from previous arch/powerpc/mm/mmu_context.c
- *  and arch/powerpc/include/asm/mmu_context.h
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- * TODO:
- *
- *   - The global context lock will not scale very well
- *   - The maps should be dynamically allocated to allow for processors
- *     that support more PID bits at runtime
- *   - Implement flush_tlb_mm() by making the context stale and picking
- *     a new one
- *   - More aggressively clear stale map bits and maybe find some way to
- *     also clear mm->cpu_vm_mask bits when processes are migrated
- */
-
-//#define DEBUG_MAP_CONSISTENCY
-//#define DEBUG_CLAMP_LAST_CONTEXT   31
-//#define DEBUG_HARDER
-
-/* We don't use DEBUG because it tends to be compiled in always nowadays
- * and this would generate way too much output
- */
-#ifdef DEBUG_HARDER
-#define pr_hard(args...)	printk(KERN_DEBUG args)
-#define pr_hardcont(args...)	printk(KERN_CONT args)
-#else
-#define pr_hard(args...)	do { } while(0)
-#define pr_hardcont(args...)	do { } while(0)
-#endif
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/memblock.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/slab.h>
-
-#include <asm/mmu_context.h>
-#include <asm/tlbflush.h>
-
-#include <mm/mmu_decl.h>
-
-/*
- * The MPC8xx has only 16 contexts. We rotate through them on each task switch.
- * A better way would be to keep track of tasks that own contexts, and implement
- * an LRU usage. That way very active tasks don't always have to pay the TLB
- * reload overhead. The kernel pages are mapped shared, so the kernel can run on
- * behalf of any task that makes a kernel entry. Shared does not mean they are
- * not protected, just that the ASID comparison is not performed. -- Dan
- *
- * The IBM4xx has 256 contexts, so we can just rotate through these as a way of
- * "switching" contexts. If the TID of the TLB is zero, the PID/TID comparison
- * is disabled, so we can use a TID of zero to represent all kernel pages as
- * shared among all contexts. -- Dan
- *
- * The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We should
- * normally never have to steal though the facility is present if needed.
- * -- BenH
- */
-#define FIRST_CONTEXT 1
-#ifdef DEBUG_CLAMP_LAST_CONTEXT
-#define LAST_CONTEXT DEBUG_CLAMP_LAST_CONTEXT
-#elif defined(CONFIG_PPC_8xx)
-#define LAST_CONTEXT 16
-#elif defined(CONFIG_PPC_47x)
-#define LAST_CONTEXT 65535
-#else
-#define LAST_CONTEXT 255
-#endif
-
-static unsigned int next_context, nr_free_contexts;
-static unsigned long *context_map;
-#ifdef CONFIG_SMP
-static unsigned long *stale_map[NR_CPUS];
-#endif
-static struct mm_struct **context_mm;
-static DEFINE_RAW_SPINLOCK(context_lock);
-
-#define CTX_MAP_SIZE	\
-	(sizeof(unsigned long) * (LAST_CONTEXT / BITS_PER_LONG + 1))
-
-
-/* Steal a context from a task that has one at the moment.
- *
- * This is used when we are running out of available PID numbers
- * on the processors.
- *
- * This isn't an LRU system, it just frees up each context in
- * turn (sort-of pseudo-random replacement :).  This would be the
- * place to implement an LRU scheme if anyone was motivated to do it.
- *  -- paulus
- *
- * For context stealing, we use a slightly different approach for
- * SMP and UP. Basically, the UP one is simpler and doesn't use
- * the stale map as we can just flush the local CPU
- *  -- benh
- */
-#ifdef CONFIG_SMP
-static unsigned int steal_context_smp(unsigned int id)
-{
-	struct mm_struct *mm;
-	unsigned int cpu, max, i;
-
-	max = LAST_CONTEXT - FIRST_CONTEXT;
-
-	/* Attempt to free next_context first and then loop until we manage */
-	while (max--) {
-		/* Pick up the victim mm */
-		mm = context_mm[id];
-
-		/* We have a candidate victim, check if it's active, on SMP
-		 * we cannot steal active contexts
-		 */
-		if (mm->context.active) {
-			id++;
-			if (id > LAST_CONTEXT)
-				id = FIRST_CONTEXT;
-			continue;
-		}
-		pr_hardcont(" | steal %d from 0x%p", id, mm);
-
-		/* Mark this mm has having no context anymore */
-		mm->context.id = MMU_NO_CONTEXT;
-
-		/* Mark it stale on all CPUs that used this mm. For threaded
-		 * implementations, we set it on all threads on each core
-		 * represented in the mask. A future implementation will use
-		 * a core map instead but this will do for now.
-		 */
-		for_each_cpu(cpu, mm_cpumask(mm)) {
-			for (i = cpu_first_thread_sibling(cpu);
-			     i <= cpu_last_thread_sibling(cpu); i++) {
-				if (stale_map[i])
-					__set_bit(id, stale_map[i]);
-			}
-			cpu = i - 1;
-		}
-		return id;
-	}
-
-	/* This will happen if you have more CPUs than available contexts,
-	 * all we can do here is wait a bit and try again
-	 */
-	raw_spin_unlock(&context_lock);
-	cpu_relax();
-	raw_spin_lock(&context_lock);
-
-	/* This will cause the caller to try again */
-	return MMU_NO_CONTEXT;
-}
-#endif  /* CONFIG_SMP */
-
-static unsigned int steal_all_contexts(void)
-{
-	struct mm_struct *mm;
-#ifdef CONFIG_SMP
-	int cpu = smp_processor_id();
-#endif
-	unsigned int id;
-
-	for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) {
-		/* Pick up the victim mm */
-		mm = context_mm[id];
-
-		pr_hardcont(" | steal %d from 0x%p", id, mm);
-
-		/* Mark this mm as having no context anymore */
-		mm->context.id = MMU_NO_CONTEXT;
-		if (id != FIRST_CONTEXT) {
-			context_mm[id] = NULL;
-			__clear_bit(id, context_map);
-#ifdef DEBUG_MAP_CONSISTENCY
-			mm->context.active = 0;
-#endif
-		}
-#ifdef CONFIG_SMP
-		__clear_bit(id, stale_map[cpu]);
-#endif
-	}
-
-	/* Flush the TLB for all contexts (not to be used on SMP) */
-	_tlbil_all();
-
-	nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT;
-
-	return FIRST_CONTEXT;
-}
-
-/* Note that this will also be called on SMP if all other CPUs are
- * offlined, which means that it may be called for cpu != 0. For
- * this to work, we somewhat assume that CPUs that are onlined
- * come up with a fully clean TLB (or are cleaned when offlined)
- */
-static unsigned int steal_context_up(unsigned int id)
-{
-	struct mm_struct *mm;
-#ifdef CONFIG_SMP
-	int cpu = smp_processor_id();
-#endif
-
-	/* Pick up the victim mm */
-	mm = context_mm[id];
-
-	pr_hardcont(" | steal %d from 0x%p", id, mm);
-
-	/* Flush the TLB for that context */
-	local_flush_tlb_mm(mm);
-
-	/* Mark this mm has having no context anymore */
-	mm->context.id = MMU_NO_CONTEXT;
-
-	/* XXX This clear should ultimately be part of local_flush_tlb_mm */
-#ifdef CONFIG_SMP
-	__clear_bit(id, stale_map[cpu]);
-#endif
-
-	return id;
-}
-
-#ifdef DEBUG_MAP_CONSISTENCY
-static void context_check_map(void)
-{
-	unsigned int id, nrf, nact;
-
-	nrf = nact = 0;
-	for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) {
-		int used = test_bit(id, context_map);
-		if (!used)
-			nrf++;
-		if (used != (context_mm[id] != NULL))
-			pr_err("MMU: Context %d is %s and MM is %p !\n",
-			       id, used ? "used" : "free", context_mm[id]);
-		if (context_mm[id] != NULL)
-			nact += context_mm[id]->context.active;
-	}
-	if (nrf != nr_free_contexts) {
-		pr_err("MMU: Free context count out of sync ! (%d vs %d)\n",
-		       nr_free_contexts, nrf);
-		nr_free_contexts = nrf;
-	}
-	if (nact > num_online_cpus())
-		pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n",
-		       nact, num_online_cpus());
-	if (FIRST_CONTEXT > 0 && !test_bit(0, context_map))
-		pr_err("MMU: Context 0 has been freed !!!\n");
-}
-#else
-static void context_check_map(void) { }
-#endif
-
-void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
-			struct task_struct *tsk)
-{
-	unsigned int id;
-#ifdef CONFIG_SMP
-	unsigned int i, cpu = smp_processor_id();
-#endif
-	unsigned long *map;
-
-	/* No lockless fast path .. yet */
-	raw_spin_lock(&context_lock);
-
-	pr_hard("[%d] activating context for mm @%p, active=%d, id=%d",
-		cpu, next, next->context.active, next->context.id);
-
-#ifdef CONFIG_SMP
-	/* Mark us active and the previous one not anymore */
-	next->context.active++;
-	if (prev) {
-		pr_hardcont(" (old=0x%p a=%d)", prev, prev->context.active);
-		WARN_ON(prev->context.active < 1);
-		prev->context.active--;
-	}
-
- again:
-#endif /* CONFIG_SMP */
-
-	/* If we already have a valid assigned context, skip all that */
-	id = next->context.id;
-	if (likely(id != MMU_NO_CONTEXT)) {
-#ifdef DEBUG_MAP_CONSISTENCY
-		if (context_mm[id] != next)
-			pr_err("MMU: mm 0x%p has id %d but context_mm[%d] says 0x%p\n",
-			       next, id, id, context_mm[id]);
-#endif
-		goto ctxt_ok;
-	}
-
-	/* We really don't have a context, let's try to acquire one */
-	id = next_context;
-	if (id > LAST_CONTEXT)
-		id = FIRST_CONTEXT;
-	map = context_map;
-
-	/* No more free contexts, let's try to steal one */
-	if (nr_free_contexts == 0) {
-#ifdef CONFIG_SMP
-		if (num_online_cpus() > 1) {
-			id = steal_context_smp(id);
-			if (id == MMU_NO_CONTEXT)
-				goto again;
-			goto stolen;
-		}
-#endif /* CONFIG_SMP */
-		if (IS_ENABLED(CONFIG_PPC_8xx))
-			id = steal_all_contexts();
-		else
-			id = steal_context_up(id);
-		goto stolen;
-	}
-	nr_free_contexts--;
-
-	/* We know there's at least one free context, try to find it */
-	while (__test_and_set_bit(id, map)) {
-		id = find_next_zero_bit(map, LAST_CONTEXT+1, id);
-		if (id > LAST_CONTEXT)
-			id = FIRST_CONTEXT;
-	}
- stolen:
-	next_context = id + 1;
-	context_mm[id] = next;
-	next->context.id = id;
-	pr_hardcont(" | new id=%d,nrf=%d", id, nr_free_contexts);
-
-	context_check_map();
- ctxt_ok:
-
-	/* If that context got marked stale on this CPU, then flush the
-	 * local TLB for it and unmark it before we use it
-	 */
-#ifdef CONFIG_SMP
-	if (test_bit(id, stale_map[cpu])) {
-		pr_hardcont(" | stale flush %d [%d..%d]",
-			    id, cpu_first_thread_sibling(cpu),
-			    cpu_last_thread_sibling(cpu));
-
-		local_flush_tlb_mm(next);
-
-		/* XXX This clear should ultimately be part of local_flush_tlb_mm */
-		for (i = cpu_first_thread_sibling(cpu);
-		     i <= cpu_last_thread_sibling(cpu); i++) {
-			if (stale_map[i])
-				__clear_bit(id, stale_map[i]);
-		}
-	}
-#endif
-
-	/* Flick the MMU and release lock */
-	pr_hardcont(" -> %d\n", id);
-	set_context(id, next->pgd);
-	raw_spin_unlock(&context_lock);
-}
-
-/*
- * Set up the context for a new address space.
- */
-int init_new_context(struct task_struct *t, struct mm_struct *mm)
-{
-	pr_hard("initing context for mm @%p\n", mm);
-
-	/*
-	 * We have MMU_NO_CONTEXT set to be ~0. Hence check
-	 * explicitly against context.id == 0. This ensures that we properly
-	 * initialize context slice details for newly allocated mm's (which will
-	 * have id == 0) and don't alter context slice inherited via fork (which
-	 * will have id != 0).
-	 */
-	if (mm->context.id == 0)
-		slice_init_new_context_exec(mm);
-	mm->context.id = MMU_NO_CONTEXT;
-	mm->context.active = 0;
-	pte_frag_set(&mm->context, NULL);
-	return 0;
-}
-
-/*
- * We're finished using the context for an address space.
- */
-void destroy_context(struct mm_struct *mm)
-{
-	unsigned long flags;
-	unsigned int id;
-
-	if (mm->context.id == MMU_NO_CONTEXT)
-		return;
-
-	WARN_ON(mm->context.active != 0);
-
-	raw_spin_lock_irqsave(&context_lock, flags);
-	id = mm->context.id;
-	if (id != MMU_NO_CONTEXT) {
-		__clear_bit(id, context_map);
-		mm->context.id = MMU_NO_CONTEXT;
-#ifdef DEBUG_MAP_CONSISTENCY
-		mm->context.active = 0;
-#endif
-		context_mm[id] = NULL;
-		nr_free_contexts++;
-	}
-	raw_spin_unlock_irqrestore(&context_lock, flags);
-}
-
-#ifdef CONFIG_SMP
-static int mmu_ctx_cpu_prepare(unsigned int cpu)
-{
-	/* We don't touch CPU 0 map, it's allocated at aboot and kept
-	 * around forever
-	 */
-	if (cpu == boot_cpuid)
-		return 0;
-
-	pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu);
-	stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL);
-	return 0;
-}
-
-static int mmu_ctx_cpu_dead(unsigned int cpu)
-{
-#ifdef CONFIG_HOTPLUG_CPU
-	if (cpu == boot_cpuid)
-		return 0;
-
-	pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu);
-	kfree(stale_map[cpu]);
-	stale_map[cpu] = NULL;
-
-	/* We also clear the cpu_vm_mask bits of CPUs going away */
-	clear_tasks_mm_cpumask(cpu);
-#endif
-	return 0;
-}
-
-#endif /* CONFIG_SMP */
-
-/*
- * Initialize the context management stuff.
- */
-void __init mmu_context_init(void)
-{
-	/* Mark init_mm as being active on all possible CPUs since
-	 * we'll get called with prev == init_mm the first time
-	 * we schedule on a given CPU
-	 */
-	init_mm.context.active = NR_CPUS;
-
-	/*
-	 * Allocate the maps used by context management
-	 */
-	context_map = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES);
-	if (!context_map)
-		panic("%s: Failed to allocate %zu bytes\n", __func__,
-		      CTX_MAP_SIZE);
-	context_mm = memblock_alloc(sizeof(void *) * (LAST_CONTEXT + 1),
-				    SMP_CACHE_BYTES);
-	if (!context_mm)
-		panic("%s: Failed to allocate %zu bytes\n", __func__,
-		      sizeof(void *) * (LAST_CONTEXT + 1));
-#ifdef CONFIG_SMP
-	stale_map[boot_cpuid] = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES);
-	if (!stale_map[boot_cpuid])
-		panic("%s: Failed to allocate %zu bytes\n", __func__,
-		      CTX_MAP_SIZE);
-
-	cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE,
-				  "powerpc/mmu/ctx:prepare",
-				  mmu_ctx_cpu_prepare, mmu_ctx_cpu_dead);
-#endif
-
-	printk(KERN_INFO
-	       "MMU: Allocated %zu bytes of context maps for %d contexts\n",
-	       2 * CTX_MAP_SIZE + (sizeof(void *) * (LAST_CONTEXT + 1)),
-	       LAST_CONTEXT - FIRST_CONTEXT + 1);
-
-	/*
-	 * Some processors have too few contexts to reserve one for
-	 * init_mm, and require using context 0 for a normal task.
-	 * Other processors reserve the use of context zero for the kernel.
-	 * This code assumes FIRST_CONTEXT < 32.
-	 */
-	context_map[0] = (1 << FIRST_CONTEXT) - 1;
-	next_context = FIRST_CONTEXT;
-	nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT + 1;
-}
diff --git a/arch/powerpc/mm/nohash/40x.c b/arch/powerpc/mm/nohash/40x.c
new file mode 100644
index 000000000000..460459b6f53e
--- /dev/null
+++ b/arch/powerpc/mm/nohash/40x.c
@@ -0,0 +1,159 @@
+/*
+ * This file contains the routines for initializing the MMU
+ * on the 4xx series of chips.
+ *  -- paulus
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/highmem.h>
+#include <linux/memblock.h>
+
+#include <asm/pgalloc.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <linux/uaccess.h>
+#include <asm/smp.h>
+#include <asm/bootx.h>
+#include <asm/machdep.h>
+#include <asm/setup.h>
+
+#include <mm/mmu_decl.h>
+
+extern int __map_without_ltlbs;
+/*
+ * MMU_init_hw does the chip-specific initialization of the MMU hardware.
+ */
+void __init MMU_init_hw(void)
+{
+	/*
+	 * The Zone Protection Register (ZPR) defines how protection will
+	 * be applied to every page which is a member of a given zone. At
+	 * present, we utilize only two of the 4xx's zones.
+	 * The zone index bits (of ZSEL) in the PTE are used for software
+	 * indicators, except the LSB.  For user access, zone 1 is used,
+	 * for kernel access, zone 0 is used.  We set all but zone 1
+	 * to zero, allowing only kernel access as indicated in the PTE.
+	 * For zone 1, we set a 01 binary (a value of 10 will not work)
+	 * to allow user access as indicated in the PTE.  This also allows
+	 * kernel access as indicated in the PTE.
+	 */
+
+        mtspr(SPRN_ZPR, 0x10000000);
+
+	flush_instruction_cache();
+
+	/*
+	 * Set up the real-mode cache parameters for the exception vector
+	 * handlers (which are run in real-mode).
+	 */
+
+        mtspr(SPRN_DCWR, 0x00000000);	/* All caching is write-back */
+
+        /*
+	 * Cache instruction and data space where the exception
+	 * vectors and the kernel live in real-mode.
+	 */
+
+        mtspr(SPRN_DCCR, 0xFFFF0000);	/* 2GByte of data space at 0x0. */
+        mtspr(SPRN_ICCR, 0xFFFF0000);	/* 2GByte of instr. space at 0x0. */
+}
+
+#define LARGE_PAGE_SIZE_16M	(1<<24)
+#define LARGE_PAGE_SIZE_4M	(1<<22)
+
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
+{
+	unsigned long v, s, mapped;
+	phys_addr_t p;
+
+	v = KERNELBASE;
+	p = 0;
+	s = total_lowmem;
+
+	if (__map_without_ltlbs)
+		return 0;
+
+	while (s >= LARGE_PAGE_SIZE_16M) {
+		pmd_t *pmdp;
+		unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE;
+
+		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
+		*pmdp++ = __pmd(val);
+		*pmdp++ = __pmd(val);
+		*pmdp++ = __pmd(val);
+		*pmdp++ = __pmd(val);
+
+		v += LARGE_PAGE_SIZE_16M;
+		p += LARGE_PAGE_SIZE_16M;
+		s -= LARGE_PAGE_SIZE_16M;
+	}
+
+	while (s >= LARGE_PAGE_SIZE_4M) {
+		pmd_t *pmdp;
+		unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE;
+
+		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
+		*pmdp = __pmd(val);
+
+		v += LARGE_PAGE_SIZE_4M;
+		p += LARGE_PAGE_SIZE_4M;
+		s -= LARGE_PAGE_SIZE_4M;
+	}
+
+	mapped = total_lowmem - s;
+
+	/* If the size of RAM is not an exact power of two, we may not
+	 * have covered RAM in its entirety with 16 and 4 MiB
+	 * pages. Consequently, restrict the top end of RAM currently
+	 * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail"
+	 * coverage with normal-sized pages (or other reasons) do not
+	 * attempt to allocate outside the allowed range.
+	 */
+	memblock_set_current_limit(mapped);
+
+	return mapped;
+}
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+				phys_addr_t first_memblock_size)
+{
+	/* We don't currently support the first MEMBLOCK not mapping 0
+	 * physical on those processors
+	 */
+	BUG_ON(first_memblock_base != 0);
+
+	/* 40x can only access 16MB at the moment (see head_40x.S) */
+	memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000));
+}
diff --git a/arch/powerpc/mm/nohash/44x.c b/arch/powerpc/mm/nohash/44x.c
new file mode 100644
index 000000000000..c07983ebc02e
--- /dev/null
+++ b/arch/powerpc/mm/nohash/44x.c
@@ -0,0 +1,246 @@
+/*
+ * Modifications by Matt Porter (mporter@mvista.com) to support
+ * PPC44x Book E processors.
+ *
+ * This file contains the routines for initializing the MMU
+ * on the 4xx series of chips.
+ *  -- paulus
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/memblock.h>
+
+#include <asm/mmu.h>
+#include <asm/page.h>
+#include <asm/cacheflush.h>
+#include <asm/code-patching.h>
+
+#include <mm/mmu_decl.h>
+
+/* Used by the 44x TLB replacement exception handler.
+ * Just needed it declared someplace.
+ */
+unsigned int tlb_44x_index; /* = 0 */
+unsigned int tlb_44x_hwater = PPC44x_TLB_SIZE - 1 - PPC44x_EARLY_TLBS;
+int icache_44x_need_flush;
+
+unsigned long tlb_47x_boltmap[1024/8];
+
+static void ppc44x_update_tlb_hwater(void)
+{
+	/* The TLB miss handlers hard codes the watermark in a cmpli
+	 * instruction to improve performances rather than loading it
+	 * from the global variable. Thus, we patch the instructions
+	 * in the 2 TLB miss handlers when updating the value
+	 */
+	modify_instruction_site(&patch__tlb_44x_hwater_D, 0xffff, tlb_44x_hwater);
+	modify_instruction_site(&patch__tlb_44x_hwater_I, 0xffff, tlb_44x_hwater);
+}
+
+/*
+ * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 44x type MMU
+ */
+static void __init ppc44x_pin_tlb(unsigned int virt, unsigned int phys)
+{
+	unsigned int entry = tlb_44x_hwater--;
+
+	ppc44x_update_tlb_hwater();
+
+	mtspr(SPRN_MMUCR, 0);
+
+	__asm__ __volatile__(
+		"tlbwe	%2,%3,%4\n"
+		"tlbwe	%1,%3,%5\n"
+		"tlbwe	%0,%3,%6\n"
+	:
+	: "r" (PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G),
+	  "r" (phys),
+	  "r" (virt | PPC44x_TLB_VALID | PPC44x_TLB_256M),
+	  "r" (entry),
+	  "i" (PPC44x_TLB_PAGEID),
+	  "i" (PPC44x_TLB_XLAT),
+	  "i" (PPC44x_TLB_ATTRIB));
+}
+
+static int __init ppc47x_find_free_bolted(void)
+{
+	unsigned int mmube0 = mfspr(SPRN_MMUBE0);
+	unsigned int mmube1 = mfspr(SPRN_MMUBE1);
+
+	if (!(mmube0 & MMUBE0_VBE0))
+		return 0;
+	if (!(mmube0 & MMUBE0_VBE1))
+		return 1;
+	if (!(mmube0 & MMUBE0_VBE2))
+		return 2;
+	if (!(mmube1 & MMUBE1_VBE3))
+		return 3;
+	if (!(mmube1 & MMUBE1_VBE4))
+		return 4;
+	if (!(mmube1 & MMUBE1_VBE5))
+		return 5;
+	return -1;
+}
+
+static void __init ppc47x_update_boltmap(void)
+{
+	unsigned int mmube0 = mfspr(SPRN_MMUBE0);
+	unsigned int mmube1 = mfspr(SPRN_MMUBE1);
+
+	if (mmube0 & MMUBE0_VBE0)
+		__set_bit((mmube0 >> MMUBE0_IBE0_SHIFT) & 0xff,
+			  tlb_47x_boltmap);
+	if (mmube0 & MMUBE0_VBE1)
+		__set_bit((mmube0 >> MMUBE0_IBE1_SHIFT) & 0xff,
+			  tlb_47x_boltmap);
+	if (mmube0 & MMUBE0_VBE2)
+		__set_bit((mmube0 >> MMUBE0_IBE2_SHIFT) & 0xff,
+			  tlb_47x_boltmap);
+	if (mmube1 & MMUBE1_VBE3)
+		__set_bit((mmube1 >> MMUBE1_IBE3_SHIFT) & 0xff,
+			  tlb_47x_boltmap);
+	if (mmube1 & MMUBE1_VBE4)
+		__set_bit((mmube1 >> MMUBE1_IBE4_SHIFT) & 0xff,
+			  tlb_47x_boltmap);
+	if (mmube1 & MMUBE1_VBE5)
+		__set_bit((mmube1 >> MMUBE1_IBE5_SHIFT) & 0xff,
+			  tlb_47x_boltmap);
+}
+
+/*
+ * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU
+ */
+static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
+{
+	unsigned int rA;
+	int bolted;
+
+	/* Base rA is HW way select, way 0, bolted bit set */
+	rA = 0x88000000;
+
+	/* Look for a bolted entry slot */
+	bolted = ppc47x_find_free_bolted();
+	BUG_ON(bolted < 0);
+
+	/* Insert bolted slot number */
+	rA |= bolted << 24;
+
+	pr_debug("256M TLB entry for 0x%08x->0x%08x in bolt slot %d\n",
+		 virt, phys, bolted);
+
+	mtspr(SPRN_MMUCR, 0);
+
+	__asm__ __volatile__(
+		"tlbwe	%2,%3,0\n"
+		"tlbwe	%1,%3,1\n"
+		"tlbwe	%0,%3,2\n"
+		:
+		: "r" (PPC47x_TLB2_SW | PPC47x_TLB2_SR |
+		       PPC47x_TLB2_SX
+#ifdef CONFIG_SMP
+		       | PPC47x_TLB2_M
+#endif
+		       ),
+		  "r" (phys),
+		  "r" (virt | PPC47x_TLB0_VALID | PPC47x_TLB0_256M),
+		  "r" (rA));
+}
+
+void __init MMU_init_hw(void)
+{
+	/* This is not useful on 47x but won't hurt either */
+	ppc44x_update_tlb_hwater();
+
+	flush_instruction_cache();
+}
+
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
+{
+	unsigned long addr;
+	unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
+
+	/* Pin in enough TLBs to cover any lowmem not covered by the
+	 * initial 256M mapping established in head_44x.S */
+	for (addr = memstart + PPC_PIN_SIZE; addr < lowmem_end_addr;
+	     addr += PPC_PIN_SIZE) {
+		if (mmu_has_feature(MMU_FTR_TYPE_47x))
+			ppc47x_pin_tlb(addr + PAGE_OFFSET, addr);
+		else
+			ppc44x_pin_tlb(addr + PAGE_OFFSET, addr);
+	}
+	if (mmu_has_feature(MMU_FTR_TYPE_47x)) {
+		ppc47x_update_boltmap();
+
+#ifdef DEBUG
+		{
+			int i;
+
+			printk(KERN_DEBUG "bolted entries: ");
+			for (i = 0; i < 255; i++) {
+				if (test_bit(i, tlb_47x_boltmap))
+					printk("%d ", i);
+			}
+			printk("\n");
+		}
+#endif /* DEBUG */
+	}
+	return total_lowmem;
+}
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+				phys_addr_t first_memblock_size)
+{
+	u64 size;
+
+#ifndef CONFIG_NONSTATIC_KERNEL
+	/* We don't currently support the first MEMBLOCK not mapping 0
+	 * physical on those processors
+	 */
+	BUG_ON(first_memblock_base != 0);
+#endif
+
+	/* 44x has a 256M TLB entry pinned at boot */
+	size = (min_t(u64, first_memblock_size, PPC_PIN_SIZE));
+	memblock_set_current_limit(first_memblock_base + size);
+}
+
+#ifdef CONFIG_SMP
+void __init mmu_init_secondary(int cpu)
+{
+	unsigned long addr;
+	unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
+
+	/* Pin in enough TLBs to cover any lowmem not covered by the
+	 * initial 256M mapping established in head_44x.S
+	 *
+	 * WARNING: This is called with only the first 256M of the
+	 * linear mapping in the TLB and we can't take faults yet
+	 * so beware of what this code uses. It runs off a temporary
+	 * stack. current (r2) isn't initialized, smp_processor_id()
+	 * will not work, current thread info isn't accessible, ...
+	 */
+	for (addr = memstart + PPC_PIN_SIZE; addr < lowmem_end_addr;
+	     addr += PPC_PIN_SIZE) {
+		if (mmu_has_feature(MMU_FTR_TYPE_47x))
+			ppc47x_pin_tlb(addr + PAGE_OFFSET, addr);
+		else
+			ppc44x_pin_tlb(addr + PAGE_OFFSET, addr);
+	}
+}
+#endif /* CONFIG_SMP */
diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
new file mode 100644
index 000000000000..70d55b615b62
--- /dev/null
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -0,0 +1,239 @@
+/*
+ * This file contains the routines for initializing the MMU
+ * on the 8xx series of chips.
+ *  -- christophe
+ *
+ *  Derived from arch/powerpc/mm/40x_mmu.c:
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/memblock.h>
+#include <linux/mmu_context.h>
+#include <asm/fixmap.h>
+#include <asm/code-patching.h>
+
+#include <mm/mmu_decl.h>
+
+#define IMMR_SIZE (FIX_IMMR_SIZE << PAGE_SHIFT)
+
+extern int __map_without_ltlbs;
+
+static unsigned long block_mapped_ram;
+
+/*
+ * Return PA for this VA if it is in an area mapped with LTLBs.
+ * Otherwise, returns 0
+ */
+phys_addr_t v_block_mapped(unsigned long va)
+{
+	unsigned long p = PHYS_IMMR_BASE;
+
+	if (__map_without_ltlbs)
+		return 0;
+	if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE)
+		return p + va - VIRT_IMMR_BASE;
+	if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram)
+		return __pa(va);
+	return 0;
+}
+
+/*
+ * Return VA for a given PA mapped with LTLBs or 0 if not mapped
+ */
+unsigned long p_block_mapped(phys_addr_t pa)
+{
+	unsigned long p = PHYS_IMMR_BASE;
+
+	if (__map_without_ltlbs)
+		return 0;
+	if (pa >= p && pa < p + IMMR_SIZE)
+		return VIRT_IMMR_BASE + pa - p;
+	if (pa < block_mapped_ram)
+		return (unsigned long)__va(pa);
+	return 0;
+}
+
+#define LARGE_PAGE_SIZE_8M	(1<<23)
+
+/*
+ * MMU_init_hw does the chip-specific initialization of the MMU hardware.
+ */
+void __init MMU_init_hw(void)
+{
+	/* PIN up to the 3 first 8Mb after IMMR in DTLB table */
+	if (IS_ENABLED(CONFIG_PIN_TLB_DATA)) {
+		unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000;
+		unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY;
+		int i = IS_ENABLED(CONFIG_PIN_TLB_IMMR) ? 29 : 28;
+		unsigned long addr = 0;
+		unsigned long mem = total_lowmem;
+
+		for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) {
+			mtspr(SPRN_MD_CTR, ctr | (i << 8));
+			mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID);
+			mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID);
+			mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT);
+			addr += LARGE_PAGE_SIZE_8M;
+			mem -= LARGE_PAGE_SIZE_8M;
+		}
+	}
+}
+
+static void __init mmu_mapin_immr(void)
+{
+	unsigned long p = PHYS_IMMR_BASE;
+	unsigned long v = VIRT_IMMR_BASE;
+	int offset;
+
+	for (offset = 0; offset < IMMR_SIZE; offset += PAGE_SIZE)
+		map_kernel_page(v + offset, p + offset, PAGE_KERNEL_NCG);
+}
+
+static void mmu_patch_cmp_limit(s32 *site, unsigned long mapped)
+{
+	modify_instruction_site(site, 0xffff, (unsigned long)__va(mapped) >> 16);
+}
+
+static void mmu_patch_addis(s32 *site, long simm)
+{
+	unsigned int instr = *(unsigned int *)patch_site_addr(site);
+
+	instr &= 0xffff0000;
+	instr |= ((unsigned long)simm) >> 16;
+	patch_instruction_site(site, instr);
+}
+
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
+{
+	unsigned long mapped;
+
+	if (__map_without_ltlbs) {
+		mapped = 0;
+		mmu_mapin_immr();
+		if (!IS_ENABLED(CONFIG_PIN_TLB_IMMR))
+			patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP);
+		if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
+			mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0);
+	} else {
+		mapped = top & ~(LARGE_PAGE_SIZE_8M - 1);
+		if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
+			mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top,
+					    _ALIGN(__pa(_einittext), 8 << 20));
+	}
+
+	mmu_patch_cmp_limit(&patch__dtlbmiss_linmem_top, mapped);
+	mmu_patch_cmp_limit(&patch__fixupdar_linmem_top, mapped);
+
+	/* If the size of RAM is not an exact power of two, we may not
+	 * have covered RAM in its entirety with 8 MiB
+	 * pages. Consequently, restrict the top end of RAM currently
+	 * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail"
+	 * coverage with normal-sized pages (or other reasons) do not
+	 * attempt to allocate outside the allowed range.
+	 */
+	if (mapped)
+		memblock_set_current_limit(mapped);
+
+	block_mapped_ram = mapped;
+
+	return mapped;
+}
+
+void mmu_mark_initmem_nx(void)
+{
+	if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23)
+		mmu_patch_addis(&patch__itlbmiss_linmem_top8,
+				-((long)_etext & ~(LARGE_PAGE_SIZE_8M - 1)));
+	if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
+		mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, __pa(_etext));
+}
+
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void mmu_mark_rodata_ro(void)
+{
+	if (CONFIG_DATA_SHIFT < 23)
+		mmu_patch_addis(&patch__dtlbmiss_romem_top8,
+				-__pa(((unsigned long)_sinittext) &
+				      ~(LARGE_PAGE_SIZE_8M - 1)));
+	mmu_patch_addis(&patch__dtlbmiss_romem_top, -__pa(_sinittext));
+}
+#endif
+
+void __init setup_initial_memory_limit(phys_addr_t first_memblock_base,
+				       phys_addr_t first_memblock_size)
+{
+	/* We don't currently support the first MEMBLOCK not mapping 0
+	 * physical on those processors
+	 */
+	BUG_ON(first_memblock_base != 0);
+
+	/* 8xx can only access 32MB at the moment */
+	memblock_set_current_limit(min_t(u64, first_memblock_size, 0x02000000));
+}
+
+/*
+ * Set up to use a given MMU context.
+ * id is context number, pgd is PGD pointer.
+ *
+ * We place the physical address of the new task page directory loaded
+ * into the MMU base register, and set the ASID compare register with
+ * the new "context."
+ */
+void set_context(unsigned long id, pgd_t *pgd)
+{
+	s16 offset = (s16)(__pa(swapper_pg_dir));
+
+	/* Context switch the PTE pointer for the Abatron BDI2000.
+	 * The PGDIR is passed as second argument.
+	 */
+	if (IS_ENABLED(CONFIG_BDI_SWITCH))
+		abatron_pteptrs[1] = pgd;
+
+	/* Register M_TWB will contain base address of level 1 table minus the
+	 * lower part of the kernel PGDIR base address, so that all accesses to
+	 * level 1 table are done relative to lower part of kernel PGDIR base
+	 * address.
+	 */
+	mtspr(SPRN_M_TWB, __pa(pgd) - offset);
+
+	/* Update context */
+	mtspr(SPRN_M_CASID, id - 1);
+	/* sync */
+	mb();
+}
+
+void flush_instruction_cache(void)
+{
+	isync();
+	mtspr(SPRN_IC_CST, IDC_INVALL);
+	isync();
+}
+
+#ifdef CONFIG_PPC_KUEP
+void __init setup_kuep(bool disabled)
+{
+	if (disabled)
+		return;
+
+	pr_info("Activating Kernel Userspace Execution Prevention\n");
+
+	mtspr(SPRN_MI_AP, MI_APG_KUEP);
+}
+#endif
+
+#ifdef CONFIG_PPC_KUAP
+void __init setup_kuap(bool disabled)
+{
+	pr_info("Activating Kernel Userspace Access Protection\n");
+
+	if (disabled)
+		pr_warn("KUAP cannot be disabled yet on 8xx when compiled in\n");
+
+	mtspr(SPRN_MD_AP, MD_APG_KUAP);
+}
+#endif
diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile
new file mode 100644
index 000000000000..b2228ff81b8a
--- /dev/null
+++ b/arch/powerpc/mm/nohash/Makefile
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
+
+obj-y				+= mmu_context.o tlb.o tlb_low.o
+obj-$(CONFIG_PPC_BOOK3E_64)  	+= tlb_low_64e.o book3e_pgtable.o
+obj-$(CONFIG_40x)		+= 40x.o
+obj-$(CONFIG_44x)		+= 44x.o
+obj-$(CONFIG_PPC_8xx)		+= 8xx.o
+obj-$(CONFIG_PPC_FSL_BOOK3E)	+= fsl_booke.o
+ifdef CONFIG_HUGETLB_PAGE
+obj-$(CONFIG_PPC_BOOK3E_MMU)	+= book3e_hugetlbpage.o
+endif
+
+# Disable kcov instrumentation on sensitive code
+# This is necessary for booting with kcov enabled on book3e machines
+KCOV_INSTRUMENT_tlb.o := n
+KCOV_INSTRUMENT_fsl_booke.o := n
diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
new file mode 100644
index 000000000000..f84ec46cdb26
--- /dev/null
+++ b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PPC Huge TLB Page Support for Book3E MMU
+ *
+ * Copyright (C) 2009 David Gibson, IBM Corporation.
+ * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
+ *
+ */
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+
+#include <asm/mmu.h>
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+#ifdef CONFIG_PPC64
+static inline int tlb1_next(void)
+{
+	struct paca_struct *paca = get_paca();
+	struct tlb_core_data *tcd;
+	int this, next;
+
+	tcd = paca->tcd_ptr;
+	this = tcd->esel_next;
+
+	next = this + 1;
+	if (next >= tcd->esel_max)
+		next = tcd->esel_first;
+
+	tcd->esel_next = next;
+	return this;
+}
+#else
+static inline int tlb1_next(void)
+{
+	int index, ncams;
+
+	ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+
+	index = this_cpu_read(next_tlbcam_idx);
+
+	/* Just round-robin the entries and wrap when we hit the end */
+	if (unlikely(index == ncams - 1))
+		__this_cpu_write(next_tlbcam_idx, tlbcam_index);
+	else
+		__this_cpu_inc(next_tlbcam_idx);
+
+	return index;
+}
+#endif /* !PPC64 */
+#endif /* FSL */
+
+static inline int mmu_get_tsize(int psize)
+{
+	return mmu_psize_defs[psize].enc;
+}
+
+#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_PPC64)
+#include <asm/paca.h>
+
+static inline void book3e_tlb_lock(void)
+{
+	struct paca_struct *paca = get_paca();
+	unsigned long tmp;
+	int token = smp_processor_id() + 1;
+
+	/*
+	 * Besides being unnecessary in the absence of SMT, this
+	 * check prevents trying to do lbarx/stbcx. on e5500 which
+	 * doesn't implement either feature.
+	 */
+	if (!cpu_has_feature(CPU_FTR_SMT))
+		return;
+
+	asm volatile("1: lbarx %0, 0, %1;"
+		     "cmpwi %0, 0;"
+		     "bne 2f;"
+		     "stbcx. %2, 0, %1;"
+		     "bne 1b;"
+		     "b 3f;"
+		     "2: lbzx %0, 0, %1;"
+		     "cmpwi %0, 0;"
+		     "bne 2b;"
+		     "b 1b;"
+		     "3:"
+		     : "=&r" (tmp)
+		     : "r" (&paca->tcd_ptr->lock), "r" (token)
+		     : "memory");
+}
+
+static inline void book3e_tlb_unlock(void)
+{
+	struct paca_struct *paca = get_paca();
+
+	if (!cpu_has_feature(CPU_FTR_SMT))
+		return;
+
+	isync();
+	paca->tcd_ptr->lock = 0;
+}
+#else
+static inline void book3e_tlb_lock(void)
+{
+}
+
+static inline void book3e_tlb_unlock(void)
+{
+}
+#endif
+
+static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid)
+{
+	int found = 0;
+
+	mtspr(SPRN_MAS6, pid << 16);
+	if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) {
+		asm volatile(
+			"li	%0,0\n"
+			"tlbsx.	0,%1\n"
+			"bne	1f\n"
+			"li	%0,1\n"
+			"1:\n"
+			: "=&r"(found) : "r"(ea));
+	} else {
+		asm volatile(
+			"tlbsx	0,%1\n"
+			"mfspr	%0,0x271\n"
+			"srwi	%0,%0,31\n"
+			: "=&r"(found) : "r"(ea));
+	}
+
+	return found;
+}
+
+void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
+			    pte_t pte)
+{
+	unsigned long mas1, mas2;
+	u64 mas7_3;
+	unsigned long psize, tsize, shift;
+	unsigned long flags;
+	struct mm_struct *mm;
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	int index;
+#endif
+
+	if (unlikely(is_kernel_addr(ea)))
+		return;
+
+	mm = vma->vm_mm;
+
+	psize = vma_mmu_pagesize(vma);
+	shift = __ilog2(psize);
+	tsize = shift - 10;
+	/*
+	 * We can't be interrupted while we're setting up the MAS
+	 * regusters or after we've confirmed that no tlb exists.
+	 */
+	local_irq_save(flags);
+
+	book3e_tlb_lock();
+
+	if (unlikely(book3e_tlb_exists(ea, mm->context.id))) {
+		book3e_tlb_unlock();
+		local_irq_restore(flags);
+		return;
+	}
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	/* We have to use the CAM(TLB1) on FSL parts for hugepages */
+	index = tlb1_next();
+	mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1));
+#endif
+
+	mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize);
+	mas2 = ea & ~((1UL << shift) - 1);
+	mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
+	mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT;
+	mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK;
+	if (!pte_dirty(pte))
+		mas7_3 &= ~(MAS3_SW|MAS3_UW);
+
+	mtspr(SPRN_MAS1, mas1);
+	mtspr(SPRN_MAS2, mas2);
+
+	if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) {
+		mtspr(SPRN_MAS7_MAS3, mas7_3);
+	} else {
+		if (mmu_has_feature(MMU_FTR_BIG_PHYS))
+			mtspr(SPRN_MAS7, upper_32_bits(mas7_3));
+		mtspr(SPRN_MAS3, lower_32_bits(mas7_3));
+	}
+
+	asm volatile ("tlbwe");
+
+	book3e_tlb_unlock();
+	local_irq_restore(flags);
+}
+
+void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+	struct hstate *hstate = hstate_file(vma->vm_file);
+	unsigned long tsize = huge_page_shift(hstate) - 10;
+
+	__flush_tlb_page(vma->vm_mm, vmaddr, tsize, 0);
+}
diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c
new file mode 100644
index 000000000000..f296c2e88b09
--- /dev/null
+++ b/arch/powerpc/mm/nohash/book3e_pgtable.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2005, Paul Mackerras, IBM Corporation.
+ * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/memblock.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/dma.h>
+
+#include <mm/mmu_decl.h>
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/*
+ * On Book3E CPUs, the vmemmap is currently mapped in the top half of
+ * the vmalloc space using normal page tables, though the size of
+ * pages encoded in the PTEs can be different
+ */
+int __meminit vmemmap_create_mapping(unsigned long start,
+				     unsigned long page_size,
+				     unsigned long phys)
+{
+	/* Create a PTE encoding without page size */
+	unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED |
+		_PAGE_KERNEL_RW;
+
+	/* PTEs only contain page size encodings up to 32M */
+	BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
+
+	/* Encode the size in the PTE */
+	flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
+
+	/* For each PTE for that area, map things. Note that we don't
+	 * increment phys because all PTEs are of the large size and
+	 * thus must have the low bits clear
+	 */
+	for (i = 0; i < page_size; i += PAGE_SIZE)
+		BUG_ON(map_kernel_page(start + i, phys, __pgprot(flags)));
+
+	return 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void vmemmap_remove_mapping(unsigned long start,
+			    unsigned long page_size)
+{
+}
+#endif
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+static __ref void *early_alloc_pgtable(unsigned long size)
+{
+	void *ptr;
+
+	ptr = memblock_alloc_try_nid(size, size, MEMBLOCK_LOW_LIMIT,
+				     __pa(MAX_DMA_ADDRESS), NUMA_NO_NODE);
+
+	if (!ptr)
+		panic("%s: Failed to allocate %lu bytes align=0x%lx max_addr=%lx\n",
+		      __func__, size, size, __pa(MAX_DMA_ADDRESS));
+
+	return ptr;
+}
+
+/*
+ * map_kernel_page currently only called by __ioremap
+ * map_kernel_page adds an entry to the ioremap page table
+ * and adds an entry to the HPT, possibly bolting it
+ */
+int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
+{
+	pgd_t *pgdp;
+	pud_t *pudp;
+	pmd_t *pmdp;
+	pte_t *ptep;
+
+	BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE);
+	if (slab_is_available()) {
+		pgdp = pgd_offset_k(ea);
+		pudp = pud_alloc(&init_mm, pgdp, ea);
+		if (!pudp)
+			return -ENOMEM;
+		pmdp = pmd_alloc(&init_mm, pudp, ea);
+		if (!pmdp)
+			return -ENOMEM;
+		ptep = pte_alloc_kernel(pmdp, ea);
+		if (!ptep)
+			return -ENOMEM;
+	} else {
+		pgdp = pgd_offset_k(ea);
+#ifndef __PAGETABLE_PUD_FOLDED
+		if (pgd_none(*pgdp)) {
+			pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
+			pgd_populate(&init_mm, pgdp, pudp);
+		}
+#endif /* !__PAGETABLE_PUD_FOLDED */
+		pudp = pud_offset(pgdp, ea);
+		if (pud_none(*pudp)) {
+			pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
+			pud_populate(&init_mm, pudp, pmdp);
+		}
+		pmdp = pmd_offset(pudp, ea);
+		if (!pmd_present(*pmdp)) {
+			ptep = early_alloc_pgtable(PAGE_SIZE);
+			pmd_populate_kernel(&init_mm, pmdp, ptep);
+		}
+		ptep = pte_offset_kernel(pmdp, ea);
+	}
+	set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
+
+	smp_wmb();
+	return 0;
+}
diff --git a/arch/powerpc/mm/nohash/fsl_booke.c b/arch/powerpc/mm/nohash/fsl_booke.c
new file mode 100644
index 000000000000..71a1a36751dd
--- /dev/null
+++ b/arch/powerpc/mm/nohash/fsl_booke.c
@@ -0,0 +1,326 @@
+/*
+ * Modifications by Kumar Gala (galak@kernel.crashing.org) to support
+ * E500 Book E processors.
+ *
+ * Copyright 2004,2010 Freescale Semiconductor, Inc.
+ *
+ * This file contains the routines for initializing the MMU
+ * on the 4xx series of chips.
+ *  -- paulus
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/highmem.h>
+#include <linux/memblock.h>
+
+#include <asm/pgalloc.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <linux/uaccess.h>
+#include <asm/smp.h>
+#include <asm/machdep.h>
+#include <asm/setup.h>
+#include <asm/paca.h>
+
+#include <mm/mmu_decl.h>
+
+unsigned int tlbcam_index;
+
+#define NUM_TLBCAMS	(64)
+struct tlbcam TLBCAM[NUM_TLBCAMS];
+
+struct tlbcamrange {
+	unsigned long start;
+	unsigned long limit;
+	phys_addr_t phys;
+} tlbcam_addrs[NUM_TLBCAMS];
+
+unsigned long tlbcam_sz(int idx)
+{
+	return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1;
+}
+
+#ifdef CONFIG_FSL_BOOKE
+/*
+ * Return PA for this VA if it is mapped by a CAM, or 0
+ */
+phys_addr_t v_block_mapped(unsigned long va)
+{
+	int b;
+	for (b = 0; b < tlbcam_index; ++b)
+		if (va >= tlbcam_addrs[b].start && va < tlbcam_addrs[b].limit)
+			return tlbcam_addrs[b].phys + (va - tlbcam_addrs[b].start);
+	return 0;
+}
+
+/*
+ * Return VA for a given PA or 0 if not mapped
+ */
+unsigned long p_block_mapped(phys_addr_t pa)
+{
+	int b;
+	for (b = 0; b < tlbcam_index; ++b)
+		if (pa >= tlbcam_addrs[b].phys
+			&& pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start)
+		              +tlbcam_addrs[b].phys)
+			return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys);
+	return 0;
+}
+#endif
+
+/*
+ * Set up a variable-size TLB entry (tlbcam). The parameters are not checked;
+ * in particular size must be a power of 4 between 4k and the max supported by
+ * an implementation; max may further be limited by what can be represented in
+ * an unsigned long (for example, 32-bit implementations cannot support a 4GB
+ * size).
+ */
+static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
+		unsigned long size, unsigned long flags, unsigned int pid)
+{
+	unsigned int tsize;
+
+	tsize = __ilog2(size) - 10;
+
+#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC)
+	if ((flags & _PAGE_NO_CACHE) == 0)
+		flags |= _PAGE_COHERENT;
+#endif
+
+	TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index) | MAS0_NV(index+1);
+	TLBCAM[index].MAS1 = MAS1_VALID | MAS1_IPROT | MAS1_TSIZE(tsize) | MAS1_TID(pid);
+	TLBCAM[index].MAS2 = virt & PAGE_MASK;
+
+	TLBCAM[index].MAS2 |= (flags & _PAGE_WRITETHRU) ? MAS2_W : 0;
+	TLBCAM[index].MAS2 |= (flags & _PAGE_NO_CACHE) ? MAS2_I : 0;
+	TLBCAM[index].MAS2 |= (flags & _PAGE_COHERENT) ? MAS2_M : 0;
+	TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0;
+	TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0;
+
+	TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR;
+	TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0);
+	if (mmu_has_feature(MMU_FTR_BIG_PHYS))
+		TLBCAM[index].MAS7 = (u64)phys >> 32;
+
+	/* Below is unlikely -- only for large user pages or similar */
+	if (pte_user(__pte(flags))) {
+	   TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
+	   TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
+	}
+
+	tlbcam_addrs[index].start = virt;
+	tlbcam_addrs[index].limit = virt + size - 1;
+	tlbcam_addrs[index].phys = phys;
+}
+
+unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
+			  phys_addr_t phys)
+{
+	unsigned int camsize = __ilog2(ram);
+	unsigned int align = __ffs(virt | phys);
+	unsigned long max_cam;
+
+	if ((mfspr(SPRN_MMUCFG) & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
+		/* Convert (4^max) kB to (2^max) bytes */
+		max_cam = ((mfspr(SPRN_TLB1CFG) >> 16) & 0xf) * 2 + 10;
+		camsize &= ~1U;
+		align &= ~1U;
+	} else {
+		/* Convert (2^max) kB to (2^max) bytes */
+		max_cam = __ilog2(mfspr(SPRN_TLB1PS)) + 10;
+	}
+
+	if (camsize > align)
+		camsize = align;
+	if (camsize > max_cam)
+		camsize = max_cam;
+
+	return 1UL << camsize;
+}
+
+static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
+					unsigned long ram, int max_cam_idx,
+					bool dryrun)
+{
+	int i;
+	unsigned long amount_mapped = 0;
+
+	/* Calculate CAM values */
+	for (i = 0; ram && i < max_cam_idx; i++) {
+		unsigned long cam_sz;
+
+		cam_sz = calc_cam_sz(ram, virt, phys);
+		if (!dryrun)
+			settlbcam(i, virt, phys, cam_sz,
+				  pgprot_val(PAGE_KERNEL_X), 0);
+
+		ram -= cam_sz;
+		amount_mapped += cam_sz;
+		virt += cam_sz;
+		phys += cam_sz;
+	}
+
+	if (dryrun)
+		return amount_mapped;
+
+	loadcam_multi(0, i, max_cam_idx);
+	tlbcam_index = i;
+
+#ifdef CONFIG_PPC64
+	get_paca()->tcd.esel_next = i;
+	get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+	get_paca()->tcd.esel_first = i;
+#endif
+
+	return amount_mapped;
+}
+
+unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun)
+{
+	unsigned long virt = PAGE_OFFSET;
+	phys_addr_t phys = memstart_addr;
+
+	return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun);
+}
+
+#ifdef CONFIG_PPC32
+
+#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS)
+#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS"
+#endif
+
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
+{
+	return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1;
+}
+
+/*
+ * MMU_init_hw does the chip-specific initialization of the MMU hardware.
+ */
+void __init MMU_init_hw(void)
+{
+	flush_instruction_cache();
+}
+
+void __init adjust_total_lowmem(void)
+{
+	unsigned long ram;
+	int i;
+
+	/* adjust lowmem size to __max_low_memory */
+	ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem);
+
+	i = switch_to_as1();
+	__max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false);
+	restore_to_as0(i, 0, 0, 1);
+
+	pr_info("Memory CAM mapping: ");
+	for (i = 0; i < tlbcam_index - 1; i++)
+		pr_cont("%lu/", tlbcam_sz(i) >> 20);
+	pr_cont("%lu Mb, residual: %dMb\n", tlbcam_sz(tlbcam_index - 1) >> 20,
+	        (unsigned int)((total_lowmem - __max_low_memory) >> 20));
+
+	memblock_set_current_limit(memstart_addr + __max_low_memory);
+}
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+				phys_addr_t first_memblock_size)
+{
+	phys_addr_t limit = first_memblock_base + first_memblock_size;
+
+	/* 64M mapped initially according to head_fsl_booke.S */
+	memblock_set_current_limit(min_t(u64, limit, 0x04000000));
+}
+
+#ifdef CONFIG_RELOCATABLE
+int __initdata is_second_reloc;
+notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start)
+{
+	unsigned long base = KERNELBASE;
+
+	kernstart_addr = start;
+	if (is_second_reloc) {
+		virt_phys_offset = PAGE_OFFSET - memstart_addr;
+		return;
+	}
+
+	/*
+	 * Relocatable kernel support based on processing of dynamic
+	 * relocation entries. Before we get the real memstart_addr,
+	 * We will compute the virt_phys_offset like this:
+	 * virt_phys_offset = stext.run - kernstart_addr
+	 *
+	 * stext.run = (KERNELBASE & ~0x3ffffff) +
+	 *				(kernstart_addr & 0x3ffffff)
+	 * When we relocate, we have :
+	 *
+	 *	(kernstart_addr & 0x3ffffff) = (stext.run & 0x3ffffff)
+	 *
+	 * hence:
+	 *  virt_phys_offset = (KERNELBASE & ~0x3ffffff) -
+	 *                              (kernstart_addr & ~0x3ffffff)
+	 *
+	 */
+	start &= ~0x3ffffff;
+	base &= ~0x3ffffff;
+	virt_phys_offset = base - start;
+	early_get_first_memblock_info(__va(dt_ptr), NULL);
+	/*
+	 * We now get the memstart_addr, then we should check if this
+	 * address is the same as what the PAGE_OFFSET map to now. If
+	 * not we have to change the map of PAGE_OFFSET to memstart_addr
+	 * and do a second relocation.
+	 */
+	if (start != memstart_addr) {
+		int n;
+		long offset = start - memstart_addr;
+
+		is_second_reloc = 1;
+		n = switch_to_as1();
+		/* map a 64M area for the second relocation */
+		if (memstart_addr > start)
+			map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM,
+					false);
+		else
+			map_mem_in_cams_addr(start, PAGE_OFFSET + offset,
+					0x4000000, CONFIG_LOWMEM_CAM_NUM,
+					false);
+		restore_to_as0(n, offset, __va(dt_ptr), 1);
+		/* We should never reach here */
+		panic("Relocation error");
+	}
+}
+#endif
+#endif
diff --git a/arch/powerpc/mm/nohash/mmu_context.c b/arch/powerpc/mm/nohash/mmu_context.c
new file mode 100644
index 000000000000..ae4505d5b4b8
--- /dev/null
+++ b/arch/powerpc/mm/nohash/mmu_context.c
@@ -0,0 +1,497 @@
+/*
+ * This file contains the routines for handling the MMU on those
+ * PowerPC implementations where the MMU is not using the hash
+ * table, such as 8xx, 4xx, BookE's etc...
+ *
+ * Copyright 2008 Ben Herrenschmidt <benh@kernel.crashing.org>
+ *                IBM Corp.
+ *
+ *  Derived from previous arch/powerpc/mm/mmu_context.c
+ *  and arch/powerpc/include/asm/mmu_context.h
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ * TODO:
+ *
+ *   - The global context lock will not scale very well
+ *   - The maps should be dynamically allocated to allow for processors
+ *     that support more PID bits at runtime
+ *   - Implement flush_tlb_mm() by making the context stale and picking
+ *     a new one
+ *   - More aggressively clear stale map bits and maybe find some way to
+ *     also clear mm->cpu_vm_mask bits when processes are migrated
+ */
+
+//#define DEBUG_MAP_CONSISTENCY
+//#define DEBUG_CLAMP_LAST_CONTEXT   31
+//#define DEBUG_HARDER
+
+/* We don't use DEBUG because it tends to be compiled in always nowadays
+ * and this would generate way too much output
+ */
+#ifdef DEBUG_HARDER
+#define pr_hard(args...)	printk(KERN_DEBUG args)
+#define pr_hardcont(args...)	printk(KERN_CONT args)
+#else
+#define pr_hard(args...)	do { } while(0)
+#define pr_hardcont(args...)	do { } while(0)
+#endif
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/memblock.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/slab.h>
+
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+
+#include <mm/mmu_decl.h>
+
+/*
+ * The MPC8xx has only 16 contexts. We rotate through them on each task switch.
+ * A better way would be to keep track of tasks that own contexts, and implement
+ * an LRU usage. That way very active tasks don't always have to pay the TLB
+ * reload overhead. The kernel pages are mapped shared, so the kernel can run on
+ * behalf of any task that makes a kernel entry. Shared does not mean they are
+ * not protected, just that the ASID comparison is not performed. -- Dan
+ *
+ * The IBM4xx has 256 contexts, so we can just rotate through these as a way of
+ * "switching" contexts. If the TID of the TLB is zero, the PID/TID comparison
+ * is disabled, so we can use a TID of zero to represent all kernel pages as
+ * shared among all contexts. -- Dan
+ *
+ * The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We should
+ * normally never have to steal though the facility is present if needed.
+ * -- BenH
+ */
+#define FIRST_CONTEXT 1
+#ifdef DEBUG_CLAMP_LAST_CONTEXT
+#define LAST_CONTEXT DEBUG_CLAMP_LAST_CONTEXT
+#elif defined(CONFIG_PPC_8xx)
+#define LAST_CONTEXT 16
+#elif defined(CONFIG_PPC_47x)
+#define LAST_CONTEXT 65535
+#else
+#define LAST_CONTEXT 255
+#endif
+
+static unsigned int next_context, nr_free_contexts;
+static unsigned long *context_map;
+#ifdef CONFIG_SMP
+static unsigned long *stale_map[NR_CPUS];
+#endif
+static struct mm_struct **context_mm;
+static DEFINE_RAW_SPINLOCK(context_lock);
+
+#define CTX_MAP_SIZE	\
+	(sizeof(unsigned long) * (LAST_CONTEXT / BITS_PER_LONG + 1))
+
+
+/* Steal a context from a task that has one at the moment.
+ *
+ * This is used when we are running out of available PID numbers
+ * on the processors.
+ *
+ * This isn't an LRU system, it just frees up each context in
+ * turn (sort-of pseudo-random replacement :).  This would be the
+ * place to implement an LRU scheme if anyone was motivated to do it.
+ *  -- paulus
+ *
+ * For context stealing, we use a slightly different approach for
+ * SMP and UP. Basically, the UP one is simpler and doesn't use
+ * the stale map as we can just flush the local CPU
+ *  -- benh
+ */
+#ifdef CONFIG_SMP
+static unsigned int steal_context_smp(unsigned int id)
+{
+	struct mm_struct *mm;
+	unsigned int cpu, max, i;
+
+	max = LAST_CONTEXT - FIRST_CONTEXT;
+
+	/* Attempt to free next_context first and then loop until we manage */
+	while (max--) {
+		/* Pick up the victim mm */
+		mm = context_mm[id];
+
+		/* We have a candidate victim, check if it's active, on SMP
+		 * we cannot steal active contexts
+		 */
+		if (mm->context.active) {
+			id++;
+			if (id > LAST_CONTEXT)
+				id = FIRST_CONTEXT;
+			continue;
+		}
+		pr_hardcont(" | steal %d from 0x%p", id, mm);
+
+		/* Mark this mm has having no context anymore */
+		mm->context.id = MMU_NO_CONTEXT;
+
+		/* Mark it stale on all CPUs that used this mm. For threaded
+		 * implementations, we set it on all threads on each core
+		 * represented in the mask. A future implementation will use
+		 * a core map instead but this will do for now.
+		 */
+		for_each_cpu(cpu, mm_cpumask(mm)) {
+			for (i = cpu_first_thread_sibling(cpu);
+			     i <= cpu_last_thread_sibling(cpu); i++) {
+				if (stale_map[i])
+					__set_bit(id, stale_map[i]);
+			}
+			cpu = i - 1;
+		}
+		return id;
+	}
+
+	/* This will happen if you have more CPUs than available contexts,
+	 * all we can do here is wait a bit and try again
+	 */
+	raw_spin_unlock(&context_lock);
+	cpu_relax();
+	raw_spin_lock(&context_lock);
+
+	/* This will cause the caller to try again */
+	return MMU_NO_CONTEXT;
+}
+#endif  /* CONFIG_SMP */
+
+static unsigned int steal_all_contexts(void)
+{
+	struct mm_struct *mm;
+#ifdef CONFIG_SMP
+	int cpu = smp_processor_id();
+#endif
+	unsigned int id;
+
+	for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) {
+		/* Pick up the victim mm */
+		mm = context_mm[id];
+
+		pr_hardcont(" | steal %d from 0x%p", id, mm);
+
+		/* Mark this mm as having no context anymore */
+		mm->context.id = MMU_NO_CONTEXT;
+		if (id != FIRST_CONTEXT) {
+			context_mm[id] = NULL;
+			__clear_bit(id, context_map);
+#ifdef DEBUG_MAP_CONSISTENCY
+			mm->context.active = 0;
+#endif
+		}
+#ifdef CONFIG_SMP
+		__clear_bit(id, stale_map[cpu]);
+#endif
+	}
+
+	/* Flush the TLB for all contexts (not to be used on SMP) */
+	_tlbil_all();
+
+	nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT;
+
+	return FIRST_CONTEXT;
+}
+
+/* Note that this will also be called on SMP if all other CPUs are
+ * offlined, which means that it may be called for cpu != 0. For
+ * this to work, we somewhat assume that CPUs that are onlined
+ * come up with a fully clean TLB (or are cleaned when offlined)
+ */
+static unsigned int steal_context_up(unsigned int id)
+{
+	struct mm_struct *mm;
+#ifdef CONFIG_SMP
+	int cpu = smp_processor_id();
+#endif
+
+	/* Pick up the victim mm */
+	mm = context_mm[id];
+
+	pr_hardcont(" | steal %d from 0x%p", id, mm);
+
+	/* Flush the TLB for that context */
+	local_flush_tlb_mm(mm);
+
+	/* Mark this mm has having no context anymore */
+	mm->context.id = MMU_NO_CONTEXT;
+
+	/* XXX This clear should ultimately be part of local_flush_tlb_mm */
+#ifdef CONFIG_SMP
+	__clear_bit(id, stale_map[cpu]);
+#endif
+
+	return id;
+}
+
+#ifdef DEBUG_MAP_CONSISTENCY
+static void context_check_map(void)
+{
+	unsigned int id, nrf, nact;
+
+	nrf = nact = 0;
+	for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) {
+		int used = test_bit(id, context_map);
+		if (!used)
+			nrf++;
+		if (used != (context_mm[id] != NULL))
+			pr_err("MMU: Context %d is %s and MM is %p !\n",
+			       id, used ? "used" : "free", context_mm[id]);
+		if (context_mm[id] != NULL)
+			nact += context_mm[id]->context.active;
+	}
+	if (nrf != nr_free_contexts) {
+		pr_err("MMU: Free context count out of sync ! (%d vs %d)\n",
+		       nr_free_contexts, nrf);
+		nr_free_contexts = nrf;
+	}
+	if (nact > num_online_cpus())
+		pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n",
+		       nact, num_online_cpus());
+	if (FIRST_CONTEXT > 0 && !test_bit(0, context_map))
+		pr_err("MMU: Context 0 has been freed !!!\n");
+}
+#else
+static void context_check_map(void) { }
+#endif
+
+void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
+			struct task_struct *tsk)
+{
+	unsigned int id;
+#ifdef CONFIG_SMP
+	unsigned int i, cpu = smp_processor_id();
+#endif
+	unsigned long *map;
+
+	/* No lockless fast path .. yet */
+	raw_spin_lock(&context_lock);
+
+	pr_hard("[%d] activating context for mm @%p, active=%d, id=%d",
+		cpu, next, next->context.active, next->context.id);
+
+#ifdef CONFIG_SMP
+	/* Mark us active and the previous one not anymore */
+	next->context.active++;
+	if (prev) {
+		pr_hardcont(" (old=0x%p a=%d)", prev, prev->context.active);
+		WARN_ON(prev->context.active < 1);
+		prev->context.active--;
+	}
+
+ again:
+#endif /* CONFIG_SMP */
+
+	/* If we already have a valid assigned context, skip all that */
+	id = next->context.id;
+	if (likely(id != MMU_NO_CONTEXT)) {
+#ifdef DEBUG_MAP_CONSISTENCY
+		if (context_mm[id] != next)
+			pr_err("MMU: mm 0x%p has id %d but context_mm[%d] says 0x%p\n",
+			       next, id, id, context_mm[id]);
+#endif
+		goto ctxt_ok;
+	}
+
+	/* We really don't have a context, let's try to acquire one */
+	id = next_context;
+	if (id > LAST_CONTEXT)
+		id = FIRST_CONTEXT;
+	map = context_map;
+
+	/* No more free contexts, let's try to steal one */
+	if (nr_free_contexts == 0) {
+#ifdef CONFIG_SMP
+		if (num_online_cpus() > 1) {
+			id = steal_context_smp(id);
+			if (id == MMU_NO_CONTEXT)
+				goto again;
+			goto stolen;
+		}
+#endif /* CONFIG_SMP */
+		if (IS_ENABLED(CONFIG_PPC_8xx))
+			id = steal_all_contexts();
+		else
+			id = steal_context_up(id);
+		goto stolen;
+	}
+	nr_free_contexts--;
+
+	/* We know there's at least one free context, try to find it */
+	while (__test_and_set_bit(id, map)) {
+		id = find_next_zero_bit(map, LAST_CONTEXT+1, id);
+		if (id > LAST_CONTEXT)
+			id = FIRST_CONTEXT;
+	}
+ stolen:
+	next_context = id + 1;
+	context_mm[id] = next;
+	next->context.id = id;
+	pr_hardcont(" | new id=%d,nrf=%d", id, nr_free_contexts);
+
+	context_check_map();
+ ctxt_ok:
+
+	/* If that context got marked stale on this CPU, then flush the
+	 * local TLB for it and unmark it before we use it
+	 */
+#ifdef CONFIG_SMP
+	if (test_bit(id, stale_map[cpu])) {
+		pr_hardcont(" | stale flush %d [%d..%d]",
+			    id, cpu_first_thread_sibling(cpu),
+			    cpu_last_thread_sibling(cpu));
+
+		local_flush_tlb_mm(next);
+
+		/* XXX This clear should ultimately be part of local_flush_tlb_mm */
+		for (i = cpu_first_thread_sibling(cpu);
+		     i <= cpu_last_thread_sibling(cpu); i++) {
+			if (stale_map[i])
+				__clear_bit(id, stale_map[i]);
+		}
+	}
+#endif
+
+	/* Flick the MMU and release lock */
+	pr_hardcont(" -> %d\n", id);
+	set_context(id, next->pgd);
+	raw_spin_unlock(&context_lock);
+}
+
+/*
+ * Set up the context for a new address space.
+ */
+int init_new_context(struct task_struct *t, struct mm_struct *mm)
+{
+	pr_hard("initing context for mm @%p\n", mm);
+
+	/*
+	 * We have MMU_NO_CONTEXT set to be ~0. Hence check
+	 * explicitly against context.id == 0. This ensures that we properly
+	 * initialize context slice details for newly allocated mm's (which will
+	 * have id == 0) and don't alter context slice inherited via fork (which
+	 * will have id != 0).
+	 */
+	if (mm->context.id == 0)
+		slice_init_new_context_exec(mm);
+	mm->context.id = MMU_NO_CONTEXT;
+	mm->context.active = 0;
+	pte_frag_set(&mm->context, NULL);
+	return 0;
+}
+
+/*
+ * We're finished using the context for an address space.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+	unsigned long flags;
+	unsigned int id;
+
+	if (mm->context.id == MMU_NO_CONTEXT)
+		return;
+
+	WARN_ON(mm->context.active != 0);
+
+	raw_spin_lock_irqsave(&context_lock, flags);
+	id = mm->context.id;
+	if (id != MMU_NO_CONTEXT) {
+		__clear_bit(id, context_map);
+		mm->context.id = MMU_NO_CONTEXT;
+#ifdef DEBUG_MAP_CONSISTENCY
+		mm->context.active = 0;
+#endif
+		context_mm[id] = NULL;
+		nr_free_contexts++;
+	}
+	raw_spin_unlock_irqrestore(&context_lock, flags);
+}
+
+#ifdef CONFIG_SMP
+static int mmu_ctx_cpu_prepare(unsigned int cpu)
+{
+	/* We don't touch CPU 0 map, it's allocated at aboot and kept
+	 * around forever
+	 */
+	if (cpu == boot_cpuid)
+		return 0;
+
+	pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu);
+	stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL);
+	return 0;
+}
+
+static int mmu_ctx_cpu_dead(unsigned int cpu)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+	if (cpu == boot_cpuid)
+		return 0;
+
+	pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu);
+	kfree(stale_map[cpu]);
+	stale_map[cpu] = NULL;
+
+	/* We also clear the cpu_vm_mask bits of CPUs going away */
+	clear_tasks_mm_cpumask(cpu);
+#endif
+	return 0;
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Initialize the context management stuff.
+ */
+void __init mmu_context_init(void)
+{
+	/* Mark init_mm as being active on all possible CPUs since
+	 * we'll get called with prev == init_mm the first time
+	 * we schedule on a given CPU
+	 */
+	init_mm.context.active = NR_CPUS;
+
+	/*
+	 * Allocate the maps used by context management
+	 */
+	context_map = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES);
+	if (!context_map)
+		panic("%s: Failed to allocate %zu bytes\n", __func__,
+		      CTX_MAP_SIZE);
+	context_mm = memblock_alloc(sizeof(void *) * (LAST_CONTEXT + 1),
+				    SMP_CACHE_BYTES);
+	if (!context_mm)
+		panic("%s: Failed to allocate %zu bytes\n", __func__,
+		      sizeof(void *) * (LAST_CONTEXT + 1));
+#ifdef CONFIG_SMP
+	stale_map[boot_cpuid] = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES);
+	if (!stale_map[boot_cpuid])
+		panic("%s: Failed to allocate %zu bytes\n", __func__,
+		      CTX_MAP_SIZE);
+
+	cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE,
+				  "powerpc/mmu/ctx:prepare",
+				  mmu_ctx_cpu_prepare, mmu_ctx_cpu_dead);
+#endif
+
+	printk(KERN_INFO
+	       "MMU: Allocated %zu bytes of context maps for %d contexts\n",
+	       2 * CTX_MAP_SIZE + (sizeof(void *) * (LAST_CONTEXT + 1)),
+	       LAST_CONTEXT - FIRST_CONTEXT + 1);
+
+	/*
+	 * Some processors have too few contexts to reserve one for
+	 * init_mm, and require using context 0 for a normal task.
+	 * Other processors reserve the use of context zero for the kernel.
+	 * This code assumes FIRST_CONTEXT < 32.
+	 */
+	context_map[0] = (1 << FIRST_CONTEXT) - 1;
+	next_context = FIRST_CONTEXT;
+	nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT + 1;
+}
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
new file mode 100644
index 000000000000..704e613a0b14
--- /dev/null
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -0,0 +1,810 @@
+/*
+ * This file contains the routines for TLB flushing.
+ * On machines where the MMU does not use a hash table to store virtual to
+ * physical translations (ie, SW loaded TLBs or Book3E compilant processors,
+ * this does -not- include 603 however which shares the implementation with
+ * hash based processors)
+ *
+ *  -- BenH
+ *
+ * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org>
+ *                     IBM Corp.
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/memblock.h>
+#include <linux/of_fdt.h>
+#include <linux/hugetlb.h>
+
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/code-patching.h>
+#include <asm/cputhreads.h>
+#include <asm/hugetlb.h>
+#include <asm/paca.h>
+
+#include <mm/mmu_decl.h>
+
+/*
+ * This struct lists the sw-supported page sizes.  The hardawre MMU may support
+ * other sizes not listed here.   The .ind field is only used on MMUs that have
+ * indirect page table entries.
+ */
+#if defined(CONFIG_PPC_BOOK3E_MMU) || defined(CONFIG_PPC_8xx)
+#ifdef CONFIG_PPC_FSL_BOOK3E
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
+	[MMU_PAGE_4K] = {
+		.shift	= 12,
+		.enc	= BOOK3E_PAGESZ_4K,
+	},
+	[MMU_PAGE_2M] = {
+		.shift	= 21,
+		.enc	= BOOK3E_PAGESZ_2M,
+	},
+	[MMU_PAGE_4M] = {
+		.shift	= 22,
+		.enc	= BOOK3E_PAGESZ_4M,
+	},
+	[MMU_PAGE_16M] = {
+		.shift	= 24,
+		.enc	= BOOK3E_PAGESZ_16M,
+	},
+	[MMU_PAGE_64M] = {
+		.shift	= 26,
+		.enc	= BOOK3E_PAGESZ_64M,
+	},
+	[MMU_PAGE_256M] = {
+		.shift	= 28,
+		.enc	= BOOK3E_PAGESZ_256M,
+	},
+	[MMU_PAGE_1G] = {
+		.shift	= 30,
+		.enc	= BOOK3E_PAGESZ_1GB,
+	},
+};
+#elif defined(CONFIG_PPC_8xx)
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
+	/* we only manage 4k and 16k pages as normal pages */
+#ifdef CONFIG_PPC_4K_PAGES
+	[MMU_PAGE_4K] = {
+		.shift	= 12,
+	},
+#else
+	[MMU_PAGE_16K] = {
+		.shift	= 14,
+	},
+#endif
+	[MMU_PAGE_512K] = {
+		.shift	= 19,
+	},
+	[MMU_PAGE_8M] = {
+		.shift	= 23,
+	},
+};
+#else
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
+	[MMU_PAGE_4K] = {
+		.shift	= 12,
+		.ind	= 20,
+		.enc	= BOOK3E_PAGESZ_4K,
+	},
+	[MMU_PAGE_16K] = {
+		.shift	= 14,
+		.enc	= BOOK3E_PAGESZ_16K,
+	},
+	[MMU_PAGE_64K] = {
+		.shift	= 16,
+		.ind	= 28,
+		.enc	= BOOK3E_PAGESZ_64K,
+	},
+	[MMU_PAGE_1M] = {
+		.shift	= 20,
+		.enc	= BOOK3E_PAGESZ_1M,
+	},
+	[MMU_PAGE_16M] = {
+		.shift	= 24,
+		.ind	= 36,
+		.enc	= BOOK3E_PAGESZ_16M,
+	},
+	[MMU_PAGE_256M] = {
+		.shift	= 28,
+		.enc	= BOOK3E_PAGESZ_256M,
+	},
+	[MMU_PAGE_1G] = {
+		.shift	= 30,
+		.enc	= BOOK3E_PAGESZ_1GB,
+	},
+};
+#endif /* CONFIG_FSL_BOOKE */
+
+static inline int mmu_get_tsize(int psize)
+{
+	return mmu_psize_defs[psize].enc;
+}
+#else
+static inline int mmu_get_tsize(int psize)
+{
+	/* This isn't used on !Book3E for now */
+	return 0;
+}
+#endif /* CONFIG_PPC_BOOK3E_MMU */
+
+/* The variables below are currently only used on 64-bit Book3E
+ * though this will probably be made common with other nohash
+ * implementations at some point
+ */
+#ifdef CONFIG_PPC64
+
+int mmu_linear_psize;		/* Page size used for the linear mapping */
+int mmu_pte_psize;		/* Page size used for PTE pages */
+int mmu_vmemmap_psize;		/* Page size used for the virtual mem map */
+int book3e_htw_mode;		/* HW tablewalk?  Value is PPC_HTW_* */
+unsigned long linear_map_top;	/* Top of linear mapping */
+
+
+/*
+ * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug
+ * exceptions.  This is used for bolted and e6500 TLB miss handlers which
+ * do not modify this SPRG in the TLB miss code; for other TLB miss handlers,
+ * this is set to zero.
+ */
+int extlb_level_exc;
+
+#endif /* CONFIG_PPC64 */
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+/* next_tlbcam_idx is used to round-robin tlbcam entry assignment */
+DEFINE_PER_CPU(int, next_tlbcam_idx);
+EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx);
+#endif
+
+/*
+ * Base TLB flushing operations:
+ *
+ *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ *  - flush_tlb_page(vma, vmaddr) flushes one page
+ *  - flush_tlb_range(vma, start, end) flushes a range of pages
+ *  - flush_tlb_kernel_range(start, end) flushes kernel pages
+ *
+ *  - local_* variants of page and mm only apply to the current
+ *    processor
+ */
+
+/*
+ * These are the base non-SMP variants of page and mm flushing
+ */
+void local_flush_tlb_mm(struct mm_struct *mm)
+{
+	unsigned int pid;
+
+	preempt_disable();
+	pid = mm->context.id;
+	if (pid != MMU_NO_CONTEXT)
+		_tlbil_pid(pid);
+	preempt_enable();
+}
+EXPORT_SYMBOL(local_flush_tlb_mm);
+
+void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+			    int tsize, int ind)
+{
+	unsigned int pid;
+
+	preempt_disable();
+	pid = mm ? mm->context.id : 0;
+	if (pid != MMU_NO_CONTEXT)
+		_tlbil_va(vmaddr, pid, tsize, ind);
+	preempt_enable();
+}
+
+void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+	__local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
+			       mmu_get_tsize(mmu_virtual_psize), 0);
+}
+EXPORT_SYMBOL(local_flush_tlb_page);
+
+/*
+ * And here are the SMP non-local implementations
+ */
+#ifdef CONFIG_SMP
+
+static DEFINE_RAW_SPINLOCK(tlbivax_lock);
+
+struct tlb_flush_param {
+	unsigned long addr;
+	unsigned int pid;
+	unsigned int tsize;
+	unsigned int ind;
+};
+
+static void do_flush_tlb_mm_ipi(void *param)
+{
+	struct tlb_flush_param *p = param;
+
+	_tlbil_pid(p ? p->pid : 0);
+}
+
+static void do_flush_tlb_page_ipi(void *param)
+{
+	struct tlb_flush_param *p = param;
+
+	_tlbil_va(p->addr, p->pid, p->tsize, p->ind);
+}
+
+
+/* Note on invalidations and PID:
+ *
+ * We snapshot the PID with preempt disabled. At this point, it can still
+ * change either because:
+ * - our context is being stolen (PID -> NO_CONTEXT) on another CPU
+ * - we are invaliating some target that isn't currently running here
+ *   and is concurrently acquiring a new PID on another CPU
+ * - some other CPU is re-acquiring a lost PID for this mm
+ * etc...
+ *
+ * However, this shouldn't be a problem as we only guarantee
+ * invalidation of TLB entries present prior to this call, so we
+ * don't care about the PID changing, and invalidating a stale PID
+ * is generally harmless.
+ */
+
+void flush_tlb_mm(struct mm_struct *mm)
+{
+	unsigned int pid;
+
+	preempt_disable();
+	pid = mm->context.id;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		goto no_context;
+	if (!mm_is_core_local(mm)) {
+		struct tlb_flush_param p = { .pid = pid };
+		/* Ignores smp_processor_id() even if set. */
+		smp_call_function_many(mm_cpumask(mm),
+				       do_flush_tlb_mm_ipi, &p, 1);
+	}
+	_tlbil_pid(pid);
+ no_context:
+	preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_mm);
+
+void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
+		      int tsize, int ind)
+{
+	struct cpumask *cpu_mask;
+	unsigned int pid;
+
+	/*
+	 * This function as well as __local_flush_tlb_page() must only be called
+	 * for user contexts.
+	 */
+	if (WARN_ON(!mm))
+		return;
+
+	preempt_disable();
+	pid = mm->context.id;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		goto bail;
+	cpu_mask = mm_cpumask(mm);
+	if (!mm_is_core_local(mm)) {
+		/* If broadcast tlbivax is supported, use it */
+		if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) {
+			int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL);
+			if (lock)
+				raw_spin_lock(&tlbivax_lock);
+			_tlbivax_bcast(vmaddr, pid, tsize, ind);
+			if (lock)
+				raw_spin_unlock(&tlbivax_lock);
+			goto bail;
+		} else {
+			struct tlb_flush_param p = {
+				.pid = pid,
+				.addr = vmaddr,
+				.tsize = tsize,
+				.ind = ind,
+			};
+			/* Ignores smp_processor_id() even if set in cpu_mask */
+			smp_call_function_many(cpu_mask,
+					       do_flush_tlb_page_ipi, &p, 1);
+		}
+	}
+	_tlbil_va(vmaddr, pid, tsize, ind);
+ bail:
+	preempt_enable();
+}
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+	if (vma && is_vm_hugetlb_page(vma))
+		flush_hugetlb_page(vma, vmaddr);
+#endif
+
+	__flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
+			 mmu_get_tsize(mmu_virtual_psize), 0);
+}
+EXPORT_SYMBOL(flush_tlb_page);
+
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_PPC_47x
+void __init early_init_mmu_47x(void)
+{
+#ifdef CONFIG_SMP
+	unsigned long root = of_get_flat_dt_root();
+	if (of_get_flat_dt_prop(root, "cooperative-partition", NULL))
+		mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST);
+#endif /* CONFIG_SMP */
+}
+#endif /* CONFIG_PPC_47x */
+
+/*
+ * Flush kernel TLB entries in the given range
+ */
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+#ifdef CONFIG_SMP
+	preempt_disable();
+	smp_call_function(do_flush_tlb_mm_ipi, NULL, 1);
+	_tlbil_pid(0);
+	preempt_enable();
+#else
+	_tlbil_pid(0);
+#endif
+}
+EXPORT_SYMBOL(flush_tlb_kernel_range);
+
+/*
+ * Currently, for range flushing, we just do a full mm flush. This should
+ * be optimized based on a threshold on the size of the range, since
+ * some implementation can stack multiple tlbivax before a tlbsync but
+ * for now, we keep it that way
+ */
+void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+		     unsigned long end)
+
+{
+	if (end - start == PAGE_SIZE && !(start & ~PAGE_MASK))
+		flush_tlb_page(vma, start);
+	else
+		flush_tlb_mm(vma->vm_mm);
+}
+EXPORT_SYMBOL(flush_tlb_range);
+
+void tlb_flush(struct mmu_gather *tlb)
+{
+	flush_tlb_mm(tlb->mm);
+}
+
+/*
+ * Below are functions specific to the 64-bit variant of Book3E though that
+ * may change in the future
+ */
+
+#ifdef CONFIG_PPC64
+
+/*
+ * Handling of virtual linear page tables or indirect TLB entries
+ * flushing when PTE pages are freed
+ */
+void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
+{
+	int tsize = mmu_psize_defs[mmu_pte_psize].enc;
+
+	if (book3e_htw_mode != PPC_HTW_NONE) {
+		unsigned long start = address & PMD_MASK;
+		unsigned long end = address + PMD_SIZE;
+		unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift;
+
+		/* This isn't the most optimal, ideally we would factor out the
+		 * while preempt & CPU mask mucking around, or even the IPI but
+		 * it will do for now
+		 */
+		while (start < end) {
+			__flush_tlb_page(tlb->mm, start, tsize, 1);
+			start += size;
+		}
+	} else {
+		unsigned long rmask = 0xf000000000000000ul;
+		unsigned long rid = (address & rmask) | 0x1000000000000000ul;
+		unsigned long vpte = address & ~rmask;
+
+#ifdef CONFIG_PPC_64K_PAGES
+		vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful;
+#else
+		vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful;
+#endif
+		vpte |= rid;
+		__flush_tlb_page(tlb->mm, vpte, tsize, 0);
+	}
+}
+
+static void setup_page_sizes(void)
+{
+	unsigned int tlb0cfg;
+	unsigned int tlb0ps;
+	unsigned int eptcfg;
+	int i, psize;
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	unsigned int mmucfg = mfspr(SPRN_MMUCFG);
+	int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E);
+
+	if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
+		unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG);
+		unsigned int min_pg, max_pg;
+
+		min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT;
+		max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT;
+
+		for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+			struct mmu_psize_def *def;
+			unsigned int shift;
+
+			def = &mmu_psize_defs[psize];
+			shift = def->shift;
+
+			if (shift == 0 || shift & 1)
+				continue;
+
+			/* adjust to be in terms of 4^shift Kb */
+			shift = (shift - 10) >> 1;
+
+			if ((shift >= min_pg) && (shift <= max_pg))
+				def->flags |= MMU_PAGE_SIZE_DIRECT;
+		}
+
+		goto out;
+	}
+
+	if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) {
+		u32 tlb1cfg, tlb1ps;
+
+		tlb0cfg = mfspr(SPRN_TLB0CFG);
+		tlb1cfg = mfspr(SPRN_TLB1CFG);
+		tlb1ps = mfspr(SPRN_TLB1PS);
+		eptcfg = mfspr(SPRN_EPTCFG);
+
+		if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT))
+			book3e_htw_mode = PPC_HTW_E6500;
+
+		/*
+		 * We expect 4K subpage size and unrestricted indirect size.
+		 * The lack of a restriction on indirect size is a Freescale
+		 * extension, indicated by PSn = 0 but SPSn != 0.
+		 */
+		if (eptcfg != 2)
+			book3e_htw_mode = PPC_HTW_NONE;
+
+		for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+			struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+			if (!def->shift)
+				continue;
+
+			if (tlb1ps & (1U << (def->shift - 10))) {
+				def->flags |= MMU_PAGE_SIZE_DIRECT;
+
+				if (book3e_htw_mode && psize == MMU_PAGE_2M)
+					def->flags |= MMU_PAGE_SIZE_INDIRECT;
+			}
+		}
+
+		goto out;
+	}
+#endif
+
+	tlb0cfg = mfspr(SPRN_TLB0CFG);
+	tlb0ps = mfspr(SPRN_TLB0PS);
+	eptcfg = mfspr(SPRN_EPTCFG);
+
+	/* Look for supported direct sizes */
+	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+		struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+		if (tlb0ps & (1U << (def->shift - 10)))
+			def->flags |= MMU_PAGE_SIZE_DIRECT;
+	}
+
+	/* Indirect page sizes supported ? */
+	if ((tlb0cfg & TLBnCFG_IND) == 0 ||
+	    (tlb0cfg & TLBnCFG_PT) == 0)
+		goto out;
+
+	book3e_htw_mode = PPC_HTW_IBM;
+
+	/* Now, we only deal with one IND page size for each
+	 * direct size. Hopefully all implementations today are
+	 * unambiguous, but we might want to be careful in the
+	 * future.
+	 */
+	for (i = 0; i < 3; i++) {
+		unsigned int ps, sps;
+
+		sps = eptcfg & 0x1f;
+		eptcfg >>= 5;
+		ps = eptcfg & 0x1f;
+		eptcfg >>= 5;
+		if (!ps || !sps)
+			continue;
+		for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
+			struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+			if (ps == (def->shift - 10))
+				def->flags |= MMU_PAGE_SIZE_INDIRECT;
+			if (sps == (def->shift - 10))
+				def->ind = ps + 10;
+		}
+	}
+
+out:
+	/* Cleanup array and print summary */
+	pr_info("MMU: Supported page sizes\n");
+	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+		struct mmu_psize_def *def = &mmu_psize_defs[psize];
+		const char *__page_type_names[] = {
+			"unsupported",
+			"direct",
+			"indirect",
+			"direct & indirect"
+		};
+		if (def->flags == 0) {
+			def->shift = 0;	
+			continue;
+		}
+		pr_info("  %8ld KB as %s\n", 1ul << (def->shift - 10),
+			__page_type_names[def->flags & 0x3]);
+	}
+}
+
+static void setup_mmu_htw(void)
+{
+	/*
+	 * If we want to use HW tablewalk, enable it by patching the TLB miss
+	 * handlers to branch to the one dedicated to it.
+	 */
+
+	switch (book3e_htw_mode) {
+	case PPC_HTW_IBM:
+		patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e);
+		patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e);
+		break;
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	case PPC_HTW_E6500:
+		extlb_level_exc = EX_TLB_SIZE;
+		patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e);
+		patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e);
+		break;
+#endif
+	}
+	pr_info("MMU: Book3E HW tablewalk %s\n",
+		book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported");
+}
+
+/*
+ * Early initialization of the MMU TLB code
+ */
+static void early_init_this_mmu(void)
+{
+	unsigned int mas4;
+
+	/* Set MAS4 based on page table setting */
+
+	mas4 = 0x4 << MAS4_WIMGED_SHIFT;
+	switch (book3e_htw_mode) {
+	case PPC_HTW_E6500:
+		mas4 |= MAS4_INDD;
+		mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT;
+		mas4 |= MAS4_TLBSELD(1);
+		mmu_pte_psize = MMU_PAGE_2M;
+		break;
+
+	case PPC_HTW_IBM:
+		mas4 |= MAS4_INDD;
+#ifdef CONFIG_PPC_64K_PAGES
+		mas4 |=	BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT;
+		mmu_pte_psize = MMU_PAGE_256M;
+#else
+		mas4 |=	BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT;
+		mmu_pte_psize = MMU_PAGE_1M;
+#endif
+		break;
+
+	case PPC_HTW_NONE:
+#ifdef CONFIG_PPC_64K_PAGES
+		mas4 |=	BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT;
+#else
+		mas4 |=	BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
+#endif
+		mmu_pte_psize = mmu_virtual_psize;
+		break;
+	}
+	mtspr(SPRN_MAS4, mas4);
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
+		unsigned int num_cams;
+		int __maybe_unused cpu = smp_processor_id();
+		bool map = true;
+
+		/* use a quarter of the TLBCAM for bolted linear map */
+		num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
+
+		/*
+		 * Only do the mapping once per core, or else the
+		 * transient mapping would cause problems.
+		 */
+#ifdef CONFIG_SMP
+		if (hweight32(get_tensr()) > 1)
+			map = false;
+#endif
+
+		if (map)
+			linear_map_top = map_mem_in_cams(linear_map_top,
+							 num_cams, false);
+	}
+#endif
+
+	/* A sync won't hurt us after mucking around with
+	 * the MMU configuration
+	 */
+	mb();
+}
+
+static void __init early_init_mmu_global(void)
+{
+	/* XXX This will have to be decided at runtime, but right
+	 * now our boot and TLB miss code hard wires it. Ideally
+	 * we should find out a suitable page size and patch the
+	 * TLB miss code (either that or use the PACA to store
+	 * the value we want)
+	 */
+	mmu_linear_psize = MMU_PAGE_1G;
+
+	/* XXX This should be decided at runtime based on supported
+	 * page sizes in the TLB, but for now let's assume 16M is
+	 * always there and a good fit (which it probably is)
+	 *
+	 * Freescale booke only supports 4K pages in TLB0, so use that.
+	 */
+	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
+		mmu_vmemmap_psize = MMU_PAGE_4K;
+	else
+		mmu_vmemmap_psize = MMU_PAGE_16M;
+
+	/* XXX This code only checks for TLB 0 capabilities and doesn't
+	 *     check what page size combos are supported by the HW. It
+	 *     also doesn't handle the case where a separate array holds
+	 *     the IND entries from the array loaded by the PT.
+	 */
+	/* Look for supported page sizes */
+	setup_page_sizes();
+
+	/* Look for HW tablewalk support */
+	setup_mmu_htw();
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
+		if (book3e_htw_mode == PPC_HTW_NONE) {
+			extlb_level_exc = EX_TLB_SIZE;
+			patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
+			patch_exception(0x1e0,
+				exc_instruction_tlb_miss_bolted_book3e);
+		}
+	}
+#endif
+
+	/* Set the global containing the top of the linear mapping
+	 * for use by the TLB miss code
+	 */
+	linear_map_top = memblock_end_of_DRAM();
+}
+
+static void __init early_mmu_set_memory_limit(void)
+{
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
+		/*
+		 * Limit memory so we dont have linear faults.
+		 * Unlike memblock_set_current_limit, which limits
+		 * memory available during early boot, this permanently
+		 * reduces the memory available to Linux.  We need to
+		 * do this because highmem is not supported on 64-bit.
+		 */
+		memblock_enforce_memory_limit(linear_map_top);
+	}
+#endif
+
+	memblock_set_current_limit(linear_map_top);
+}
+
+/* boot cpu only */
+void __init early_init_mmu(void)
+{
+	early_init_mmu_global();
+	early_init_this_mmu();
+	early_mmu_set_memory_limit();
+}
+
+void early_init_mmu_secondary(void)
+{
+	early_init_this_mmu();
+}
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+				phys_addr_t first_memblock_size)
+{
+	/* On non-FSL Embedded 64-bit, we adjust the RMA size to match
+	 * the bolted TLB entry. We know for now that only 1G
+	 * entries are supported though that may eventually
+	 * change.
+	 *
+	 * on FSL Embedded 64-bit, usually all RAM is bolted, but with
+	 * unusual memory sizes it's possible for some RAM to not be mapped
+	 * (such RAM is not used at all by Linux, since we don't support
+	 * highmem on 64-bit).  We limit ppc64_rma_size to what would be
+	 * mappable if this memblock is the only one.  Additional memblocks
+	 * can only increase, not decrease, the amount that ends up getting
+	 * mapped.  We still limit max to 1G even if we'll eventually map
+	 * more.  This is due to what the early init code is set up to do.
+	 *
+	 * We crop it to the size of the first MEMBLOCK to
+	 * avoid going over total available memory just in case...
+	 */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
+		unsigned long linear_sz;
+		unsigned int num_cams;
+
+		/* use a quarter of the TLBCAM for bolted linear map */
+		num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
+
+		linear_sz = map_mem_in_cams(first_memblock_size, num_cams,
+					    true);
+
+		ppc64_rma_size = min_t(u64, linear_sz, 0x40000000);
+	} else
+#endif
+		ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000);
+
+	/* Finally limit subsequent allocations */
+	memblock_set_current_limit(first_memblock_base + ppc64_rma_size);
+}
+#else /* ! CONFIG_PPC64 */
+void __init early_init_mmu(void)
+{
+#ifdef CONFIG_PPC_47x
+	early_init_mmu_47x();
+#endif
+
+#ifdef CONFIG_PPC_MM_SLICES
+#if defined(CONFIG_PPC_8xx)
+	init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW;
+#endif
+#endif
+}
+#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/mm/nohash/tlb_low.S b/arch/powerpc/mm/nohash/tlb_low.S
new file mode 100644
index 000000000000..e066a658acac
--- /dev/null
+++ b/arch/powerpc/mm/nohash/tlb_low.S
@@ -0,0 +1,491 @@
+/*
+ * This file contains low-level functions for performing various
+ * types of TLB invalidations on various processors with no hash
+ * table.
+ *
+ * This file implements the following functions for all no-hash
+ * processors. Some aren't implemented for some variants. Some
+ * are inline in tlbflush.h
+ *
+ *	- tlbil_va
+ *	- tlbil_pid
+ *	- tlbil_all
+ *	- tlbivax_bcast
+ *
+ * Code mostly moved over from misc_32.S
+ *
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Partially rewritten by Cort Dougan (cort@cs.nmt.edu)
+ * Paul Mackerras, Kumar Gala and Benjamin Herrenschmidt.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/mmu.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor.h>
+#include <asm/bug.h>
+#include <asm/asm-compat.h>
+#include <asm/feature-fixups.h>
+
+#if defined(CONFIG_40x)
+
+/*
+ * 40x implementation needs only tlbil_va
+ */
+_GLOBAL(__tlbil_va)
+	/* We run the search with interrupts disabled because we have to change
+	 * the PID and I don't want to preempt when that happens.
+	 */
+	mfmsr	r5
+	mfspr	r6,SPRN_PID
+	wrteei	0
+	mtspr	SPRN_PID,r4
+	tlbsx.	r3, 0, r3
+	mtspr	SPRN_PID,r6
+	wrtee	r5
+	bne	1f
+	sync
+	/* There are only 64 TLB entries, so r3 < 64, which means bit 25 is
+	 * clear. Since 25 is the V bit in the TLB_TAG, loading this value
+	 * will invalidate the TLB entry. */
+	tlbwe	r3, r3, TLB_TAG
+	isync
+1:	blr
+
+#elif defined(CONFIG_PPC_8xx)
+
+/*
+ * Nothing to do for 8xx, everything is inline
+ */
+
+#elif defined(CONFIG_44x) /* Includes 47x */
+
+/*
+ * 440 implementation uses tlbsx/we for tlbil_va and a full sweep
+ * of the TLB for everything else.
+ */
+_GLOBAL(__tlbil_va)
+	mfspr	r5,SPRN_MMUCR
+	mfmsr   r10
+
+	/*
+	 * We write 16 bits of STID since 47x supports that much, we
+	 * will never be passed out of bounds values on 440 (hopefully)
+	 */
+	rlwimi  r5,r4,0,16,31
+
+	/* We have to run the search with interrupts disabled, otherwise
+	 * an interrupt which causes a TLB miss can clobber the MMUCR
+	 * between the mtspr and the tlbsx.
+	 *
+	 * Critical and Machine Check interrupts take care of saving
+	 * and restoring MMUCR, so only normal interrupts have to be
+	 * taken care of.
+	 */
+	wrteei	0
+	mtspr	SPRN_MMUCR,r5
+	tlbsx.	r6,0,r3
+	bne	10f
+	sync
+BEGIN_MMU_FTR_SECTION
+	b	2f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
+	/* On 440 There are only 64 TLB entries, so r3 < 64, which means bit
+	 * 22, is clear.  Since 22 is the V bit in the TLB_PAGEID, loading this
+	 * value will invalidate the TLB entry.
+	 */
+	tlbwe	r6,r6,PPC44x_TLB_PAGEID
+	isync
+10:	wrtee	r10
+	blr
+2:
+#ifdef CONFIG_PPC_47x
+	oris	r7,r6,0x8000	/* specify way explicitly */
+	clrrwi	r4,r3,12	/* get an EPN for the hashing with V = 0 */
+	ori	r4,r4,PPC47x_TLBE_SIZE
+	tlbwe   r4,r7,0		/* write it */
+	isync
+	wrtee	r10
+	blr
+#else /* CONFIG_PPC_47x */
+1:	trap
+	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0;
+#endif /* !CONFIG_PPC_47x */
+
+_GLOBAL(_tlbil_all)
+_GLOBAL(_tlbil_pid)
+BEGIN_MMU_FTR_SECTION
+	b	2f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
+	li	r3,0
+	sync
+
+	/* Load high watermark */
+	lis	r4,tlb_44x_hwater@ha
+	lwz	r5,tlb_44x_hwater@l(r4)
+
+1:	tlbwe	r3,r3,PPC44x_TLB_PAGEID
+	addi	r3,r3,1
+	cmpw	0,r3,r5
+	ble	1b
+
+	isync
+	blr
+2:
+#ifdef CONFIG_PPC_47x
+	/* 476 variant. There's not simple way to do this, hopefully we'll
+	 * try to limit the amount of such full invalidates
+	 */
+	mfmsr	r11		/* Interrupts off */
+	wrteei	0
+	li	r3,-1		/* Current set */
+	lis	r10,tlb_47x_boltmap@h
+	ori	r10,r10,tlb_47x_boltmap@l
+	lis	r7,0x8000	/* Specify way explicitly */
+
+	b	9f		/* For each set */
+
+1:	li	r9,4		/* Number of ways */
+	li	r4,0		/* Current way */
+	li	r6,0		/* Default entry value 0 */
+	andi.	r0,r8,1		/* Check if way 0 is bolted */
+	mtctr	r9		/* Load way counter */
+	bne-	3f		/* Bolted, skip loading it */
+
+2:	/* For each way */
+	or	r5,r3,r4	/* Make way|index for tlbre */
+	rlwimi	r5,r5,16,8,15	/* Copy index into position */
+	tlbre	r6,r5,0		/* Read entry */
+3:	addis	r4,r4,0x2000	/* Next way */
+	andi.	r0,r6,PPC47x_TLB0_VALID /* Valid entry ? */
+	beq	4f		/* Nope, skip it */
+	rlwimi	r7,r5,0,1,2	/* Insert way number */
+	rlwinm	r6,r6,0,21,19	/* Clear V */
+	tlbwe   r6,r7,0		/* Write it */
+4:	bdnz	2b		/* Loop for each way */
+	srwi	r8,r8,1		/* Next boltmap bit */
+9:	cmpwi	cr1,r3,255	/* Last set done ? */
+	addi	r3,r3,1		/* Next set */
+	beq	cr1,1f		/* End of loop */
+	andi.	r0,r3,0x1f	/* Need to load a new boltmap word ? */
+	bne	1b		/* No, loop */
+	lwz	r8,0(r10)	/* Load boltmap entry */
+	addi	r10,r10,4	/* Next word */
+	b	1b		/* Then loop */
+1:	isync			/* Sync shadows */
+	wrtee	r11
+#else /* CONFIG_PPC_47x */
+1:	trap
+	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0;
+#endif /* !CONFIG_PPC_47x */
+	blr
+
+#ifdef CONFIG_PPC_47x
+
+/*
+ * _tlbivax_bcast is only on 47x. We don't bother doing a runtime
+ * check though, it will blow up soon enough if we mistakenly try
+ * to use it on a 440.
+ */
+_GLOBAL(_tlbivax_bcast)
+	mfspr	r5,SPRN_MMUCR
+	mfmsr	r10
+	rlwimi	r5,r4,0,16,31
+	wrteei	0
+	mtspr	SPRN_MMUCR,r5
+	isync
+	PPC_TLBIVAX(0, R3)
+	isync
+	eieio
+	tlbsync
+BEGIN_FTR_SECTION
+	b	1f
+END_FTR_SECTION_IFSET(CPU_FTR_476_DD2)
+	sync
+	wrtee	r10
+	blr
+/*
+ * DD2 HW could hang if in instruction fetch happens before msync completes.
+ * Touch enough instruction cache lines to ensure cache hits
+ */
+1:	mflr	r9
+	bl	2f
+2:	mflr	r6
+	li	r7,32
+	PPC_ICBT(0,R6,R7)		/* touch next cache line */
+	add	r6,r6,r7
+	PPC_ICBT(0,R6,R7)		/* touch next cache line */
+	add	r6,r6,r7
+	PPC_ICBT(0,R6,R7)		/* touch next cache line */
+	sync
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	mtlr	r9
+	wrtee	r10
+	blr
+#endif /* CONFIG_PPC_47x */
+
+#elif defined(CONFIG_FSL_BOOKE)
+/*
+ * FSL BookE implementations.
+ *
+ * Since feature sections are using _SECTION_ELSE we need
+ * to have the larger code path before the _SECTION_ELSE
+ */
+
+/*
+ * Flush MMU TLB on the local processor
+ */
+_GLOBAL(_tlbil_all)
+BEGIN_MMU_FTR_SECTION
+	li	r3,(MMUCSR0_TLBFI)@l
+	mtspr	SPRN_MMUCSR0, r3
+1:
+	mfspr	r3,SPRN_MMUCSR0
+	andi.	r3,r3,MMUCSR0_TLBFI@l
+	bne	1b
+MMU_FTR_SECTION_ELSE
+	PPC_TLBILX_ALL(0,R0)
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
+	msync
+	isync
+	blr
+
+_GLOBAL(_tlbil_pid)
+BEGIN_MMU_FTR_SECTION
+	slwi	r3,r3,16
+	mfmsr	r10
+	wrteei	0
+	mfspr	r4,SPRN_MAS6	/* save MAS6 */
+	mtspr	SPRN_MAS6,r3
+	PPC_TLBILX_PID(0,R0)
+	mtspr	SPRN_MAS6,r4	/* restore MAS6 */
+	wrtee	r10
+MMU_FTR_SECTION_ELSE
+	li	r3,(MMUCSR0_TLBFI)@l
+	mtspr	SPRN_MMUCSR0, r3
+1:
+	mfspr	r3,SPRN_MMUCSR0
+	andi.	r3,r3,MMUCSR0_TLBFI@l
+	bne	1b
+ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBILX)
+	msync
+	isync
+	blr
+
+/*
+ * Flush MMU TLB for a particular address, but only on the local processor
+ * (no broadcast)
+ */
+_GLOBAL(__tlbil_va)
+	mfmsr	r10
+	wrteei	0
+	slwi	r4,r4,16
+	ori	r4,r4,(MAS6_ISIZE(BOOK3E_PAGESZ_4K))@l
+	mtspr	SPRN_MAS6,r4		/* assume AS=0 for now */
+BEGIN_MMU_FTR_SECTION
+	tlbsx	0,r3
+	mfspr	r4,SPRN_MAS1		/* check valid */
+	andis.	r3,r4,MAS1_VALID@h
+	beq	1f
+	rlwinm	r4,r4,0,1,31
+	mtspr	SPRN_MAS1,r4
+	tlbwe
+MMU_FTR_SECTION_ELSE
+	PPC_TLBILX_VA(0,R3)
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
+	msync
+	isync
+1:	wrtee	r10
+	blr
+#elif defined(CONFIG_PPC_BOOK3E)
+/*
+ * New Book3E (>= 2.06) implementation
+ *
+ * Note: We may be able to get away without the interrupt masking stuff
+ * if we save/restore MAS6 on exceptions that might modify it
+ */
+_GLOBAL(_tlbil_pid)
+	slwi	r4,r3,MAS6_SPID_SHIFT
+	mfmsr	r10
+	wrteei	0
+	mtspr	SPRN_MAS6,r4
+	PPC_TLBILX_PID(0,R0)
+	wrtee	r10
+	msync
+	isync
+	blr
+
+_GLOBAL(_tlbil_pid_noind)
+	slwi	r4,r3,MAS6_SPID_SHIFT
+	mfmsr	r10
+	ori	r4,r4,MAS6_SIND
+	wrteei	0
+	mtspr	SPRN_MAS6,r4
+	PPC_TLBILX_PID(0,R0)
+	wrtee	r10
+	msync
+	isync
+	blr
+
+_GLOBAL(_tlbil_all)
+	PPC_TLBILX_ALL(0,R0)
+	msync
+	isync
+	blr
+
+_GLOBAL(_tlbil_va)
+	mfmsr	r10
+	wrteei	0
+	cmpwi	cr0,r6,0
+	slwi	r4,r4,MAS6_SPID_SHIFT
+	rlwimi	r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK
+	beq	1f
+	rlwimi	r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
+1:	mtspr	SPRN_MAS6,r4		/* assume AS=0 for now */
+	PPC_TLBILX_VA(0,R3)
+	msync
+	isync
+	wrtee	r10
+	blr
+
+_GLOBAL(_tlbivax_bcast)
+	mfmsr	r10
+	wrteei	0
+	cmpwi	cr0,r6,0
+	slwi	r4,r4,MAS6_SPID_SHIFT
+	rlwimi	r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK
+	beq	1f
+	rlwimi	r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
+1:	mtspr	SPRN_MAS6,r4		/* assume AS=0 for now */
+	PPC_TLBIVAX(0,R3)
+	eieio
+	tlbsync
+	sync
+	wrtee	r10
+	blr
+
+_GLOBAL(set_context)
+#ifdef CONFIG_BDI_SWITCH
+	/* Context switch the PTE pointer for the Abatron BDI2000.
+	 * The PGDIR is the second parameter.
+	 */
+	lis	r5, abatron_pteptrs@h
+	ori	r5, r5, abatron_pteptrs@l
+	stw	r4, 0x4(r5)
+#endif
+	mtspr	SPRN_PID,r3
+	isync			/* Force context change */
+	blr
+#else
+#error Unsupported processor type !
+#endif
+
+#if defined(CONFIG_PPC_FSL_BOOK3E)
+/*
+ * extern void loadcam_entry(unsigned int index)
+ *
+ * Load TLBCAM[index] entry in to the L2 CAM MMU
+ * Must preserve r7, r8, r9, and r10
+ */
+_GLOBAL(loadcam_entry)
+	mflr	r5
+	LOAD_REG_ADDR_PIC(r4, TLBCAM)
+	mtlr	r5
+	mulli	r5,r3,TLBCAM_SIZE
+	add	r3,r5,r4
+	lwz	r4,TLBCAM_MAS0(r3)
+	mtspr	SPRN_MAS0,r4
+	lwz	r4,TLBCAM_MAS1(r3)
+	mtspr	SPRN_MAS1,r4
+	PPC_LL	r4,TLBCAM_MAS2(r3)
+	mtspr	SPRN_MAS2,r4
+	lwz	r4,TLBCAM_MAS3(r3)
+	mtspr	SPRN_MAS3,r4
+BEGIN_MMU_FTR_SECTION
+	lwz	r4,TLBCAM_MAS7(r3)
+	mtspr	SPRN_MAS7,r4
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
+	isync
+	tlbwe
+	isync
+	blr
+
+/*
+ * Load multiple TLB entries at once, using an alternate-space
+ * trampoline so that we don't have to care about whether the same
+ * TLB entry maps us before and after.
+ *
+ * r3 = first entry to write
+ * r4 = number of entries to write
+ * r5 = temporary tlb entry
+ */
+_GLOBAL(loadcam_multi)
+	mflr	r8
+
+	/*
+	 * Set up temporary TLB entry that is the same as what we're
+	 * running from, but in AS=1.
+	 */
+	bl	1f
+1:	mflr	r6
+	tlbsx	0,r8
+	mfspr	r6,SPRN_MAS1
+	ori	r6,r6,MAS1_TS
+	mtspr	SPRN_MAS1,r6
+	mfspr	r6,SPRN_MAS0
+	rlwimi	r6,r5,MAS0_ESEL_SHIFT,MAS0_ESEL_MASK
+	mr	r7,r5
+	mtspr	SPRN_MAS0,r6
+	isync
+	tlbwe
+	isync
+
+	/* Switch to AS=1 */
+	mfmsr	r6
+	ori	r6,r6,MSR_IS|MSR_DS
+	mtmsr	r6
+	isync
+
+	mr	r9,r3
+	add	r10,r3,r4
+2:	bl	loadcam_entry
+	addi	r9,r9,1
+	cmpw	r9,r10
+	mr	r3,r9
+	blt	2b
+
+	/* Return to AS=0 and clear the temporary entry */
+	mfmsr	r6
+	rlwinm.	r6,r6,0,~(MSR_IS|MSR_DS)
+	mtmsr	r6
+	isync
+
+	li	r6,0
+	mtspr	SPRN_MAS1,r6
+	rlwinm	r6,r7,MAS0_ESEL_SHIFT,MAS0_ESEL_MASK
+	oris	r6,r6,MAS0_TLBSEL(1)@h
+	mtspr	SPRN_MAS0,r6
+	isync
+	tlbwe
+	isync
+
+	mtlr	r8
+	blr
+#endif
diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S
new file mode 100644
index 000000000000..9ed90064f542
--- /dev/null
+++ b/arch/powerpc/mm/nohash/tlb_low_64e.S
@@ -0,0 +1,1280 @@
+/*
+ *  Low level TLB miss handlers for Book3E
+ *
+ *  Copyright (C) 2008-2009
+ *      Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/processor.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cputable.h>
+#include <asm/pgtable.h>
+#include <asm/exception-64e.h>
+#include <asm/ppc-opcode.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_booke_hv_asm.h>
+#include <asm/feature-fixups.h>
+
+#ifdef CONFIG_PPC_64K_PAGES
+#define VPTE_PMD_SHIFT	(PTE_INDEX_SIZE+1)
+#else
+#define VPTE_PMD_SHIFT	(PTE_INDEX_SIZE)
+#endif
+#define VPTE_PUD_SHIFT	(VPTE_PMD_SHIFT + PMD_INDEX_SIZE)
+#define VPTE_PGD_SHIFT	(VPTE_PUD_SHIFT + PUD_INDEX_SIZE)
+#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE)
+
+/**********************************************************************
+ *                                                                    *
+ * TLB miss handling for Book3E with a bolted linear mapping          *
+ * No virtual page table, no nested TLB misses                        *
+ *                                                                    *
+ **********************************************************************/
+
+/*
+ * Note that, unlike non-bolted handlers, TLB_EXFRAME is not
+ * modified by the TLB miss handlers themselves, since the TLB miss
+ * handler code will not itself cause a recursive TLB miss.
+ *
+ * TLB_EXFRAME will be modified when crit/mc/debug exceptions are
+ * entered/exited.
+ */
+.macro tlb_prolog_bolted intnum addr
+	mtspr	SPRN_SPRG_GEN_SCRATCH,r12
+	mfspr	r12,SPRN_SPRG_TLB_EXFRAME
+	std	r13,EX_TLB_R13(r12)
+	std	r10,EX_TLB_R10(r12)
+	mfspr	r13,SPRN_SPRG_PACA
+
+	mfcr	r10
+	std	r11,EX_TLB_R11(r12)
+#ifdef CONFIG_KVM_BOOKE_HV
+BEGIN_FTR_SECTION
+	mfspr	r11, SPRN_SRR1
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+#endif
+	DO_KVM	\intnum, SPRN_SRR1
+	std	r16,EX_TLB_R16(r12)
+	mfspr	r16,\addr		/* get faulting address */
+	std	r14,EX_TLB_R14(r12)
+	ld	r14,PACAPGD(r13)
+	std	r15,EX_TLB_R15(r12)
+	std	r10,EX_TLB_CR(r12)
+#ifdef CONFIG_PPC_FSL_BOOK3E
+START_BTB_FLUSH_SECTION
+	mfspr r11, SPRN_SRR1
+	andi. r10,r11,MSR_PR
+	beq 1f
+	BTB_FLUSH(r10)
+1:
+END_BTB_FLUSH_SECTION
+	std	r7,EX_TLB_R7(r12)
+#endif
+	TLB_MISS_PROLOG_STATS
+.endm
+
+.macro tlb_epilog_bolted
+	ld	r14,EX_TLB_CR(r12)
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	ld	r7,EX_TLB_R7(r12)
+#endif
+	ld	r10,EX_TLB_R10(r12)
+	ld	r11,EX_TLB_R11(r12)
+	ld	r13,EX_TLB_R13(r12)
+	mtcr	r14
+	ld	r14,EX_TLB_R14(r12)
+	ld	r15,EX_TLB_R15(r12)
+	TLB_MISS_RESTORE_STATS
+	ld	r16,EX_TLB_R16(r12)
+	mfspr	r12,SPRN_SPRG_GEN_SCRATCH
+.endm
+
+/* Data TLB miss */
+	START_EXCEPTION(data_tlb_miss_bolted)
+	tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR
+
+	/* We need _PAGE_PRESENT and  _PAGE_ACCESSED set */
+
+	/* We do the user/kernel test for the PID here along with the RW test
+	 */
+	/* We pre-test some combination of permissions to avoid double
+	 * faults:
+	 *
+	 * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE
+	 * ESR_ST   is 0x00800000
+	 * _PAGE_BAP_SW is 0x00000010
+	 * So the shift is >> 19. This tests for supervisor writeability.
+	 * If the page happens to be supervisor writeable and not user
+	 * writeable, we will take a new fault later, but that should be
+	 * a rare enough case.
+	 *
+	 * We also move ESR_ST in _PAGE_DIRTY position
+	 * _PAGE_DIRTY is 0x00001000 so the shift is >> 11
+	 *
+	 * MAS1 is preset for all we need except for TID that needs to
+	 * be cleared for kernel translations
+	 */
+
+	mfspr	r11,SPRN_ESR
+
+	srdi	r15,r16,60		/* get region */
+	rldicl.	r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+	bne-	dtlb_miss_fault_bolted	/* Bail if fault addr is invalid */
+
+	rlwinm	r10,r11,32-19,27,27
+	rlwimi	r10,r11,32-16,19,19
+	cmpwi	r15,0			/* user vs kernel check */
+	ori	r10,r10,_PAGE_PRESENT
+	oris	r11,r10,_PAGE_ACCESSED@h
+
+	TLB_MISS_STATS_SAVE_INFO_BOLTED
+	bne	tlb_miss_kernel_bolted
+
+tlb_miss_common_bolted:
+/*
+ * This is the guts of the TLB miss handler for bolted-linear.
+ * We are entered with:
+ *
+ * r16 = faulting address
+ * r15 = crap (free to use)
+ * r14 = page table base
+ * r13 = PACA
+ * r11 = PTE permission mask
+ * r10 = crap (free to use)
+ */
+	rldicl	r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3
+	cmpldi	cr0,r14,0
+	clrrdi	r15,r15,3
+	beq	tlb_miss_fault_bolted	/* No PGDIR, bail */
+
+BEGIN_MMU_FTR_SECTION
+	/* Set the TLB reservation and search for existing entry. Then load
+	 * the entry.
+	 */
+	PPC_TLBSRX_DOT(0,R16)
+	ldx	r14,r14,r15		/* grab pgd entry */
+	beq	tlb_miss_done_bolted	/* tlb exists already, bail */
+MMU_FTR_SECTION_ELSE
+	ldx	r14,r14,r15		/* grab pgd entry */
+ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
+
+#ifndef CONFIG_PPC_64K_PAGES
+	rldicl	r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
+	clrrdi	r15,r15,3
+	cmpdi	cr0,r14,0
+	bge	tlb_miss_fault_bolted	/* Bad pgd entry or hugepage; bail */
+	ldx	r14,r14,r15		/* grab pud entry */
+#endif /* CONFIG_PPC_64K_PAGES */
+
+	rldicl	r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
+	clrrdi	r15,r15,3
+	cmpdi	cr0,r14,0
+	bge	tlb_miss_fault_bolted
+	ldx	r14,r14,r15		/* Grab pmd entry */
+
+	rldicl	r15,r16,64-PAGE_SHIFT+3,64-PTE_INDEX_SIZE-3
+	clrrdi	r15,r15,3
+	cmpdi	cr0,r14,0
+	bge	tlb_miss_fault_bolted
+	ldx	r14,r14,r15		/* Grab PTE, normal (!huge) page */
+
+	/* Check if required permissions are met */
+	andc.	r15,r11,r14
+	rldicr	r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
+	bne-	tlb_miss_fault_bolted
+
+	/* Now we build the MAS:
+	 *
+	 * MAS 0   :	Fully setup with defaults in MAS4 and TLBnCFG
+	 * MAS 1   :	Almost fully setup
+	 *               - PID already updated by caller if necessary
+	 *               - TSIZE need change if !base page size, not
+	 *                 yet implemented for now
+	 * MAS 2   :	Defaults not useful, need to be redone
+	 * MAS 3+7 :	Needs to be done
+	 */
+	clrrdi	r11,r16,12		/* Clear low crap in EA */
+	clrldi	r15,r15,12		/* Clear crap at the top */
+	rlwimi	r11,r14,32-19,27,31	/* Insert WIMGE */
+	rlwimi	r15,r14,32-8,22,25	/* Move in U bits */
+	mtspr	SPRN_MAS2,r11
+	andi.	r11,r14,_PAGE_DIRTY
+	rlwimi	r15,r14,32-2,26,31	/* Move in BAP bits */
+
+	/* Mask out SW and UW if !DIRTY (XXX optimize this !) */
+	bne	1f
+	li	r11,MAS3_SW|MAS3_UW
+	andc	r15,r15,r11
+1:
+	mtspr	SPRN_MAS7_MAS3,r15
+	tlbwe
+
+tlb_miss_done_bolted:
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
+	tlb_epilog_bolted
+	rfi
+
+itlb_miss_kernel_bolted:
+	li	r11,_PAGE_PRESENT|_PAGE_BAP_SX	/* Base perm */
+	oris	r11,r11,_PAGE_ACCESSED@h
+tlb_miss_kernel_bolted:
+	mfspr	r10,SPRN_MAS1
+	ld	r14,PACA_KERNELPGD(r13)
+	cmpldi	cr0,r15,8		/* Check for vmalloc region */
+	rlwinm	r10,r10,0,16,1		/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+	beq+	tlb_miss_common_bolted
+
+tlb_miss_fault_bolted:
+	/* We need to check if it was an instruction miss */
+	andi.	r10,r11,_PAGE_EXEC|_PAGE_BAP_SX
+	bne	itlb_miss_fault_bolted
+dtlb_miss_fault_bolted:
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+	tlb_epilog_bolted
+	b	exc_data_storage_book3e
+itlb_miss_fault_bolted:
+	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+	tlb_epilog_bolted
+	b	exc_instruction_storage_book3e
+
+/* Instruction TLB miss */
+	START_EXCEPTION(instruction_tlb_miss_bolted)
+	tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0
+
+	rldicl.	r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+	srdi	r15,r16,60		/* get region */
+	TLB_MISS_STATS_SAVE_INFO_BOLTED
+	bne-	itlb_miss_fault_bolted
+
+	li	r11,_PAGE_PRESENT|_PAGE_EXEC	/* Base perm */
+
+	/* We do the user/kernel test for the PID here along with the RW test
+	 */
+
+	cmpldi	cr0,r15,0			/* Check for user region */
+	oris	r11,r11,_PAGE_ACCESSED@h
+	beq	tlb_miss_common_bolted
+	b	itlb_miss_kernel_bolted
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+/*
+ * TLB miss handling for e6500 and derivatives, using hardware tablewalk.
+ *
+ * Linear mapping is bolted: no virtual page table or nested TLB misses
+ * Indirect entries in TLB1, hardware loads resulting direct entries
+ *    into TLB0
+ * No HES or NV hint on TLB1, so we need to do software round-robin
+ * No tlbsrx. so we need a spinlock, and we have to deal
+ *    with MAS-damage caused by tlbsx
+ * 4K pages only
+ */
+
+	START_EXCEPTION(instruction_tlb_miss_e6500)
+	tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0
+
+	ld	r11,PACA_TCD_PTR(r13)
+	srdi.	r15,r16,60		/* get region */
+	ori	r16,r16,1
+
+	TLB_MISS_STATS_SAVE_INFO_BOLTED
+	bne	tlb_miss_kernel_e6500	/* user/kernel test */
+
+	b	tlb_miss_common_e6500
+
+	START_EXCEPTION(data_tlb_miss_e6500)
+	tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR
+
+	ld	r11,PACA_TCD_PTR(r13)
+	srdi.	r15,r16,60		/* get region */
+	rldicr	r16,r16,0,62
+
+	TLB_MISS_STATS_SAVE_INFO_BOLTED
+	bne	tlb_miss_kernel_e6500	/* user vs kernel check */
+
+/*
+ * This is the guts of the TLB miss handler for e6500 and derivatives.
+ * We are entered with:
+ *
+ * r16 = page of faulting address (low bit 0 if data, 1 if instruction)
+ * r15 = crap (free to use)
+ * r14 = page table base
+ * r13 = PACA
+ * r11 = tlb_per_core ptr
+ * r10 = crap (free to use)
+ * r7  = esel_next
+ */
+tlb_miss_common_e6500:
+	crmove	cr2*4+2,cr0*4+2		/* cr2.eq != 0 if kernel address */
+
+BEGIN_FTR_SECTION		/* CPU_FTR_SMT */
+	/*
+	 * Search if we already have an indirect entry for that virtual
+	 * address, and if we do, bail out.
+	 *
+	 * MAS6:IND should be already set based on MAS4
+	 */
+	lhz	r10,PACAPACAINDEX(r13)
+	addi	r10,r10,1
+	crclr	cr1*4+eq	/* set cr1.eq = 0 for non-recursive */
+1:	lbarx	r15,0,r11
+	cmpdi	r15,0
+	bne	2f
+	stbcx.	r10,0,r11
+	bne	1b
+3:
+	.subsection 1
+2:	cmpd	cr1,r15,r10	/* recursive lock due to mcheck/crit/etc? */
+	beq	cr1,3b		/* unlock will happen if cr1.eq = 0 */
+10:	lbz	r15,0(r11)
+	cmpdi	r15,0
+	bne	10b
+	b	1b
+	.previous
+END_FTR_SECTION_IFSET(CPU_FTR_SMT)
+
+	lbz	r7,TCD_ESEL_NEXT(r11)
+
+BEGIN_FTR_SECTION		/* CPU_FTR_SMT */
+	/*
+	 * Erratum A-008139 says that we can't use tlbwe to change
+	 * an indirect entry in any way (including replacing or
+	 * invalidating) if the other thread could be in the process
+	 * of a lookup.  The workaround is to invalidate the entry
+	 * with tlbilx before overwriting.
+	 */
+
+	rlwinm	r10,r7,16,0xff0000
+	oris	r10,r10,MAS0_TLBSEL(1)@h
+	mtspr	SPRN_MAS0,r10
+	isync
+	tlbre
+	mfspr	r15,SPRN_MAS1
+	andis.	r15,r15,MAS1_VALID@h
+	beq	5f
+
+BEGIN_FTR_SECTION_NESTED(532)
+	mfspr	r10,SPRN_MAS8
+	rlwinm	r10,r10,0,0x80000fff  /* tgs,tlpid -> sgs,slpid */
+	mtspr	SPRN_MAS5,r10
+END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532)
+
+	mfspr	r10,SPRN_MAS1
+	rlwinm	r15,r10,0,0x3fff0000  /* tid -> spid */
+	rlwimi	r15,r10,20,0x00000003 /* ind,ts -> sind,sas */
+	mfspr	r10,SPRN_MAS6
+	mtspr	SPRN_MAS6,r15
+
+	mfspr	r15,SPRN_MAS2
+	isync
+	tlbilxva 0,r15
+	isync
+
+	mtspr	SPRN_MAS6,r10
+
+5:
+BEGIN_FTR_SECTION_NESTED(532)
+	li	r10,0
+	mtspr	SPRN_MAS8,r10
+	mtspr	SPRN_MAS5,r10
+END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532)
+
+	tlbsx	0,r16
+	mfspr	r10,SPRN_MAS1
+	andis.	r15,r10,MAS1_VALID@h
+	bne	tlb_miss_done_e6500
+FTR_SECTION_ELSE
+	mfspr	r10,SPRN_MAS1
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT)
+
+	oris	r10,r10,MAS1_VALID@h
+	beq	cr2,4f
+	rlwinm	r10,r10,0,16,1		/* Clear TID */
+4:	mtspr	SPRN_MAS1,r10
+
+	/* Now, we need to walk the page tables. First check if we are in
+	 * range.
+	 */
+	rldicl.	r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+	bne-	tlb_miss_fault_e6500
+
+	rldicl	r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3
+	cmpldi	cr0,r14,0
+	clrrdi	r15,r15,3
+	beq-	tlb_miss_fault_e6500 /* No PGDIR, bail */
+	ldx	r14,r14,r15		/* grab pgd entry */
+
+	rldicl	r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
+	clrrdi	r15,r15,3
+	cmpdi	cr0,r14,0
+	bge	tlb_miss_huge_e6500	/* Bad pgd entry or hugepage; bail */
+	ldx	r14,r14,r15		/* grab pud entry */
+
+	rldicl	r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
+	clrrdi	r15,r15,3
+	cmpdi	cr0,r14,0
+	bge	tlb_miss_huge_e6500
+	ldx	r14,r14,r15		/* Grab pmd entry */
+
+	mfspr	r10,SPRN_MAS0
+	cmpdi	cr0,r14,0
+	bge	tlb_miss_huge_e6500
+
+	/* Now we build the MAS for a 2M indirect page:
+	 *
+	 * MAS 0   :	ESEL needs to be filled by software round-robin
+	 * MAS 1   :	Fully set up
+	 *               - PID already updated by caller if necessary
+	 *               - TSIZE for now is base ind page size always
+	 *               - TID already cleared if necessary
+	 * MAS 2   :	Default not 2M-aligned, need to be redone
+	 * MAS 3+7 :	Needs to be done
+	 */
+
+	ori	r14,r14,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
+	mtspr	SPRN_MAS7_MAS3,r14
+
+	clrrdi	r15,r16,21		/* make EA 2M-aligned */
+	mtspr	SPRN_MAS2,r15
+
+tlb_miss_huge_done_e6500:
+	lbz	r16,TCD_ESEL_MAX(r11)
+	lbz	r14,TCD_ESEL_FIRST(r11)
+	rlwimi	r10,r7,16,0x00ff0000	/* insert esel_next into MAS0 */
+	addi	r7,r7,1			/* increment esel_next */
+	mtspr	SPRN_MAS0,r10
+	cmpw	r7,r16
+	iseleq	r7,r14,r7		/* if next == last use first */
+	stb	r7,TCD_ESEL_NEXT(r11)
+
+	tlbwe
+
+tlb_miss_done_e6500:
+	.macro	tlb_unlock_e6500
+BEGIN_FTR_SECTION
+	beq	cr1,1f		/* no unlock if lock was recursively grabbed */
+	li	r15,0
+	isync
+	stb	r15,0(r11)
+1:
+END_FTR_SECTION_IFSET(CPU_FTR_SMT)
+	.endm
+
+	tlb_unlock_e6500
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
+	tlb_epilog_bolted
+	rfi
+
+tlb_miss_huge_e6500:
+	beq	tlb_miss_fault_e6500
+	li	r10,1
+	andi.	r15,r14,HUGEPD_SHIFT_MASK@l /* r15 = psize */
+	rldimi	r14,r10,63,0		/* Set PD_HUGE */
+	xor	r14,r14,r15		/* Clear size bits */
+	ldx	r14,0,r14
+
+	/*
+	 * Now we build the MAS for a huge page.
+	 *
+	 * MAS 0   :	ESEL needs to be filled by software round-robin
+	 *		 - can be handled by indirect code
+	 * MAS 1   :	Need to clear IND and set TSIZE
+	 * MAS 2,3+7:	Needs to be redone similar to non-tablewalk handler
+	 */
+
+	subi	r15,r15,10		/* Convert psize to tsize */
+	mfspr	r10,SPRN_MAS1
+	rlwinm	r10,r10,0,~MAS1_IND
+	rlwimi	r10,r15,MAS1_TSIZE_SHIFT,MAS1_TSIZE_MASK
+	mtspr	SPRN_MAS1,r10
+
+	li	r10,-0x400
+	sld	r15,r10,r15		/* Generate mask based on size */
+	and	r10,r16,r15
+	rldicr	r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
+	rlwimi	r10,r14,32-19,27,31	/* Insert WIMGE */
+	clrldi	r15,r15,PAGE_SHIFT	/* Clear crap at the top */
+	rlwimi	r15,r14,32-8,22,25	/* Move in U bits */
+	mtspr	SPRN_MAS2,r10
+	andi.	r10,r14,_PAGE_DIRTY
+	rlwimi	r15,r14,32-2,26,31	/* Move in BAP bits */
+
+	/* Mask out SW and UW if !DIRTY (XXX optimize this !) */
+	bne	1f
+	li	r10,MAS3_SW|MAS3_UW
+	andc	r15,r15,r10
+1:
+	mtspr	SPRN_MAS7_MAS3,r15
+
+	mfspr	r10,SPRN_MAS0
+	b	tlb_miss_huge_done_e6500
+
+tlb_miss_kernel_e6500:
+	ld	r14,PACA_KERNELPGD(r13)
+	cmpldi	cr1,r15,8		/* Check for vmalloc region */
+	beq+	cr1,tlb_miss_common_e6500
+
+tlb_miss_fault_e6500:
+	tlb_unlock_e6500
+	/* We need to check if it was an instruction miss */
+	andi.	r16,r16,1
+	bne	itlb_miss_fault_e6500
+dtlb_miss_fault_e6500:
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+	tlb_epilog_bolted
+	b	exc_data_storage_book3e
+itlb_miss_fault_e6500:
+	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+	tlb_epilog_bolted
+	b	exc_instruction_storage_book3e
+#endif /* CONFIG_PPC_FSL_BOOK3E */
+
+/**********************************************************************
+ *                                                                    *
+ * TLB miss handling for Book3E with TLB reservation and HES support  *
+ *                                                                    *
+ **********************************************************************/
+
+
+/* Data TLB miss */
+	START_EXCEPTION(data_tlb_miss)
+	TLB_MISS_PROLOG
+
+	/* Now we handle the fault proper. We only save DEAR in normal
+	 * fault case since that's the only interesting values here.
+	 * We could probably also optimize by not saving SRR0/1 in the
+	 * linear mapping case but I'll leave that for later
+	 */
+	mfspr	r14,SPRN_ESR
+	mfspr	r16,SPRN_DEAR		/* get faulting address */
+	srdi	r15,r16,60		/* get region */
+	cmpldi	cr0,r15,0xc		/* linear mapping ? */
+	TLB_MISS_STATS_SAVE_INFO
+	beq	tlb_load_linear		/* yes -> go to linear map load */
+
+	/* The page tables are mapped virtually linear. At this point, though,
+	 * we don't know whether we are trying to fault in a first level
+	 * virtual address or a virtual page table address. We can get that
+	 * from bit 0x1 of the region ID which we have set for a page table
+	 */
+	andi.	r10,r15,0x1
+	bne-	virt_page_table_tlb_miss
+
+	std	r14,EX_TLB_ESR(r12);	/* save ESR */
+	std	r16,EX_TLB_DEAR(r12);	/* save DEAR */
+
+	 /* We need _PAGE_PRESENT and  _PAGE_ACCESSED set */
+	li	r11,_PAGE_PRESENT
+	oris	r11,r11,_PAGE_ACCESSED@h
+
+	/* We do the user/kernel test for the PID here along with the RW test
+	 */
+	cmpldi	cr0,r15,0		/* Check for user region */
+
+	/* We pre-test some combination of permissions to avoid double
+	 * faults:
+	 *
+	 * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE
+	 * ESR_ST   is 0x00800000
+	 * _PAGE_BAP_SW is 0x00000010
+	 * So the shift is >> 19. This tests for supervisor writeability.
+	 * If the page happens to be supervisor writeable and not user
+	 * writeable, we will take a new fault later, but that should be
+	 * a rare enough case.
+	 *
+	 * We also move ESR_ST in _PAGE_DIRTY position
+	 * _PAGE_DIRTY is 0x00001000 so the shift is >> 11
+	 *
+	 * MAS1 is preset for all we need except for TID that needs to
+	 * be cleared for kernel translations
+	 */
+	rlwimi	r11,r14,32-19,27,27
+	rlwimi	r11,r14,32-16,19,19
+	beq	normal_tlb_miss
+	/* XXX replace the RMW cycles with immediate loads + writes */
+1:	mfspr	r10,SPRN_MAS1
+	cmpldi	cr0,r15,8		/* Check for vmalloc region */
+	rlwinm	r10,r10,0,16,1		/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+	beq+	normal_tlb_miss
+
+	/* We got a crappy address, just fault with whatever DEAR and ESR
+	 * are here
+	 */
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_data_storage_book3e
+
+/* Instruction TLB miss */
+	START_EXCEPTION(instruction_tlb_miss)
+	TLB_MISS_PROLOG
+
+	/* If we take a recursive fault, the second level handler may need
+	 * to know whether we are handling a data or instruction fault in
+	 * order to get to the right store fault handler. We provide that
+	 * info by writing a crazy value in ESR in our exception frame
+	 */
+	li	r14,-1	/* store to exception frame is done later */
+
+	/* Now we handle the fault proper. We only save DEAR in the non
+	 * linear mapping case since we know the linear mapping case will
+	 * not re-enter. We could indeed optimize and also not save SRR0/1
+	 * in the linear mapping case but I'll leave that for later
+	 *
+	 * Faulting address is SRR0 which is already in r16
+	 */
+	srdi	r15,r16,60		/* get region */
+	cmpldi	cr0,r15,0xc		/* linear mapping ? */
+	TLB_MISS_STATS_SAVE_INFO
+	beq	tlb_load_linear		/* yes -> go to linear map load */
+
+	/* We do the user/kernel test for the PID here along with the RW test
+	 */
+	li	r11,_PAGE_PRESENT|_PAGE_EXEC	/* Base perm */
+	oris	r11,r11,_PAGE_ACCESSED@h
+
+	cmpldi	cr0,r15,0			/* Check for user region */
+	std	r14,EX_TLB_ESR(r12)		/* write crazy -1 to frame */
+	beq	normal_tlb_miss
+
+	li	r11,_PAGE_PRESENT|_PAGE_BAP_SX	/* Base perm */
+	oris	r11,r11,_PAGE_ACCESSED@h
+	/* XXX replace the RMW cycles with immediate loads + writes */
+	mfspr	r10,SPRN_MAS1
+	cmpldi	cr0,r15,8			/* Check for vmalloc region */
+	rlwinm	r10,r10,0,16,1			/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+	beq+	normal_tlb_miss
+
+	/* We got a crappy address, just fault */
+	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_instruction_storage_book3e
+
+/*
+ * This is the guts of the first-level TLB miss handler for direct
+ * misses. We are entered with:
+ *
+ * r16 = faulting address
+ * r15 = region ID
+ * r14 = crap (free to use)
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = PTE permission mask
+ * r10 = crap (free to use)
+ */
+normal_tlb_miss:
+	/* So we first construct the page table address. We do that by
+	 * shifting the bottom of the address (not the region ID) by
+	 * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and
+	 * or'ing the fourth high bit.
+	 *
+	 * NOTE: For 64K pages, we do things slightly differently in
+	 * order to handle the weird page table format used by linux
+	 */
+	ori	r10,r15,0x1
+#ifdef CONFIG_PPC_64K_PAGES
+	/* For the top bits, 16 bytes per PTE */
+	rldicl	r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4
+	/* Now create the bottom bits as 0 in position 0x8000 and
+	 * the rest calculated for 8 bytes per PTE
+	 */
+	rldicl	r15,r16,64-(PAGE_SHIFT-3),64-15
+	/* Insert the bottom bits in */
+	rlwimi	r14,r15,0,16,31
+#else
+	rldicl	r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4
+#endif
+	sldi	r15,r10,60
+	clrrdi	r14,r14,3
+	or	r10,r15,r14
+
+BEGIN_MMU_FTR_SECTION
+	/* Set the TLB reservation and search for existing entry. Then load
+	 * the entry.
+	 */
+	PPC_TLBSRX_DOT(0,R16)
+	ld	r14,0(r10)
+	beq	normal_tlb_miss_done
+MMU_FTR_SECTION_ELSE
+	ld	r14,0(r10)
+ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
+
+finish_normal_tlb_miss:
+	/* Check if required permissions are met */
+	andc.	r15,r11,r14
+	bne-	normal_tlb_miss_access_fault
+
+	/* Now we build the MAS:
+	 *
+	 * MAS 0   :	Fully setup with defaults in MAS4 and TLBnCFG
+	 * MAS 1   :	Almost fully setup
+	 *               - PID already updated by caller if necessary
+	 *               - TSIZE need change if !base page size, not
+	 *                 yet implemented for now
+	 * MAS 2   :	Defaults not useful, need to be redone
+	 * MAS 3+7 :	Needs to be done
+	 *
+	 * TODO: mix up code below for better scheduling
+	 */
+	clrrdi	r11,r16,12		/* Clear low crap in EA */
+	rlwimi	r11,r14,32-19,27,31	/* Insert WIMGE */
+	mtspr	SPRN_MAS2,r11
+
+	/* Check page size, if not standard, update MAS1 */
+	rldicl	r11,r14,64-8,64-8
+#ifdef CONFIG_PPC_64K_PAGES
+	cmpldi	cr0,r11,BOOK3E_PAGESZ_64K
+#else
+	cmpldi	cr0,r11,BOOK3E_PAGESZ_4K
+#endif
+	beq-	1f
+	mfspr	r11,SPRN_MAS1
+	rlwimi	r11,r14,31,21,24
+	rlwinm	r11,r11,0,21,19
+	mtspr	SPRN_MAS1,r11
+1:
+	/* Move RPN in position */
+	rldicr	r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
+	clrldi	r15,r11,12		/* Clear crap at the top */
+	rlwimi	r15,r14,32-8,22,25	/* Move in U bits */
+	rlwimi	r15,r14,32-2,26,31	/* Move in BAP bits */
+
+	/* Mask out SW and UW if !DIRTY (XXX optimize this !) */
+	andi.	r11,r14,_PAGE_DIRTY
+	bne	1f
+	li	r11,MAS3_SW|MAS3_UW
+	andc	r15,r15,r11
+1:
+BEGIN_MMU_FTR_SECTION
+	srdi	r16,r15,32
+	mtspr	SPRN_MAS3,r15
+	mtspr	SPRN_MAS7,r16
+MMU_FTR_SECTION_ELSE
+	mtspr	SPRN_MAS7_MAS3,r15
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
+
+	tlbwe
+
+normal_tlb_miss_done:
+	/* We don't bother with restoring DEAR or ESR since we know we are
+	 * level 0 and just going back to userland. They are only needed
+	 * if you are going to take an access fault
+	 */
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
+	TLB_MISS_EPILOG_SUCCESS
+	rfi
+
+normal_tlb_miss_access_fault:
+	/* We need to check if it was an instruction miss */
+	andi.	r10,r11,_PAGE_EXEC
+	bne	1f
+	ld	r14,EX_TLB_DEAR(r12)
+	ld	r15,EX_TLB_ESR(r12)
+	mtspr	SPRN_DEAR,r14
+	mtspr	SPRN_ESR,r15
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_data_storage_book3e
+1:	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_instruction_storage_book3e
+
+
+/*
+ * This is the guts of the second-level TLB miss handler for direct
+ * misses. We are entered with:
+ *
+ * r16 = virtual page table faulting address
+ * r15 = region (top 4 bits of address)
+ * r14 = crap (free to use)
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = crap (free to use)
+ * r10 = crap (free to use)
+ *
+ * Note that this should only ever be called as a second level handler
+ * with the current scheme when using SW load.
+ * That means we can always get the original fault DEAR at
+ * EX_TLB_DEAR-EX_TLB_SIZE(r12)
+ *
+ * It can be re-entered by the linear mapping miss handler. However, to
+ * avoid too much complication, it will restart the whole fault at level
+ * 0 so we don't care too much about clobbers
+ *
+ * XXX That code was written back when we couldn't clobber r14. We can now,
+ * so we could probably optimize things a bit
+ */
+virt_page_table_tlb_miss:
+	/* Are we hitting a kernel page table ? */
+	andi.	r10,r15,0x8
+
+	/* The cool thing now is that r10 contains 0 for user and 8 for kernel,
+	 * and we happen to have the swapper_pg_dir at offset 8 from the user
+	 * pgdir in the PACA :-).
+	 */
+	add	r11,r10,r13
+
+	/* If kernel, we need to clear MAS1 TID */
+	beq	1f
+	/* XXX replace the RMW cycles with immediate loads + writes */
+	mfspr	r10,SPRN_MAS1
+	rlwinm	r10,r10,0,16,1			/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+1:
+BEGIN_MMU_FTR_SECTION
+	/* Search if we already have a TLB entry for that virtual address, and
+	 * if we do, bail out.
+	 */
+	PPC_TLBSRX_DOT(0,R16)
+	beq	virt_page_table_tlb_miss_done
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
+
+	/* Now, we need to walk the page tables. First check if we are in
+	 * range.
+	 */
+	rldicl.	r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4
+	bne-	virt_page_table_tlb_miss_fault
+
+	/* Get the PGD pointer */
+	ld	r15,PACAPGD(r11)
+	cmpldi	cr0,r15,0
+	beq-	virt_page_table_tlb_miss_fault
+
+	/* Get to PGD entry */
+	rldicl	r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3
+	clrrdi	r10,r11,3
+	ldx	r15,r10,r15
+	cmpdi	cr0,r15,0
+	bge	virt_page_table_tlb_miss_fault
+
+#ifndef CONFIG_PPC_64K_PAGES
+	/* Get to PUD entry */
+	rldicl	r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3
+	clrrdi	r10,r11,3
+	ldx	r15,r10,r15
+	cmpdi	cr0,r15,0
+	bge	virt_page_table_tlb_miss_fault
+#endif /* CONFIG_PPC_64K_PAGES */
+
+	/* Get to PMD entry */
+	rldicl	r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3
+	clrrdi	r10,r11,3
+	ldx	r15,r10,r15
+	cmpdi	cr0,r15,0
+	bge	virt_page_table_tlb_miss_fault
+
+	/* Ok, we're all right, we can now create a kernel translation for
+	 * a 4K or 64K page from r16 -> r15.
+	 */
+	/* Now we build the MAS:
+	 *
+	 * MAS 0   :	Fully setup with defaults in MAS4 and TLBnCFG
+	 * MAS 1   :	Almost fully setup
+	 *               - PID already updated by caller if necessary
+	 *               - TSIZE for now is base page size always
+	 * MAS 2   :	Use defaults
+	 * MAS 3+7 :	Needs to be done
+	 *
+	 * So we only do MAS 2 and 3 for now...
+	 */
+	clrldi	r11,r15,4		/* remove region ID from RPN */
+	ori	r10,r11,1		/* Or-in SR */
+
+BEGIN_MMU_FTR_SECTION
+	srdi	r16,r10,32
+	mtspr	SPRN_MAS3,r10
+	mtspr	SPRN_MAS7,r16
+MMU_FTR_SECTION_ELSE
+	mtspr	SPRN_MAS7_MAS3,r10
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
+
+	tlbwe
+
+BEGIN_MMU_FTR_SECTION
+virt_page_table_tlb_miss_done:
+
+	/* We have overridden MAS2:EPN but currently our primary TLB miss
+	 * handler will always restore it so that should not be an issue,
+	 * if we ever optimize the primary handler to not write MAS2 on
+	 * some cases, we'll have to restore MAS2:EPN here based on the
+	 * original fault's DEAR. If we do that we have to modify the
+	 * ITLB miss handler to also store SRR0 in the exception frame
+	 * as DEAR.
+	 *
+	 * However, one nasty thing we did is we cleared the reservation
+	 * (well, potentially we did). We do a trick here thus if we
+	 * are not a level 0 exception (we interrupted the TLB miss) we
+	 * offset the return address by -4 in order to replay the tlbsrx
+	 * instruction there
+	 */
+	subf	r10,r13,r12
+	cmpldi	cr0,r10,PACA_EXTLB+EX_TLB_SIZE
+	bne-	1f
+	ld	r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
+	addi	r10,r11,-4
+	std	r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
+1:
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
+	/* Return to caller, normal case */
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK);
+	TLB_MISS_EPILOG_SUCCESS
+	rfi
+
+virt_page_table_tlb_miss_fault:
+	/* If we fault here, things are a little bit tricky. We need to call
+	 * either data or instruction store fault, and we need to retrieve
+	 * the original fault address and ESR (for data).
+	 *
+	 * The thing is, we know that in normal circumstances, this is
+	 * always called as a second level tlb miss for SW load or as a first
+	 * level TLB miss for HW load, so we should be able to peek at the
+	 * relevant information in the first exception frame in the PACA.
+	 *
+	 * However, we do need to double check that, because we may just hit
+	 * a stray kernel pointer or a userland attack trying to hit those
+	 * areas. If that is the case, we do a data fault. (We can't get here
+	 * from an instruction tlb miss anyway).
+	 *
+	 * Note also that when going to a fault, we must unwind the previous
+	 * level as well. Since we are doing that, we don't need to clear or
+	 * restore the TLB reservation neither.
+	 */
+	subf	r10,r13,r12
+	cmpldi	cr0,r10,PACA_EXTLB+EX_TLB_SIZE
+	bne-	virt_page_table_tlb_miss_whacko_fault
+
+	/* We dig the original DEAR and ESR from slot 0 */
+	ld	r15,EX_TLB_DEAR+PACA_EXTLB(r13)
+	ld	r16,EX_TLB_ESR+PACA_EXTLB(r13)
+
+	/* We check for the "special" ESR value for instruction faults */
+	cmpdi	cr0,r16,-1
+	beq	1f
+	mtspr	SPRN_DEAR,r15
+	mtspr	SPRN_ESR,r16
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT);
+	TLB_MISS_EPILOG_ERROR
+	b	exc_data_storage_book3e
+1:	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT);
+	TLB_MISS_EPILOG_ERROR
+	b	exc_instruction_storage_book3e
+
+virt_page_table_tlb_miss_whacko_fault:
+	/* The linear fault will restart everything so ESR and DEAR will
+	 * not have been clobbered, let's just fault with what we have
+	 */
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_FAULT);
+	TLB_MISS_EPILOG_ERROR
+	b	exc_data_storage_book3e
+
+
+/**************************************************************
+ *                                                            *
+ * TLB miss handling for Book3E with hw page table support    *
+ *                                                            *
+ **************************************************************/
+
+
+/* Data TLB miss */
+	START_EXCEPTION(data_tlb_miss_htw)
+	TLB_MISS_PROLOG
+
+	/* Now we handle the fault proper. We only save DEAR in normal
+	 * fault case since that's the only interesting values here.
+	 * We could probably also optimize by not saving SRR0/1 in the
+	 * linear mapping case but I'll leave that for later
+	 */
+	mfspr	r14,SPRN_ESR
+	mfspr	r16,SPRN_DEAR		/* get faulting address */
+	srdi	r11,r16,60		/* get region */
+	cmpldi	cr0,r11,0xc		/* linear mapping ? */
+	TLB_MISS_STATS_SAVE_INFO
+	beq	tlb_load_linear		/* yes -> go to linear map load */
+
+	/* We do the user/kernel test for the PID here along with the RW test
+	 */
+	cmpldi	cr0,r11,0		/* Check for user region */
+	ld	r15,PACAPGD(r13)	/* Load user pgdir */
+	beq	htw_tlb_miss
+
+	/* XXX replace the RMW cycles with immediate loads + writes */
+1:	mfspr	r10,SPRN_MAS1
+	cmpldi	cr0,r11,8		/* Check for vmalloc region */
+	rlwinm	r10,r10,0,16,1		/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+	ld	r15,PACA_KERNELPGD(r13)	/* Load kernel pgdir */
+	beq+	htw_tlb_miss
+
+	/* We got a crappy address, just fault with whatever DEAR and ESR
+	 * are here
+	 */
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_data_storage_book3e
+
+/* Instruction TLB miss */
+	START_EXCEPTION(instruction_tlb_miss_htw)
+	TLB_MISS_PROLOG
+
+	/* If we take a recursive fault, the second level handler may need
+	 * to know whether we are handling a data or instruction fault in
+	 * order to get to the right store fault handler. We provide that
+	 * info by keeping a crazy value for ESR in r14
+	 */
+	li	r14,-1	/* store to exception frame is done later */
+
+	/* Now we handle the fault proper. We only save DEAR in the non
+	 * linear mapping case since we know the linear mapping case will
+	 * not re-enter. We could indeed optimize and also not save SRR0/1
+	 * in the linear mapping case but I'll leave that for later
+	 *
+	 * Faulting address is SRR0 which is already in r16
+	 */
+	srdi	r11,r16,60		/* get region */
+	cmpldi	cr0,r11,0xc		/* linear mapping ? */
+	TLB_MISS_STATS_SAVE_INFO
+	beq	tlb_load_linear		/* yes -> go to linear map load */
+
+	/* We do the user/kernel test for the PID here along with the RW test
+	 */
+	cmpldi	cr0,r11,0			/* Check for user region */
+	ld	r15,PACAPGD(r13)		/* Load user pgdir */
+	beq	htw_tlb_miss
+
+	/* XXX replace the RMW cycles with immediate loads + writes */
+1:	mfspr	r10,SPRN_MAS1
+	cmpldi	cr0,r11,8			/* Check for vmalloc region */
+	rlwinm	r10,r10,0,16,1			/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+	ld	r15,PACA_KERNELPGD(r13)		/* Load kernel pgdir */
+	beq+	htw_tlb_miss
+
+	/* We got a crappy address, just fault */
+	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_instruction_storage_book3e
+
+
+/*
+ * This is the guts of the second-level TLB miss handler for direct
+ * misses. We are entered with:
+ *
+ * r16 = virtual page table faulting address
+ * r15 = PGD pointer
+ * r14 = ESR
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = crap (free to use)
+ * r10 = crap (free to use)
+ *
+ * It can be re-entered by the linear mapping miss handler. However, to
+ * avoid too much complication, it will save/restore things for us
+ */
+htw_tlb_miss:
+	/* Search if we already have a TLB entry for that virtual address, and
+	 * if we do, bail out.
+	 *
+	 * MAS1:IND should be already set based on MAS4
+	 */
+	PPC_TLBSRX_DOT(0,R16)
+	beq	htw_tlb_miss_done
+
+	/* Now, we need to walk the page tables. First check if we are in
+	 * range.
+	 */
+	rldicl.	r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+	bne-	htw_tlb_miss_fault
+
+	/* Get the PGD pointer */
+	cmpldi	cr0,r15,0
+	beq-	htw_tlb_miss_fault
+
+	/* Get to PGD entry */
+	rldicl	r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3
+	clrrdi	r10,r11,3
+	ldx	r15,r10,r15
+	cmpdi	cr0,r15,0
+	bge	htw_tlb_miss_fault
+
+#ifndef CONFIG_PPC_64K_PAGES
+	/* Get to PUD entry */
+	rldicl	r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3
+	clrrdi	r10,r11,3
+	ldx	r15,r10,r15
+	cmpdi	cr0,r15,0
+	bge	htw_tlb_miss_fault
+#endif /* CONFIG_PPC_64K_PAGES */
+
+	/* Get to PMD entry */
+	rldicl	r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3
+	clrrdi	r10,r11,3
+	ldx	r15,r10,r15
+	cmpdi	cr0,r15,0
+	bge	htw_tlb_miss_fault
+
+	/* Ok, we're all right, we can now create an indirect entry for
+	 * a 1M or 256M page.
+	 *
+	 * The last trick is now that because we use "half" pages for
+	 * the HTW (1M IND is 2K and 256M IND is 32K) we need to account
+	 * for an added LSB bit to the RPN. For 64K pages, there is no
+	 * problem as we already use 32K arrays (half PTE pages), but for
+	 * 4K page we need to extract a bit from the virtual address and
+	 * insert it into the "PA52" bit of the RPN.
+	 */
+#ifndef CONFIG_PPC_64K_PAGES
+	rlwimi	r15,r16,32-9,20,20
+#endif
+	/* Now we build the MAS:
+	 *
+	 * MAS 0   :	Fully setup with defaults in MAS4 and TLBnCFG
+	 * MAS 1   :	Almost fully setup
+	 *               - PID already updated by caller if necessary
+	 *               - TSIZE for now is base ind page size always
+	 * MAS 2   :	Use defaults
+	 * MAS 3+7 :	Needs to be done
+	 */
+#ifdef CONFIG_PPC_64K_PAGES
+	ori	r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT)
+#else
+	ori	r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
+#endif
+
+BEGIN_MMU_FTR_SECTION
+	srdi	r16,r10,32
+	mtspr	SPRN_MAS3,r10
+	mtspr	SPRN_MAS7,r16
+MMU_FTR_SECTION_ELSE
+	mtspr	SPRN_MAS7_MAS3,r10
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
+
+	tlbwe
+
+htw_tlb_miss_done:
+	/* We don't bother with restoring DEAR or ESR since we know we are
+	 * level 0 and just going back to userland. They are only needed
+	 * if you are going to take an access fault
+	 */
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK)
+	TLB_MISS_EPILOG_SUCCESS
+	rfi
+
+htw_tlb_miss_fault:
+	/* We need to check if it was an instruction miss. We know this
+	 * though because r14 would contain -1
+	 */
+	cmpdi	cr0,r14,-1
+	beq	1f
+	mtspr	SPRN_DEAR,r16
+	mtspr	SPRN_ESR,r14
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_data_storage_book3e
+1:	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT)
+	TLB_MISS_EPILOG_ERROR
+	b	exc_instruction_storage_book3e
+
+/*
+ * This is the guts of "any" level TLB miss handler for kernel linear
+ * mapping misses. We are entered with:
+ *
+ *
+ * r16 = faulting address
+ * r15 = crap (free to use)
+ * r14 = ESR (data) or -1 (instruction)
+ * r13 = PACA
+ * r12 = TLB exception frame in PACA
+ * r11 = crap (free to use)
+ * r10 = crap (free to use)
+ *
+ * In addition we know that we will not re-enter, so in theory, we could
+ * use a simpler epilog not restoring SRR0/1 etc.. but we'll do that later.
+ *
+ * We also need to be careful about MAS registers here & TLB reservation,
+ * as we know we'll have clobbered them if we interrupt the main TLB miss
+ * handlers in which case we probably want to do a full restart at level
+ * 0 rather than saving / restoring the MAS.
+ *
+ * Note: If we care about performance of that core, we can easily shuffle
+ *       a few things around
+ */
+tlb_load_linear:
+	/* For now, we assume the linear mapping is contiguous and stops at
+	 * linear_map_top. We also assume the size is a multiple of 1G, thus
+	 * we only use 1G pages for now. That might have to be changed in a
+	 * final implementation, especially when dealing with hypervisors
+	 */
+	ld	r11,PACATOC(r13)
+	ld	r11,linear_map_top@got(r11)
+	ld	r10,0(r11)
+	tovirt(10,10)
+	cmpld	cr0,r16,r10
+	bge	tlb_load_linear_fault
+
+	/* MAS1 need whole new setup. */
+	li	r15,(BOOK3E_PAGESZ_1GB<<MAS1_TSIZE_SHIFT)
+	oris	r15,r15,MAS1_VALID@h	/* MAS1 needs V and TSIZE */
+	mtspr	SPRN_MAS1,r15
+
+	/* Already somebody there ? */
+	PPC_TLBSRX_DOT(0,R16)
+	beq	tlb_load_linear_done
+
+	/* Now we build the remaining MAS. MAS0 and 2 should be fine
+	 * with their defaults, which leaves us with MAS 3 and 7. The
+	 * mapping is linear, so we just take the address, clear the
+	 * region bits, and or in the permission bits which are currently
+	 * hard wired
+	 */
+	clrrdi	r10,r16,30		/* 1G page index */
+	clrldi	r10,r10,4		/* clear region bits */
+	ori	r10,r10,MAS3_SR|MAS3_SW|MAS3_SX
+
+BEGIN_MMU_FTR_SECTION
+	srdi	r16,r10,32
+	mtspr	SPRN_MAS3,r10
+	mtspr	SPRN_MAS7,r16
+MMU_FTR_SECTION_ELSE
+	mtspr	SPRN_MAS7_MAS3,r10
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
+
+	tlbwe
+
+tlb_load_linear_done:
+	/* We use the "error" epilog for success as we do want to
+	 * restore to the initial faulting context, whatever it was.
+	 * We do that because we can't resume a fault within a TLB
+	 * miss handler, due to MAS and TLB reservation being clobbered.
+	 */
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_LINEAR)
+	TLB_MISS_EPILOG_ERROR
+	rfi
+
+tlb_load_linear_fault:
+	/* We keep the DEAR and ESR around, this shouldn't have happened */
+	cmpdi	cr0,r14,-1
+	beq	1f
+	TLB_MISS_EPILOG_ERROR_SPECIAL
+	b	exc_data_storage_book3e
+1:	TLB_MISS_EPILOG_ERROR_SPECIAL
+	b	exc_instruction_storage_book3e
+
+
+#ifdef CONFIG_BOOK3E_MMU_TLB_STATS
+.tlb_stat_inc:
+1:	ldarx	r8,0,r9
+	addi	r8,r8,1
+	stdcx.	r8,0,r9
+	bne-	1b
+	blr
+#endif
diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c
deleted file mode 100644
index f296c2e88b09..000000000000
--- a/arch/powerpc/mm/pgtable-book3e.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright 2005, Paul Mackerras, IBM Corporation.
- * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
- * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/sched.h>
-#include <linux/memblock.h>
-#include <asm/pgalloc.h>
-#include <asm/tlb.h>
-#include <asm/dma.h>
-
-#include <mm/mmu_decl.h>
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-/*
- * On Book3E CPUs, the vmemmap is currently mapped in the top half of
- * the vmalloc space using normal page tables, though the size of
- * pages encoded in the PTEs can be different
- */
-int __meminit vmemmap_create_mapping(unsigned long start,
-				     unsigned long page_size,
-				     unsigned long phys)
-{
-	/* Create a PTE encoding without page size */
-	unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED |
-		_PAGE_KERNEL_RW;
-
-	/* PTEs only contain page size encodings up to 32M */
-	BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf);
-
-	/* Encode the size in the PTE */
-	flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8;
-
-	/* For each PTE for that area, map things. Note that we don't
-	 * increment phys because all PTEs are of the large size and
-	 * thus must have the low bits clear
-	 */
-	for (i = 0; i < page_size; i += PAGE_SIZE)
-		BUG_ON(map_kernel_page(start + i, phys, __pgprot(flags)));
-
-	return 0;
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-void vmemmap_remove_mapping(unsigned long start,
-			    unsigned long page_size)
-{
-}
-#endif
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-
-static __ref void *early_alloc_pgtable(unsigned long size)
-{
-	void *ptr;
-
-	ptr = memblock_alloc_try_nid(size, size, MEMBLOCK_LOW_LIMIT,
-				     __pa(MAX_DMA_ADDRESS), NUMA_NO_NODE);
-
-	if (!ptr)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx max_addr=%lx\n",
-		      __func__, size, size, __pa(MAX_DMA_ADDRESS));
-
-	return ptr;
-}
-
-/*
- * map_kernel_page currently only called by __ioremap
- * map_kernel_page adds an entry to the ioremap page table
- * and adds an entry to the HPT, possibly bolting it
- */
-int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
-{
-	pgd_t *pgdp;
-	pud_t *pudp;
-	pmd_t *pmdp;
-	pte_t *ptep;
-
-	BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE);
-	if (slab_is_available()) {
-		pgdp = pgd_offset_k(ea);
-		pudp = pud_alloc(&init_mm, pgdp, ea);
-		if (!pudp)
-			return -ENOMEM;
-		pmdp = pmd_alloc(&init_mm, pudp, ea);
-		if (!pmdp)
-			return -ENOMEM;
-		ptep = pte_alloc_kernel(pmdp, ea);
-		if (!ptep)
-			return -ENOMEM;
-	} else {
-		pgdp = pgd_offset_k(ea);
-#ifndef __PAGETABLE_PUD_FOLDED
-		if (pgd_none(*pgdp)) {
-			pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
-			pgd_populate(&init_mm, pgdp, pudp);
-		}
-#endif /* !__PAGETABLE_PUD_FOLDED */
-		pudp = pud_offset(pgdp, ea);
-		if (pud_none(*pudp)) {
-			pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
-			pud_populate(&init_mm, pudp, pmdp);
-		}
-		pmdp = pmd_offset(pudp, ea);
-		if (!pmd_present(*pmdp)) {
-			ptep = early_alloc_pgtable(PAGE_SIZE);
-			pmd_populate_kernel(&init_mm, pmdp, ptep);
-		}
-		ptep = pte_offset_kernel(pmdp, ea);
-	}
-	set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
-
-	smp_wmb();
-	return 0;
-}
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
deleted file mode 100644
index 9ed90064f542..000000000000
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ /dev/null
@@ -1,1280 +0,0 @@
-/*
- *  Low level TLB miss handlers for Book3E
- *
- *  Copyright (C) 2008-2009
- *      Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- */
-
-#include <asm/processor.h>
-#include <asm/reg.h>
-#include <asm/page.h>
-#include <asm/mmu.h>
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/cputable.h>
-#include <asm/pgtable.h>
-#include <asm/exception-64e.h>
-#include <asm/ppc-opcode.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_booke_hv_asm.h>
-#include <asm/feature-fixups.h>
-
-#ifdef CONFIG_PPC_64K_PAGES
-#define VPTE_PMD_SHIFT	(PTE_INDEX_SIZE+1)
-#else
-#define VPTE_PMD_SHIFT	(PTE_INDEX_SIZE)
-#endif
-#define VPTE_PUD_SHIFT	(VPTE_PMD_SHIFT + PMD_INDEX_SIZE)
-#define VPTE_PGD_SHIFT	(VPTE_PUD_SHIFT + PUD_INDEX_SIZE)
-#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE)
-
-/**********************************************************************
- *                                                                    *
- * TLB miss handling for Book3E with a bolted linear mapping          *
- * No virtual page table, no nested TLB misses                        *
- *                                                                    *
- **********************************************************************/
-
-/*
- * Note that, unlike non-bolted handlers, TLB_EXFRAME is not
- * modified by the TLB miss handlers themselves, since the TLB miss
- * handler code will not itself cause a recursive TLB miss.
- *
- * TLB_EXFRAME will be modified when crit/mc/debug exceptions are
- * entered/exited.
- */
-.macro tlb_prolog_bolted intnum addr
-	mtspr	SPRN_SPRG_GEN_SCRATCH,r12
-	mfspr	r12,SPRN_SPRG_TLB_EXFRAME
-	std	r13,EX_TLB_R13(r12)
-	std	r10,EX_TLB_R10(r12)
-	mfspr	r13,SPRN_SPRG_PACA
-
-	mfcr	r10
-	std	r11,EX_TLB_R11(r12)
-#ifdef CONFIG_KVM_BOOKE_HV
-BEGIN_FTR_SECTION
-	mfspr	r11, SPRN_SRR1
-END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
-#endif
-	DO_KVM	\intnum, SPRN_SRR1
-	std	r16,EX_TLB_R16(r12)
-	mfspr	r16,\addr		/* get faulting address */
-	std	r14,EX_TLB_R14(r12)
-	ld	r14,PACAPGD(r13)
-	std	r15,EX_TLB_R15(r12)
-	std	r10,EX_TLB_CR(r12)
-#ifdef CONFIG_PPC_FSL_BOOK3E
-START_BTB_FLUSH_SECTION
-	mfspr r11, SPRN_SRR1
-	andi. r10,r11,MSR_PR
-	beq 1f
-	BTB_FLUSH(r10)
-1:
-END_BTB_FLUSH_SECTION
-	std	r7,EX_TLB_R7(r12)
-#endif
-	TLB_MISS_PROLOG_STATS
-.endm
-
-.macro tlb_epilog_bolted
-	ld	r14,EX_TLB_CR(r12)
-#ifdef CONFIG_PPC_FSL_BOOK3E
-	ld	r7,EX_TLB_R7(r12)
-#endif
-	ld	r10,EX_TLB_R10(r12)
-	ld	r11,EX_TLB_R11(r12)
-	ld	r13,EX_TLB_R13(r12)
-	mtcr	r14
-	ld	r14,EX_TLB_R14(r12)
-	ld	r15,EX_TLB_R15(r12)
-	TLB_MISS_RESTORE_STATS
-	ld	r16,EX_TLB_R16(r12)
-	mfspr	r12,SPRN_SPRG_GEN_SCRATCH
-.endm
-
-/* Data TLB miss */
-	START_EXCEPTION(data_tlb_miss_bolted)
-	tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR
-
-	/* We need _PAGE_PRESENT and  _PAGE_ACCESSED set */
-
-	/* We do the user/kernel test for the PID here along with the RW test
-	 */
-	/* We pre-test some combination of permissions to avoid double
-	 * faults:
-	 *
-	 * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE
-	 * ESR_ST   is 0x00800000
-	 * _PAGE_BAP_SW is 0x00000010
-	 * So the shift is >> 19. This tests for supervisor writeability.
-	 * If the page happens to be supervisor writeable and not user
-	 * writeable, we will take a new fault later, but that should be
-	 * a rare enough case.
-	 *
-	 * We also move ESR_ST in _PAGE_DIRTY position
-	 * _PAGE_DIRTY is 0x00001000 so the shift is >> 11
-	 *
-	 * MAS1 is preset for all we need except for TID that needs to
-	 * be cleared for kernel translations
-	 */
-
-	mfspr	r11,SPRN_ESR
-
-	srdi	r15,r16,60		/* get region */
-	rldicl.	r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
-	bne-	dtlb_miss_fault_bolted	/* Bail if fault addr is invalid */
-
-	rlwinm	r10,r11,32-19,27,27
-	rlwimi	r10,r11,32-16,19,19
-	cmpwi	r15,0			/* user vs kernel check */
-	ori	r10,r10,_PAGE_PRESENT
-	oris	r11,r10,_PAGE_ACCESSED@h
-
-	TLB_MISS_STATS_SAVE_INFO_BOLTED
-	bne	tlb_miss_kernel_bolted
-
-tlb_miss_common_bolted:
-/*
- * This is the guts of the TLB miss handler for bolted-linear.
- * We are entered with:
- *
- * r16 = faulting address
- * r15 = crap (free to use)
- * r14 = page table base
- * r13 = PACA
- * r11 = PTE permission mask
- * r10 = crap (free to use)
- */
-	rldicl	r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3
-	cmpldi	cr0,r14,0
-	clrrdi	r15,r15,3
-	beq	tlb_miss_fault_bolted	/* No PGDIR, bail */
-
-BEGIN_MMU_FTR_SECTION
-	/* Set the TLB reservation and search for existing entry. Then load
-	 * the entry.
-	 */
-	PPC_TLBSRX_DOT(0,R16)
-	ldx	r14,r14,r15		/* grab pgd entry */
-	beq	tlb_miss_done_bolted	/* tlb exists already, bail */
-MMU_FTR_SECTION_ELSE
-	ldx	r14,r14,r15		/* grab pgd entry */
-ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
-
-#ifndef CONFIG_PPC_64K_PAGES
-	rldicl	r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
-	clrrdi	r15,r15,3
-	cmpdi	cr0,r14,0
-	bge	tlb_miss_fault_bolted	/* Bad pgd entry or hugepage; bail */
-	ldx	r14,r14,r15		/* grab pud entry */
-#endif /* CONFIG_PPC_64K_PAGES */
-
-	rldicl	r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
-	clrrdi	r15,r15,3
-	cmpdi	cr0,r14,0
-	bge	tlb_miss_fault_bolted
-	ldx	r14,r14,r15		/* Grab pmd entry */
-
-	rldicl	r15,r16,64-PAGE_SHIFT+3,64-PTE_INDEX_SIZE-3
-	clrrdi	r15,r15,3
-	cmpdi	cr0,r14,0
-	bge	tlb_miss_fault_bolted
-	ldx	r14,r14,r15		/* Grab PTE, normal (!huge) page */
-
-	/* Check if required permissions are met */
-	andc.	r15,r11,r14
-	rldicr	r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
-	bne-	tlb_miss_fault_bolted
-
-	/* Now we build the MAS:
-	 *
-	 * MAS 0   :	Fully setup with defaults in MAS4 and TLBnCFG
-	 * MAS 1   :	Almost fully setup
-	 *               - PID already updated by caller if necessary
-	 *               - TSIZE need change if !base page size, not
-	 *                 yet implemented for now
-	 * MAS 2   :	Defaults not useful, need to be redone
-	 * MAS 3+7 :	Needs to be done
-	 */
-	clrrdi	r11,r16,12		/* Clear low crap in EA */
-	clrldi	r15,r15,12		/* Clear crap at the top */
-	rlwimi	r11,r14,32-19,27,31	/* Insert WIMGE */
-	rlwimi	r15,r14,32-8,22,25	/* Move in U bits */
-	mtspr	SPRN_MAS2,r11
-	andi.	r11,r14,_PAGE_DIRTY
-	rlwimi	r15,r14,32-2,26,31	/* Move in BAP bits */
-
-	/* Mask out SW and UW if !DIRTY (XXX optimize this !) */
-	bne	1f
-	li	r11,MAS3_SW|MAS3_UW
-	andc	r15,r15,r11
-1:
-	mtspr	SPRN_MAS7_MAS3,r15
-	tlbwe
-
-tlb_miss_done_bolted:
-	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
-	tlb_epilog_bolted
-	rfi
-
-itlb_miss_kernel_bolted:
-	li	r11,_PAGE_PRESENT|_PAGE_BAP_SX	/* Base perm */
-	oris	r11,r11,_PAGE_ACCESSED@h
-tlb_miss_kernel_bolted:
-	mfspr	r10,SPRN_MAS1
-	ld	r14,PACA_KERNELPGD(r13)
-	cmpldi	cr0,r15,8		/* Check for vmalloc region */
-	rlwinm	r10,r10,0,16,1		/* Clear TID */
-	mtspr	SPRN_MAS1,r10
-	beq+	tlb_miss_common_bolted
-
-tlb_miss_fault_bolted:
-	/* We need to check if it was an instruction miss */
-	andi.	r10,r11,_PAGE_EXEC|_PAGE_BAP_SX
-	bne	itlb_miss_fault_bolted
-dtlb_miss_fault_bolted:
-	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
-	tlb_epilog_bolted
-	b	exc_data_storage_book3e
-itlb_miss_fault_bolted:
-	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
-	tlb_epilog_bolted
-	b	exc_instruction_storage_book3e
-
-/* Instruction TLB miss */
-	START_EXCEPTION(instruction_tlb_miss_bolted)
-	tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0
-
-	rldicl.	r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
-	srdi	r15,r16,60		/* get region */
-	TLB_MISS_STATS_SAVE_INFO_BOLTED
-	bne-	itlb_miss_fault_bolted
-
-	li	r11,_PAGE_PRESENT|_PAGE_EXEC	/* Base perm */
-
-	/* We do the user/kernel test for the PID here along with the RW test
-	 */
-
-	cmpldi	cr0,r15,0			/* Check for user region */
-	oris	r11,r11,_PAGE_ACCESSED@h
-	beq	tlb_miss_common_bolted
-	b	itlb_miss_kernel_bolted
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
-/*
- * TLB miss handling for e6500 and derivatives, using hardware tablewalk.
- *
- * Linear mapping is bolted: no virtual page table or nested TLB misses
- * Indirect entries in TLB1, hardware loads resulting direct entries
- *    into TLB0
- * No HES or NV hint on TLB1, so we need to do software round-robin
- * No tlbsrx. so we need a spinlock, and we have to deal
- *    with MAS-damage caused by tlbsx
- * 4K pages only
- */
-
-	START_EXCEPTION(instruction_tlb_miss_e6500)
-	tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0
-
-	ld	r11,PACA_TCD_PTR(r13)
-	srdi.	r15,r16,60		/* get region */
-	ori	r16,r16,1
-
-	TLB_MISS_STATS_SAVE_INFO_BOLTED
-	bne	tlb_miss_kernel_e6500	/* user/kernel test */
-
-	b	tlb_miss_common_e6500
-
-	START_EXCEPTION(data_tlb_miss_e6500)
-	tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR
-
-	ld	r11,PACA_TCD_PTR(r13)
-	srdi.	r15,r16,60		/* get region */
-	rldicr	r16,r16,0,62
-
-	TLB_MISS_STATS_SAVE_INFO_BOLTED
-	bne	tlb_miss_kernel_e6500	/* user vs kernel check */
-
-/*
- * This is the guts of the TLB miss handler for e6500 and derivatives.
- * We are entered with:
- *
- * r16 = page of faulting address (low bit 0 if data, 1 if instruction)
- * r15 = crap (free to use)
- * r14 = page table base
- * r13 = PACA
- * r11 = tlb_per_core ptr
- * r10 = crap (free to use)
- * r7  = esel_next
- */
-tlb_miss_common_e6500:
-	crmove	cr2*4+2,cr0*4+2		/* cr2.eq != 0 if kernel address */
-
-BEGIN_FTR_SECTION		/* CPU_FTR_SMT */
-	/*
-	 * Search if we already have an indirect entry for that virtual
-	 * address, and if we do, bail out.
-	 *
-	 * MAS6:IND should be already set based on MAS4
-	 */
-	lhz	r10,PACAPACAINDEX(r13)
-	addi	r10,r10,1
-	crclr	cr1*4+eq	/* set cr1.eq = 0 for non-recursive */
-1:	lbarx	r15,0,r11
-	cmpdi	r15,0
-	bne	2f
-	stbcx.	r10,0,r11
-	bne	1b
-3:
-	.subsection 1
-2:	cmpd	cr1,r15,r10	/* recursive lock due to mcheck/crit/etc? */
-	beq	cr1,3b		/* unlock will happen if cr1.eq = 0 */
-10:	lbz	r15,0(r11)
-	cmpdi	r15,0
-	bne	10b
-	b	1b
-	.previous
-END_FTR_SECTION_IFSET(CPU_FTR_SMT)
-
-	lbz	r7,TCD_ESEL_NEXT(r11)
-
-BEGIN_FTR_SECTION		/* CPU_FTR_SMT */
-	/*
-	 * Erratum A-008139 says that we can't use tlbwe to change
-	 * an indirect entry in any way (including replacing or
-	 * invalidating) if the other thread could be in the process
-	 * of a lookup.  The workaround is to invalidate the entry
-	 * with tlbilx before overwriting.
-	 */
-
-	rlwinm	r10,r7,16,0xff0000
-	oris	r10,r10,MAS0_TLBSEL(1)@h
-	mtspr	SPRN_MAS0,r10
-	isync
-	tlbre
-	mfspr	r15,SPRN_MAS1
-	andis.	r15,r15,MAS1_VALID@h
-	beq	5f
-
-BEGIN_FTR_SECTION_NESTED(532)
-	mfspr	r10,SPRN_MAS8
-	rlwinm	r10,r10,0,0x80000fff  /* tgs,tlpid -> sgs,slpid */
-	mtspr	SPRN_MAS5,r10
-END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532)
-
-	mfspr	r10,SPRN_MAS1
-	rlwinm	r15,r10,0,0x3fff0000  /* tid -> spid */
-	rlwimi	r15,r10,20,0x00000003 /* ind,ts -> sind,sas */
-	mfspr	r10,SPRN_MAS6
-	mtspr	SPRN_MAS6,r15
-
-	mfspr	r15,SPRN_MAS2
-	isync
-	tlbilxva 0,r15
-	isync
-
-	mtspr	SPRN_MAS6,r10
-
-5:
-BEGIN_FTR_SECTION_NESTED(532)
-	li	r10,0
-	mtspr	SPRN_MAS8,r10
-	mtspr	SPRN_MAS5,r10
-END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532)
-
-	tlbsx	0,r16
-	mfspr	r10,SPRN_MAS1
-	andis.	r15,r10,MAS1_VALID@h
-	bne	tlb_miss_done_e6500
-FTR_SECTION_ELSE
-	mfspr	r10,SPRN_MAS1
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT)
-
-	oris	r10,r10,MAS1_VALID@h
-	beq	cr2,4f
-	rlwinm	r10,r10,0,16,1		/* Clear TID */
-4:	mtspr	SPRN_MAS1,r10
-
-	/* Now, we need to walk the page tables. First check if we are in
-	 * range.
-	 */
-	rldicl.	r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
-	bne-	tlb_miss_fault_e6500
-
-	rldicl	r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3
-	cmpldi	cr0,r14,0
-	clrrdi	r15,r15,3
-	beq-	tlb_miss_fault_e6500 /* No PGDIR, bail */
-	ldx	r14,r14,r15		/* grab pgd entry */
-
-	rldicl	r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
-	clrrdi	r15,r15,3
-	cmpdi	cr0,r14,0
-	bge	tlb_miss_huge_e6500	/* Bad pgd entry or hugepage; bail */
-	ldx	r14,r14,r15		/* grab pud entry */
-
-	rldicl	r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
-	clrrdi	r15,r15,3
-	cmpdi	cr0,r14,0
-	bge	tlb_miss_huge_e6500
-	ldx	r14,r14,r15		/* Grab pmd entry */
-
-	mfspr	r10,SPRN_MAS0
-	cmpdi	cr0,r14,0
-	bge	tlb_miss_huge_e6500
-
-	/* Now we build the MAS for a 2M indirect page:
-	 *
-	 * MAS 0   :	ESEL needs to be filled by software round-robin
-	 * MAS 1   :	Fully set up
-	 *               - PID already updated by caller if necessary
-	 *               - TSIZE for now is base ind page size always
-	 *               - TID already cleared if necessary
-	 * MAS 2   :	Default not 2M-aligned, need to be redone
-	 * MAS 3+7 :	Needs to be done
-	 */
-
-	ori	r14,r14,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
-	mtspr	SPRN_MAS7_MAS3,r14
-
-	clrrdi	r15,r16,21		/* make EA 2M-aligned */
-	mtspr	SPRN_MAS2,r15
-
-tlb_miss_huge_done_e6500:
-	lbz	r16,TCD_ESEL_MAX(r11)
-	lbz	r14,TCD_ESEL_FIRST(r11)
-	rlwimi	r10,r7,16,0x00ff0000	/* insert esel_next into MAS0 */
-	addi	r7,r7,1			/* increment esel_next */
-	mtspr	SPRN_MAS0,r10
-	cmpw	r7,r16
-	iseleq	r7,r14,r7		/* if next == last use first */
-	stb	r7,TCD_ESEL_NEXT(r11)
-
-	tlbwe
-
-tlb_miss_done_e6500:
-	.macro	tlb_unlock_e6500
-BEGIN_FTR_SECTION
-	beq	cr1,1f		/* no unlock if lock was recursively grabbed */
-	li	r15,0
-	isync
-	stb	r15,0(r11)
-1:
-END_FTR_SECTION_IFSET(CPU_FTR_SMT)
-	.endm
-
-	tlb_unlock_e6500
-	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
-	tlb_epilog_bolted
-	rfi
-
-tlb_miss_huge_e6500:
-	beq	tlb_miss_fault_e6500
-	li	r10,1
-	andi.	r15,r14,HUGEPD_SHIFT_MASK@l /* r15 = psize */
-	rldimi	r14,r10,63,0		/* Set PD_HUGE */
-	xor	r14,r14,r15		/* Clear size bits */
-	ldx	r14,0,r14
-
-	/*
-	 * Now we build the MAS for a huge page.
-	 *
-	 * MAS 0   :	ESEL needs to be filled by software round-robin
-	 *		 - can be handled by indirect code
-	 * MAS 1   :	Need to clear IND and set TSIZE
-	 * MAS 2,3+7:	Needs to be redone similar to non-tablewalk handler
-	 */
-
-	subi	r15,r15,10		/* Convert psize to tsize */
-	mfspr	r10,SPRN_MAS1
-	rlwinm	r10,r10,0,~MAS1_IND
-	rlwimi	r10,r15,MAS1_TSIZE_SHIFT,MAS1_TSIZE_MASK
-	mtspr	SPRN_MAS1,r10
-
-	li	r10,-0x400
-	sld	r15,r10,r15		/* Generate mask based on size */
-	and	r10,r16,r15
-	rldicr	r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
-	rlwimi	r10,r14,32-19,27,31	/* Insert WIMGE */
-	clrldi	r15,r15,PAGE_SHIFT	/* Clear crap at the top */
-	rlwimi	r15,r14,32-8,22,25	/* Move in U bits */
-	mtspr	SPRN_MAS2,r10
-	andi.	r10,r14,_PAGE_DIRTY
-	rlwimi	r15,r14,32-2,26,31	/* Move in BAP bits */
-
-	/* Mask out SW and UW if !DIRTY (XXX optimize this !) */
-	bne	1f
-	li	r10,MAS3_SW|MAS3_UW
-	andc	r15,r15,r10
-1:
-	mtspr	SPRN_MAS7_MAS3,r15
-
-	mfspr	r10,SPRN_MAS0
-	b	tlb_miss_huge_done_e6500
-
-tlb_miss_kernel_e6500:
-	ld	r14,PACA_KERNELPGD(r13)
-	cmpldi	cr1,r15,8		/* Check for vmalloc region */
-	beq+	cr1,tlb_miss_common_e6500
-
-tlb_miss_fault_e6500:
-	tlb_unlock_e6500
-	/* We need to check if it was an instruction miss */
-	andi.	r16,r16,1
-	bne	itlb_miss_fault_e6500
-dtlb_miss_fault_e6500:
-	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
-	tlb_epilog_bolted
-	b	exc_data_storage_book3e
-itlb_miss_fault_e6500:
-	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
-	tlb_epilog_bolted
-	b	exc_instruction_storage_book3e
-#endif /* CONFIG_PPC_FSL_BOOK3E */
-
-/**********************************************************************
- *                                                                    *
- * TLB miss handling for Book3E with TLB reservation and HES support  *
- *                                                                    *
- **********************************************************************/
-
-
-/* Data TLB miss */
-	START_EXCEPTION(data_tlb_miss)
-	TLB_MISS_PROLOG
-
-	/* Now we handle the fault proper. We only save DEAR in normal
-	 * fault case since that's the only interesting values here.
-	 * We could probably also optimize by not saving SRR0/1 in the
-	 * linear mapping case but I'll leave that for later
-	 */
-	mfspr	r14,SPRN_ESR
-	mfspr	r16,SPRN_DEAR		/* get faulting address */
-	srdi	r15,r16,60		/* get region */
-	cmpldi	cr0,r15,0xc		/* linear mapping ? */
-	TLB_MISS_STATS_SAVE_INFO
-	beq	tlb_load_linear		/* yes -> go to linear map load */
-
-	/* The page tables are mapped virtually linear. At this point, though,
-	 * we don't know whether we are trying to fault in a first level
-	 * virtual address or a virtual page table address. We can get that
-	 * from bit 0x1 of the region ID which we have set for a page table
-	 */
-	andi.	r10,r15,0x1
-	bne-	virt_page_table_tlb_miss
-
-	std	r14,EX_TLB_ESR(r12);	/* save ESR */
-	std	r16,EX_TLB_DEAR(r12);	/* save DEAR */
-
-	 /* We need _PAGE_PRESENT and  _PAGE_ACCESSED set */
-	li	r11,_PAGE_PRESENT
-	oris	r11,r11,_PAGE_ACCESSED@h
-
-	/* We do the user/kernel test for the PID here along with the RW test
-	 */
-	cmpldi	cr0,r15,0		/* Check for user region */
-
-	/* We pre-test some combination of permissions to avoid double
-	 * faults:
-	 *
-	 * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE
-	 * ESR_ST   is 0x00800000
-	 * _PAGE_BAP_SW is 0x00000010
-	 * So the shift is >> 19. This tests for supervisor writeability.
-	 * If the page happens to be supervisor writeable and not user
-	 * writeable, we will take a new fault later, but that should be
-	 * a rare enough case.
-	 *
-	 * We also move ESR_ST in _PAGE_DIRTY position
-	 * _PAGE_DIRTY is 0x00001000 so the shift is >> 11
-	 *
-	 * MAS1 is preset for all we need except for TID that needs to
-	 * be cleared for kernel translations
-	 */
-	rlwimi	r11,r14,32-19,27,27
-	rlwimi	r11,r14,32-16,19,19
-	beq	normal_tlb_miss
-	/* XXX replace the RMW cycles with immediate loads + writes */
-1:	mfspr	r10,SPRN_MAS1
-	cmpldi	cr0,r15,8		/* Check for vmalloc region */
-	rlwinm	r10,r10,0,16,1		/* Clear TID */
-	mtspr	SPRN_MAS1,r10
-	beq+	normal_tlb_miss
-
-	/* We got a crappy address, just fault with whatever DEAR and ESR
-	 * are here
-	 */
-	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
-	TLB_MISS_EPILOG_ERROR
-	b	exc_data_storage_book3e
-
-/* Instruction TLB miss */
-	START_EXCEPTION(instruction_tlb_miss)
-	TLB_MISS_PROLOG
-
-	/* If we take a recursive fault, the second level handler may need
-	 * to know whether we are handling a data or instruction fault in
-	 * order to get to the right store fault handler. We provide that
-	 * info by writing a crazy value in ESR in our exception frame
-	 */
-	li	r14,-1	/* store to exception frame is done later */
-
-	/* Now we handle the fault proper. We only save DEAR in the non
-	 * linear mapping case since we know the linear mapping case will
-	 * not re-enter. We could indeed optimize and also not save SRR0/1
-	 * in the linear mapping case but I'll leave that for later
-	 *
-	 * Faulting address is SRR0 which is already in r16
-	 */
-	srdi	r15,r16,60		/* get region */
-	cmpldi	cr0,r15,0xc		/* linear mapping ? */
-	TLB_MISS_STATS_SAVE_INFO
-	beq	tlb_load_linear		/* yes -> go to linear map load */
-
-	/* We do the user/kernel test for the PID here along with the RW test
-	 */
-	li	r11,_PAGE_PRESENT|_PAGE_EXEC	/* Base perm */
-	oris	r11,r11,_PAGE_ACCESSED@h
-
-	cmpldi	cr0,r15,0			/* Check for user region */
-	std	r14,EX_TLB_ESR(r12)		/* write crazy -1 to frame */
-	beq	normal_tlb_miss
-
-	li	r11,_PAGE_PRESENT|_PAGE_BAP_SX	/* Base perm */
-	oris	r11,r11,_PAGE_ACCESSED@h
-	/* XXX replace the RMW cycles with immediate loads + writes */
-	mfspr	r10,SPRN_MAS1
-	cmpldi	cr0,r15,8			/* Check for vmalloc region */
-	rlwinm	r10,r10,0,16,1			/* Clear TID */
-	mtspr	SPRN_MAS1,r10
-	beq+	normal_tlb_miss
-
-	/* We got a crappy address, just fault */
-	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
-	TLB_MISS_EPILOG_ERROR
-	b	exc_instruction_storage_book3e
-
-/*
- * This is the guts of the first-level TLB miss handler for direct
- * misses. We are entered with:
- *
- * r16 = faulting address
- * r15 = region ID
- * r14 = crap (free to use)
- * r13 = PACA
- * r12 = TLB exception frame in PACA
- * r11 = PTE permission mask
- * r10 = crap (free to use)
- */
-normal_tlb_miss:
-	/* So we first construct the page table address. We do that by
-	 * shifting the bottom of the address (not the region ID) by
-	 * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and
-	 * or'ing the fourth high bit.
-	 *
-	 * NOTE: For 64K pages, we do things slightly differently in
-	 * order to handle the weird page table format used by linux
-	 */
-	ori	r10,r15,0x1
-#ifdef CONFIG_PPC_64K_PAGES
-	/* For the top bits, 16 bytes per PTE */
-	rldicl	r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4
-	/* Now create the bottom bits as 0 in position 0x8000 and
-	 * the rest calculated for 8 bytes per PTE
-	 */
-	rldicl	r15,r16,64-(PAGE_SHIFT-3),64-15
-	/* Insert the bottom bits in */
-	rlwimi	r14,r15,0,16,31
-#else
-	rldicl	r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4
-#endif
-	sldi	r15,r10,60
-	clrrdi	r14,r14,3
-	or	r10,r15,r14
-
-BEGIN_MMU_FTR_SECTION
-	/* Set the TLB reservation and search for existing entry. Then load
-	 * the entry.
-	 */
-	PPC_TLBSRX_DOT(0,R16)
-	ld	r14,0(r10)
-	beq	normal_tlb_miss_done
-MMU_FTR_SECTION_ELSE
-	ld	r14,0(r10)
-ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
-
-finish_normal_tlb_miss:
-	/* Check if required permissions are met */
-	andc.	r15,r11,r14
-	bne-	normal_tlb_miss_access_fault
-
-	/* Now we build the MAS:
-	 *
-	 * MAS 0   :	Fully setup with defaults in MAS4 and TLBnCFG
-	 * MAS 1   :	Almost fully setup
-	 *               - PID already updated by caller if necessary
-	 *               - TSIZE need change if !base page size, not
-	 *                 yet implemented for now
-	 * MAS 2   :	Defaults not useful, need to be redone
-	 * MAS 3+7 :	Needs to be done
-	 *
-	 * TODO: mix up code below for better scheduling
-	 */
-	clrrdi	r11,r16,12		/* Clear low crap in EA */
-	rlwimi	r11,r14,32-19,27,31	/* Insert WIMGE */
-	mtspr	SPRN_MAS2,r11
-
-	/* Check page size, if not standard, update MAS1 */
-	rldicl	r11,r14,64-8,64-8
-#ifdef CONFIG_PPC_64K_PAGES
-	cmpldi	cr0,r11,BOOK3E_PAGESZ_64K
-#else
-	cmpldi	cr0,r11,BOOK3E_PAGESZ_4K
-#endif
-	beq-	1f
-	mfspr	r11,SPRN_MAS1
-	rlwimi	r11,r14,31,21,24
-	rlwinm	r11,r11,0,21,19
-	mtspr	SPRN_MAS1,r11
-1:
-	/* Move RPN in position */
-	rldicr	r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
-	clrldi	r15,r11,12		/* Clear crap at the top */
-	rlwimi	r15,r14,32-8,22,25	/* Move in U bits */
-	rlwimi	r15,r14,32-2,26,31	/* Move in BAP bits */
-
-	/* Mask out SW and UW if !DIRTY (XXX optimize this !) */
-	andi.	r11,r14,_PAGE_DIRTY
-	bne	1f
-	li	r11,MAS3_SW|MAS3_UW
-	andc	r15,r15,r11
-1:
-BEGIN_MMU_FTR_SECTION
-	srdi	r16,r15,32
-	mtspr	SPRN_MAS3,r15
-	mtspr	SPRN_MAS7,r16
-MMU_FTR_SECTION_ELSE
-	mtspr	SPRN_MAS7_MAS3,r15
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
-
-	tlbwe
-
-normal_tlb_miss_done:
-	/* We don't bother with restoring DEAR or ESR since we know we are
-	 * level 0 and just going back to userland. They are only needed
-	 * if you are going to take an access fault
-	 */
-	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
-	TLB_MISS_EPILOG_SUCCESS
-	rfi
-
-normal_tlb_miss_access_fault:
-	/* We need to check if it was an instruction miss */
-	andi.	r10,r11,_PAGE_EXEC
-	bne	1f
-	ld	r14,EX_TLB_DEAR(r12)
-	ld	r15,EX_TLB_ESR(r12)
-	mtspr	SPRN_DEAR,r14
-	mtspr	SPRN_ESR,r15
-	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
-	TLB_MISS_EPILOG_ERROR
-	b	exc_data_storage_book3e
-1:	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
-	TLB_MISS_EPILOG_ERROR
-	b	exc_instruction_storage_book3e
-
-
-/*
- * This is the guts of the second-level TLB miss handler for direct
- * misses. We are entered with:
- *
- * r16 = virtual page table faulting address
- * r15 = region (top 4 bits of address)
- * r14 = crap (free to use)
- * r13 = PACA
- * r12 = TLB exception frame in PACA
- * r11 = crap (free to use)
- * r10 = crap (free to use)
- *
- * Note that this should only ever be called as a second level handler
- * with the current scheme when using SW load.
- * That means we can always get the original fault DEAR at
- * EX_TLB_DEAR-EX_TLB_SIZE(r12)
- *
- * It can be re-entered by the linear mapping miss handler. However, to
- * avoid too much complication, it will restart the whole fault at level
- * 0 so we don't care too much about clobbers
- *
- * XXX That code was written back when we couldn't clobber r14. We can now,
- * so we could probably optimize things a bit
- */
-virt_page_table_tlb_miss:
-	/* Are we hitting a kernel page table ? */
-	andi.	r10,r15,0x8
-
-	/* The cool thing now is that r10 contains 0 for user and 8 for kernel,
-	 * and we happen to have the swapper_pg_dir at offset 8 from the user
-	 * pgdir in the PACA :-).
-	 */
-	add	r11,r10,r13
-
-	/* If kernel, we need to clear MAS1 TID */
-	beq	1f
-	/* XXX replace the RMW cycles with immediate loads + writes */
-	mfspr	r10,SPRN_MAS1
-	rlwinm	r10,r10,0,16,1			/* Clear TID */
-	mtspr	SPRN_MAS1,r10
-1:
-BEGIN_MMU_FTR_SECTION
-	/* Search if we already have a TLB entry for that virtual address, and
-	 * if we do, bail out.
-	 */
-	PPC_TLBSRX_DOT(0,R16)
-	beq	virt_page_table_tlb_miss_done
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
-
-	/* Now, we need to walk the page tables. First check if we are in
-	 * range.
-	 */
-	rldicl.	r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4
-	bne-	virt_page_table_tlb_miss_fault
-
-	/* Get the PGD pointer */
-	ld	r15,PACAPGD(r11)
-	cmpldi	cr0,r15,0
-	beq-	virt_page_table_tlb_miss_fault
-
-	/* Get to PGD entry */
-	rldicl	r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3
-	clrrdi	r10,r11,3
-	ldx	r15,r10,r15
-	cmpdi	cr0,r15,0
-	bge	virt_page_table_tlb_miss_fault
-
-#ifndef CONFIG_PPC_64K_PAGES
-	/* Get to PUD entry */
-	rldicl	r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3
-	clrrdi	r10,r11,3
-	ldx	r15,r10,r15
-	cmpdi	cr0,r15,0
-	bge	virt_page_table_tlb_miss_fault
-#endif /* CONFIG_PPC_64K_PAGES */
-
-	/* Get to PMD entry */
-	rldicl	r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3
-	clrrdi	r10,r11,3
-	ldx	r15,r10,r15
-	cmpdi	cr0,r15,0
-	bge	virt_page_table_tlb_miss_fault
-
-	/* Ok, we're all right, we can now create a kernel translation for
-	 * a 4K or 64K page from r16 -> r15.
-	 */
-	/* Now we build the MAS:
-	 *
-	 * MAS 0   :	Fully setup with defaults in MAS4 and TLBnCFG
-	 * MAS 1   :	Almost fully setup
-	 *               - PID already updated by caller if necessary
-	 *               - TSIZE for now is base page size always
-	 * MAS 2   :	Use defaults
-	 * MAS 3+7 :	Needs to be done
-	 *
-	 * So we only do MAS 2 and 3 for now...
-	 */
-	clrldi	r11,r15,4		/* remove region ID from RPN */
-	ori	r10,r11,1		/* Or-in SR */
-
-BEGIN_MMU_FTR_SECTION
-	srdi	r16,r10,32
-	mtspr	SPRN_MAS3,r10
-	mtspr	SPRN_MAS7,r16
-MMU_FTR_SECTION_ELSE
-	mtspr	SPRN_MAS7_MAS3,r10
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
-
-	tlbwe
-
-BEGIN_MMU_FTR_SECTION
-virt_page_table_tlb_miss_done:
-
-	/* We have overridden MAS2:EPN but currently our primary TLB miss
-	 * handler will always restore it so that should not be an issue,
-	 * if we ever optimize the primary handler to not write MAS2 on
-	 * some cases, we'll have to restore MAS2:EPN here based on the
-	 * original fault's DEAR. If we do that we have to modify the
-	 * ITLB miss handler to also store SRR0 in the exception frame
-	 * as DEAR.
-	 *
-	 * However, one nasty thing we did is we cleared the reservation
-	 * (well, potentially we did). We do a trick here thus if we
-	 * are not a level 0 exception (we interrupted the TLB miss) we
-	 * offset the return address by -4 in order to replay the tlbsrx
-	 * instruction there
-	 */
-	subf	r10,r13,r12
-	cmpldi	cr0,r10,PACA_EXTLB+EX_TLB_SIZE
-	bne-	1f
-	ld	r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
-	addi	r10,r11,-4
-	std	r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13)
-1:
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
-	/* Return to caller, normal case */
-	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK);
-	TLB_MISS_EPILOG_SUCCESS
-	rfi
-
-virt_page_table_tlb_miss_fault:
-	/* If we fault here, things are a little bit tricky. We need to call
-	 * either data or instruction store fault, and we need to retrieve
-	 * the original fault address and ESR (for data).
-	 *
-	 * The thing is, we know that in normal circumstances, this is
-	 * always called as a second level tlb miss for SW load or as a first
-	 * level TLB miss for HW load, so we should be able to peek at the
-	 * relevant information in the first exception frame in the PACA.
-	 *
-	 * However, we do need to double check that, because we may just hit
-	 * a stray kernel pointer or a userland attack trying to hit those
-	 * areas. If that is the case, we do a data fault. (We can't get here
-	 * from an instruction tlb miss anyway).
-	 *
-	 * Note also that when going to a fault, we must unwind the previous
-	 * level as well. Since we are doing that, we don't need to clear or
-	 * restore the TLB reservation neither.
-	 */
-	subf	r10,r13,r12
-	cmpldi	cr0,r10,PACA_EXTLB+EX_TLB_SIZE
-	bne-	virt_page_table_tlb_miss_whacko_fault
-
-	/* We dig the original DEAR and ESR from slot 0 */
-	ld	r15,EX_TLB_DEAR+PACA_EXTLB(r13)
-	ld	r16,EX_TLB_ESR+PACA_EXTLB(r13)
-
-	/* We check for the "special" ESR value for instruction faults */
-	cmpdi	cr0,r16,-1
-	beq	1f
-	mtspr	SPRN_DEAR,r15
-	mtspr	SPRN_ESR,r16
-	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT);
-	TLB_MISS_EPILOG_ERROR
-	b	exc_data_storage_book3e
-1:	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT);
-	TLB_MISS_EPILOG_ERROR
-	b	exc_instruction_storage_book3e
-
-virt_page_table_tlb_miss_whacko_fault:
-	/* The linear fault will restart everything so ESR and DEAR will
-	 * not have been clobbered, let's just fault with what we have
-	 */
-	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_FAULT);
-	TLB_MISS_EPILOG_ERROR
-	b	exc_data_storage_book3e
-
-
-/**************************************************************
- *                                                            *
- * TLB miss handling for Book3E with hw page table support    *
- *                                                            *
- **************************************************************/
-
-
-/* Data TLB miss */
-	START_EXCEPTION(data_tlb_miss_htw)
-	TLB_MISS_PROLOG
-
-	/* Now we handle the fault proper. We only save DEAR in normal
-	 * fault case since that's the only interesting values here.
-	 * We could probably also optimize by not saving SRR0/1 in the
-	 * linear mapping case but I'll leave that for later
-	 */
-	mfspr	r14,SPRN_ESR
-	mfspr	r16,SPRN_DEAR		/* get faulting address */
-	srdi	r11,r16,60		/* get region */
-	cmpldi	cr0,r11,0xc		/* linear mapping ? */
-	TLB_MISS_STATS_SAVE_INFO
-	beq	tlb_load_linear		/* yes -> go to linear map load */
-
-	/* We do the user/kernel test for the PID here along with the RW test
-	 */
-	cmpldi	cr0,r11,0		/* Check for user region */
-	ld	r15,PACAPGD(r13)	/* Load user pgdir */
-	beq	htw_tlb_miss
-
-	/* XXX replace the RMW cycles with immediate loads + writes */
-1:	mfspr	r10,SPRN_MAS1
-	cmpldi	cr0,r11,8		/* Check for vmalloc region */
-	rlwinm	r10,r10,0,16,1		/* Clear TID */
-	mtspr	SPRN_MAS1,r10
-	ld	r15,PACA_KERNELPGD(r13)	/* Load kernel pgdir */
-	beq+	htw_tlb_miss
-
-	/* We got a crappy address, just fault with whatever DEAR and ESR
-	 * are here
-	 */
-	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
-	TLB_MISS_EPILOG_ERROR
-	b	exc_data_storage_book3e
-
-/* Instruction TLB miss */
-	START_EXCEPTION(instruction_tlb_miss_htw)
-	TLB_MISS_PROLOG
-
-	/* If we take a recursive fault, the second level handler may need
-	 * to know whether we are handling a data or instruction fault in
-	 * order to get to the right store fault handler. We provide that
-	 * info by keeping a crazy value for ESR in r14
-	 */
-	li	r14,-1	/* store to exception frame is done later */
-
-	/* Now we handle the fault proper. We only save DEAR in the non
-	 * linear mapping case since we know the linear mapping case will
-	 * not re-enter. We could indeed optimize and also not save SRR0/1
-	 * in the linear mapping case but I'll leave that for later
-	 *
-	 * Faulting address is SRR0 which is already in r16
-	 */
-	srdi	r11,r16,60		/* get region */
-	cmpldi	cr0,r11,0xc		/* linear mapping ? */
-	TLB_MISS_STATS_SAVE_INFO
-	beq	tlb_load_linear		/* yes -> go to linear map load */
-
-	/* We do the user/kernel test for the PID here along with the RW test
-	 */
-	cmpldi	cr0,r11,0			/* Check for user region */
-	ld	r15,PACAPGD(r13)		/* Load user pgdir */
-	beq	htw_tlb_miss
-
-	/* XXX replace the RMW cycles with immediate loads + writes */
-1:	mfspr	r10,SPRN_MAS1
-	cmpldi	cr0,r11,8			/* Check for vmalloc region */
-	rlwinm	r10,r10,0,16,1			/* Clear TID */
-	mtspr	SPRN_MAS1,r10
-	ld	r15,PACA_KERNELPGD(r13)		/* Load kernel pgdir */
-	beq+	htw_tlb_miss
-
-	/* We got a crappy address, just fault */
-	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
-	TLB_MISS_EPILOG_ERROR
-	b	exc_instruction_storage_book3e
-
-
-/*
- * This is the guts of the second-level TLB miss handler for direct
- * misses. We are entered with:
- *
- * r16 = virtual page table faulting address
- * r15 = PGD pointer
- * r14 = ESR
- * r13 = PACA
- * r12 = TLB exception frame in PACA
- * r11 = crap (free to use)
- * r10 = crap (free to use)
- *
- * It can be re-entered by the linear mapping miss handler. However, to
- * avoid too much complication, it will save/restore things for us
- */
-htw_tlb_miss:
-	/* Search if we already have a TLB entry for that virtual address, and
-	 * if we do, bail out.
-	 *
-	 * MAS1:IND should be already set based on MAS4
-	 */
-	PPC_TLBSRX_DOT(0,R16)
-	beq	htw_tlb_miss_done
-
-	/* Now, we need to walk the page tables. First check if we are in
-	 * range.
-	 */
-	rldicl.	r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
-	bne-	htw_tlb_miss_fault
-
-	/* Get the PGD pointer */
-	cmpldi	cr0,r15,0
-	beq-	htw_tlb_miss_fault
-
-	/* Get to PGD entry */
-	rldicl	r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3
-	clrrdi	r10,r11,3
-	ldx	r15,r10,r15
-	cmpdi	cr0,r15,0
-	bge	htw_tlb_miss_fault
-
-#ifndef CONFIG_PPC_64K_PAGES
-	/* Get to PUD entry */
-	rldicl	r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3
-	clrrdi	r10,r11,3
-	ldx	r15,r10,r15
-	cmpdi	cr0,r15,0
-	bge	htw_tlb_miss_fault
-#endif /* CONFIG_PPC_64K_PAGES */
-
-	/* Get to PMD entry */
-	rldicl	r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3
-	clrrdi	r10,r11,3
-	ldx	r15,r10,r15
-	cmpdi	cr0,r15,0
-	bge	htw_tlb_miss_fault
-
-	/* Ok, we're all right, we can now create an indirect entry for
-	 * a 1M or 256M page.
-	 *
-	 * The last trick is now that because we use "half" pages for
-	 * the HTW (1M IND is 2K and 256M IND is 32K) we need to account
-	 * for an added LSB bit to the RPN. For 64K pages, there is no
-	 * problem as we already use 32K arrays (half PTE pages), but for
-	 * 4K page we need to extract a bit from the virtual address and
-	 * insert it into the "PA52" bit of the RPN.
-	 */
-#ifndef CONFIG_PPC_64K_PAGES
-	rlwimi	r15,r16,32-9,20,20
-#endif
-	/* Now we build the MAS:
-	 *
-	 * MAS 0   :	Fully setup with defaults in MAS4 and TLBnCFG
-	 * MAS 1   :	Almost fully setup
-	 *               - PID already updated by caller if necessary
-	 *               - TSIZE for now is base ind page size always
-	 * MAS 2   :	Use defaults
-	 * MAS 3+7 :	Needs to be done
-	 */
-#ifdef CONFIG_PPC_64K_PAGES
-	ori	r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT)
-#else
-	ori	r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
-#endif
-
-BEGIN_MMU_FTR_SECTION
-	srdi	r16,r10,32
-	mtspr	SPRN_MAS3,r10
-	mtspr	SPRN_MAS7,r16
-MMU_FTR_SECTION_ELSE
-	mtspr	SPRN_MAS7_MAS3,r10
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
-
-	tlbwe
-
-htw_tlb_miss_done:
-	/* We don't bother with restoring DEAR or ESR since we know we are
-	 * level 0 and just going back to userland. They are only needed
-	 * if you are going to take an access fault
-	 */
-	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK)
-	TLB_MISS_EPILOG_SUCCESS
-	rfi
-
-htw_tlb_miss_fault:
-	/* We need to check if it was an instruction miss. We know this
-	 * though because r14 would contain -1
-	 */
-	cmpdi	cr0,r14,-1
-	beq	1f
-	mtspr	SPRN_DEAR,r16
-	mtspr	SPRN_ESR,r14
-	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT)
-	TLB_MISS_EPILOG_ERROR
-	b	exc_data_storage_book3e
-1:	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT)
-	TLB_MISS_EPILOG_ERROR
-	b	exc_instruction_storage_book3e
-
-/*
- * This is the guts of "any" level TLB miss handler for kernel linear
- * mapping misses. We are entered with:
- *
- *
- * r16 = faulting address
- * r15 = crap (free to use)
- * r14 = ESR (data) or -1 (instruction)
- * r13 = PACA
- * r12 = TLB exception frame in PACA
- * r11 = crap (free to use)
- * r10 = crap (free to use)
- *
- * In addition we know that we will not re-enter, so in theory, we could
- * use a simpler epilog not restoring SRR0/1 etc.. but we'll do that later.
- *
- * We also need to be careful about MAS registers here & TLB reservation,
- * as we know we'll have clobbered them if we interrupt the main TLB miss
- * handlers in which case we probably want to do a full restart at level
- * 0 rather than saving / restoring the MAS.
- *
- * Note: If we care about performance of that core, we can easily shuffle
- *       a few things around
- */
-tlb_load_linear:
-	/* For now, we assume the linear mapping is contiguous and stops at
-	 * linear_map_top. We also assume the size is a multiple of 1G, thus
-	 * we only use 1G pages for now. That might have to be changed in a
-	 * final implementation, especially when dealing with hypervisors
-	 */
-	ld	r11,PACATOC(r13)
-	ld	r11,linear_map_top@got(r11)
-	ld	r10,0(r11)
-	tovirt(10,10)
-	cmpld	cr0,r16,r10
-	bge	tlb_load_linear_fault
-
-	/* MAS1 need whole new setup. */
-	li	r15,(BOOK3E_PAGESZ_1GB<<MAS1_TSIZE_SHIFT)
-	oris	r15,r15,MAS1_VALID@h	/* MAS1 needs V and TSIZE */
-	mtspr	SPRN_MAS1,r15
-
-	/* Already somebody there ? */
-	PPC_TLBSRX_DOT(0,R16)
-	beq	tlb_load_linear_done
-
-	/* Now we build the remaining MAS. MAS0 and 2 should be fine
-	 * with their defaults, which leaves us with MAS 3 and 7. The
-	 * mapping is linear, so we just take the address, clear the
-	 * region bits, and or in the permission bits which are currently
-	 * hard wired
-	 */
-	clrrdi	r10,r16,30		/* 1G page index */
-	clrldi	r10,r10,4		/* clear region bits */
-	ori	r10,r10,MAS3_SR|MAS3_SW|MAS3_SX
-
-BEGIN_MMU_FTR_SECTION
-	srdi	r16,r10,32
-	mtspr	SPRN_MAS3,r10
-	mtspr	SPRN_MAS7,r16
-MMU_FTR_SECTION_ELSE
-	mtspr	SPRN_MAS7_MAS3,r10
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
-
-	tlbwe
-
-tlb_load_linear_done:
-	/* We use the "error" epilog for success as we do want to
-	 * restore to the initial faulting context, whatever it was.
-	 * We do that because we can't resume a fault within a TLB
-	 * miss handler, due to MAS and TLB reservation being clobbered.
-	 */
-	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_LINEAR)
-	TLB_MISS_EPILOG_ERROR
-	rfi
-
-tlb_load_linear_fault:
-	/* We keep the DEAR and ESR around, this shouldn't have happened */
-	cmpdi	cr0,r14,-1
-	beq	1f
-	TLB_MISS_EPILOG_ERROR_SPECIAL
-	b	exc_data_storage_book3e
-1:	TLB_MISS_EPILOG_ERROR_SPECIAL
-	b	exc_instruction_storage_book3e
-
-
-#ifdef CONFIG_BOOK3E_MMU_TLB_STATS
-.tlb_stat_inc:
-1:	ldarx	r8,0,r9
-	addi	r8,r8,1
-	stdcx.	r8,0,r9
-	bne-	1b
-	blr
-#endif
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
deleted file mode 100644
index 704e613a0b14..000000000000
--- a/arch/powerpc/mm/tlb_nohash.c
+++ /dev/null
@@ -1,810 +0,0 @@
-/*
- * This file contains the routines for TLB flushing.
- * On machines where the MMU does not use a hash table to store virtual to
- * physical translations (ie, SW loaded TLBs or Book3E compilant processors,
- * this does -not- include 603 however which shares the implementation with
- * hash based processors)
- *
- *  -- BenH
- *
- * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org>
- *                     IBM Corp.
- *
- *  Derived from arch/ppc/mm/init.c:
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
- *    Copyright (C) 1996 Paul Mackerras
- *
- *  Derived from "arch/i386/mm/init.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-#include <linux/preempt.h>
-#include <linux/spinlock.h>
-#include <linux/memblock.h>
-#include <linux/of_fdt.h>
-#include <linux/hugetlb.h>
-
-#include <asm/tlbflush.h>
-#include <asm/tlb.h>
-#include <asm/code-patching.h>
-#include <asm/cputhreads.h>
-#include <asm/hugetlb.h>
-#include <asm/paca.h>
-
-#include <mm/mmu_decl.h>
-
-/*
- * This struct lists the sw-supported page sizes.  The hardawre MMU may support
- * other sizes not listed here.   The .ind field is only used on MMUs that have
- * indirect page table entries.
- */
-#if defined(CONFIG_PPC_BOOK3E_MMU) || defined(CONFIG_PPC_8xx)
-#ifdef CONFIG_PPC_FSL_BOOK3E
-struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
-	[MMU_PAGE_4K] = {
-		.shift	= 12,
-		.enc	= BOOK3E_PAGESZ_4K,
-	},
-	[MMU_PAGE_2M] = {
-		.shift	= 21,
-		.enc	= BOOK3E_PAGESZ_2M,
-	},
-	[MMU_PAGE_4M] = {
-		.shift	= 22,
-		.enc	= BOOK3E_PAGESZ_4M,
-	},
-	[MMU_PAGE_16M] = {
-		.shift	= 24,
-		.enc	= BOOK3E_PAGESZ_16M,
-	},
-	[MMU_PAGE_64M] = {
-		.shift	= 26,
-		.enc	= BOOK3E_PAGESZ_64M,
-	},
-	[MMU_PAGE_256M] = {
-		.shift	= 28,
-		.enc	= BOOK3E_PAGESZ_256M,
-	},
-	[MMU_PAGE_1G] = {
-		.shift	= 30,
-		.enc	= BOOK3E_PAGESZ_1GB,
-	},
-};
-#elif defined(CONFIG_PPC_8xx)
-struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
-	/* we only manage 4k and 16k pages as normal pages */
-#ifdef CONFIG_PPC_4K_PAGES
-	[MMU_PAGE_4K] = {
-		.shift	= 12,
-	},
-#else
-	[MMU_PAGE_16K] = {
-		.shift	= 14,
-	},
-#endif
-	[MMU_PAGE_512K] = {
-		.shift	= 19,
-	},
-	[MMU_PAGE_8M] = {
-		.shift	= 23,
-	},
-};
-#else
-struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
-	[MMU_PAGE_4K] = {
-		.shift	= 12,
-		.ind	= 20,
-		.enc	= BOOK3E_PAGESZ_4K,
-	},
-	[MMU_PAGE_16K] = {
-		.shift	= 14,
-		.enc	= BOOK3E_PAGESZ_16K,
-	},
-	[MMU_PAGE_64K] = {
-		.shift	= 16,
-		.ind	= 28,
-		.enc	= BOOK3E_PAGESZ_64K,
-	},
-	[MMU_PAGE_1M] = {
-		.shift	= 20,
-		.enc	= BOOK3E_PAGESZ_1M,
-	},
-	[MMU_PAGE_16M] = {
-		.shift	= 24,
-		.ind	= 36,
-		.enc	= BOOK3E_PAGESZ_16M,
-	},
-	[MMU_PAGE_256M] = {
-		.shift	= 28,
-		.enc	= BOOK3E_PAGESZ_256M,
-	},
-	[MMU_PAGE_1G] = {
-		.shift	= 30,
-		.enc	= BOOK3E_PAGESZ_1GB,
-	},
-};
-#endif /* CONFIG_FSL_BOOKE */
-
-static inline int mmu_get_tsize(int psize)
-{
-	return mmu_psize_defs[psize].enc;
-}
-#else
-static inline int mmu_get_tsize(int psize)
-{
-	/* This isn't used on !Book3E for now */
-	return 0;
-}
-#endif /* CONFIG_PPC_BOOK3E_MMU */
-
-/* The variables below are currently only used on 64-bit Book3E
- * though this will probably be made common with other nohash
- * implementations at some point
- */
-#ifdef CONFIG_PPC64
-
-int mmu_linear_psize;		/* Page size used for the linear mapping */
-int mmu_pte_psize;		/* Page size used for PTE pages */
-int mmu_vmemmap_psize;		/* Page size used for the virtual mem map */
-int book3e_htw_mode;		/* HW tablewalk?  Value is PPC_HTW_* */
-unsigned long linear_map_top;	/* Top of linear mapping */
-
-
-/*
- * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug
- * exceptions.  This is used for bolted and e6500 TLB miss handlers which
- * do not modify this SPRG in the TLB miss code; for other TLB miss handlers,
- * this is set to zero.
- */
-int extlb_level_exc;
-
-#endif /* CONFIG_PPC64 */
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
-/* next_tlbcam_idx is used to round-robin tlbcam entry assignment */
-DEFINE_PER_CPU(int, next_tlbcam_idx);
-EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx);
-#endif
-
-/*
- * Base TLB flushing operations:
- *
- *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
- *  - flush_tlb_page(vma, vmaddr) flushes one page
- *  - flush_tlb_range(vma, start, end) flushes a range of pages
- *  - flush_tlb_kernel_range(start, end) flushes kernel pages
- *
- *  - local_* variants of page and mm only apply to the current
- *    processor
- */
-
-/*
- * These are the base non-SMP variants of page and mm flushing
- */
-void local_flush_tlb_mm(struct mm_struct *mm)
-{
-	unsigned int pid;
-
-	preempt_disable();
-	pid = mm->context.id;
-	if (pid != MMU_NO_CONTEXT)
-		_tlbil_pid(pid);
-	preempt_enable();
-}
-EXPORT_SYMBOL(local_flush_tlb_mm);
-
-void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
-			    int tsize, int ind)
-{
-	unsigned int pid;
-
-	preempt_disable();
-	pid = mm ? mm->context.id : 0;
-	if (pid != MMU_NO_CONTEXT)
-		_tlbil_va(vmaddr, pid, tsize, ind);
-	preempt_enable();
-}
-
-void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
-	__local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
-			       mmu_get_tsize(mmu_virtual_psize), 0);
-}
-EXPORT_SYMBOL(local_flush_tlb_page);
-
-/*
- * And here are the SMP non-local implementations
- */
-#ifdef CONFIG_SMP
-
-static DEFINE_RAW_SPINLOCK(tlbivax_lock);
-
-struct tlb_flush_param {
-	unsigned long addr;
-	unsigned int pid;
-	unsigned int tsize;
-	unsigned int ind;
-};
-
-static void do_flush_tlb_mm_ipi(void *param)
-{
-	struct tlb_flush_param *p = param;
-
-	_tlbil_pid(p ? p->pid : 0);
-}
-
-static void do_flush_tlb_page_ipi(void *param)
-{
-	struct tlb_flush_param *p = param;
-
-	_tlbil_va(p->addr, p->pid, p->tsize, p->ind);
-}
-
-
-/* Note on invalidations and PID:
- *
- * We snapshot the PID with preempt disabled. At this point, it can still
- * change either because:
- * - our context is being stolen (PID -> NO_CONTEXT) on another CPU
- * - we are invaliating some target that isn't currently running here
- *   and is concurrently acquiring a new PID on another CPU
- * - some other CPU is re-acquiring a lost PID for this mm
- * etc...
- *
- * However, this shouldn't be a problem as we only guarantee
- * invalidation of TLB entries present prior to this call, so we
- * don't care about the PID changing, and invalidating a stale PID
- * is generally harmless.
- */
-
-void flush_tlb_mm(struct mm_struct *mm)
-{
-	unsigned int pid;
-
-	preempt_disable();
-	pid = mm->context.id;
-	if (unlikely(pid == MMU_NO_CONTEXT))
-		goto no_context;
-	if (!mm_is_core_local(mm)) {
-		struct tlb_flush_param p = { .pid = pid };
-		/* Ignores smp_processor_id() even if set. */
-		smp_call_function_many(mm_cpumask(mm),
-				       do_flush_tlb_mm_ipi, &p, 1);
-	}
-	_tlbil_pid(pid);
- no_context:
-	preempt_enable();
-}
-EXPORT_SYMBOL(flush_tlb_mm);
-
-void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
-		      int tsize, int ind)
-{
-	struct cpumask *cpu_mask;
-	unsigned int pid;
-
-	/*
-	 * This function as well as __local_flush_tlb_page() must only be called
-	 * for user contexts.
-	 */
-	if (WARN_ON(!mm))
-		return;
-
-	preempt_disable();
-	pid = mm->context.id;
-	if (unlikely(pid == MMU_NO_CONTEXT))
-		goto bail;
-	cpu_mask = mm_cpumask(mm);
-	if (!mm_is_core_local(mm)) {
-		/* If broadcast tlbivax is supported, use it */
-		if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) {
-			int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL);
-			if (lock)
-				raw_spin_lock(&tlbivax_lock);
-			_tlbivax_bcast(vmaddr, pid, tsize, ind);
-			if (lock)
-				raw_spin_unlock(&tlbivax_lock);
-			goto bail;
-		} else {
-			struct tlb_flush_param p = {
-				.pid = pid,
-				.addr = vmaddr,
-				.tsize = tsize,
-				.ind = ind,
-			};
-			/* Ignores smp_processor_id() even if set in cpu_mask */
-			smp_call_function_many(cpu_mask,
-					       do_flush_tlb_page_ipi, &p, 1);
-		}
-	}
-	_tlbil_va(vmaddr, pid, tsize, ind);
- bail:
-	preempt_enable();
-}
-
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
-#ifdef CONFIG_HUGETLB_PAGE
-	if (vma && is_vm_hugetlb_page(vma))
-		flush_hugetlb_page(vma, vmaddr);
-#endif
-
-	__flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
-			 mmu_get_tsize(mmu_virtual_psize), 0);
-}
-EXPORT_SYMBOL(flush_tlb_page);
-
-#endif /* CONFIG_SMP */
-
-#ifdef CONFIG_PPC_47x
-void __init early_init_mmu_47x(void)
-{
-#ifdef CONFIG_SMP
-	unsigned long root = of_get_flat_dt_root();
-	if (of_get_flat_dt_prop(root, "cooperative-partition", NULL))
-		mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST);
-#endif /* CONFIG_SMP */
-}
-#endif /* CONFIG_PPC_47x */
-
-/*
- * Flush kernel TLB entries in the given range
- */
-void flush_tlb_kernel_range(unsigned long start, unsigned long end)
-{
-#ifdef CONFIG_SMP
-	preempt_disable();
-	smp_call_function(do_flush_tlb_mm_ipi, NULL, 1);
-	_tlbil_pid(0);
-	preempt_enable();
-#else
-	_tlbil_pid(0);
-#endif
-}
-EXPORT_SYMBOL(flush_tlb_kernel_range);
-
-/*
- * Currently, for range flushing, we just do a full mm flush. This should
- * be optimized based on a threshold on the size of the range, since
- * some implementation can stack multiple tlbivax before a tlbsync but
- * for now, we keep it that way
- */
-void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
-		     unsigned long end)
-
-{
-	if (end - start == PAGE_SIZE && !(start & ~PAGE_MASK))
-		flush_tlb_page(vma, start);
-	else
-		flush_tlb_mm(vma->vm_mm);
-}
-EXPORT_SYMBOL(flush_tlb_range);
-
-void tlb_flush(struct mmu_gather *tlb)
-{
-	flush_tlb_mm(tlb->mm);
-}
-
-/*
- * Below are functions specific to the 64-bit variant of Book3E though that
- * may change in the future
- */
-
-#ifdef CONFIG_PPC64
-
-/*
- * Handling of virtual linear page tables or indirect TLB entries
- * flushing when PTE pages are freed
- */
-void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
-{
-	int tsize = mmu_psize_defs[mmu_pte_psize].enc;
-
-	if (book3e_htw_mode != PPC_HTW_NONE) {
-		unsigned long start = address & PMD_MASK;
-		unsigned long end = address + PMD_SIZE;
-		unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift;
-
-		/* This isn't the most optimal, ideally we would factor out the
-		 * while preempt & CPU mask mucking around, or even the IPI but
-		 * it will do for now
-		 */
-		while (start < end) {
-			__flush_tlb_page(tlb->mm, start, tsize, 1);
-			start += size;
-		}
-	} else {
-		unsigned long rmask = 0xf000000000000000ul;
-		unsigned long rid = (address & rmask) | 0x1000000000000000ul;
-		unsigned long vpte = address & ~rmask;
-
-#ifdef CONFIG_PPC_64K_PAGES
-		vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful;
-#else
-		vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful;
-#endif
-		vpte |= rid;
-		__flush_tlb_page(tlb->mm, vpte, tsize, 0);
-	}
-}
-
-static void setup_page_sizes(void)
-{
-	unsigned int tlb0cfg;
-	unsigned int tlb0ps;
-	unsigned int eptcfg;
-	int i, psize;
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
-	unsigned int mmucfg = mfspr(SPRN_MMUCFG);
-	int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E);
-
-	if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) {
-		unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG);
-		unsigned int min_pg, max_pg;
-
-		min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT;
-		max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT;
-
-		for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
-			struct mmu_psize_def *def;
-			unsigned int shift;
-
-			def = &mmu_psize_defs[psize];
-			shift = def->shift;
-
-			if (shift == 0 || shift & 1)
-				continue;
-
-			/* adjust to be in terms of 4^shift Kb */
-			shift = (shift - 10) >> 1;
-
-			if ((shift >= min_pg) && (shift <= max_pg))
-				def->flags |= MMU_PAGE_SIZE_DIRECT;
-		}
-
-		goto out;
-	}
-
-	if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) {
-		u32 tlb1cfg, tlb1ps;
-
-		tlb0cfg = mfspr(SPRN_TLB0CFG);
-		tlb1cfg = mfspr(SPRN_TLB1CFG);
-		tlb1ps = mfspr(SPRN_TLB1PS);
-		eptcfg = mfspr(SPRN_EPTCFG);
-
-		if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT))
-			book3e_htw_mode = PPC_HTW_E6500;
-
-		/*
-		 * We expect 4K subpage size and unrestricted indirect size.
-		 * The lack of a restriction on indirect size is a Freescale
-		 * extension, indicated by PSn = 0 but SPSn != 0.
-		 */
-		if (eptcfg != 2)
-			book3e_htw_mode = PPC_HTW_NONE;
-
-		for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
-			struct mmu_psize_def *def = &mmu_psize_defs[psize];
-
-			if (!def->shift)
-				continue;
-
-			if (tlb1ps & (1U << (def->shift - 10))) {
-				def->flags |= MMU_PAGE_SIZE_DIRECT;
-
-				if (book3e_htw_mode && psize == MMU_PAGE_2M)
-					def->flags |= MMU_PAGE_SIZE_INDIRECT;
-			}
-		}
-
-		goto out;
-	}
-#endif
-
-	tlb0cfg = mfspr(SPRN_TLB0CFG);
-	tlb0ps = mfspr(SPRN_TLB0PS);
-	eptcfg = mfspr(SPRN_EPTCFG);
-
-	/* Look for supported direct sizes */
-	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
-		struct mmu_psize_def *def = &mmu_psize_defs[psize];
-
-		if (tlb0ps & (1U << (def->shift - 10)))
-			def->flags |= MMU_PAGE_SIZE_DIRECT;
-	}
-
-	/* Indirect page sizes supported ? */
-	if ((tlb0cfg & TLBnCFG_IND) == 0 ||
-	    (tlb0cfg & TLBnCFG_PT) == 0)
-		goto out;
-
-	book3e_htw_mode = PPC_HTW_IBM;
-
-	/* Now, we only deal with one IND page size for each
-	 * direct size. Hopefully all implementations today are
-	 * unambiguous, but we might want to be careful in the
-	 * future.
-	 */
-	for (i = 0; i < 3; i++) {
-		unsigned int ps, sps;
-
-		sps = eptcfg & 0x1f;
-		eptcfg >>= 5;
-		ps = eptcfg & 0x1f;
-		eptcfg >>= 5;
-		if (!ps || !sps)
-			continue;
-		for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
-			struct mmu_psize_def *def = &mmu_psize_defs[psize];
-
-			if (ps == (def->shift - 10))
-				def->flags |= MMU_PAGE_SIZE_INDIRECT;
-			if (sps == (def->shift - 10))
-				def->ind = ps + 10;
-		}
-	}
-
-out:
-	/* Cleanup array and print summary */
-	pr_info("MMU: Supported page sizes\n");
-	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
-		struct mmu_psize_def *def = &mmu_psize_defs[psize];
-		const char *__page_type_names[] = {
-			"unsupported",
-			"direct",
-			"indirect",
-			"direct & indirect"
-		};
-		if (def->flags == 0) {
-			def->shift = 0;	
-			continue;
-		}
-		pr_info("  %8ld KB as %s\n", 1ul << (def->shift - 10),
-			__page_type_names[def->flags & 0x3]);
-	}
-}
-
-static void setup_mmu_htw(void)
-{
-	/*
-	 * If we want to use HW tablewalk, enable it by patching the TLB miss
-	 * handlers to branch to the one dedicated to it.
-	 */
-
-	switch (book3e_htw_mode) {
-	case PPC_HTW_IBM:
-		patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e);
-		patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e);
-		break;
-#ifdef CONFIG_PPC_FSL_BOOK3E
-	case PPC_HTW_E6500:
-		extlb_level_exc = EX_TLB_SIZE;
-		patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e);
-		patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e);
-		break;
-#endif
-	}
-	pr_info("MMU: Book3E HW tablewalk %s\n",
-		book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported");
-}
-
-/*
- * Early initialization of the MMU TLB code
- */
-static void early_init_this_mmu(void)
-{
-	unsigned int mas4;
-
-	/* Set MAS4 based on page table setting */
-
-	mas4 = 0x4 << MAS4_WIMGED_SHIFT;
-	switch (book3e_htw_mode) {
-	case PPC_HTW_E6500:
-		mas4 |= MAS4_INDD;
-		mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT;
-		mas4 |= MAS4_TLBSELD(1);
-		mmu_pte_psize = MMU_PAGE_2M;
-		break;
-
-	case PPC_HTW_IBM:
-		mas4 |= MAS4_INDD;
-#ifdef CONFIG_PPC_64K_PAGES
-		mas4 |=	BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT;
-		mmu_pte_psize = MMU_PAGE_256M;
-#else
-		mas4 |=	BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT;
-		mmu_pte_psize = MMU_PAGE_1M;
-#endif
-		break;
-
-	case PPC_HTW_NONE:
-#ifdef CONFIG_PPC_64K_PAGES
-		mas4 |=	BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT;
-#else
-		mas4 |=	BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
-#endif
-		mmu_pte_psize = mmu_virtual_psize;
-		break;
-	}
-	mtspr(SPRN_MAS4, mas4);
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
-	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
-		unsigned int num_cams;
-		int __maybe_unused cpu = smp_processor_id();
-		bool map = true;
-
-		/* use a quarter of the TLBCAM for bolted linear map */
-		num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
-
-		/*
-		 * Only do the mapping once per core, or else the
-		 * transient mapping would cause problems.
-		 */
-#ifdef CONFIG_SMP
-		if (hweight32(get_tensr()) > 1)
-			map = false;
-#endif
-
-		if (map)
-			linear_map_top = map_mem_in_cams(linear_map_top,
-							 num_cams, false);
-	}
-#endif
-
-	/* A sync won't hurt us after mucking around with
-	 * the MMU configuration
-	 */
-	mb();
-}
-
-static void __init early_init_mmu_global(void)
-{
-	/* XXX This will have to be decided at runtime, but right
-	 * now our boot and TLB miss code hard wires it. Ideally
-	 * we should find out a suitable page size and patch the
-	 * TLB miss code (either that or use the PACA to store
-	 * the value we want)
-	 */
-	mmu_linear_psize = MMU_PAGE_1G;
-
-	/* XXX This should be decided at runtime based on supported
-	 * page sizes in the TLB, but for now let's assume 16M is
-	 * always there and a good fit (which it probably is)
-	 *
-	 * Freescale booke only supports 4K pages in TLB0, so use that.
-	 */
-	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
-		mmu_vmemmap_psize = MMU_PAGE_4K;
-	else
-		mmu_vmemmap_psize = MMU_PAGE_16M;
-
-	/* XXX This code only checks for TLB 0 capabilities and doesn't
-	 *     check what page size combos are supported by the HW. It
-	 *     also doesn't handle the case where a separate array holds
-	 *     the IND entries from the array loaded by the PT.
-	 */
-	/* Look for supported page sizes */
-	setup_page_sizes();
-
-	/* Look for HW tablewalk support */
-	setup_mmu_htw();
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
-	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
-		if (book3e_htw_mode == PPC_HTW_NONE) {
-			extlb_level_exc = EX_TLB_SIZE;
-			patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
-			patch_exception(0x1e0,
-				exc_instruction_tlb_miss_bolted_book3e);
-		}
-	}
-#endif
-
-	/* Set the global containing the top of the linear mapping
-	 * for use by the TLB miss code
-	 */
-	linear_map_top = memblock_end_of_DRAM();
-}
-
-static void __init early_mmu_set_memory_limit(void)
-{
-#ifdef CONFIG_PPC_FSL_BOOK3E
-	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
-		/*
-		 * Limit memory so we dont have linear faults.
-		 * Unlike memblock_set_current_limit, which limits
-		 * memory available during early boot, this permanently
-		 * reduces the memory available to Linux.  We need to
-		 * do this because highmem is not supported on 64-bit.
-		 */
-		memblock_enforce_memory_limit(linear_map_top);
-	}
-#endif
-
-	memblock_set_current_limit(linear_map_top);
-}
-
-/* boot cpu only */
-void __init early_init_mmu(void)
-{
-	early_init_mmu_global();
-	early_init_this_mmu();
-	early_mmu_set_memory_limit();
-}
-
-void early_init_mmu_secondary(void)
-{
-	early_init_this_mmu();
-}
-
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
-				phys_addr_t first_memblock_size)
-{
-	/* On non-FSL Embedded 64-bit, we adjust the RMA size to match
-	 * the bolted TLB entry. We know for now that only 1G
-	 * entries are supported though that may eventually
-	 * change.
-	 *
-	 * on FSL Embedded 64-bit, usually all RAM is bolted, but with
-	 * unusual memory sizes it's possible for some RAM to not be mapped
-	 * (such RAM is not used at all by Linux, since we don't support
-	 * highmem on 64-bit).  We limit ppc64_rma_size to what would be
-	 * mappable if this memblock is the only one.  Additional memblocks
-	 * can only increase, not decrease, the amount that ends up getting
-	 * mapped.  We still limit max to 1G even if we'll eventually map
-	 * more.  This is due to what the early init code is set up to do.
-	 *
-	 * We crop it to the size of the first MEMBLOCK to
-	 * avoid going over total available memory just in case...
-	 */
-#ifdef CONFIG_PPC_FSL_BOOK3E
-	if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
-		unsigned long linear_sz;
-		unsigned int num_cams;
-
-		/* use a quarter of the TLBCAM for bolted linear map */
-		num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
-
-		linear_sz = map_mem_in_cams(first_memblock_size, num_cams,
-					    true);
-
-		ppc64_rma_size = min_t(u64, linear_sz, 0x40000000);
-	} else
-#endif
-		ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000);
-
-	/* Finally limit subsequent allocations */
-	memblock_set_current_limit(first_memblock_base + ppc64_rma_size);
-}
-#else /* ! CONFIG_PPC64 */
-void __init early_init_mmu(void)
-{
-#ifdef CONFIG_PPC_47x
-	early_init_mmu_47x();
-#endif
-
-#ifdef CONFIG_PPC_MM_SLICES
-#if defined(CONFIG_PPC_8xx)
-	init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW;
-#endif
-#endif
-}
-#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
deleted file mode 100644
index e066a658acac..000000000000
--- a/arch/powerpc/mm/tlb_nohash_low.S
+++ /dev/null
@@ -1,491 +0,0 @@
-/*
- * This file contains low-level functions for performing various
- * types of TLB invalidations on various processors with no hash
- * table.
- *
- * This file implements the following functions for all no-hash
- * processors. Some aren't implemented for some variants. Some
- * are inline in tlbflush.h
- *
- *	- tlbil_va
- *	- tlbil_pid
- *	- tlbil_all
- *	- tlbivax_bcast
- *
- * Code mostly moved over from misc_32.S
- *
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- * Partially rewritten by Cort Dougan (cort@cs.nmt.edu)
- * Paul Mackerras, Kumar Gala and Benjamin Herrenschmidt.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <asm/reg.h>
-#include <asm/page.h>
-#include <asm/cputable.h>
-#include <asm/mmu.h>
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/processor.h>
-#include <asm/bug.h>
-#include <asm/asm-compat.h>
-#include <asm/feature-fixups.h>
-
-#if defined(CONFIG_40x)
-
-/*
- * 40x implementation needs only tlbil_va
- */
-_GLOBAL(__tlbil_va)
-	/* We run the search with interrupts disabled because we have to change
-	 * the PID and I don't want to preempt when that happens.
-	 */
-	mfmsr	r5
-	mfspr	r6,SPRN_PID
-	wrteei	0
-	mtspr	SPRN_PID,r4
-	tlbsx.	r3, 0, r3
-	mtspr	SPRN_PID,r6
-	wrtee	r5
-	bne	1f
-	sync
-	/* There are only 64 TLB entries, so r3 < 64, which means bit 25 is
-	 * clear. Since 25 is the V bit in the TLB_TAG, loading this value
-	 * will invalidate the TLB entry. */
-	tlbwe	r3, r3, TLB_TAG
-	isync
-1:	blr
-
-#elif defined(CONFIG_PPC_8xx)
-
-/*
- * Nothing to do for 8xx, everything is inline
- */
-
-#elif defined(CONFIG_44x) /* Includes 47x */
-
-/*
- * 440 implementation uses tlbsx/we for tlbil_va and a full sweep
- * of the TLB for everything else.
- */
-_GLOBAL(__tlbil_va)
-	mfspr	r5,SPRN_MMUCR
-	mfmsr   r10
-
-	/*
-	 * We write 16 bits of STID since 47x supports that much, we
-	 * will never be passed out of bounds values on 440 (hopefully)
-	 */
-	rlwimi  r5,r4,0,16,31
-
-	/* We have to run the search with interrupts disabled, otherwise
-	 * an interrupt which causes a TLB miss can clobber the MMUCR
-	 * between the mtspr and the tlbsx.
-	 *
-	 * Critical and Machine Check interrupts take care of saving
-	 * and restoring MMUCR, so only normal interrupts have to be
-	 * taken care of.
-	 */
-	wrteei	0
-	mtspr	SPRN_MMUCR,r5
-	tlbsx.	r6,0,r3
-	bne	10f
-	sync
-BEGIN_MMU_FTR_SECTION
-	b	2f
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
-	/* On 440 There are only 64 TLB entries, so r3 < 64, which means bit
-	 * 22, is clear.  Since 22 is the V bit in the TLB_PAGEID, loading this
-	 * value will invalidate the TLB entry.
-	 */
-	tlbwe	r6,r6,PPC44x_TLB_PAGEID
-	isync
-10:	wrtee	r10
-	blr
-2:
-#ifdef CONFIG_PPC_47x
-	oris	r7,r6,0x8000	/* specify way explicitly */
-	clrrwi	r4,r3,12	/* get an EPN for the hashing with V = 0 */
-	ori	r4,r4,PPC47x_TLBE_SIZE
-	tlbwe   r4,r7,0		/* write it */
-	isync
-	wrtee	r10
-	blr
-#else /* CONFIG_PPC_47x */
-1:	trap
-	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0;
-#endif /* !CONFIG_PPC_47x */
-
-_GLOBAL(_tlbil_all)
-_GLOBAL(_tlbil_pid)
-BEGIN_MMU_FTR_SECTION
-	b	2f
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
-	li	r3,0
-	sync
-
-	/* Load high watermark */
-	lis	r4,tlb_44x_hwater@ha
-	lwz	r5,tlb_44x_hwater@l(r4)
-
-1:	tlbwe	r3,r3,PPC44x_TLB_PAGEID
-	addi	r3,r3,1
-	cmpw	0,r3,r5
-	ble	1b
-
-	isync
-	blr
-2:
-#ifdef CONFIG_PPC_47x
-	/* 476 variant. There's not simple way to do this, hopefully we'll
-	 * try to limit the amount of such full invalidates
-	 */
-	mfmsr	r11		/* Interrupts off */
-	wrteei	0
-	li	r3,-1		/* Current set */
-	lis	r10,tlb_47x_boltmap@h
-	ori	r10,r10,tlb_47x_boltmap@l
-	lis	r7,0x8000	/* Specify way explicitly */
-
-	b	9f		/* For each set */
-
-1:	li	r9,4		/* Number of ways */
-	li	r4,0		/* Current way */
-	li	r6,0		/* Default entry value 0 */
-	andi.	r0,r8,1		/* Check if way 0 is bolted */
-	mtctr	r9		/* Load way counter */
-	bne-	3f		/* Bolted, skip loading it */
-
-2:	/* For each way */
-	or	r5,r3,r4	/* Make way|index for tlbre */
-	rlwimi	r5,r5,16,8,15	/* Copy index into position */
-	tlbre	r6,r5,0		/* Read entry */
-3:	addis	r4,r4,0x2000	/* Next way */
-	andi.	r0,r6,PPC47x_TLB0_VALID /* Valid entry ? */
-	beq	4f		/* Nope, skip it */
-	rlwimi	r7,r5,0,1,2	/* Insert way number */
-	rlwinm	r6,r6,0,21,19	/* Clear V */
-	tlbwe   r6,r7,0		/* Write it */
-4:	bdnz	2b		/* Loop for each way */
-	srwi	r8,r8,1		/* Next boltmap bit */
-9:	cmpwi	cr1,r3,255	/* Last set done ? */
-	addi	r3,r3,1		/* Next set */
-	beq	cr1,1f		/* End of loop */
-	andi.	r0,r3,0x1f	/* Need to load a new boltmap word ? */
-	bne	1b		/* No, loop */
-	lwz	r8,0(r10)	/* Load boltmap entry */
-	addi	r10,r10,4	/* Next word */
-	b	1b		/* Then loop */
-1:	isync			/* Sync shadows */
-	wrtee	r11
-#else /* CONFIG_PPC_47x */
-1:	trap
-	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0;
-#endif /* !CONFIG_PPC_47x */
-	blr
-
-#ifdef CONFIG_PPC_47x
-
-/*
- * _tlbivax_bcast is only on 47x. We don't bother doing a runtime
- * check though, it will blow up soon enough if we mistakenly try
- * to use it on a 440.
- */
-_GLOBAL(_tlbivax_bcast)
-	mfspr	r5,SPRN_MMUCR
-	mfmsr	r10
-	rlwimi	r5,r4,0,16,31
-	wrteei	0
-	mtspr	SPRN_MMUCR,r5
-	isync
-	PPC_TLBIVAX(0, R3)
-	isync
-	eieio
-	tlbsync
-BEGIN_FTR_SECTION
-	b	1f
-END_FTR_SECTION_IFSET(CPU_FTR_476_DD2)
-	sync
-	wrtee	r10
-	blr
-/*
- * DD2 HW could hang if in instruction fetch happens before msync completes.
- * Touch enough instruction cache lines to ensure cache hits
- */
-1:	mflr	r9
-	bl	2f
-2:	mflr	r6
-	li	r7,32
-	PPC_ICBT(0,R6,R7)		/* touch next cache line */
-	add	r6,r6,r7
-	PPC_ICBT(0,R6,R7)		/* touch next cache line */
-	add	r6,r6,r7
-	PPC_ICBT(0,R6,R7)		/* touch next cache line */
-	sync
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-	mtlr	r9
-	wrtee	r10
-	blr
-#endif /* CONFIG_PPC_47x */
-
-#elif defined(CONFIG_FSL_BOOKE)
-/*
- * FSL BookE implementations.
- *
- * Since feature sections are using _SECTION_ELSE we need
- * to have the larger code path before the _SECTION_ELSE
- */
-
-/*
- * Flush MMU TLB on the local processor
- */
-_GLOBAL(_tlbil_all)
-BEGIN_MMU_FTR_SECTION
-	li	r3,(MMUCSR0_TLBFI)@l
-	mtspr	SPRN_MMUCSR0, r3
-1:
-	mfspr	r3,SPRN_MMUCSR0
-	andi.	r3,r3,MMUCSR0_TLBFI@l
-	bne	1b
-MMU_FTR_SECTION_ELSE
-	PPC_TLBILX_ALL(0,R0)
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
-	msync
-	isync
-	blr
-
-_GLOBAL(_tlbil_pid)
-BEGIN_MMU_FTR_SECTION
-	slwi	r3,r3,16
-	mfmsr	r10
-	wrteei	0
-	mfspr	r4,SPRN_MAS6	/* save MAS6 */
-	mtspr	SPRN_MAS6,r3
-	PPC_TLBILX_PID(0,R0)
-	mtspr	SPRN_MAS6,r4	/* restore MAS6 */
-	wrtee	r10
-MMU_FTR_SECTION_ELSE
-	li	r3,(MMUCSR0_TLBFI)@l
-	mtspr	SPRN_MMUCSR0, r3
-1:
-	mfspr	r3,SPRN_MMUCSR0
-	andi.	r3,r3,MMUCSR0_TLBFI@l
-	bne	1b
-ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBILX)
-	msync
-	isync
-	blr
-
-/*
- * Flush MMU TLB for a particular address, but only on the local processor
- * (no broadcast)
- */
-_GLOBAL(__tlbil_va)
-	mfmsr	r10
-	wrteei	0
-	slwi	r4,r4,16
-	ori	r4,r4,(MAS6_ISIZE(BOOK3E_PAGESZ_4K))@l
-	mtspr	SPRN_MAS6,r4		/* assume AS=0 for now */
-BEGIN_MMU_FTR_SECTION
-	tlbsx	0,r3
-	mfspr	r4,SPRN_MAS1		/* check valid */
-	andis.	r3,r4,MAS1_VALID@h
-	beq	1f
-	rlwinm	r4,r4,0,1,31
-	mtspr	SPRN_MAS1,r4
-	tlbwe
-MMU_FTR_SECTION_ELSE
-	PPC_TLBILX_VA(0,R3)
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX)
-	msync
-	isync
-1:	wrtee	r10
-	blr
-#elif defined(CONFIG_PPC_BOOK3E)
-/*
- * New Book3E (>= 2.06) implementation
- *
- * Note: We may be able to get away without the interrupt masking stuff
- * if we save/restore MAS6 on exceptions that might modify it
- */
-_GLOBAL(_tlbil_pid)
-	slwi	r4,r3,MAS6_SPID_SHIFT
-	mfmsr	r10
-	wrteei	0
-	mtspr	SPRN_MAS6,r4
-	PPC_TLBILX_PID(0,R0)
-	wrtee	r10
-	msync
-	isync
-	blr
-
-_GLOBAL(_tlbil_pid_noind)
-	slwi	r4,r3,MAS6_SPID_SHIFT
-	mfmsr	r10
-	ori	r4,r4,MAS6_SIND
-	wrteei	0
-	mtspr	SPRN_MAS6,r4
-	PPC_TLBILX_PID(0,R0)
-	wrtee	r10
-	msync
-	isync
-	blr
-
-_GLOBAL(_tlbil_all)
-	PPC_TLBILX_ALL(0,R0)
-	msync
-	isync
-	blr
-
-_GLOBAL(_tlbil_va)
-	mfmsr	r10
-	wrteei	0
-	cmpwi	cr0,r6,0
-	slwi	r4,r4,MAS6_SPID_SHIFT
-	rlwimi	r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK
-	beq	1f
-	rlwimi	r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
-1:	mtspr	SPRN_MAS6,r4		/* assume AS=0 for now */
-	PPC_TLBILX_VA(0,R3)
-	msync
-	isync
-	wrtee	r10
-	blr
-
-_GLOBAL(_tlbivax_bcast)
-	mfmsr	r10
-	wrteei	0
-	cmpwi	cr0,r6,0
-	slwi	r4,r4,MAS6_SPID_SHIFT
-	rlwimi	r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK
-	beq	1f
-	rlwimi	r4,r6,MAS6_SIND_SHIFT,MAS6_SIND
-1:	mtspr	SPRN_MAS6,r4		/* assume AS=0 for now */
-	PPC_TLBIVAX(0,R3)
-	eieio
-	tlbsync
-	sync
-	wrtee	r10
-	blr
-
-_GLOBAL(set_context)
-#ifdef CONFIG_BDI_SWITCH
-	/* Context switch the PTE pointer for the Abatron BDI2000.
-	 * The PGDIR is the second parameter.
-	 */
-	lis	r5, abatron_pteptrs@h
-	ori	r5, r5, abatron_pteptrs@l
-	stw	r4, 0x4(r5)
-#endif
-	mtspr	SPRN_PID,r3
-	isync			/* Force context change */
-	blr
-#else
-#error Unsupported processor type !
-#endif
-
-#if defined(CONFIG_PPC_FSL_BOOK3E)
-/*
- * extern void loadcam_entry(unsigned int index)
- *
- * Load TLBCAM[index] entry in to the L2 CAM MMU
- * Must preserve r7, r8, r9, and r10
- */
-_GLOBAL(loadcam_entry)
-	mflr	r5
-	LOAD_REG_ADDR_PIC(r4, TLBCAM)
-	mtlr	r5
-	mulli	r5,r3,TLBCAM_SIZE
-	add	r3,r5,r4
-	lwz	r4,TLBCAM_MAS0(r3)
-	mtspr	SPRN_MAS0,r4
-	lwz	r4,TLBCAM_MAS1(r3)
-	mtspr	SPRN_MAS1,r4
-	PPC_LL	r4,TLBCAM_MAS2(r3)
-	mtspr	SPRN_MAS2,r4
-	lwz	r4,TLBCAM_MAS3(r3)
-	mtspr	SPRN_MAS3,r4
-BEGIN_MMU_FTR_SECTION
-	lwz	r4,TLBCAM_MAS7(r3)
-	mtspr	SPRN_MAS7,r4
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
-	isync
-	tlbwe
-	isync
-	blr
-
-/*
- * Load multiple TLB entries at once, using an alternate-space
- * trampoline so that we don't have to care about whether the same
- * TLB entry maps us before and after.
- *
- * r3 = first entry to write
- * r4 = number of entries to write
- * r5 = temporary tlb entry
- */
-_GLOBAL(loadcam_multi)
-	mflr	r8
-
-	/*
-	 * Set up temporary TLB entry that is the same as what we're
-	 * running from, but in AS=1.
-	 */
-	bl	1f
-1:	mflr	r6
-	tlbsx	0,r8
-	mfspr	r6,SPRN_MAS1
-	ori	r6,r6,MAS1_TS
-	mtspr	SPRN_MAS1,r6
-	mfspr	r6,SPRN_MAS0
-	rlwimi	r6,r5,MAS0_ESEL_SHIFT,MAS0_ESEL_MASK
-	mr	r7,r5
-	mtspr	SPRN_MAS0,r6
-	isync
-	tlbwe
-	isync
-
-	/* Switch to AS=1 */
-	mfmsr	r6
-	ori	r6,r6,MSR_IS|MSR_DS
-	mtmsr	r6
-	isync
-
-	mr	r9,r3
-	add	r10,r3,r4
-2:	bl	loadcam_entry
-	addi	r9,r9,1
-	cmpw	r9,r10
-	mr	r3,r9
-	blt	2b
-
-	/* Return to AS=0 and clear the temporary entry */
-	mfmsr	r6
-	rlwinm.	r6,r6,0,~(MSR_IS|MSR_DS)
-	mtmsr	r6
-	isync
-
-	li	r6,0
-	mtspr	SPRN_MAS1,r6
-	rlwinm	r6,r7,MAS0_ESEL_SHIFT,MAS0_ESEL_MASK
-	oris	r6,r6,MAS0_TLBSEL(1)@h
-	mtspr	SPRN_MAS0,r6
-	isync
-	tlbwe
-	isync
-
-	mtlr	r8
-	blr
-#endif
-- 
cgit v1.2.3-59-g8ed1b


From 5ba666d56c4ff9b011c1b029dcc689cff8b176fb Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 25 Apr 2019 14:29:27 +0000
Subject: powerpc/mm: fix erroneous duplicate slb_addr_limit init

Commit 67fda38f0d68 ("powerpc/mm: Move slb_addr_linit to
early_init_mmu") moved slb_addr_limit init out of setup_arch().

Commit 701101865f5d ("powerpc/mm: Reduce memory usage for mm_context_t
for radix") brought it back into setup_arch() by error.

This patch reverts that erroneous regress.

Fixes: 701101865f5d ("powerpc/mm: Reduce memory usage for mm_context_t for radix")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/setup-common.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 70dc10aa0ccf..19d68a9b5f37 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -950,12 +950,6 @@ void __init setup_arch(char **cmdline_p)
 	init_mm.end_data = (unsigned long) _edata;
 	init_mm.brk = klimit;
 
-#ifdef CONFIG_PPC_MM_SLICES
-#if defined(CONFIG_PPC_8xx)
-	init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW;
-#endif
-#endif
-
 #ifdef CONFIG_SPAPR_TCE_IOMMU
 	mm_iommu_init(&init_mm);
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 02f89aed6b829d73980bb633d9f4e3de9eb45543 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 25 Apr 2019 14:29:28 +0000
Subject: powerpc/mm: no slice for nohash/64

Only nohash/32 and book3s/64 support mm slices.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/nohash/64/slice.h | 12 ------------
 arch/powerpc/include/asm/slice.h           |  4 +---
 arch/powerpc/platforms/Kconfig.cputype     |  4 ++++
 3 files changed, 5 insertions(+), 15 deletions(-)
 delete mode 100644 arch/powerpc/include/asm/nohash/64/slice.h

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/nohash/64/slice.h b/arch/powerpc/include/asm/nohash/64/slice.h
deleted file mode 100644
index ad0d6e3cc1c5..000000000000
--- a/arch/powerpc/include/asm/nohash/64/slice.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_POWERPC_NOHASH_64_SLICE_H
-#define _ASM_POWERPC_NOHASH_64_SLICE_H
-
-#ifdef CONFIG_PPC_64K_PAGES
-#define get_slice_psize(mm, addr)	MMU_PAGE_64K
-#else /* CONFIG_PPC_64K_PAGES */
-#define get_slice_psize(mm, addr)	MMU_PAGE_4K
-#endif /* !CONFIG_PPC_64K_PAGES */
-#define slice_set_user_psize(mm, psize)	do { BUG(); } while (0)
-
-#endif /* _ASM_POWERPC_NOHASH_64_SLICE_H */
diff --git a/arch/powerpc/include/asm/slice.h b/arch/powerpc/include/asm/slice.h
index 44816cbc4198..be8af667098f 100644
--- a/arch/powerpc/include/asm/slice.h
+++ b/arch/powerpc/include/asm/slice.h
@@ -4,9 +4,7 @@
 
 #ifdef CONFIG_PPC_BOOK3S_64
 #include <asm/book3s/64/slice.h>
-#elif defined(CONFIG_PPC64)
-#include <asm/nohash/64/slice.h>
-#elif defined(CONFIG_PPC_MMU_NOHASH)
+#elif defined(CONFIG_PPC_MMU_NOHASH_32)
 #include <asm/nohash/32/slice.h>
 #endif
 
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 60a7c7095b05..cd28b045c0f3 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -391,6 +391,10 @@ config PPC_MMU_NOHASH
 	def_bool y
 	depends on !PPC_BOOK3S
 
+config PPC_MMU_NOHASH_32
+	def_bool y
+	depends on PPC_MMU_NOHASH && PPC32
+
 config PPC_BOOK3E_MMU
 	def_bool y
 	depends on FSL_BOOKE || PPC_BOOK3E
-- 
cgit v1.2.3-59-g8ed1b


From 6f60cc98df2be7f082bd786aa824ceabd24d24cb Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 25 Apr 2019 14:29:29 +0000
Subject: powerpc/mm: hand a context_t over to slice_mask_for_size() instead of
 mm_struct

slice_mask_for_size() only uses mm->context, so hand directly a
pointer to the context. This will help moving the function in
subarch mmu.h in the next patch by avoiding having to include
the definition of struct mm_struct

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/slice.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 35b278082391..8eb7e8b09c75 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -151,32 +151,32 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret,
 }
 
 #ifdef CONFIG_PPC_BOOK3S_64
-static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize)
+static struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize)
 {
 #ifdef CONFIG_PPC_64K_PAGES
 	if (psize == MMU_PAGE_64K)
-		return mm_ctx_slice_mask_64k(&mm->context);
+		return mm_ctx_slice_mask_64k(&ctx);
 #endif
 	if (psize == MMU_PAGE_4K)
-		return mm_ctx_slice_mask_4k(&mm->context);
+		return mm_ctx_slice_mask_4k(&ctx);
 #ifdef CONFIG_HUGETLB_PAGE
 	if (psize == MMU_PAGE_16M)
-		return mm_ctx_slice_mask_16m(&mm->context);
+		return mm_ctx_slice_mask_16m(&ctx);
 	if (psize == MMU_PAGE_16G)
-		return mm_ctx_slice_mask_16g(&mm->context);
+		return mm_ctx_slice_mask_16g(&ctx);
 #endif
 	BUG();
 }
 #elif defined(CONFIG_PPC_8xx)
-static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize)
+static struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize)
 {
 	if (psize == mmu_virtual_psize)
-		return &mm->context.mask_base_psize;
+		return &ctx->mask_base_psize;
 #ifdef CONFIG_HUGETLB_PAGE
 	if (psize == MMU_PAGE_512K)
-		return &mm->context.mask_512k;
+		return &ctx->mask_512k;
 	if (psize == MMU_PAGE_8M)
-		return &mm->context.mask_8m;
+		return &ctx->mask_8m;
 #endif
 	BUG();
 }
@@ -246,7 +246,7 @@ static void slice_convert(struct mm_struct *mm,
 	slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize);
 	slice_print_mask(" mask", mask);
 
-	psize_mask = slice_mask_for_size(mm, psize);
+	psize_mask = slice_mask_for_size(&mm->context, psize);
 
 	/* We need to use a spinlock here to protect against
 	 * concurrent 64k -> 4k demotion ...
@@ -263,7 +263,7 @@ static void slice_convert(struct mm_struct *mm,
 
 		/* Update the slice_mask */
 		old_psize = (lpsizes[index] >> (mask_index * 4)) & 0xf;
-		old_mask = slice_mask_for_size(mm, old_psize);
+		old_mask = slice_mask_for_size(&mm->context, old_psize);
 		old_mask->low_slices &= ~(1u << i);
 		psize_mask->low_slices |= 1u << i;
 
@@ -282,7 +282,7 @@ static void slice_convert(struct mm_struct *mm,
 
 		/* Update the slice_mask */
 		old_psize = (hpsizes[index] >> (mask_index * 4)) & 0xf;
-		old_mask = slice_mask_for_size(mm, old_psize);
+		old_mask = slice_mask_for_size(&mm->context, old_psize);
 		__clear_bit(i, old_mask->high_slices);
 		__set_bit(i, psize_mask->high_slices);
 
@@ -538,7 +538,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	/* First make up a "good" mask of slices that have the right size
 	 * already
 	 */
-	maskp = slice_mask_for_size(mm, psize);
+	maskp = slice_mask_for_size(&mm->context, psize);
 
 	/*
 	 * Here "good" means slices that are already the right page size,
@@ -565,7 +565,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	 * a pointer to good mask for the next code to use.
 	 */
 	if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) {
-		compat_maskp = slice_mask_for_size(mm, MMU_PAGE_4K);
+		compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K);
 		if (fixed)
 			slice_or_mask(&good_mask, maskp, compat_maskp);
 		else
@@ -760,7 +760,7 @@ void slice_init_new_context_exec(struct mm_struct *mm)
 	/*
 	 * Slice mask cache starts zeroed, fill the default size cache.
 	 */
-	mask = slice_mask_for_size(mm, psize);
+	mask = slice_mask_for_size(&mm->context, psize);
 	mask->low_slices = ~0UL;
 	if (SLICE_NUM_HIGH)
 		bitmap_fill(mask->high_slices, SLICE_NUM_HIGH);
@@ -819,14 +819,14 @@ int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 
 	VM_BUG_ON(radix_enabled());
 
-	maskp = slice_mask_for_size(mm, psize);
+	maskp = slice_mask_for_size(&mm->context, psize);
 #ifdef CONFIG_PPC_64K_PAGES
 	/* We need to account for 4k slices too */
 	if (psize == MMU_PAGE_64K) {
 		const struct slice_mask *compat_maskp;
 		struct slice_mask available;
 
-		compat_maskp = slice_mask_for_size(mm, MMU_PAGE_4K);
+		compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K);
 		slice_or_mask(&available, maskp, compat_maskp);
 		return !slice_check_range_fits(mm, &available, addr, len);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From fca5c1e9eb5e263c1b4def0b5ae4ce5b2e1a9877 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 25 Apr 2019 14:29:30 +0000
Subject: powerpc/mm: move slice_mask_for_size() into mmu.h

Move slice_mask_for_size() into subarch mmu.h

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
[mpe: Retain the BUG_ON()s, rather than converting to VM_BUG_ON()]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu.h     | 17 +++++++++++
 arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 42 +++++++++++++++++++---------
 arch/powerpc/mm/slice.c                      | 34 ----------------------
 3 files changed, 46 insertions(+), 47 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 230a9dec7677..a6d5b5ed1170 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -203,6 +203,23 @@ static inline struct slice_mask *mm_ctx_slice_mask_16g(mm_context_t *ctx)
 }
 #endif
 
+static inline struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize)
+{
+#ifdef CONFIG_PPC_64K_PAGES
+	if (psize == MMU_PAGE_64K)
+		return mm_ctx_slice_mask_64k(&ctx);
+#endif
+#ifdef CONFIG_HUGETLB_PAGE
+	if (psize == MMU_PAGE_16M)
+		return mm_ctx_slice_mask_16m(&ctx);
+	if (psize == MMU_PAGE_16G)
+		return mm_ctx_slice_mask_16g(&ctx);
+#endif
+	BUG_ON(psize != MMU_PAGE_4K);
+
+	return mm_ctx_slice_mask_4k(&ctx);
+}
+
 #ifdef CONFIG_PPC_SUBPAGE_PROT
 static inline struct subpage_prot_table *mm_ctx_subpage_prot(mm_context_t *ctx)
 {
diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
index c503e2f05e61..114f50d995dc 100644
--- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
@@ -184,7 +184,23 @@
 #define LOW_SLICE_ARRAY_SZ	SLICE_ARRAY_SIZE
 #endif
 
+#if defined(CONFIG_PPC_4K_PAGES)
+#define mmu_virtual_psize	MMU_PAGE_4K
+#elif defined(CONFIG_PPC_16K_PAGES)
+#define mmu_virtual_psize	MMU_PAGE_16K
+#define PTE_FRAG_NR		4
+#define PTE_FRAG_SIZE_SHIFT	12
+#define PTE_FRAG_SIZE		(1UL << 12)
+#else
+#error "Unsupported PAGE_SIZE"
+#endif
+
+#define mmu_linear_psize	MMU_PAGE_8M
+
 #ifndef __ASSEMBLY__
+
+#include <linux/mmdebug.h>
+
 struct slice_mask {
 	u64 low_slices;
 	DECLARE_BITMAP(high_slices, 0);
@@ -255,6 +271,19 @@ static inline struct slice_mask *mm_ctx_slice_mask_8m(mm_context_t *ctx)
 	return &ctx->mask_8m;
 }
 #endif
+
+static inline struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+	if (psize == MMU_PAGE_512K)
+		return &ctx->mask_512k;
+	if (psize == MMU_PAGE_8M)
+		return &ctx->mask_8m;
+#endif
+	BUG_ON(psize != mmu_virtual_psize);
+
+	return &ctx->mask_base_psize;
+}
 #endif /* CONFIG_PPC_MM_SLICE */
 
 #define PHYS_IMMR_BASE (mfspr(SPRN_IMMR) & 0xfff80000)
@@ -306,17 +335,4 @@ extern s32 patch__itlbmiss_perf, patch__dtlbmiss_perf;
 
 #endif /* !__ASSEMBLY__ */
 
-#if defined(CONFIG_PPC_4K_PAGES)
-#define mmu_virtual_psize	MMU_PAGE_4K
-#elif defined(CONFIG_PPC_16K_PAGES)
-#define mmu_virtual_psize	MMU_PAGE_16K
-#define PTE_FRAG_NR		4
-#define PTE_FRAG_SIZE_SHIFT	12
-#define PTE_FRAG_SIZE		(1UL << 12)
-#else
-#error "Unsupported PAGE_SIZE"
-#endif
-
-#define mmu_linear_psize	MMU_PAGE_8M
-
 #endif /* _ASM_POWERPC_MMU_8XX_H_ */
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 8eb7e8b09c75..31de91b65a64 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -150,40 +150,6 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret,
 			__set_bit(i, ret->high_slices);
 }
 
-#ifdef CONFIG_PPC_BOOK3S_64
-static struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize)
-{
-#ifdef CONFIG_PPC_64K_PAGES
-	if (psize == MMU_PAGE_64K)
-		return mm_ctx_slice_mask_64k(&ctx);
-#endif
-	if (psize == MMU_PAGE_4K)
-		return mm_ctx_slice_mask_4k(&ctx);
-#ifdef CONFIG_HUGETLB_PAGE
-	if (psize == MMU_PAGE_16M)
-		return mm_ctx_slice_mask_16m(&ctx);
-	if (psize == MMU_PAGE_16G)
-		return mm_ctx_slice_mask_16g(&ctx);
-#endif
-	BUG();
-}
-#elif defined(CONFIG_PPC_8xx)
-static struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize)
-{
-	if (psize == mmu_virtual_psize)
-		return &ctx->mask_base_psize;
-#ifdef CONFIG_HUGETLB_PAGE
-	if (psize == MMU_PAGE_512K)
-		return &ctx->mask_512k;
-	if (psize == MMU_PAGE_8M)
-		return &ctx->mask_8m;
-#endif
-	BUG();
-}
-#else
-#error "Must define the slice masks for page sizes supported by the platform"
-#endif
-
 static bool slice_check_range_fits(struct mm_struct *mm,
 			   const struct slice_mask *available,
 			   unsigned long start, unsigned long len)
-- 
cgit v1.2.3-59-g8ed1b


From 877461210ea1c92f159bf261924e58d7d27edadc Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 25 Apr 2019 14:29:31 +0000
Subject: powerpc/mm: get rid of mm_ctx_slice_mask_xxx()

Now that slice_mask_for_size() is in mmu.h, the mm_ctx_slice_mask_xxx()
are not needed anymore, so drop them. Note that the 8xx ones where
not used anyway.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu.h     | 32 ++++------------------------
 arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 17 ---------------
 2 files changed, 4 insertions(+), 45 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index a6d5b5ed1170..51b2d60efc1b 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -179,45 +179,21 @@ static inline void mm_ctx_set_slb_addr_limit(mm_context_t *ctx, unsigned long li
 	ctx->hash_context->slb_addr_limit = limit;
 }
 
-#ifdef CONFIG_PPC_64K_PAGES
-static inline struct slice_mask *mm_ctx_slice_mask_64k(mm_context_t *ctx)
-{
-	return &ctx->hash_context->mask_64k;
-}
-#endif
-
-static inline struct slice_mask *mm_ctx_slice_mask_4k(mm_context_t *ctx)
-{
-	return &ctx->hash_context->mask_4k;
-}
-
-#ifdef CONFIG_HUGETLB_PAGE
-static inline struct slice_mask *mm_ctx_slice_mask_16m(mm_context_t *ctx)
-{
-	return &ctx->hash_context->mask_16m;
-}
-
-static inline struct slice_mask *mm_ctx_slice_mask_16g(mm_context_t *ctx)
-{
-	return &ctx->hash_context->mask_16g;
-}
-#endif
-
 static inline struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize)
 {
 #ifdef CONFIG_PPC_64K_PAGES
 	if (psize == MMU_PAGE_64K)
-		return mm_ctx_slice_mask_64k(&ctx);
+		return &ctx->hash_context->mask_64k;
 #endif
 #ifdef CONFIG_HUGETLB_PAGE
 	if (psize == MMU_PAGE_16M)
-		return mm_ctx_slice_mask_16m(&ctx);
+		return &ctx->hash_context->mask_16m;
 	if (psize == MMU_PAGE_16G)
-		return mm_ctx_slice_mask_16g(&ctx);
+		return &ctx->hash_context->mask_16g;
 #endif
 	BUG_ON(psize != MMU_PAGE_4K);
 
-	return mm_ctx_slice_mask_4k(&ctx);
+	return &ctx->hash_context->mask_4k;
 }
 
 #ifdef CONFIG_PPC_SUBPAGE_PROT
diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
index 114f50d995dc..77ccf7cb6fcc 100644
--- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
@@ -255,23 +255,6 @@ static inline void mm_ctx_set_slb_addr_limit(mm_context_t *ctx, unsigned long li
 	ctx->slb_addr_limit = limit;
 }
 
-static inline struct slice_mask *mm_ctx_slice_mask_base(mm_context_t *ctx)
-{
-	return &ctx->mask_base_psize;
-}
-
-#ifdef CONFIG_HUGETLB_PAGE
-static inline struct slice_mask *mm_ctx_slice_mask_512k(mm_context_t *ctx)
-{
-	return &ctx->mask_512k;
-}
-
-static inline struct slice_mask *mm_ctx_slice_mask_8m(mm_context_t *ctx)
-{
-	return &ctx->mask_8m;
-}
-#endif
-
 static inline struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize)
 {
 #ifdef CONFIG_HUGETLB_PAGE
-- 
cgit v1.2.3-59-g8ed1b


From b4baad0b2712471740c58a1bc9578ab057af7514 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 25 Apr 2019 14:29:32 +0000
Subject: powerpc/mm: remove unnecessary #ifdef CONFIG_PPC64

For PPC32 that's a noop, gcc should be smart enough to ignore it.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/slice.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 31de91b65a64..840c4118a185 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -118,13 +118,11 @@ static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice)
 	unsigned long start = slice << SLICE_HIGH_SHIFT;
 	unsigned long end = start + (1ul << SLICE_HIGH_SHIFT);
 
-#ifdef CONFIG_PPC64
 	/* Hack, so that each addresses is controlled by exactly one
 	 * of the high or low area bitmaps, the first high area starts
 	 * at 4GB, not 0 */
 	if (start == 0)
-		start = SLICE_LOW_TOP;
-#endif
+		start = (unsigned long)SLICE_LOW_TOP;
 
 	return !slice_area_is_free(mm, start, end - start);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 203a1fa6286671900698485ddffbb435901aa75b Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 25 Apr 2019 14:29:33 +0000
Subject: powerpc/mm: remove a couple of #ifdef CONFIG_PPC_64K_PAGES in
 mm/slice.c

This patch replaces a couple of #ifdef CONFIG_PPC_64K_PAGES
by IS_ENABLED(CONFIG_PPC_64K_PAGES) to improve code maintainability.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/slice.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 840c4118a185..ace97d953040 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -606,14 +606,13 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	newaddr = slice_find_area(mm, len, &potential_mask,
 				  psize, topdown, high_limit);
 
-#ifdef CONFIG_PPC_64K_PAGES
-	if (newaddr == -ENOMEM && psize == MMU_PAGE_64K) {
+	if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && newaddr == -ENOMEM &&
+	    psize == MMU_PAGE_64K) {
 		/* retry the search with 4k-page slices included */
 		slice_or_mask(&potential_mask, &potential_mask, compat_maskp);
 		newaddr = slice_find_area(mm, len, &potential_mask,
 					  psize, topdown, high_limit);
 	}
-#endif
 
 	if (newaddr == -ENOMEM)
 		return -ENOMEM;
@@ -784,9 +783,9 @@ int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 	VM_BUG_ON(radix_enabled());
 
 	maskp = slice_mask_for_size(&mm->context, psize);
-#ifdef CONFIG_PPC_64K_PAGES
+
 	/* We need to account for 4k slices too */
-	if (psize == MMU_PAGE_64K) {
+	if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) {
 		const struct slice_mask *compat_maskp;
 		struct slice_mask available;
 
@@ -794,7 +793,6 @@ int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 		slice_or_mask(&available, maskp, compat_maskp);
 		return !slice_check_range_fits(mm, &available, addr, len);
 	}
-#endif
 
 	return !slice_check_range_fits(mm, maskp, addr, len);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 33f128c64919736164e70eb024da3ae5e5768cd6 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 25 Apr 2019 14:29:34 +0000
Subject: powerpc/8xx: get rid of #ifdef CONFIG_HUGETLB_PAGE for slices

The 8xx only selects CONFIG_PPC_MM_SLICES when CONFIG_HUGETLB_PAGE
is set.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
index 77ccf7cb6fcc..76af5b0cb16e 100644
--- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
@@ -216,10 +216,8 @@ typedef struct {
 	unsigned char high_slices_psize[0];
 	unsigned long slb_addr_limit;
 	struct slice_mask mask_base_psize; /* 4k or 16k */
-# ifdef CONFIG_HUGETLB_PAGE
 	struct slice_mask mask_512k;
 	struct slice_mask mask_8m;
-# endif
 #endif
 	void *pte_frag;
 } mm_context_t;
@@ -257,12 +255,11 @@ static inline void mm_ctx_set_slb_addr_limit(mm_context_t *ctx, unsigned long li
 
 static inline struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize)
 {
-#ifdef CONFIG_HUGETLB_PAGE
 	if (psize == MMU_PAGE_512K)
 		return &ctx->mask_512k;
 	if (psize == MMU_PAGE_8M)
 		return &ctx->mask_8m;
-#endif
+
 	BUG_ON(psize != mmu_virtual_psize);
 
 	return &ctx->mask_base_psize;
-- 
cgit v1.2.3-59-g8ed1b


From 43ed7909d70a61c621cadb5d808dc392ad537e5a Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 25 Apr 2019 14:29:35 +0000
Subject: powerpc/mm: define get_slice_psize() all the time

get_slice_psize() can be defined regardless of CONFIG_PPC_MM_SLICES
to avoid ifdefs

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/slice.h | 5 +++++
 arch/powerpc/mm/hugetlbpage.c    | 4 +---
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/slice.h b/arch/powerpc/include/asm/slice.h
index be8af667098f..c6f466f4c241 100644
--- a/arch/powerpc/include/asm/slice.h
+++ b/arch/powerpc/include/asm/slice.h
@@ -36,6 +36,11 @@ void slice_setup_new_exec(void);
 
 static inline void slice_init_new_context_exec(struct mm_struct *mm) {}
 
+static inline unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr)
+{
+	return 0;
+}
+
 #endif /* CONFIG_PPC_MM_SLICES */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 9e732bb2c84a..5f67e7a4d1cc 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -578,14 +578,12 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 
 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
 {
-#ifdef CONFIG_PPC_MM_SLICES
 	/* With radix we don't use slice, so derive it from vma*/
-	if (!radix_enabled()) {
+	if (IS_ENABLED(CONFIG_PPC_MM_SLICES) && !radix_enabled()) {
 		unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
 
 		return 1UL << mmu_psize_to_shift(psize);
 	}
-#endif
 	return vma_kernel_pagesize(vma);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 5953fb4f4671d7d755a81017a76766c00922d059 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 25 Apr 2019 14:29:36 +0000
Subject: powerpc/mm: define subarch SLB_ADDR_LIMIT_DEFAULT

This patch defines a subarch specific SLB_ADDR_LIMIT_DEFAULT
to remove the #ifdefs around the setup of mm->context.slb_addr_limit

It also generalises the use of mm_ctx_set_slb_addr_limit() helper.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/slice.h | 2 ++
 arch/powerpc/include/asm/nohash/32/slice.h | 2 ++
 arch/powerpc/mm/book3s64/hash_utils.c      | 2 +-
 arch/powerpc/mm/nohash/tlb.c               | 4 +---
 arch/powerpc/mm/slice.c                    | 6 +-----
 5 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/slice.h b/arch/powerpc/include/asm/book3s/64/slice.h
index 062e11136e9c..f0d3194ba41b 100644
--- a/arch/powerpc/include/asm/book3s/64/slice.h
+++ b/arch/powerpc/include/asm/book3s/64/slice.h
@@ -11,4 +11,6 @@
 #define SLICE_NUM_HIGH		(H_PGTABLE_RANGE >> SLICE_HIGH_SHIFT)
 #define GET_HIGH_SLICE_INDEX(addr)	((addr) >> SLICE_HIGH_SHIFT)
 
+#define SLB_ADDR_LIMIT_DEFAULT	DEFAULT_MAP_WINDOW_USER64
+
 #endif /* _ASM_POWERPC_BOOK3S_64_SLICE_H */
diff --git a/arch/powerpc/include/asm/nohash/32/slice.h b/arch/powerpc/include/asm/nohash/32/slice.h
index 777d62e40ac0..39eb0154ae2d 100644
--- a/arch/powerpc/include/asm/nohash/32/slice.h
+++ b/arch/powerpc/include/asm/nohash/32/slice.h
@@ -13,6 +13,8 @@
 #define SLICE_NUM_HIGH		0ul
 #define GET_HIGH_SLICE_INDEX(addr)	(addr & 0)
 
+#define SLB_ADDR_LIMIT_DEFAULT	DEFAULT_MAP_WINDOW
+
 #endif /* CONFIG_PPC_MM_SLICES */
 
 #endif /* _ASM_POWERPC_NOHASH_32_SLICE_H */
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index b21a81d42f15..23ed8db645ad 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1058,7 +1058,7 @@ void __init hash__early_init_mmu(void)
 	htab_initialize();
 
 	init_mm.context.hash_context = &init_hash_mm_context;
-	init_mm.context.hash_context->slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
+	mm_ctx_set_slb_addr_limit(&init_mm.context, SLB_ADDR_LIMIT_DEFAULT);
 
 	pr_info("Initializing hash mmu with SLB\n");
 	/* Initialize SLB management */
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index 704e613a0b14..c2494b838008 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -802,9 +802,7 @@ void __init early_init_mmu(void)
 #endif
 
 #ifdef CONFIG_PPC_MM_SLICES
-#if defined(CONFIG_PPC_8xx)
-	init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW;
-#endif
+	mm_ctx_set_slb_addr_limit(&init_mm.context, SLB_ADDR_LIMIT_DEFAULT);
 #endif
 }
 #endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index ace97d953040..97fbf7b54422 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -704,11 +704,7 @@ void slice_init_new_context_exec(struct mm_struct *mm)
 	 * case of fork it is just inherited from the mm being
 	 * duplicated.
 	 */
-#ifdef CONFIG_PPC64
-	mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW_USER64);
-#else
-	mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW;
-#endif
+	mm_ctx_set_slb_addr_limit(&mm->context, SLB_ADDR_LIMIT_DEFAULT);
 	mm_ctx_set_user_psize(&mm->context, psize);
 
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From a521c44c3ded9fe184c5de3eed3a442af2d26f00 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 05:59:38 +0000
Subject: powerpc/book3e: drop mmu_get_tsize()

This function is not used anymore, drop it.

Fixes: b42279f0165c ("powerpc/mm/nohash: MM_SLICE is only used by book3s 64")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/nohash/book3e_hugetlbpage.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
index f84ec46cdb26..c911fe9bfa0e 100644
--- a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
+++ b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
@@ -49,11 +49,6 @@ static inline int tlb1_next(void)
 #endif /* !PPC64 */
 #endif /* FSL */
 
-static inline int mmu_get_tsize(int psize)
-{
-	return mmu_psize_defs[psize].enc;
-}
-
 #if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_PPC64)
 #include <asm/paca.h>
 
-- 
cgit v1.2.3-59-g8ed1b


From 5874cabe29079b72b192a28d266adf1a460fc5d6 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 05:59:39 +0000
Subject: powerpc/64: only book3s/64 supports CONFIG_PPC_64K_PAGES

CONFIG_PPC_64K_PAGES cannot be selected by nohash/64.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig                         |  1 -
 arch/powerpc/include/asm/nohash/64/pgalloc.h |  3 ---
 arch/powerpc/include/asm/nohash/64/pgtable.h |  4 ----
 arch/powerpc/include/asm/nohash/pte-book3e.h |  5 -----
 arch/powerpc/include/asm/pgtable-be-types.h  |  9 ++------
 arch/powerpc/include/asm/pgtable-types.h     |  9 ++------
 arch/powerpc/include/asm/task_size_64.h      |  2 +-
 arch/powerpc/mm/nohash/tlb.c                 | 13 ------------
 arch/powerpc/mm/nohash/tlb_low_64e.S         | 31 ----------------------------
 9 files changed, 5 insertions(+), 72 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 2d0be82c3061..5d8e692d6470 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -375,7 +375,6 @@ config ZONE_DMA
 config PGTABLE_LEVELS
 	int
 	default 2 if !PPC64
-	default 3 if PPC_64K_PAGES && !PPC_BOOK3S_64
 	default 4
 
 source "arch/powerpc/sysdev/Kconfig"
diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h
index 66d086f85bd5..ded453f9b5a8 100644
--- a/arch/powerpc/include/asm/nohash/64/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h
@@ -171,12 +171,9 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
 
 #define __pmd_free_tlb(tlb, pmd, addr)		      \
 	pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX)
-#ifndef CONFIG_PPC_64K_PAGES
 #define __pud_free_tlb(tlb, pud, addr)		      \
 	pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE)
 
-#endif /* CONFIG_PPC_64K_PAGES */
-
 #define check_pgt_cache()	do { } while (0)
 
 #endif /* _ASM_POWERPC_PGALLOC_64_H */
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index c8e6a9a5bc33..b9f66cf15c31 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -10,10 +10,6 @@
 #include <asm/barrier.h>
 #include <asm/asm-const.h>
 
-#ifdef CONFIG_PPC_64K_PAGES
-#error "Page size not supported"
-#endif
-
 #define FIRST_USER_ADDRESS	0UL
 
 /*
diff --git a/arch/powerpc/include/asm/nohash/pte-book3e.h b/arch/powerpc/include/asm/nohash/pte-book3e.h
index dd40d200f274..813918f40765 100644
--- a/arch/powerpc/include/asm/nohash/pte-book3e.h
+++ b/arch/powerpc/include/asm/nohash/pte-book3e.h
@@ -60,13 +60,8 @@
 #define _PAGE_SPECIAL	_PAGE_SW0
 
 /* Base page size */
-#ifdef CONFIG_PPC_64K_PAGES
-#define _PAGE_PSIZE	_PAGE_PSIZE_64K
-#define PTE_RPN_SHIFT	(28)
-#else
 #define _PAGE_PSIZE	_PAGE_PSIZE_4K
 #define	PTE_RPN_SHIFT	(24)
-#endif
 
 #define PTE_WIMGE_SHIFT (19)
 #define PTE_BAP_SHIFT	(2)
diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h
index a89c67b62680..b169bbf95fcb 100644
--- a/arch/powerpc/include/asm/pgtable-be-types.h
+++ b/arch/powerpc/include/asm/pgtable-be-types.h
@@ -33,11 +33,7 @@ static inline __be64 pmd_raw(pmd_t x)
 	return x.pmd;
 }
 
-/*
- * 64 bit hash always use 4 level table. Everybody else use 4 level
- * only for 4K page size.
- */
-#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES)
+/* 64 bit always use 4 level table. */
 typedef struct { __be64 pud; } pud_t;
 #define __pud(x)	((pud_t) { cpu_to_be64(x) })
 #define __pud_raw(x)	((pud_t) { (x) })
@@ -51,7 +47,6 @@ static inline __be64 pud_raw(pud_t x)
 	return x.pud;
 }
 
-#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */
 #endif /* CONFIG_PPC64 */
 
 /* PGD level */
@@ -77,7 +72,7 @@ typedef struct { unsigned long pgprot; } pgprot_t;
  * With hash config 64k pages additionally define a bigger "real PTE" type that
  * gathers the "second half" part of the PTE for pseudo 64k pages
  */
-#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_BOOK3S_64)
+#ifdef CONFIG_PPC_64K_PAGES
 typedef struct { pte_t pte; unsigned long hidx; } real_pte_t;
 #else
 typedef struct { pte_t pte; } real_pte_t;
diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h
index 3b0edf041b2e..d11b4c61d686 100644
--- a/arch/powerpc/include/asm/pgtable-types.h
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -23,18 +23,13 @@ static inline unsigned long pmd_val(pmd_t x)
 	return x.pmd;
 }
 
-/*
- * 64 bit hash always use 4 level table. Everybody else use 4 level
- * only for 4K page size.
- */
-#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES)
+/* 64 bit always use 4 level table. */
 typedef struct { unsigned long pud; } pud_t;
 #define __pud(x)	((pud_t) { (x) })
 static inline unsigned long pud_val(pud_t x)
 {
 	return x.pud;
 }
-#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */
 #endif /* CONFIG_PPC64 */
 
 /* PGD level */
@@ -54,7 +49,7 @@ typedef struct { unsigned long pgprot; } pgprot_t;
  * With hash config 64k pages additionally define a bigger "real PTE" type that
  * gathers the "second half" part of the PTE for pseudo 64k pages
  */
-#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_BOOK3S_64)
+#ifdef CONFIG_PPC_64K_PAGES
 typedef struct { pte_t pte; unsigned long hidx; } real_pte_t;
 #else
 typedef struct { pte_t pte; } real_pte_t;
diff --git a/arch/powerpc/include/asm/task_size_64.h b/arch/powerpc/include/asm/task_size_64.h
index eab4779f6b84..c993482237ed 100644
--- a/arch/powerpc/include/asm/task_size_64.h
+++ b/arch/powerpc/include/asm/task_size_64.h
@@ -20,7 +20,7 @@
 /*
  * For now 512TB is only supported with book3s and 64K linux page size.
  */
-#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_64K_PAGES)
+#ifdef CONFIG_PPC_64K_PAGES
 /*
  * Max value currently used:
  */
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index c2494b838008..24f88efb05bf 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -433,11 +433,7 @@ void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
 		unsigned long rid = (address & rmask) | 0x1000000000000000ul;
 		unsigned long vpte = address & ~rmask;
 
-#ifdef CONFIG_PPC_64K_PAGES
-		vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful;
-#else
 		vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful;
-#endif
 		vpte |= rid;
 		__flush_tlb_page(tlb->mm, vpte, tsize, 0);
 	}
@@ -625,21 +621,12 @@ static void early_init_this_mmu(void)
 
 	case PPC_HTW_IBM:
 		mas4 |= MAS4_INDD;
-#ifdef CONFIG_PPC_64K_PAGES
-		mas4 |=	BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT;
-		mmu_pte_psize = MMU_PAGE_256M;
-#else
 		mas4 |=	BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT;
 		mmu_pte_psize = MMU_PAGE_1M;
-#endif
 		break;
 
 	case PPC_HTW_NONE:
-#ifdef CONFIG_PPC_64K_PAGES
-		mas4 |=	BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT;
-#else
 		mas4 |=	BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
-#endif
 		mmu_pte_psize = mmu_virtual_psize;
 		break;
 	}
diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S
index 9ed90064f542..58959ce15415 100644
--- a/arch/powerpc/mm/nohash/tlb_low_64e.S
+++ b/arch/powerpc/mm/nohash/tlb_low_64e.S
@@ -24,11 +24,7 @@
 #include <asm/kvm_booke_hv_asm.h>
 #include <asm/feature-fixups.h>
 
-#ifdef CONFIG_PPC_64K_PAGES
-#define VPTE_PMD_SHIFT	(PTE_INDEX_SIZE+1)
-#else
 #define VPTE_PMD_SHIFT	(PTE_INDEX_SIZE)
-#endif
 #define VPTE_PUD_SHIFT	(VPTE_PMD_SHIFT + PMD_INDEX_SIZE)
 #define VPTE_PGD_SHIFT	(VPTE_PUD_SHIFT + PUD_INDEX_SIZE)
 #define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE)
@@ -167,13 +163,11 @@ MMU_FTR_SECTION_ELSE
 	ldx	r14,r14,r15		/* grab pgd entry */
 ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
 
-#ifndef CONFIG_PPC_64K_PAGES
 	rldicl	r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
 	clrrdi	r15,r15,3
 	cmpdi	cr0,r14,0
 	bge	tlb_miss_fault_bolted	/* Bad pgd entry or hugepage; bail */
 	ldx	r14,r14,r15		/* grab pud entry */
-#endif /* CONFIG_PPC_64K_PAGES */
 
 	rldicl	r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
 	clrrdi	r15,r15,3
@@ -682,18 +676,7 @@ normal_tlb_miss:
 	 * order to handle the weird page table format used by linux
 	 */
 	ori	r10,r15,0x1
-#ifdef CONFIG_PPC_64K_PAGES
-	/* For the top bits, 16 bytes per PTE */
-	rldicl	r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4
-	/* Now create the bottom bits as 0 in position 0x8000 and
-	 * the rest calculated for 8 bytes per PTE
-	 */
-	rldicl	r15,r16,64-(PAGE_SHIFT-3),64-15
-	/* Insert the bottom bits in */
-	rlwimi	r14,r15,0,16,31
-#else
 	rldicl	r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4
-#endif
 	sldi	r15,r10,60
 	clrrdi	r14,r14,3
 	or	r10,r15,r14
@@ -732,11 +715,7 @@ finish_normal_tlb_miss:
 
 	/* Check page size, if not standard, update MAS1 */
 	rldicl	r11,r14,64-8,64-8
-#ifdef CONFIG_PPC_64K_PAGES
-	cmpldi	cr0,r11,BOOK3E_PAGESZ_64K
-#else
 	cmpldi	cr0,r11,BOOK3E_PAGESZ_4K
-#endif
 	beq-	1f
 	mfspr	r11,SPRN_MAS1
 	rlwimi	r11,r14,31,21,24
@@ -857,14 +836,12 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
 	cmpdi	cr0,r15,0
 	bge	virt_page_table_tlb_miss_fault
 
-#ifndef CONFIG_PPC_64K_PAGES
 	/* Get to PUD entry */
 	rldicl	r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3
 	clrrdi	r10,r11,3
 	ldx	r15,r10,r15
 	cmpdi	cr0,r15,0
 	bge	virt_page_table_tlb_miss_fault
-#endif /* CONFIG_PPC_64K_PAGES */
 
 	/* Get to PMD entry */
 	rldicl	r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3
@@ -1106,14 +1083,12 @@ htw_tlb_miss:
 	cmpdi	cr0,r15,0
 	bge	htw_tlb_miss_fault
 
-#ifndef CONFIG_PPC_64K_PAGES
 	/* Get to PUD entry */
 	rldicl	r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3
 	clrrdi	r10,r11,3
 	ldx	r15,r10,r15
 	cmpdi	cr0,r15,0
 	bge	htw_tlb_miss_fault
-#endif /* CONFIG_PPC_64K_PAGES */
 
 	/* Get to PMD entry */
 	rldicl	r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3
@@ -1132,9 +1107,7 @@ htw_tlb_miss:
 	 * 4K page we need to extract a bit from the virtual address and
 	 * insert it into the "PA52" bit of the RPN.
 	 */
-#ifndef CONFIG_PPC_64K_PAGES
 	rlwimi	r15,r16,32-9,20,20
-#endif
 	/* Now we build the MAS:
 	 *
 	 * MAS 0   :	Fully setup with defaults in MAS4 and TLBnCFG
@@ -1144,11 +1117,7 @@ htw_tlb_miss:
 	 * MAS 2   :	Use defaults
 	 * MAS 3+7 :	Needs to be done
 	 */
-#ifdef CONFIG_PPC_64K_PAGES
-	ori	r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT)
-#else
 	ori	r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
-#endif
 
 BEGIN_MMU_FTR_SECTION
 	srdi	r16,r10,32
-- 
cgit v1.2.3-59-g8ed1b


From 3dea7332ccac1f701a6f8cd4fe44faa9be2e6014 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 05:59:40 +0000
Subject: powerpc/book3e: hugetlbpage is only for CONFIG_PPC_FSL_BOOK3E

As per Kconfig.cputype, only CONFIG_PPC_FSL_BOOK3E gets to
select SYS_SUPPORTS_HUGETLBFS so simplify accordingly.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/nohash/Makefile             |  2 +-
 arch/powerpc/mm/nohash/book3e_hugetlbpage.c | 47 ++++++++++++-----------------
 2 files changed, 20 insertions(+), 29 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile
index b2228ff81b8a..33b6f6f29d3f 100644
--- a/arch/powerpc/mm/nohash/Makefile
+++ b/arch/powerpc/mm/nohash/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_44x)		+= 44x.o
 obj-$(CONFIG_PPC_8xx)		+= 8xx.o
 obj-$(CONFIG_PPC_FSL_BOOK3E)	+= fsl_booke.o
 ifdef CONFIG_HUGETLB_PAGE
-obj-$(CONFIG_PPC_BOOK3E_MMU)	+= book3e_hugetlbpage.o
+obj-$(CONFIG_PPC_FSL_BOOK3E)	+= book3e_hugetlbpage.o
 endif
 
 # Disable kcov instrumentation on sensitive code
diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
index c911fe9bfa0e..61915f4d3c7f 100644
--- a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
+++ b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
@@ -11,8 +11,9 @@
 
 #include <asm/mmu.h>
 
-#ifdef CONFIG_PPC_FSL_BOOK3E
 #ifdef CONFIG_PPC64
+#include <asm/paca.h>
+
 static inline int tlb1_next(void)
 {
 	struct paca_struct *paca = get_paca();
@@ -29,28 +30,6 @@ static inline int tlb1_next(void)
 	tcd->esel_next = next;
 	return this;
 }
-#else
-static inline int tlb1_next(void)
-{
-	int index, ncams;
-
-	ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
-
-	index = this_cpu_read(next_tlbcam_idx);
-
-	/* Just round-robin the entries and wrap when we hit the end */
-	if (unlikely(index == ncams - 1))
-		__this_cpu_write(next_tlbcam_idx, tlbcam_index);
-	else
-		__this_cpu_inc(next_tlbcam_idx);
-
-	return index;
-}
-#endif /* !PPC64 */
-#endif /* FSL */
-
-#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_PPC64)
-#include <asm/paca.h>
 
 static inline void book3e_tlb_lock(void)
 {
@@ -93,6 +72,23 @@ static inline void book3e_tlb_unlock(void)
 	paca->tcd_ptr->lock = 0;
 }
 #else
+static inline int tlb1_next(void)
+{
+	int index, ncams;
+
+	ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+
+	index = this_cpu_read(next_tlbcam_idx);
+
+	/* Just round-robin the entries and wrap when we hit the end */
+	if (unlikely(index == ncams - 1))
+		__this_cpu_write(next_tlbcam_idx, tlbcam_index);
+	else
+		__this_cpu_inc(next_tlbcam_idx);
+
+	return index;
+}
+
 static inline void book3e_tlb_lock(void)
 {
 }
@@ -134,10 +130,7 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
 	unsigned long psize, tsize, shift;
 	unsigned long flags;
 	struct mm_struct *mm;
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
 	int index;
-#endif
 
 	if (unlikely(is_kernel_addr(ea)))
 		return;
@@ -161,11 +154,9 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
 		return;
 	}
 
-#ifdef CONFIG_PPC_FSL_BOOK3E
 	/* We have to use the CAM(TLB1) on FSL parts for hugepages */
 	index = tlb1_next();
 	mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1));
-#endif
 
 	mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize);
 	mas2 = ea & ~((1UL << shift) - 1);
-- 
cgit v1.2.3-59-g8ed1b


From 0caed4de502c7699b7faeaea0a93b39e4f19e11a Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 05:59:41 +0000
Subject: powerpc/mm: move __find_linux_pte() out of hugetlbpage.c

__find_linux_pte() is the only function in hugetlbpage.c
which is compiled in regardless on CONFIG_HUGETLBPAGE

This patch moves it in pgtable.c.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/hugetlbpage.c | 103 -----------------------------------------
 arch/powerpc/mm/pgtable.c     | 104 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+), 103 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 5f67e7a4d1cc..17915fc389ff 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -756,109 +756,6 @@ void flush_dcache_icache_hugepage(struct page *page)
 
 #endif /* CONFIG_HUGETLB_PAGE */
 
-/*
- * We have 4 cases for pgds and pmds:
- * (1) invalid (all zeroes)
- * (2) pointer to next table, as normal; bottom 6 bits == 0
- * (3) leaf pte for huge page _PAGE_PTE set
- * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
- *
- * So long as we atomically load page table pointers we are safe against teardown,
- * we can follow the address down to the the page and take a ref on it.
- * This function need to be called with interrupts disabled. We use this variant
- * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
- */
-pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
-			bool *is_thp, unsigned *hpage_shift)
-{
-	pgd_t pgd, *pgdp;
-	pud_t pud, *pudp;
-	pmd_t pmd, *pmdp;
-	pte_t *ret_pte;
-	hugepd_t *hpdp = NULL;
-	unsigned pdshift = PGDIR_SHIFT;
-
-	if (hpage_shift)
-		*hpage_shift = 0;
-
-	if (is_thp)
-		*is_thp = false;
-
-	pgdp = pgdir + pgd_index(ea);
-	pgd  = READ_ONCE(*pgdp);
-	/*
-	 * Always operate on the local stack value. This make sure the
-	 * value don't get updated by a parallel THP split/collapse,
-	 * page fault or a page unmap. The return pte_t * is still not
-	 * stable. So should be checked there for above conditions.
-	 */
-	if (pgd_none(pgd))
-		return NULL;
-	else if (pgd_huge(pgd)) {
-		ret_pte = (pte_t *) pgdp;
-		goto out;
-	} else if (is_hugepd(__hugepd(pgd_val(pgd))))
-		hpdp = (hugepd_t *)&pgd;
-	else {
-		/*
-		 * Even if we end up with an unmap, the pgtable will not
-		 * be freed, because we do an rcu free and here we are
-		 * irq disabled
-		 */
-		pdshift = PUD_SHIFT;
-		pudp = pud_offset(&pgd, ea);
-		pud  = READ_ONCE(*pudp);
-
-		if (pud_none(pud))
-			return NULL;
-		else if (pud_huge(pud)) {
-			ret_pte = (pte_t *) pudp;
-			goto out;
-		} else if (is_hugepd(__hugepd(pud_val(pud))))
-			hpdp = (hugepd_t *)&pud;
-		else {
-			pdshift = PMD_SHIFT;
-			pmdp = pmd_offset(&pud, ea);
-			pmd  = READ_ONCE(*pmdp);
-			/*
-			 * A hugepage collapse is captured by pmd_none, because
-			 * it mark the pmd none and do a hpte invalidate.
-			 */
-			if (pmd_none(pmd))
-				return NULL;
-
-			if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
-				if (is_thp)
-					*is_thp = true;
-				ret_pte = (pte_t *) pmdp;
-				goto out;
-			}
-			/*
-			 * pmd_large check below will handle the swap pmd pte
-			 * we need to do both the check because they are config
-			 * dependent.
-			 */
-			if (pmd_huge(pmd) || pmd_large(pmd)) {
-				ret_pte = (pte_t *) pmdp;
-				goto out;
-			} else if (is_hugepd(__hugepd(pmd_val(pmd))))
-				hpdp = (hugepd_t *)&pmd;
-			else
-				return pte_offset_kernel(&pmd, ea);
-		}
-	}
-	if (!hpdp)
-		return NULL;
-
-	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
-	pdshift = hugepd_shift(*hpdp);
-out:
-	if (hpage_shift)
-		*hpage_shift = pdshift;
-	return ret_pte;
-}
-EXPORT_SYMBOL_GPL(__find_linux_pte);
-
 int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index d3d61d29b4f1..9f4ccd15849f 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -30,6 +30,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
+#include <asm/hugetlb.h>
 
 static inline int is_exec_fault(void)
 {
@@ -299,3 +300,106 @@ unsigned long vmalloc_to_phys(void *va)
 	return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va);
 }
 EXPORT_SYMBOL_GPL(vmalloc_to_phys);
+
+/*
+ * We have 4 cases for pgds and pmds:
+ * (1) invalid (all zeroes)
+ * (2) pointer to next table, as normal; bottom 6 bits == 0
+ * (3) leaf pte for huge page _PAGE_PTE set
+ * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
+ *
+ * So long as we atomically load page table pointers we are safe against teardown,
+ * we can follow the address down to the the page and take a ref on it.
+ * This function need to be called with interrupts disabled. We use this variant
+ * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
+ */
+pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
+			bool *is_thp, unsigned *hpage_shift)
+{
+	pgd_t pgd, *pgdp;
+	pud_t pud, *pudp;
+	pmd_t pmd, *pmdp;
+	pte_t *ret_pte;
+	hugepd_t *hpdp = NULL;
+	unsigned pdshift = PGDIR_SHIFT;
+
+	if (hpage_shift)
+		*hpage_shift = 0;
+
+	if (is_thp)
+		*is_thp = false;
+
+	pgdp = pgdir + pgd_index(ea);
+	pgd  = READ_ONCE(*pgdp);
+	/*
+	 * Always operate on the local stack value. This make sure the
+	 * value don't get updated by a parallel THP split/collapse,
+	 * page fault or a page unmap. The return pte_t * is still not
+	 * stable. So should be checked there for above conditions.
+	 */
+	if (pgd_none(pgd))
+		return NULL;
+	else if (pgd_huge(pgd)) {
+		ret_pte = (pte_t *) pgdp;
+		goto out;
+	} else if (is_hugepd(__hugepd(pgd_val(pgd))))
+		hpdp = (hugepd_t *)&pgd;
+	else {
+		/*
+		 * Even if we end up with an unmap, the pgtable will not
+		 * be freed, because we do an rcu free and here we are
+		 * irq disabled
+		 */
+		pdshift = PUD_SHIFT;
+		pudp = pud_offset(&pgd, ea);
+		pud  = READ_ONCE(*pudp);
+
+		if (pud_none(pud))
+			return NULL;
+		else if (pud_huge(pud)) {
+			ret_pte = (pte_t *) pudp;
+			goto out;
+		} else if (is_hugepd(__hugepd(pud_val(pud))))
+			hpdp = (hugepd_t *)&pud;
+		else {
+			pdshift = PMD_SHIFT;
+			pmdp = pmd_offset(&pud, ea);
+			pmd  = READ_ONCE(*pmdp);
+			/*
+			 * A hugepage collapse is captured by pmd_none, because
+			 * it mark the pmd none and do a hpte invalidate.
+			 */
+			if (pmd_none(pmd))
+				return NULL;
+
+			if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
+				if (is_thp)
+					*is_thp = true;
+				ret_pte = (pte_t *) pmdp;
+				goto out;
+			}
+			/*
+			 * pmd_large check below will handle the swap pmd pte
+			 * we need to do both the check because they are config
+			 * dependent.
+			 */
+			if (pmd_huge(pmd) || pmd_large(pmd)) {
+				ret_pte = (pte_t *) pmdp;
+				goto out;
+			} else if (is_hugepd(__hugepd(pmd_val(pmd))))
+				hpdp = (hugepd_t *)&pmd;
+			else
+				return pte_offset_kernel(&pmd, ea);
+		}
+	}
+	if (!hpdp)
+		return NULL;
+
+	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
+	pdshift = hugepd_shift(*hpdp);
+out:
+	if (hpage_shift)
+		*hpage_shift = pdshift;
+	return ret_pte;
+}
+EXPORT_SYMBOL_GPL(__find_linux_pte);
-- 
cgit v1.2.3-59-g8ed1b


From b7dcf96ce03e2cab7eb6cda2ca8c66e1529e9bc3 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 05:59:42 +0000
Subject: powerpc/mm: make hugetlbpage.c depend on CONFIG_HUGETLB_PAGE

The only function in hugetlbpage.c which doesn't depend on
CONFIG_HUGETLB_PAGE is gup_hugepte(), and this function is
only called from gup_huge_pd() which depends on
CONFIG_HUGETLB_PAGE so all the content of hugetlbpage.c
depends on CONFIG_HUGETLB_PAGE.

This patch modifies Makefile to only compile hugetlbpage.c
when CONFIG_HUGETLB_PAGE is set.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/Makefile      | 2 +-
 arch/powerpc/mm/hugetlbpage.c | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 08557bae6fa1..3daea8da0c7f 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -15,7 +15,7 @@ obj-$(CONFIG_PPC_BOOK3S_64)	+= pgtable-frag.o
 obj-$(CONFIG_PPC32)		+= pgtable-frag.o
 obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
-obj-y				+= hugetlbpage.o
+obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)		+= highmem.o
 obj-$(CONFIG_PPC_COPRO_BASE)	+= copro_fault.o
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 17915fc389ff..9f69594f5d09 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -26,9 +26,6 @@
 #include <asm/hugetlb.h>
 #include <asm/pte-walk.h>
 
-
-#ifdef CONFIG_HUGETLB_PAGE
-
 #define PAGE_SHIFT_64K	16
 #define PAGE_SHIFT_512K	19
 #define PAGE_SHIFT_8M	23
@@ -754,8 +751,6 @@ void flush_dcache_icache_hugepage(struct page *page)
 	}
 }
 
-#endif /* CONFIG_HUGETLB_PAGE */
-
 int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 0001e5aa5c028c11570f2e641f0198287f4808ba Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 05:59:43 +0000
Subject: powerpc/mm: make gup_hugepte() static

gup_huge_pd() is the only user of gup_hugepte() and it is
located in the same file. This patch moves gup_huge_pd()
after gup_hugepte() and makes gup_hugepte() static.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/pgtable.h |  3 ---
 arch/powerpc/mm/hugetlbpage.c      | 38 +++++++++++++++++++-------------------
 2 files changed, 19 insertions(+), 22 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 505550fb2935..c51846da41a7 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -89,9 +89,6 @@ extern void paging_init(void);
  */
 extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *);
 
-extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
-		       unsigned long end, int write,
-		       struct page **pages, int *nr);
 #ifndef CONFIG_TRANSPARENT_HUGEPAGE
 #define pmd_large(pmd)		0
 #endif
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 9f69594f5d09..95cc9f3d97e2 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -539,23 +539,6 @@ static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
 	return (__boundary - 1 < end - 1) ? __boundary : end;
 }
 
-int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
-		unsigned long end, int write, struct page **pages, int *nr)
-{
-	pte_t *ptep;
-	unsigned long sz = 1UL << hugepd_shift(hugepd);
-	unsigned long next;
-
-	ptep = hugepte_offset(hugepd, addr, pdshift);
-	do {
-		next = hugepte_addr_end(addr, end, sz);
-		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
-			return 0;
-	} while (ptep++, addr = next, addr != end);
-
-	return 1;
-}
-
 #ifdef CONFIG_PPC_MM_SLICES
 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long len, unsigned long pgoff,
@@ -751,8 +734,8 @@ void flush_dcache_icache_hugepage(struct page *page)
 	}
 }
 
-int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
+static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+		       unsigned long end, int write, struct page **pages, int *nr)
 {
 	unsigned long pte_end;
 	struct page *head, *page;
@@ -798,3 +781,20 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 
 	return 1;
 }
+
+int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	pte_t *ptep;
+	unsigned long sz = 1UL << hugepd_shift(hugepd);
+	unsigned long next;
+
+	ptep = hugepte_offset(hugepd, addr, pdshift);
+	do {
+		next = hugepte_addr_end(addr, end, sz);
+		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
+			return 0;
+	} while (ptep++, addr = next, addr != end);
+
+	return 1;
+}
-- 
cgit v1.2.3-59-g8ed1b


From 8197af22be01e7c9ab476138652e0dc8cd22a207 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 05:59:44 +0000
Subject: powerpc/mm: split asm/hugetlb.h into dedicated subarch files

Three subarches support hugepages:
  - fsl book3e
  - book3s/64
  - 8xx

This patch splits asm/hugetlb.h to reduce the #ifdef mess.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hugetlb.h     | 40 +++++++++++
 arch/powerpc/include/asm/hugetlb.h               | 87 ++----------------------
 arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h | 31 +++++++++
 arch/powerpc/include/asm/nohash/hugetlb-book3e.h | 31 +++++++++
 4 files changed, 106 insertions(+), 83 deletions(-)
 create mode 100644 arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
 create mode 100644 arch/powerpc/include/asm/nohash/hugetlb-book3e.h

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index ec2a55a553c7..cbc8153d6e0e 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -62,4 +62,44 @@ extern pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
 extern void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 					 unsigned long addr, pte_t *ptep,
 					 pte_t old_pte, pte_t new_pte);
+/*
+ * This should work for other subarchs too. But right now we use the
+ * new format only for 64bit book3s
+ */
+static inline pte_t *hugepd_page(hugepd_t hpd)
+{
+	BUG_ON(!hugepd_ok(hpd));
+	/*
+	 * We have only four bits to encode, MMU page size
+	 */
+	BUILD_BUG_ON((MMU_PAGE_COUNT - 1) > 0xf);
+	return __va(hpd_val(hpd) & HUGEPD_ADDR_MASK);
+}
+
+static inline unsigned int hugepd_mmu_psize(hugepd_t hpd)
+{
+	return (hpd_val(hpd) & HUGEPD_SHIFT_MASK) >> 2;
+}
+
+static inline unsigned int hugepd_shift(hugepd_t hpd)
+{
+	return mmu_psize_to_shift(hugepd_mmu_psize(hpd));
+}
+static inline void flush_hugetlb_page(struct vm_area_struct *vma,
+				      unsigned long vmaddr)
+{
+	if (radix_enabled())
+		return radix__flush_hugetlb_page(vma, vmaddr);
+}
+
+static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
+				    unsigned int pdshift)
+{
+	unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(hpd);
+
+	return hugepd_page(hpd) + idx;
+}
+
+void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
+
 #endif
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index 8d40565ad0c3..fd5c0873a57d 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -6,83 +6,13 @@
 #include <asm/page.h>
 
 #ifdef CONFIG_PPC_BOOK3S_64
-
 #include <asm/book3s/64/hugetlb.h>
-/*
- * This should work for other subarchs too. But right now we use the
- * new format only for 64bit book3s
- */
-static inline pte_t *hugepd_page(hugepd_t hpd)
-{
-	BUG_ON(!hugepd_ok(hpd));
-	/*
-	 * We have only four bits to encode, MMU page size
-	 */
-	BUILD_BUG_ON((MMU_PAGE_COUNT - 1) > 0xf);
-	return __va(hpd_val(hpd) & HUGEPD_ADDR_MASK);
-}
-
-static inline unsigned int hugepd_mmu_psize(hugepd_t hpd)
-{
-	return (hpd_val(hpd) & HUGEPD_SHIFT_MASK) >> 2;
-}
-
-static inline unsigned int hugepd_shift(hugepd_t hpd)
-{
-	return mmu_psize_to_shift(hugepd_mmu_psize(hpd));
-}
-static inline void flush_hugetlb_page(struct vm_area_struct *vma,
-				      unsigned long vmaddr)
-{
-	if (radix_enabled())
-		return radix__flush_hugetlb_page(vma, vmaddr);
-}
-
-#else
-
-static inline pte_t *hugepd_page(hugepd_t hpd)
-{
-	BUG_ON(!hugepd_ok(hpd));
-#ifdef CONFIG_PPC_8xx
-	return (pte_t *)__va(hpd_val(hpd) & ~HUGEPD_SHIFT_MASK);
-#else
-	return (pte_t *)((hpd_val(hpd) &
-			  ~HUGEPD_SHIFT_MASK) | PD_HUGE);
-#endif
-}
-
-static inline unsigned int hugepd_shift(hugepd_t hpd)
-{
-#ifdef CONFIG_PPC_8xx
-	return ((hpd_val(hpd) & _PMD_PAGE_MASK) >> 1) + 17;
-#else
-	return hpd_val(hpd) & HUGEPD_SHIFT_MASK;
-#endif
-}
-
+#elif defined(CONFIG_PPC_FSL_BOOK3E)
+#include <asm/nohash/hugetlb-book3e.h>
+#elif defined(CONFIG_PPC_8xx)
+#include <asm/nohash/32/hugetlb-8xx.h>
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
-
-static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
-				    unsigned pdshift)
-{
-	/*
-	 * On FSL BookE, we have multiple higher-level table entries that
-	 * point to the same hugepte.  Just use the first one since they're all
-	 * identical.  So for that case, idx=0.
-	 */
-	unsigned long idx = 0;
-
-	pte_t *dir = hugepd_page(hpd);
-#ifdef CONFIG_PPC_8xx
-	idx = (addr & ((1UL << pdshift) - 1)) >> PAGE_SHIFT;
-#elif !defined(CONFIG_PPC_FSL_BOOK3E)
-	idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(hpd);
-#endif
-
-	return dir + idx;
-}
-
 void flush_dcache_icache_hugepage(struct page *page);
 
 int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
@@ -99,15 +29,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
 
 void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
 			    pte_t pte);
-#ifdef CONFIG_PPC_8xx
-static inline void flush_hugetlb_page(struct vm_area_struct *vma,
-				      unsigned long vmaddr)
-{
-	flush_tlb_page(vma, vmaddr);
-}
-#else
-void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
-#endif
 
 #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
 void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
new file mode 100644
index 000000000000..997f5b3d6b99
--- /dev/null
+++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H
+#define _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H
+
+static inline pte_t *hugepd_page(hugepd_t hpd)
+{
+	BUG_ON(!hugepd_ok(hpd));
+
+	return (pte_t *)__va(hpd_val(hpd) & ~HUGEPD_SHIFT_MASK);
+}
+
+static inline unsigned int hugepd_shift(hugepd_t hpd)
+{
+	return ((hpd_val(hpd) & _PMD_PAGE_MASK) >> 1) + 17;
+}
+
+static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
+				    unsigned int pdshift)
+{
+	unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> PAGE_SHIFT;
+
+	return hugepd_page(hpd) + idx;
+}
+
+static inline void flush_hugetlb_page(struct vm_area_struct *vma,
+				      unsigned long vmaddr)
+{
+	flush_tlb_page(vma, vmaddr);
+}
+
+#endif /* _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H */
diff --git a/arch/powerpc/include/asm/nohash/hugetlb-book3e.h b/arch/powerpc/include/asm/nohash/hugetlb-book3e.h
new file mode 100644
index 000000000000..e94f1cd048ee
--- /dev/null
+++ b/arch/powerpc/include/asm/nohash/hugetlb-book3e.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_NOHASH_HUGETLB_BOOK3E_H
+#define _ASM_POWERPC_NOHASH_HUGETLB_BOOK3E_H
+
+static inline pte_t *hugepd_page(hugepd_t hpd)
+{
+	if (WARN_ON(!hugepd_ok(hpd)))
+		return NULL;
+
+	return (pte_t *)((hpd_val(hpd) & ~HUGEPD_SHIFT_MASK) | PD_HUGE);
+}
+
+static inline unsigned int hugepd_shift(hugepd_t hpd)
+{
+	return hpd_val(hpd) & HUGEPD_SHIFT_MASK;
+}
+
+static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
+				    unsigned int pdshift)
+{
+	/*
+	 * On FSL BookE, we have multiple higher-level table entries that
+	 * point to the same hugepte.  Just use the first one since they're all
+	 * identical.  So for that case, idx=0.
+	 */
+	return hugepd_page(hpd);
+}
+
+void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
+
+#endif /* _ASM_POWERPC_NOHASH_HUGETLB_BOOK3E_H */
-- 
cgit v1.2.3-59-g8ed1b


From 5fb84fec46015758271fcd2a746633fd4d48e619 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 05:59:45 +0000
Subject: powerpc/mm: add a helper to populate hugepd

This patchs adds a subarch helper to populate hugepd.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hugetlb.h     |  5 +++++
 arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h |  8 ++++++++
 arch/powerpc/include/asm/nohash/hugetlb-book3e.h |  6 ++++++
 arch/powerpc/mm/hugetlbpage.c                    | 20 +-------------------
 4 files changed, 20 insertions(+), 19 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index cbc8153d6e0e..def77a45e905 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -100,6 +100,11 @@ static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
 	return hugepd_page(hpd) + idx;
 }
 
+static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshift)
+{
+	*hpdp = __hugepd(__pa(new) | HUGEPD_VAL_BITS | (shift_to_mmu_psize(pshift) << 2));
+}
+
 void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
 
 #endif
diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
index 997f5b3d6b99..75676885bec2 100644
--- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
@@ -2,6 +2,8 @@
 #ifndef _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H
 #define _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H
 
+#define PAGE_SHIFT_8M		23
+
 static inline pte_t *hugepd_page(hugepd_t hpd)
 {
 	BUG_ON(!hugepd_ok(hpd));
@@ -28,4 +30,10 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma,
 	flush_tlb_page(vma, vmaddr);
 }
 
+static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshift)
+{
+	*hpdp = __hugepd(__pa(new) | _PMD_USER | _PMD_PRESENT |
+			 (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M : _PMD_PAGE_512K));
+}
+
 #endif /* _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H */
diff --git a/arch/powerpc/include/asm/nohash/hugetlb-book3e.h b/arch/powerpc/include/asm/nohash/hugetlb-book3e.h
index e94f1cd048ee..51439bcfe313 100644
--- a/arch/powerpc/include/asm/nohash/hugetlb-book3e.h
+++ b/arch/powerpc/include/asm/nohash/hugetlb-book3e.h
@@ -28,4 +28,10 @@ static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
 
 void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
 
+static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshift)
+{
+	/* We use the old format for PPC_FSL_BOOK3E */
+	*hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
+}
+
 #endif /* _ASM_POWERPC_NOHASH_HUGETLB_BOOK3E_H */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 95cc9f3d97e2..036f408cfcb0 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -26,12 +26,6 @@
 #include <asm/hugetlb.h>
 #include <asm/pte-walk.h>
 
-#define PAGE_SHIFT_64K	16
-#define PAGE_SHIFT_512K	19
-#define PAGE_SHIFT_8M	23
-#define PAGE_SHIFT_16M	24
-#define PAGE_SHIFT_16G	34
-
 bool hugetlb_disabled = false;
 
 unsigned int HPAGE_SHIFT;
@@ -95,19 +89,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 	for (i = 0; i < num_hugepd; i++, hpdp++) {
 		if (unlikely(!hugepd_none(*hpdp)))
 			break;
-		else {
-#ifdef CONFIG_PPC_BOOK3S_64
-			*hpdp = __hugepd(__pa(new) | HUGEPD_VAL_BITS |
-					 (shift_to_mmu_psize(pshift) << 2));
-#elif defined(CONFIG_PPC_8xx)
-			*hpdp = __hugepd(__pa(new) | _PMD_USER |
-					 (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
-					  _PMD_PAGE_512K) | _PMD_PRESENT);
-#else
-			/* We use the old format for PPC_FSL_BOOK3E */
-			*hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
-#endif
-		}
+		hugepd_populate(hpdp, new, pshift);
 	}
 	/* If we bailed from the for loop early, an error occurred, clean up */
 	if (i < num_hugepd) {
-- 
cgit v1.2.3-59-g8ed1b


From 723f268f19daddba56a987b934f3e34a04b6499d Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 05:59:46 +0000
Subject: powerpc/mm: cleanup ifdef mess in add_huge_page_size()

Introduce a subarch specific helper check_and_get_huge_psize()
to check the huge page sizes and cleanup the ifdef mess in
add_huge_page_size()

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hugetlb.h     | 27 +++++++++++++++++
 arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h |  5 ++++
 arch/powerpc/include/asm/nohash/hugetlb-book3e.h |  8 +++++
 arch/powerpc/mm/hugetlbpage.c                    | 37 ++----------------------
 4 files changed, 43 insertions(+), 34 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index def77a45e905..56140d19c85f 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -107,4 +107,31 @@ static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshi
 
 void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
 
+static inline int check_and_get_huge_psize(int shift)
+{
+	int mmu_psize;
+
+	if (shift > SLICE_HIGH_SHIFT)
+		return -EINVAL;
+
+	mmu_psize = shift_to_mmu_psize(shift);
+
+	/*
+	 * We need to make sure that for different page sizes reported by
+	 * firmware we only add hugetlb support for page sizes that can be
+	 * supported by linux page table layout.
+	 * For now we have
+	 * Radix: 2M and 1G
+	 * Hash: 16M and 16G
+	 */
+	if (radix_enabled()) {
+		if (mmu_psize != MMU_PAGE_2M && mmu_psize != MMU_PAGE_1G)
+			return -EINVAL;
+	} else {
+		if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
+			return -EINVAL;
+	}
+	return mmu_psize;
+}
+
 #endif
diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
index 75676885bec2..a46616937d20 100644
--- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
@@ -36,4 +36,9 @@ static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshi
 			 (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M : _PMD_PAGE_512K));
 }
 
+static inline int check_and_get_huge_psize(int shift)
+{
+	return shift_to_mmu_psize(shift);
+}
+
 #endif /* _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H */
diff --git a/arch/powerpc/include/asm/nohash/hugetlb-book3e.h b/arch/powerpc/include/asm/nohash/hugetlb-book3e.h
index 51439bcfe313..ecd8694cb229 100644
--- a/arch/powerpc/include/asm/nohash/hugetlb-book3e.h
+++ b/arch/powerpc/include/asm/nohash/hugetlb-book3e.h
@@ -34,4 +34,12 @@ static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshi
 	*hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
 }
 
+static inline int check_and_get_huge_psize(int shift)
+{
+	if (shift & 1)	/* Not a power of 4 */
+		return -EINVAL;
+
+	return shift_to_mmu_psize(shift);
+}
+
 #endif /* _ASM_POWERPC_NOHASH_HUGETLB_BOOK3E_H */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 036f408cfcb0..7b7027aae73f 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -549,13 +549,6 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
 	return vma_kernel_pagesize(vma);
 }
 
-static inline bool is_power_of_4(unsigned long x)
-{
-	if (is_power_of_2(x))
-		return (__ilog2(x) % 2) ? false : true;
-	return false;
-}
-
 static int __init add_huge_page_size(unsigned long long size)
 {
 	int shift = __ffs(size);
@@ -563,37 +556,13 @@ static int __init add_huge_page_size(unsigned long long size)
 
 	/* Check that it is a page size supported by the hardware and
 	 * that it fits within pagetable and slice limits. */
-	if (size <= PAGE_SIZE)
-		return -EINVAL;
-#if defined(CONFIG_PPC_FSL_BOOK3E)
-	if (!is_power_of_4(size))
+	if (size <= PAGE_SIZE || !is_power_of_2(size))
 		return -EINVAL;
-#elif !defined(CONFIG_PPC_8xx)
-	if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT))
-		return -EINVAL;
-#endif
 
-	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
+	mmu_psize = check_and_get_huge_psize(size);
+	if (mmu_psize < 0)
 		return -EINVAL;
 
-#ifdef CONFIG_PPC_BOOK3S_64
-	/*
-	 * We need to make sure that for different page sizes reported by
-	 * firmware we only add hugetlb support for page sizes that can be
-	 * supported by linux page table layout.
-	 * For now we have
-	 * Radix: 2M and 1G
-	 * Hash: 16M and 16G
-	 */
-	if (radix_enabled()) {
-		if (mmu_psize != MMU_PAGE_2M && mmu_psize != MMU_PAGE_1G)
-			return -EINVAL;
-	} else {
-		if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
-			return -EINVAL;
-	}
-#endif
-
 	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
 
 	/* Return if huge page size has already been setup */
-- 
cgit v1.2.3-59-g8ed1b


From 45d0ba527b575d47b2be75dd517b57cceda04bfe Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 05:59:47 +0000
Subject: powerpc/mm: move hugetlb_disabled into asm/hugetlb.h

No need to have this in asm/page.h, move it into asm/hugetlb.h

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/hugetlb.h    | 2 ++
 arch/powerpc/include/asm/page.h       | 1 -
 arch/powerpc/kernel/fadump.c          | 1 +
 arch/powerpc/mm/book3s64/hash_utils.c | 1 +
 4 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index fd5c0873a57d..84598c6b0959 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -13,6 +13,8 @@
 #include <asm/nohash/32/hugetlb-8xx.h>
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
+extern bool hugetlb_disabled;
+
 void flush_dcache_icache_hugepage(struct page *page);
 
 int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 748f5db2e2b7..6b508420d92b 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -29,7 +29,6 @@
 
 #ifndef __ASSEMBLY__
 #ifdef CONFIG_HUGETLB_PAGE
-extern bool hugetlb_disabled;
 extern unsigned int HPAGE_SHIFT;
 #else
 #define HPAGE_SHIFT PAGE_SHIFT
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 45a8d0be1c96..25f063f56ec5 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -36,6 +36,7 @@
 #include <linux/sysfs.h>
 #include <linux/slab.h>
 #include <linux/cma.h>
+#include <linux/hugetlb.h>
 
 #include <asm/debugfs.h>
 #include <asm/page.h>
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 23ed8db645ad..f0ce860a69ac 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -37,6 +37,7 @@
 #include <linux/context_tracking.h>
 #include <linux/libfdt.h>
 #include <linux/pkeys.h>
+#include <linux/hugetlb.h>
 
 #include <asm/debugfs.h>
 #include <asm/processor.h>
-- 
cgit v1.2.3-59-g8ed1b


From c5710cd20735037ba9be0e95530f0d3795ce07e6 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 05:59:48 +0000
Subject: powerpc/mm: cleanup HPAGE_SHIFT setup

Only book3s/64 may select default among several HPAGE_SHIFT at runtime.
8xx always defines 512K pages as default
FSL_BOOK3E always defines 4M pages as default

This patch limits HUGETLB_PAGE_SIZE_VARIABLE to book3s/64
moves the definitions in subarches files.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig                        |  2 +-
 arch/powerpc/include/asm/hugetlb.h          |  2 ++
 arch/powerpc/include/asm/page.h             | 11 ++++++++---
 arch/powerpc/mm/book3s64/hash_hugetlbpage.c | 16 ++++++++++++++++
 arch/powerpc/mm/hugetlbpage.c               | 23 +++--------------------
 5 files changed, 30 insertions(+), 24 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5d8e692d6470..7815eb0cc2a5 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -390,7 +390,7 @@ source "kernel/Kconfig.hz"
 
 config HUGETLB_PAGE_SIZE_VARIABLE
 	bool
-	depends on HUGETLB_PAGE
+	depends on HUGETLB_PAGE && PPC_BOOK3S_64
 	default y
 
 config MATH_EMULATION
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index 84598c6b0959..20a101046cff 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -15,6 +15,8 @@
 
 extern bool hugetlb_disabled;
 
+void hugetlbpage_init_default(void);
+
 void flush_dcache_icache_hugepage(struct page *page);
 
 int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 6b508420d92b..dbc8c0679480 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -28,10 +28,15 @@
 #define PAGE_SIZE		(ASM_CONST(1) << PAGE_SHIFT)
 
 #ifndef __ASSEMBLY__
-#ifdef CONFIG_HUGETLB_PAGE
-extern unsigned int HPAGE_SHIFT;
-#else
+#ifndef CONFIG_HUGETLB_PAGE
 #define HPAGE_SHIFT PAGE_SHIFT
+#elif defined(CONFIG_PPC_BOOK3S_64)
+extern unsigned int hpage_shift;
+#define HPAGE_SHIFT hpage_shift
+#elif defined(CONFIG_PPC_8xx)
+#define HPAGE_SHIFT		19	/* 512k pages */
+#elif defined(CONFIG_PPC_FSL_BOOK3E)
+#define HPAGE_SHIFT		22	/* 4M pages */
 #endif
 #define HPAGE_SIZE		((1UL) << HPAGE_SHIFT)
 #define HPAGE_MASK		(~(HPAGE_SIZE - 1))
diff --git a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c
index 2d4e02aa15a3..eefa89c6117b 100644
--- a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c
@@ -15,6 +15,9 @@
 #include <asm/cacheflush.h>
 #include <asm/machdep.h>
 
+unsigned int hpage_shift;
+EXPORT_SYMBOL(hpage_shift);
+
 extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
 				  unsigned long pa, unsigned long rlags,
 				  unsigned long vflags, int psize, int ssize);
@@ -150,3 +153,16 @@ void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr
 							   old_pte, pte);
 	set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
 }
+
+void hugetlbpage_init_default(void)
+{
+	/* Set default large page size. Currently, we pick 16M or 1M
+	 * depending on what is available
+	 */
+	if (mmu_psize_defs[MMU_PAGE_16M].shift)
+		hpage_shift = mmu_psize_defs[MMU_PAGE_16M].shift;
+	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+		hpage_shift = mmu_psize_defs[MMU_PAGE_1M].shift;
+	else if (mmu_psize_defs[MMU_PAGE_2M].shift)
+		hpage_shift = mmu_psize_defs[MMU_PAGE_2M].shift;
+}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 7b7027aae73f..847fb495a628 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -28,9 +28,6 @@
 
 bool hugetlb_disabled = false;
 
-unsigned int HPAGE_SHIFT;
-EXPORT_SYMBOL(HPAGE_SHIFT);
-
 #define hugepd_none(hpd)	(hpd_val(hpd) == 0)
 
 #define PTE_T_ORDER	(__builtin_ffs(sizeof(pte_t)) - __builtin_ffs(sizeof(void *)))
@@ -645,23 +642,9 @@ static int __init hugetlbpage_init(void)
 #endif
 	}
 
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
-	/* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
-	if (mmu_psize_defs[MMU_PAGE_4M].shift)
-		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
-	else if (mmu_psize_defs[MMU_PAGE_512K].shift)
-		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
-#else
-	/* Set default large page size. Currently, we pick 16M or 1M
-	 * depending on what is available
-	 */
-	if (mmu_psize_defs[MMU_PAGE_16M].shift)
-		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
-	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
-		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
-	else if (mmu_psize_defs[MMU_PAGE_2M].shift)
-		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
-#endif
+	if (IS_ENABLED(HUGETLB_PAGE_SIZE_VARIABLE))
+		hugetlbpage_init_default();
+
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 4df4b27585227c8ba66fdf0dd7531d1e23a37194 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 05:59:49 +0000
Subject: powerpc/mm: cleanup remaining ifdef mess in hugetlbpage.c

Only 3 subarches support huge pages. So when it is either 2 of them,
it is not the third one.

And mmu_has_feature() is known by all subarches so IS_ENABLED() can
be used instead of #ifdef

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/hugetlbpage.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 847fb495a628..98db5ec6a1dd 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -226,7 +226,7 @@ int __init alloc_bootmem_huge_page(struct hstate *h)
 	return __alloc_bootmem_huge_page(h);
 }
 
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
+#ifndef CONFIG_PPC_BOOK3S_64
 #define HUGEPD_FREELIST_SIZE \
 	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
 
@@ -595,10 +595,10 @@ static int __init hugetlbpage_init(void)
 		return 0;
 	}
 
-#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
-	if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
+	if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled() &&
+	    !mmu_has_feature(MMU_FTR_16M_PAGE))
 		return -ENODEV;
-#endif
+
 	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 		unsigned shift;
 		unsigned pdshift;
@@ -636,10 +636,8 @@ static int __init hugetlbpage_init(void)
 			pgtable_cache_add(PTE_INDEX_SIZE);
 		else if (pdshift > shift)
 			pgtable_cache_add(pdshift - shift);
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
-		else
+		else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || IS_ENABLED(CONFIG_PPC_8xx))
 			pgtable_cache_add(PTE_T_ORDER);
-#endif
 	}
 
 	if (IS_ENABLED(HUGETLB_PAGE_SIZE_VARIABLE))
-- 
cgit v1.2.3-59-g8ed1b


From fab9a1165bcda99682e3319d1c83980fd4e72365 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 05:59:51 +0000
Subject: powerpc/mm: flatten function __find_linux_pte() step 1

__find_linux_pte() is full of if/else which is hard to
follow allthough the handling is pretty simple.

This patch flattens the function by getting rid of as much if/else
as possible. In order to ease the review, this is done in three steps.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/pgtable.c | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 9f4ccd15849f..d332abeedf0a 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -339,12 +339,16 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 	 */
 	if (pgd_none(pgd))
 		return NULL;
-	else if (pgd_huge(pgd)) {
-		ret_pte = (pte_t *) pgdp;
+
+	if (pgd_huge(pgd)) {
+		ret_pte = (pte_t *)pgdp;
 		goto out;
-	} else if (is_hugepd(__hugepd(pgd_val(pgd))))
+	}
+	if (is_hugepd(__hugepd(pgd_val(pgd)))) {
 		hpdp = (hugepd_t *)&pgd;
-	else {
+		goto out_huge;
+	}
+	{
 		/*
 		 * Even if we end up with an unmap, the pgtable will not
 		 * be freed, because we do an rcu free and here we are
@@ -356,12 +360,16 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 
 		if (pud_none(pud))
 			return NULL;
-		else if (pud_huge(pud)) {
+
+		if (pud_huge(pud)) {
 			ret_pte = (pte_t *) pudp;
 			goto out;
-		} else if (is_hugepd(__hugepd(pud_val(pud))))
+		}
+		if (is_hugepd(__hugepd(pud_val(pud)))) {
 			hpdp = (hugepd_t *)&pud;
-		else {
+			goto out_huge;
+		}
+		{
 			pdshift = PMD_SHIFT;
 			pmdp = pmd_offset(&pud, ea);
 			pmd  = READ_ONCE(*pmdp);
@@ -386,12 +394,16 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 			if (pmd_huge(pmd) || pmd_large(pmd)) {
 				ret_pte = (pte_t *) pmdp;
 				goto out;
-			} else if (is_hugepd(__hugepd(pmd_val(pmd))))
+			}
+			if (is_hugepd(__hugepd(pmd_val(pmd)))) {
 				hpdp = (hugepd_t *)&pmd;
-			else
-				return pte_offset_kernel(&pmd, ea);
+				goto out_huge;
+			}
+
+			return pte_offset_kernel(&pmd, ea);
 		}
 	}
+out_huge:
 	if (!hpdp)
 		return NULL;
 
-- 
cgit v1.2.3-59-g8ed1b


From e2fb2511888b3f7768835de0768c24d1e0d74590 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 05:59:52 +0000
Subject: powerpc/mm: flatten function __find_linux_pte() step 2

__find_linux_pte() is full of if/else which is hard to
follow allthough the handling is pretty simple.

Previous patch left { } blocks. This patch removes the first one
by shifting its content to the left.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/pgtable.c | 62 +++++++++++++++++++++++------------------------
 1 file changed, 30 insertions(+), 32 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index d332abeedf0a..c1c6d0b79baa 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -369,39 +369,37 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 			hpdp = (hugepd_t *)&pud;
 			goto out_huge;
 		}
-		{
-			pdshift = PMD_SHIFT;
-			pmdp = pmd_offset(&pud, ea);
-			pmd  = READ_ONCE(*pmdp);
-			/*
-			 * A hugepage collapse is captured by pmd_none, because
-			 * it mark the pmd none and do a hpte invalidate.
-			 */
-			if (pmd_none(pmd))
-				return NULL;
-
-			if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
-				if (is_thp)
-					*is_thp = true;
-				ret_pte = (pte_t *) pmdp;
-				goto out;
-			}
-			/*
-			 * pmd_large check below will handle the swap pmd pte
-			 * we need to do both the check because they are config
-			 * dependent.
-			 */
-			if (pmd_huge(pmd) || pmd_large(pmd)) {
-				ret_pte = (pte_t *) pmdp;
-				goto out;
-			}
-			if (is_hugepd(__hugepd(pmd_val(pmd)))) {
-				hpdp = (hugepd_t *)&pmd;
-				goto out_huge;
-			}
-
-			return pte_offset_kernel(&pmd, ea);
+		pdshift = PMD_SHIFT;
+		pmdp = pmd_offset(&pud, ea);
+		pmd  = READ_ONCE(*pmdp);
+		/*
+		 * A hugepage collapse is captured by pmd_none, because
+		 * it mark the pmd none and do a hpte invalidate.
+		 */
+		if (pmd_none(pmd))
+			return NULL;
+
+		if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
+			if (is_thp)
+				*is_thp = true;
+			ret_pte = (pte_t *)pmdp;
+			goto out;
+		}
+		/*
+		 * pmd_large check below will handle the swap pmd pte
+		 * we need to do both the check because they are config
+		 * dependent.
+		 */
+		if (pmd_huge(pmd) || pmd_large(pmd)) {
+			ret_pte = (pte_t *)pmdp;
+			goto out;
 		}
+		if (is_hugepd(__hugepd(pmd_val(pmd)))) {
+			hpdp = (hugepd_t *)&pmd;
+			goto out_huge;
+		}
+
+		return pte_offset_kernel(&pmd, ea);
 	}
 out_huge:
 	if (!hpdp)
-- 
cgit v1.2.3-59-g8ed1b


From 26e66b08c3376b6fb4ad4508d48a4e74d61f0b9b Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 05:59:53 +0000
Subject: powerpc/mm: flatten function __find_linux_pte() step 3

__find_linux_pte() is full of if/else which is hard to
follow allthough the handling is pretty simple.

Previous patches left a { } block. This patch removes it.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/pgtable.c | 98 +++++++++++++++++++++++------------------------
 1 file changed, 49 insertions(+), 49 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index c1c6d0b79baa..db4a6253df92 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -348,59 +348,59 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 		hpdp = (hugepd_t *)&pgd;
 		goto out_huge;
 	}
-	{
-		/*
-		 * Even if we end up with an unmap, the pgtable will not
-		 * be freed, because we do an rcu free and here we are
-		 * irq disabled
-		 */
-		pdshift = PUD_SHIFT;
-		pudp = pud_offset(&pgd, ea);
-		pud  = READ_ONCE(*pudp);
 
-		if (pud_none(pud))
-			return NULL;
+	/*
+	 * Even if we end up with an unmap, the pgtable will not
+	 * be freed, because we do an rcu free and here we are
+	 * irq disabled
+	 */
+	pdshift = PUD_SHIFT;
+	pudp = pud_offset(&pgd, ea);
+	pud  = READ_ONCE(*pudp);
 
-		if (pud_huge(pud)) {
-			ret_pte = (pte_t *) pudp;
-			goto out;
-		}
-		if (is_hugepd(__hugepd(pud_val(pud)))) {
-			hpdp = (hugepd_t *)&pud;
-			goto out_huge;
-		}
-		pdshift = PMD_SHIFT;
-		pmdp = pmd_offset(&pud, ea);
-		pmd  = READ_ONCE(*pmdp);
-		/*
-		 * A hugepage collapse is captured by pmd_none, because
-		 * it mark the pmd none and do a hpte invalidate.
-		 */
-		if (pmd_none(pmd))
-			return NULL;
-
-		if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
-			if (is_thp)
-				*is_thp = true;
-			ret_pte = (pte_t *)pmdp;
-			goto out;
-		}
-		/*
-		 * pmd_large check below will handle the swap pmd pte
-		 * we need to do both the check because they are config
-		 * dependent.
-		 */
-		if (pmd_huge(pmd) || pmd_large(pmd)) {
-			ret_pte = (pte_t *)pmdp;
-			goto out;
-		}
-		if (is_hugepd(__hugepd(pmd_val(pmd)))) {
-			hpdp = (hugepd_t *)&pmd;
-			goto out_huge;
-		}
+	if (pud_none(pud))
+		return NULL;
 
-		return pte_offset_kernel(&pmd, ea);
+	if (pud_huge(pud)) {
+		ret_pte = (pte_t *)pudp;
+		goto out;
 	}
+	if (is_hugepd(__hugepd(pud_val(pud)))) {
+		hpdp = (hugepd_t *)&pud;
+		goto out_huge;
+	}
+	pdshift = PMD_SHIFT;
+	pmdp = pmd_offset(&pud, ea);
+	pmd  = READ_ONCE(*pmdp);
+	/*
+	 * A hugepage collapse is captured by pmd_none, because
+	 * it mark the pmd none and do a hpte invalidate.
+	 */
+	if (pmd_none(pmd))
+		return NULL;
+
+	if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
+		if (is_thp)
+			*is_thp = true;
+		ret_pte = (pte_t *)pmdp;
+		goto out;
+	}
+	/*
+	 * pmd_large check below will handle the swap pmd pte
+	 * we need to do both the check because they are config
+	 * dependent.
+	 */
+	if (pmd_huge(pmd) || pmd_large(pmd)) {
+		ret_pte = (pte_t *)pmdp;
+		goto out;
+	}
+	if (is_hugepd(__hugepd(pmd_val(pmd)))) {
+		hpdp = (hugepd_t *)&pmd;
+		goto out_huge;
+	}
+
+	return pte_offset_kernel(&pmd, ea);
+
 out_huge:
 	if (!hpdp)
 		return NULL;
-- 
cgit v1.2.3-59-g8ed1b


From 447def3b06adab60b999417b316bd2352d7e643e Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 15:57:59 +0000
Subject: powerpc/mm: drop __bad_pte()

This has never been called (since Kernel has been in git at least),
drop it.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/32/pgalloc.h | 2 --
 arch/powerpc/include/asm/nohash/32/pgalloc.h | 2 --
 2 files changed, 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index 3633502e102c..645af86cd072 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -22,8 +22,6 @@
  */
 #define MAX_PGTABLE_INDEX_SIZE	0xf
 
-extern void __bad_pte(pmd_t *pmd);
-
 extern struct kmem_cache *pgtable_cache[];
 #define PGT_CACHE(shift) pgtable_cache[shift]
 
diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h
index bd186e85b4f7..ea265a578eb0 100644
--- a/arch/powerpc/include/asm/nohash/32/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h
@@ -22,8 +22,6 @@
  */
 #define MAX_PGTABLE_INDEX_SIZE	0xf
 
-extern void __bad_pte(pmd_t *pmd);
-
 extern struct kmem_cache *pgtable_cache[];
 #define PGT_CACHE(shift) pgtable_cache[shift]
 
-- 
cgit v1.2.3-59-g8ed1b


From 737b434d3d55c0b3c23df4eab1ea5b33f8850f30 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 15:58:01 +0000
Subject: powerpc/mm: convert Book3E 64 to pte_fragment

Book3E 64 is the only subarch not using pte_fragment. In order
to allow refactorisation, this patch converts it to pte_fragment.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mmu_context.h       |  6 -----
 arch/powerpc/include/asm/nohash/64/mmu.h     |  4 +++-
 arch/powerpc/include/asm/nohash/64/pgalloc.h | 33 ++++++++++------------------
 arch/powerpc/mm/Makefile                     |  2 +-
 arch/powerpc/mm/mmu_context.c                |  2 +-
 5 files changed, 17 insertions(+), 30 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 6ee8195a2ffb..66a3805dc935 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -228,13 +228,7 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
 #endif
 }
 
-#ifdef CONFIG_PPC_BOOK3E_64
-static inline void arch_exit_mmap(struct mm_struct *mm)
-{
-}
-#else
 extern void arch_exit_mmap(struct mm_struct *mm);
-#endif
 
 static inline void arch_unmap(struct mm_struct *mm,
 			      struct vm_area_struct *vma,
diff --git a/arch/powerpc/include/asm/nohash/64/mmu.h b/arch/powerpc/include/asm/nohash/64/mmu.h
index 81cf30c370e5..26e05ce8f5aa 100644
--- a/arch/powerpc/include/asm/nohash/64/mmu.h
+++ b/arch/powerpc/include/asm/nohash/64/mmu.h
@@ -4,11 +4,13 @@
 
 #define MAX_PHYSMEM_BITS        44
 
+#include <asm/page.h>
+
 /* Freescale Book-E software loaded TLB or Book-3e (ISA 2.06+) MMU */
 #include <asm/nohash/mmu-book3e.h>
 
 #ifndef __ASSEMBLY__
-typedef struct page *pgtable_t;
+typedef pte_t *pgtable_t;
 #endif
 
 #endif /* _ASM_POWERPC_NOHASH_64_MMU_H_ */
diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h
index ded453f9b5a8..7fb87235f845 100644
--- a/arch/powerpc/include/asm/nohash/64/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h
@@ -76,10 +76,10 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 				pgtable_t pte_page)
 {
-	pmd_set(pmd, (unsigned long)page_address(pte_page));
+	pmd_set(pmd, (unsigned long)pte_page);
 }
 
-#define pmd_pgtable(pmd) pmd_page(pmd)
+#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
@@ -92,44 +92,35 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 	kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd);
 }
 
+pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel);
 
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
 {
-	return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+	return (pte_t *)pte_fragment_alloc(mm, 1);
 }
 
 static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
 {
-	struct page *page;
-	pte_t *pte;
-
-	pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT);
-	if (!pte)
-		return NULL;
-	page = virt_to_page(pte);
-	if (!pgtable_page_ctor(page)) {
-		__free_page(page);
-		return NULL;
-	}
-	return page;
+	return (pgtable_t)pte_fragment_alloc(mm, 0);
 }
 
+void pte_frag_destroy(void *pte_frag);
+void pte_fragment_free(unsigned long *table, int kernel);
+
 static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
-	free_page((unsigned long)pte);
+	pte_fragment_free((unsigned long *)pte, 1);
 }
 
 static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
 {
-	pgtable_page_dtor(ptepage);
-	__free_page(ptepage);
+	pte_fragment_free((unsigned long *)ptepage, 0);
 }
 
 static inline void pgtable_free(void *table, int shift)
 {
 	if (!shift) {
-		pgtable_page_dtor(virt_to_page(table));
-		free_page((unsigned long)table);
+		pte_fragment_free((unsigned long *)table, 0);
 	} else {
 		BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
 		kmem_cache_free(PGT_CACHE(shift), table);
@@ -166,7 +157,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
 				  unsigned long address)
 {
 	tlb_flush_pgtable(tlb, address);
-	pgtable_free_tlb(tlb, page_address(table), 0);
+	pgtable_free_tlb(tlb, table, 0);
 }
 
 #define __pmd_free_tlb(tlb, pmd, addr)		      \
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 3daea8da0c7f..c7d5f37f7c52 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -7,12 +7,12 @@ ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
 
 obj-y				:= fault.o mem.o pgtable.o mmap.o \
 				   init_$(BITS).o pgtable_$(BITS).o \
+				   pgtable-frag.o \
 				   init-common.o mmu_context.o drmem.o
 obj-$(CONFIG_PPC_MMU_NOHASH)	+= nohash/
 obj-$(CONFIG_PPC_BOOK3S_32)	+= book3s32/
 obj-$(CONFIG_PPC_BOOK3S_64)	+= book3s64/
 obj-$(CONFIG_PPC_BOOK3S_64)	+= pgtable-frag.o
-obj-$(CONFIG_PPC32)		+= pgtable-frag.o
 obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
index bb52320b7369..6b049d82b98a 100644
--- a/arch/powerpc/mm/mmu_context.c
+++ b/arch/powerpc/mm/mmu_context.c
@@ -98,7 +98,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	switch_mmu_context(prev, next, tsk);
 }
 
-#ifdef CONFIG_PPC32
+#ifndef CONFIG_PPC_BOOK3S_64
 void arch_exit_mmap(struct mm_struct *mm)
 {
 	void *frag = pte_frag_get(&mm->context);
-- 
cgit v1.2.3-59-g8ed1b


From 696dffa24bd0e17c8bccb18467555c17cc15e62c Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 15:58:02 +0000
Subject: powerpc/mm: move pgtable_t in asm/mmu.h

pgtable_t is now identical for all subarches, move it to the
top level asm/mmu.h

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/32/mmu-hash.h | 4 ----
 arch/powerpc/include/asm/book3s/64/mmu.h      | 8 --------
 arch/powerpc/include/asm/mmu.h                | 3 +++
 arch/powerpc/include/asm/nohash/32/mmu.h      | 6 ------
 arch/powerpc/include/asm/nohash/64/mmu.h      | 6 ------
 5 files changed, 3 insertions(+), 24 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
index f9eae105a9f4..2e277ca0170f 100644
--- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
@@ -10,8 +10,6 @@
  * BATs
  */
 
-#include <asm/page.h>
-
 /* Block size masks */
 #define BL_128K	0x000
 #define BL_256K 0x001
@@ -49,8 +47,6 @@ struct ppc_bat {
 	u32 batu;
 	u32 batl;
 };
-
-typedef pte_t *pgtable_t;
 #endif /* !__ASSEMBLY__ */
 
 /*
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 51b2d60efc1b..74d24201fc4f 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -25,14 +25,6 @@ struct mmu_psize_def {
 	};
 };
 extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
-
-/*
- * For BOOK3s 64 with 4k and 64K linux page size
- * we want to use pointers, because the page table
- * actually store pfn
- */
-typedef pte_t *pgtable_t;
-
 #endif /* __ASSEMBLY__ */
 
 /*
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index d86c5641bd97..ba94ce8c22d7 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -129,6 +129,9 @@
 #ifndef __ASSEMBLY__
 #include <linux/bug.h>
 #include <asm/cputable.h>
+#include <asm/page.h>
+
+typedef pte_t *pgtable_t;
 
 #ifdef CONFIG_PPC_FSL_BOOK3E
 #include <asm/percpu.h>
diff --git a/arch/powerpc/include/asm/nohash/32/mmu.h b/arch/powerpc/include/asm/nohash/32/mmu.h
index 7d94a36d57d2..af0e8b54876a 100644
--- a/arch/powerpc/include/asm/nohash/32/mmu.h
+++ b/arch/powerpc/include/asm/nohash/32/mmu.h
@@ -2,8 +2,6 @@
 #ifndef _ASM_POWERPC_NOHASH_32_MMU_H_
 #define _ASM_POWERPC_NOHASH_32_MMU_H_
 
-#include <asm/page.h>
-
 #if defined(CONFIG_40x)
 /* 40x-style software loaded TLB */
 #include <asm/nohash/32/mmu-40x.h>
@@ -18,8 +16,4 @@
 #include <asm/nohash/32/mmu-8xx.h>
 #endif
 
-#ifndef __ASSEMBLY__
-typedef pte_t *pgtable_t;
-#endif
-
 #endif /* _ASM_POWERPC_NOHASH_32_MMU_H_ */
diff --git a/arch/powerpc/include/asm/nohash/64/mmu.h b/arch/powerpc/include/asm/nohash/64/mmu.h
index 26e05ce8f5aa..e490ecdac012 100644
--- a/arch/powerpc/include/asm/nohash/64/mmu.h
+++ b/arch/powerpc/include/asm/nohash/64/mmu.h
@@ -4,13 +4,7 @@
 
 #define MAX_PHYSMEM_BITS        44
 
-#include <asm/page.h>
-
 /* Freescale Book-E software loaded TLB or Book-3e (ISA 2.06+) MMU */
 #include <asm/nohash/mmu-book3e.h>
 
-#ifndef __ASSEMBLY__
-typedef pte_t *pgtable_t;
-#endif
-
 #endif /* _ASM_POWERPC_NOHASH_64_MMU_H_ */
-- 
cgit v1.2.3-59-g8ed1b


From 7a792d5da27f8407c5fe1b3c976106229e0d8bbd Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 15:58:03 +0000
Subject: powerpc/mm: get rid of nohash/32/mmu.h and nohash/64/mmu.h

Those files have no real added values, especially the 64 bit
which only includes the common book3e mmu.h which is also
included from 32 bits side.

So lets do the final inclusion directly from nohash/mmu.h

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/nohash/32/mmu.h     | 19 -------------------
 arch/powerpc/include/asm/nohash/64/mmu.h     | 10 ----------
 arch/powerpc/include/asm/nohash/mmu-book3e.h |  2 ++
 arch/powerpc/include/asm/nohash/mmu.h        | 16 ++++++++++++----
 4 files changed, 14 insertions(+), 33 deletions(-)
 delete mode 100644 arch/powerpc/include/asm/nohash/32/mmu.h
 delete mode 100644 arch/powerpc/include/asm/nohash/64/mmu.h

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/nohash/32/mmu.h b/arch/powerpc/include/asm/nohash/32/mmu.h
deleted file mode 100644
index af0e8b54876a..000000000000
--- a/arch/powerpc/include/asm/nohash/32/mmu.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_POWERPC_NOHASH_32_MMU_H_
-#define _ASM_POWERPC_NOHASH_32_MMU_H_
-
-#if defined(CONFIG_40x)
-/* 40x-style software loaded TLB */
-#include <asm/nohash/32/mmu-40x.h>
-#elif defined(CONFIG_44x)
-/* 44x-style software loaded TLB */
-#include <asm/nohash/32/mmu-44x.h>
-#elif defined(CONFIG_PPC_BOOK3E_MMU)
-/* Freescale Book-E software loaded TLB or Book-3e (ISA 2.06+) MMU */
-#include <asm/nohash/mmu-book3e.h>
-#elif defined (CONFIG_PPC_8xx)
-/* Motorola/Freescale 8xx software loaded TLB */
-#include <asm/nohash/32/mmu-8xx.h>
-#endif
-
-#endif /* _ASM_POWERPC_NOHASH_32_MMU_H_ */
diff --git a/arch/powerpc/include/asm/nohash/64/mmu.h b/arch/powerpc/include/asm/nohash/64/mmu.h
deleted file mode 100644
index e490ecdac012..000000000000
--- a/arch/powerpc/include/asm/nohash/64/mmu.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_POWERPC_NOHASH_64_MMU_H_
-#define _ASM_POWERPC_NOHASH_64_MMU_H_
-
-#define MAX_PHYSMEM_BITS        44
-
-/* Freescale Book-E software loaded TLB or Book-3e (ISA 2.06+) MMU */
-#include <asm/nohash/mmu-book3e.h>
-
-#endif /* _ASM_POWERPC_NOHASH_64_MMU_H_ */
diff --git a/arch/powerpc/include/asm/nohash/mmu-book3e.h b/arch/powerpc/include/asm/nohash/mmu-book3e.h
index e20072972e35..4c9777d256fb 100644
--- a/arch/powerpc/include/asm/nohash/mmu-book3e.h
+++ b/arch/powerpc/include/asm/nohash/mmu-book3e.h
@@ -306,6 +306,8 @@ extern int book3e_htw_mode;
 
 #define mmu_cleanup_all NULL
 
+#define MAX_PHYSMEM_BITS        44
+
 #endif
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/nohash/mmu.h b/arch/powerpc/include/asm/nohash/mmu.h
index a037cb1efb57..edc793e5f08f 100644
--- a/arch/powerpc/include/asm/nohash/mmu.h
+++ b/arch/powerpc/include/asm/nohash/mmu.h
@@ -2,10 +2,18 @@
 #ifndef _ASM_POWERPC_NOHASH_MMU_H_
 #define _ASM_POWERPC_NOHASH_MMU_H_
 
-#ifdef CONFIG_PPC64
-#include <asm/nohash/64/mmu.h>
-#else
-#include <asm/nohash/32/mmu.h>
+#if defined(CONFIG_40x)
+/* 40x-style software loaded TLB */
+#include <asm/nohash/32/mmu-40x.h>
+#elif defined(CONFIG_44x)
+/* 44x-style software loaded TLB */
+#include <asm/nohash/32/mmu-44x.h>
+#elif defined(CONFIG_PPC_BOOK3E_MMU)
+/* Freescale Book-E software loaded TLB or Book-3e (ISA 2.06+) MMU */
+#include <asm/nohash/mmu-book3e.h>
+#elif defined (CONFIG_PPC_8xx)
+/* Motorola/Freescale 8xx software loaded TLB */
+#include <asm/nohash/32/mmu-8xx.h>
 #endif
 
 #endif /* _ASM_POWERPC_NOHASH_MMU_H_ */
-- 
cgit v1.2.3-59-g8ed1b


From e7a7be5679a5c5f1817226d8253d971520038b67 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 15:58:04 +0000
Subject: powerpc/Kconfig: select PPC_MM_SLICES from subarch type

Lets select PPC_MM_SLICES from the subarch config item instead of
doing it via defaults declaration in the PPC_MM_SLICES item itself.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/Kconfig.cputype | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index cd28b045c0f3..fa6b03205ae1 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -38,6 +38,7 @@ config PPC_8xx
 	select SYS_SUPPORTS_HUGETLBFS
 	select PPC_HAVE_KUEP
 	select PPC_HAVE_KUAP
+	select PPC_MM_SLICES if HUGETLB_PAGE
 
 config 40x
 	bool "AMCC 40x"
@@ -79,6 +80,7 @@ config PPC_BOOK3S_64
 	select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
 	select ARCH_SUPPORTS_NUMA_BALANCING
 	select IRQ_WORK
+	select PPC_MM_SLICES
 
 config PPC_BOOK3E_64
 	bool "Embedded processors"
@@ -401,8 +403,6 @@ config PPC_BOOK3E_MMU
 
 config PPC_MM_SLICES
 	bool
-	default y if PPC_BOOK3S_64
-	default y if PPC_8xx && HUGETLB_PAGE
 
 config PPC_HAVE_PMU_SUPPORT
        bool
-- 
cgit v1.2.3-59-g8ed1b


From 627f06c6f51e6af6ca3f7d1e82154b59583abc15 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 15:58:05 +0000
Subject: powerpc/book3e: move early_alloc_pgtable() to init section

early_alloc_pgtable() is only used during init.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/nohash/book3e_pgtable.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c
index f296c2e88b09..75e9e2c35fe2 100644
--- a/arch/powerpc/mm/nohash/book3e_pgtable.c
+++ b/arch/powerpc/mm/nohash/book3e_pgtable.c
@@ -55,7 +55,7 @@ void vmemmap_remove_mapping(unsigned long start,
 #endif
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
-static __ref void *early_alloc_pgtable(unsigned long size)
+static void __init *early_alloc_pgtable(unsigned long size)
 {
 	void *ptr;
 
@@ -74,7 +74,7 @@ static __ref void *early_alloc_pgtable(unsigned long size)
  * map_kernel_page adds an entry to the ioremap page table
  * and adds an entry to the HPT, possibly bolting it
  */
-int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
+int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
 {
 	pgd_t *pgdp;
 	pud_t *pudp;
-- 
cgit v1.2.3-59-g8ed1b


From 4a6d8cf90017019f3b2829b38157cd1a74c64856 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 15:58:06 +0000
Subject: powerpc/mm: don't use pte_alloc_kernel() until slab is available on
 PPC32

In the same way as PPC64, implement early allocation functions and
avoid calling pte_alloc_kernel() before slab is available.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/pgtable_32.c | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index c9cdbb84d31f..e54b612cbc98 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -43,11 +43,8 @@ EXPORT_SYMBOL(ioremap_bot);	/* aka VMALLOC_END */
 
 extern char etext[], _stext[], _sinittext[], _einittext[];
 
-__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
 {
-	if (!slab_is_available())
-		return memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE);
-
 	return (pte_t *)pte_fragment_alloc(mm, 1);
 }
 
@@ -205,7 +202,29 @@ void iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(iounmap);
 
-int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot)
+static void __init *early_alloc_pgtable(unsigned long size)
+{
+	void *ptr = memblock_alloc(size, size);
+
+	if (!ptr)
+		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+		      __func__, size, size);
+
+	return ptr;
+}
+
+static pte_t __init *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va)
+{
+	if (pmd_none(*pmdp)) {
+		pte_t *ptep = early_alloc_pgtable(PTE_FRAG_SIZE);
+
+		pmd_populate_kernel(&init_mm, pmdp, ptep);
+	}
+	return pte_offset_kernel(pmdp, va);
+}
+
+
+int __ref map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot)
 {
 	pmd_t *pd;
 	pte_t *pg;
@@ -214,7 +233,10 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot)
 	/* Use upper 10 bits of VA to index the first level map */
 	pd = pmd_offset(pud_offset(pgd_offset_k(va), va), va);
 	/* Use middle 10 bits of VA to index the second-level map */
-	pg = pte_alloc_kernel(pd, va);
+	if (likely(slab_is_available()))
+		pg = pte_alloc_kernel(pd, va);
+	else
+		pg = early_pte_alloc_kernel(pd, va);
 	if (pg != 0) {
 		err = 0;
 		/* The PTE should never be already set nor present in the
-- 
cgit v1.2.3-59-g8ed1b


From b0124ff57e9405725b4dfeffbdfa929bb973ad2c Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 15:58:07 +0000
Subject: powerpc/mm: inline pte_alloc_one_kernel() and pte_alloc_one() on
 PPC32

pte_alloc_one_kernel() and pte_alloc_one() are simple calls to
pte_fragment_alloc(), so they are good candidates for inlining as
already done on PPC64.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/32/pgalloc.h | 15 ++++++++++++---
 arch/powerpc/include/asm/nohash/32/pgalloc.h | 15 ++++++++++++---
 arch/powerpc/mm/pgtable_32.c                 | 10 ----------
 3 files changed, 24 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index 645af86cd072..0ed856068bb8 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -59,10 +59,19 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
 
 #define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
 
-extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);
-extern pgtable_t pte_alloc_one(struct mm_struct *mm);
-void pte_frag_destroy(void *pte_frag);
 pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel);
+
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
+{
+	return (pte_t *)pte_fragment_alloc(mm, 1);
+}
+
+static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
+{
+	return (pgtable_t)pte_fragment_alloc(mm, 0);
+}
+
+void pte_frag_destroy(void *pte_frag);
 void pte_fragment_free(unsigned long *table, int kernel);
 
 static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h
index ea265a578eb0..1d41508f0676 100644
--- a/arch/powerpc/include/asm/nohash/32/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h
@@ -77,10 +77,19 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
 #define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
 #endif
 
-extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);
-extern pgtable_t pte_alloc_one(struct mm_struct *mm);
-void pte_frag_destroy(void *pte_frag);
 pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel);
+
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
+{
+	return (pte_t *)pte_fragment_alloc(mm, 1);
+}
+
+static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
+{
+	return (pgtable_t)pte_fragment_alloc(mm, 0);
+}
+
+void pte_frag_destroy(void *pte_frag);
 void pte_fragment_free(unsigned long *table, int kernel);
 
 static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index e54b612cbc98..2e67f9a1430b 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -43,16 +43,6 @@ EXPORT_SYMBOL(ioremap_bot);	/* aka VMALLOC_END */
 
 extern char etext[], _stext[], _sinittext[], _einittext[];
 
-pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
-{
-	return (pte_t *)pte_fragment_alloc(mm, 1);
-}
-
-pgtable_t pte_alloc_one(struct mm_struct *mm)
-{
-	return (pgtable_t)pte_fragment_alloc(mm, 0);
-}
-
 void __iomem *
 ioremap(phys_addr_t addr, unsigned long size)
 {
-- 
cgit v1.2.3-59-g8ed1b


From dc096864ba784c2d3d10480d71f14a53f40f997c Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 15:58:08 +0000
Subject: powerpc/mm: refactor pte_alloc_one() and pte_free() families
 definition.

Functions pte_alloc_one(), pte_alloc_one_kernel(), pte_free(),
pte_free_kernel() are identical for the four subarches.

This patch moves their definition in a common place.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/32/pgalloc.h | 25 -------------------------
 arch/powerpc/include/asm/book3s/64/pgalloc.h | 22 ----------------------
 arch/powerpc/include/asm/nohash/32/pgalloc.h | 25 -------------------------
 arch/powerpc/include/asm/nohash/64/pgalloc.h | 25 -------------------------
 arch/powerpc/include/asm/pgalloc.h           | 25 +++++++++++++++++++++++++
 5 files changed, 25 insertions(+), 97 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index 0ed856068bb8..46422309d6e0 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -59,31 +59,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
 
 #define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
 
-pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel);
-
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
-{
-	return (pte_t *)pte_fragment_alloc(mm, 1);
-}
-
-static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
-{
-	return (pgtable_t)pte_fragment_alloc(mm, 0);
-}
-
-void pte_frag_destroy(void *pte_frag);
-void pte_fragment_free(unsigned long *table, int kernel);
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-	pte_fragment_free((unsigned long *)pte, 1);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
-{
-	pte_fragment_free((unsigned long *)ptepage, 0);
-}
-
 static inline void pgtable_free(void *table, unsigned index_size)
 {
 	if (!index_size) {
diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index 138bc2ecc0c4..cfd48d8cc055 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -39,9 +39,7 @@ extern struct vmemmap_backing *vmemmap_list;
 extern struct kmem_cache *pgtable_cache[];
 #define PGT_CACHE(shift) pgtable_cache[shift]
 
-extern pte_t *pte_fragment_alloc(struct mm_struct *, int);
 extern pmd_t *pmd_fragment_alloc(struct mm_struct *, unsigned long);
-extern void pte_fragment_free(unsigned long *, int);
 extern void pmd_fragment_free(unsigned long *);
 extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
 #ifdef CONFIG_SMP
@@ -190,26 +188,6 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd)
 	return (pgtable_t)pmd_page_vaddr(pmd);
 }
 
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
-{
-	return (pte_t *)pte_fragment_alloc(mm, 1);
-}
-
-static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
-{
-	return (pgtable_t)pte_fragment_alloc(mm, 0);
-}
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-	pte_fragment_free((unsigned long *)pte, 1);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
-{
-	pte_fragment_free((unsigned long *)ptepage, 0);
-}
-
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
 				  unsigned long address)
 {
diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h
index 1d41508f0676..e96ef2fde2ca 100644
--- a/arch/powerpc/include/asm/nohash/32/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h
@@ -77,31 +77,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
 #define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
 #endif
 
-pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel);
-
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
-{
-	return (pte_t *)pte_fragment_alloc(mm, 1);
-}
-
-static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
-{
-	return (pgtable_t)pte_fragment_alloc(mm, 0);
-}
-
-void pte_frag_destroy(void *pte_frag);
-void pte_fragment_free(unsigned long *table, int kernel);
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-	pte_fragment_free((unsigned long *)pte, 1);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
-{
-	pte_fragment_free((unsigned long *)ptepage, 0);
-}
-
 static inline void pgtable_free(void *table, unsigned index_size)
 {
 	if (!index_size) {
diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h
index 7fb87235f845..98de4f3b0306 100644
--- a/arch/powerpc/include/asm/nohash/64/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h
@@ -92,31 +92,6 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 	kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd);
 }
 
-pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel);
-
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
-{
-	return (pte_t *)pte_fragment_alloc(mm, 1);
-}
-
-static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
-{
-	return (pgtable_t)pte_fragment_alloc(mm, 0);
-}
-
-void pte_frag_destroy(void *pte_frag);
-void pte_fragment_free(unsigned long *table, int kernel);
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-	pte_fragment_free((unsigned long *)pte, 1);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
-{
-	pte_fragment_free((unsigned long *)ptepage, 0);
-}
-
 static inline void pgtable_free(void *table, int shift)
 {
 	if (!shift) {
diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h
index e11f03007b57..c2c6fd438840 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -20,6 +20,31 @@ static inline gfp_t pgtable_gfp_flags(struct mm_struct *mm, gfp_t gfp)
 
 #define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
 
+pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel);
+
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
+{
+	return (pte_t *)pte_fragment_alloc(mm, 1);
+}
+
+static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
+{
+	return (pgtable_t)pte_fragment_alloc(mm, 0);
+}
+
+void pte_frag_destroy(void *pte_frag);
+void pte_fragment_free(unsigned long *table, int kernel);
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+	pte_fragment_free((unsigned long *)pte, 1);
+}
+
+static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
+{
+	pte_fragment_free((unsigned long *)ptepage, 0);
+}
+
 #ifdef CONFIG_PPC_BOOK3S
 #include <asm/book3s/pgalloc.h>
 #else
-- 
cgit v1.2.3-59-g8ed1b


From e80789a3c13f9fbc8f361a988868f9b68a8cf134 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 15:58:09 +0000
Subject: powerpc/mm: refactor definition of pgtable_cache[]

pgtable_cache[] is the same for the 4 subarches, lets make it common.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/32/pgalloc.h | 21 ---------------------
 arch/powerpc/include/asm/book3s/64/pgalloc.h | 22 ----------------------
 arch/powerpc/include/asm/nohash/32/pgalloc.h | 21 ---------------------
 arch/powerpc/include/asm/nohash/64/pgalloc.h | 22 ----------------------
 arch/powerpc/include/asm/pgalloc.h           | 21 +++++++++++++++++++++
 5 files changed, 21 insertions(+), 86 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index 46422309d6e0..1b9b5c228230 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -5,26 +5,6 @@
 #include <linux/threads.h>
 #include <linux/slab.h>
 
-/*
- * Functions that deal with pagetables that could be at any level of
- * the table need to be passed an "index_size" so they know how to
- * handle allocation.  For PTE pages (which are linked to a struct
- * page for now, and drawn from the main get_free_pages() pool), the
- * allocation size will be (2^index_size * sizeof(pointer)) and
- * allocations are drawn from the kmem_cache in PGT_CACHE(index_size).
- *
- * The maximum index size needs to be big enough to allow any
- * pagetable sizes we need, but small enough to fit in the low bits of
- * any page table pointer.  In other words all pagetables, even tiny
- * ones, must be aligned to allow at least enough low 0 bits to
- * contain this value.  This value is also used as a mask, so it must
- * be one less than a power of two.
- */
-#define MAX_PGTABLE_INDEX_SIZE	0xf
-
-extern struct kmem_cache *pgtable_cache[];
-#define PGT_CACHE(shift) pgtable_cache[shift]
-
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE),
@@ -69,7 +49,6 @@ static inline void pgtable_free(void *table, unsigned index_size)
 	}
 }
 
-#define check_pgt_cache()	do { } while (0)
 #define get_hugepd_cache_index(x)  (x)
 
 #ifdef CONFIG_SMP
diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index cfd48d8cc055..df2dce6afe14 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -19,26 +19,6 @@ struct vmemmap_backing {
 };
 extern struct vmemmap_backing *vmemmap_list;
 
-/*
- * Functions that deal with pagetables that could be at any level of
- * the table need to be passed an "index_size" so they know how to
- * handle allocation.  For PTE pages (which are linked to a struct
- * page for now, and drawn from the main get_free_pages() pool), the
- * allocation size will be (2^index_size * sizeof(pointer)) and
- * allocations are drawn from the kmem_cache in PGT_CACHE(index_size).
- *
- * The maximum index size needs to be big enough to allow any
- * pagetable sizes we need, but small enough to fit in the low bits of
- * any page table pointer.  In other words all pagetables, even tiny
- * ones, must be aligned to allow at least enough low 0 bits to
- * contain this value.  This value is also used as a mask, so it must
- * be one less than a power of two.
- */
-#define MAX_PGTABLE_INDEX_SIZE	0xf
-
-extern struct kmem_cache *pgtable_cache[];
-#define PGT_CACHE(shift) pgtable_cache[shift]
-
 extern pmd_t *pmd_fragment_alloc(struct mm_struct *, unsigned long);
 extern void pmd_fragment_free(unsigned long *);
 extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
@@ -199,8 +179,6 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
 	pgtable_free_tlb(tlb, table, PTE_INDEX);
 }
 
-#define check_pgt_cache()	do { } while (0)
-
 extern atomic_long_t direct_pages_count[MMU_PAGE_COUNT];
 static inline void update_page_count(int psize, long count)
 {
diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h
index e96ef2fde2ca..4615801aa953 100644
--- a/arch/powerpc/include/asm/nohash/32/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h
@@ -5,26 +5,6 @@
 #include <linux/threads.h>
 #include <linux/slab.h>
 
-/*
- * Functions that deal with pagetables that could be at any level of
- * the table need to be passed an "index_size" so they know how to
- * handle allocation.  For PTE pages (which are linked to a struct
- * page for now, and drawn from the main get_free_pages() pool), the
- * allocation size will be (2^index_size * sizeof(pointer)) and
- * allocations are drawn from the kmem_cache in PGT_CACHE(index_size).
- *
- * The maximum index size needs to be big enough to allow any
- * pagetable sizes we need, but small enough to fit in the low bits of
- * any page table pointer.  In other words all pagetables, even tiny
- * ones, must be aligned to allow at least enough low 0 bits to
- * contain this value.  This value is also used as a mask, so it must
- * be one less than a power of two.
- */
-#define MAX_PGTABLE_INDEX_SIZE	0xf
-
-extern struct kmem_cache *pgtable_cache[];
-#define PGT_CACHE(shift) pgtable_cache[shift]
-
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE),
@@ -87,7 +67,6 @@ static inline void pgtable_free(void *table, unsigned index_size)
 	}
 }
 
-#define check_pgt_cache()	do { } while (0)
 #define get_hugepd_cache_index(x)	(x)
 
 #ifdef CONFIG_SMP
diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h
index 98de4f3b0306..ffc86d42816d 100644
--- a/arch/powerpc/include/asm/nohash/64/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h
@@ -18,26 +18,6 @@ struct vmemmap_backing {
 };
 extern struct vmemmap_backing *vmemmap_list;
 
-/*
- * Functions that deal with pagetables that could be at any level of
- * the table need to be passed an "index_size" so they know how to
- * handle allocation.  For PTE pages (which are linked to a struct
- * page for now, and drawn from the main get_free_pages() pool), the
- * allocation size will be (2^index_size * sizeof(pointer)) and
- * allocations are drawn from the kmem_cache in PGT_CACHE(index_size).
- *
- * The maximum index size needs to be big enough to allow any
- * pagetable sizes we need, but small enough to fit in the low bits of
- * any page table pointer.  In other words all pagetables, even tiny
- * ones, must be aligned to allow at least enough low 0 bits to
- * contain this value.  This value is also used as a mask, so it must
- * be one less than a power of two.
- */
-#define MAX_PGTABLE_INDEX_SIZE	0xf
-
-extern struct kmem_cache *pgtable_cache[];
-#define PGT_CACHE(shift) pgtable_cache[shift]
-
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE),
@@ -140,6 +120,4 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
 #define __pud_free_tlb(tlb, pud, addr)		      \
 	pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE)
 
-#define check_pgt_cache()	do { } while (0)
-
 #endif /* _ASM_POWERPC_PGALLOC_64_H */
diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h
index c2c6fd438840..5761bee0f004 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -45,6 +45,27 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
 	pte_fragment_free((unsigned long *)ptepage, 0);
 }
 
+/*
+ * Functions that deal with pagetables that could be at any level of
+ * the table need to be passed an "index_size" so they know how to
+ * handle allocation.  For PTE pages, the allocation size will be
+ * (2^index_size * sizeof(pointer)) and allocations are drawn from
+ * the kmem_cache in PGT_CACHE(index_size).
+ *
+ * The maximum index size needs to be big enough to allow any
+ * pagetable sizes we need, but small enough to fit in the low bits of
+ * any page table pointer.  In other words all pagetables, even tiny
+ * ones, must be aligned to allow at least enough low 0 bits to
+ * contain this value.  This value is also used as a mask, so it must
+ * be one less than a power of two.
+ */
+#define MAX_PGTABLE_INDEX_SIZE	0xf
+
+extern struct kmem_cache *pgtable_cache[];
+#define PGT_CACHE(shift) pgtable_cache[shift]
+
+static inline void check_pgt_cache(void) { }
+
 #ifdef CONFIG_PPC_BOOK3S
 #include <asm/book3s/pgalloc.h>
 #else
-- 
cgit v1.2.3-59-g8ed1b


From bf8156c5aef12621e20afa470ae41f92cdca377b Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 15:58:10 +0000
Subject: powerpc/mm: Only keep one version of pmd_populate() functions on
 nohash/32

Use IS_ENABLED(CONFIG_BOOKE) to make single versions of
pmd_populate() and pmd_populate_kernel()

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/nohash/32/pgalloc.h | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h
index 4615801aa953..7ee8e27070f4 100644
--- a/arch/powerpc/include/asm/nohash/32/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h
@@ -25,37 +25,25 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 #define __pmd_free_tlb(tlb,x,a)		do { } while (0)
 /* #define pgd_populate(mm, pmd, pte)      BUG() */
 
-#ifndef CONFIG_BOOKE
-
 static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp,
 				       pte_t *pte)
 {
-	*pmdp = __pmd(__pa(pte) | _PMD_PRESENT);
+	if (IS_ENABLED(CONFIG_BOOKE))
+		*pmdp = __pmd((unsigned long)pte | _PMD_PRESENT);
+	else
+		*pmdp = __pmd(__pa(pte) | _PMD_PRESENT);
 }
 
 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
 				pgtable_t pte_page)
 {
-	*pmdp = __pmd(__pa(pte_page) | _PMD_USER | _PMD_PRESENT);
+	if (IS_ENABLED(CONFIG_BOOKE))
+		*pmdp = __pmd((unsigned long)pte_page | _PMD_PRESENT);
+	else
+		*pmdp = __pmd(__pa(pte_page) | _PMD_USER | _PMD_PRESENT);
 }
 
 #define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
-#else
-
-static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp,
-				       pte_t *pte)
-{
-	*pmdp = __pmd((unsigned long)pte | _PMD_PRESENT);
-}
-
-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
-				pgtable_t pte_page)
-{
-	*pmdp = __pmd((unsigned long)pte_page | _PMD_PRESENT);
-}
-
-#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
-#endif
 
 static inline void pgtable_free(void *table, unsigned index_size)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 7cec90e9499c25c31b539f8a35d949c8e9043c14 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 15:58:11 +0000
Subject: powerpc/mm: refactor pgtable freeing functions on nohash

pgtable_free() and others are identical on nohash/32 and 64,
so move them into asm/nohash/pgalloc.h

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/nohash/32/pgalloc.h | 43 ---------------------------
 arch/powerpc/include/asm/nohash/64/pgalloc.h | 43 ---------------------------
 arch/powerpc/include/asm/nohash/pgalloc.h    | 44 ++++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 86 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h
index 7ee8e27070f4..6c0f5151dc1d 100644
--- a/arch/powerpc/include/asm/nohash/32/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h
@@ -45,47 +45,4 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
 
 #define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
 
-static inline void pgtable_free(void *table, unsigned index_size)
-{
-	if (!index_size) {
-		pte_fragment_free((unsigned long *)table, 0);
-	} else {
-		BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE);
-		kmem_cache_free(PGT_CACHE(index_size), table);
-	}
-}
-
-#define get_hugepd_cache_index(x)	(x)
-
-#ifdef CONFIG_SMP
-static inline void pgtable_free_tlb(struct mmu_gather *tlb,
-				    void *table, int shift)
-{
-	unsigned long pgf = (unsigned long)table;
-	BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
-	pgf |= shift;
-	tlb_remove_table(tlb, (void *)pgf);
-}
-
-static inline void __tlb_remove_table(void *_table)
-{
-	void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
-	unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
-
-	pgtable_free(table, shift);
-}
-#else
-static inline void pgtable_free_tlb(struct mmu_gather *tlb,
-				    void *table, int shift)
-{
-	pgtable_free(table, shift);
-}
-#endif
-
-static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
-				  unsigned long address)
-{
-	tlb_flush_pgtable(tlb, address);
-	pgtable_free_tlb(tlb, table, 0);
-}
 #endif /* _ASM_POWERPC_PGALLOC_32_H */
diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h
index ffc86d42816d..c636feced1ff 100644
--- a/arch/powerpc/include/asm/nohash/64/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h
@@ -72,49 +72,6 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 	kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd);
 }
 
-static inline void pgtable_free(void *table, int shift)
-{
-	if (!shift) {
-		pte_fragment_free((unsigned long *)table, 0);
-	} else {
-		BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
-		kmem_cache_free(PGT_CACHE(shift), table);
-	}
-}
-
-#define get_hugepd_cache_index(x)	(x)
-#ifdef CONFIG_SMP
-static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
-{
-	unsigned long pgf = (unsigned long)table;
-
-	BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
-	pgf |= shift;
-	tlb_remove_table(tlb, (void *)pgf);
-}
-
-static inline void __tlb_remove_table(void *_table)
-{
-	void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
-	unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
-
-	pgtable_free(table, shift);
-}
-
-#else
-static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
-{
-	pgtable_free(table, shift);
-}
-#endif
-
-static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
-				  unsigned long address)
-{
-	tlb_flush_pgtable(tlb, address);
-	pgtable_free_tlb(tlb, table, 0);
-}
-
 #define __pmd_free_tlb(tlb, pmd, addr)		      \
 	pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX)
 #define __pud_free_tlb(tlb, pud, addr)		      \
diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h b/arch/powerpc/include/asm/nohash/pgalloc.h
index 0634f2949438..4fccac6af3ad 100644
--- a/arch/powerpc/include/asm/nohash/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/pgalloc.h
@@ -21,4 +21,48 @@ static inline void tlb_flush_pgtable(struct mmu_gather *tlb,
 #else
 #include <asm/nohash/32/pgalloc.h>
 #endif
+
+static inline void pgtable_free(void *table, int shift)
+{
+	if (!shift) {
+		pte_fragment_free((unsigned long *)table, 0);
+	} else {
+		BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+		kmem_cache_free(PGT_CACHE(shift), table);
+	}
+}
+
+#define get_hugepd_cache_index(x)	(x)
+
+#ifdef CONFIG_SMP
+static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
+{
+	unsigned long pgf = (unsigned long)table;
+
+	BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+	pgf |= shift;
+	tlb_remove_table(tlb, (void *)pgf);
+}
+
+static inline void __tlb_remove_table(void *_table)
+{
+	void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
+	unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
+
+	pgtable_free(table, shift);
+}
+
+#else
+static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
+{
+	pgtable_free(table, shift);
+}
+#endif
+
+static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
+				  unsigned long address)
+{
+	tlb_flush_pgtable(tlb, address);
+	pgtable_free_tlb(tlb, table, 0);
+}
 #endif /* _ASM_POWERPC_NOHASH_PGALLOC_H */
-- 
cgit v1.2.3-59-g8ed1b


From 8a2cc87a24e8c0a823c2e4ec8702c90d743a69d4 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 15:58:12 +0000
Subject: powerpc/mm: refactor pmd_pgtable()

pmd_pgtable() is identical on the 4 subarches, refactor it.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/32/pgalloc.h | 2 --
 arch/powerpc/include/asm/book3s/64/pgalloc.h | 5 -----
 arch/powerpc/include/asm/nohash/32/pgalloc.h | 2 --
 arch/powerpc/include/asm/nohash/64/pgalloc.h | 2 --
 arch/powerpc/include/asm/pgalloc.h           | 5 +++++
 5 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index 1b9b5c228230..998317702630 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -37,8 +37,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
 	*pmdp = __pmd(__pa(pte_page) | _PMD_PRESENT);
 }
 
-#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
-
 static inline void pgtable_free(void *table, unsigned index_size)
 {
 	if (!index_size) {
diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index df2dce6afe14..053a7940504e 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -163,11 +163,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 	*pmd = __pmd(__pgtable_ptr_val(pte_page) | PMD_VAL_BITS);
 }
 
-static inline pgtable_t pmd_pgtable(pmd_t pmd)
-{
-	return (pgtable_t)pmd_page_vaddr(pmd);
-}
-
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
 				  unsigned long address)
 {
diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h
index 6c0f5151dc1d..137761b01588 100644
--- a/arch/powerpc/include/asm/nohash/32/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h
@@ -43,6 +43,4 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
 		*pmdp = __pmd(__pa(pte_page) | _PMD_USER | _PMD_PRESENT);
 }
 
-#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
-
 #endif /* _ASM_POWERPC_PGALLOC_32_H */
diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h
index c636feced1ff..5a0ea63c77c7 100644
--- a/arch/powerpc/include/asm/nohash/64/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h
@@ -59,8 +59,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 	pmd_set(pmd, (unsigned long)pte_page);
 }
 
-#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
-
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
 	return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX),
diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h
index 5761bee0f004..2b2c60a1a66d 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -72,4 +72,9 @@ static inline void check_pgt_cache(void) { }
 #include <asm/nohash/pgalloc.h>
 #endif
 
+static inline pgtable_t pmd_pgtable(pmd_t pmd)
+{
+	return (pgtable_t)pmd_page_vaddr(pmd);
+}
+
 #endif /* _ASM_POWERPC_PGALLOC_H */
-- 
cgit v1.2.3-59-g8ed1b


From 069239169ab060da4236a59d35aec91084cc694d Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 15:58:13 +0000
Subject: powerpc/mm: refactor pgd_alloc() and pgd_free() on nohash

pgd_alloc() and pgd_free() are identical on nohash 32 and 64.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/nohash/32/pgalloc.h | 11 -----------
 arch/powerpc/include/asm/nohash/64/pgalloc.h | 11 -----------
 arch/powerpc/include/asm/nohash/pgalloc.h    | 12 ++++++++++++
 3 files changed, 12 insertions(+), 22 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h
index 137761b01588..11eac371e7e0 100644
--- a/arch/powerpc/include/asm/nohash/32/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h
@@ -5,17 +5,6 @@
 #include <linux/threads.h>
 #include <linux/slab.h>
 
-static inline pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-	return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE),
-			pgtable_gfp_flags(mm, GFP_KERNEL));
-}
-
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-	kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
-}
-
 /*
  * We don't have any real pmd's, and this code never triggers because
  * the pgd will always be present..
diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h
index 5a0ea63c77c7..62321cd12da9 100644
--- a/arch/powerpc/include/asm/nohash/64/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h
@@ -18,17 +18,6 @@ struct vmemmap_backing {
 };
 extern struct vmemmap_backing *vmemmap_list;
 
-static inline pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-	return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE),
-			pgtable_gfp_flags(mm, GFP_KERNEL));
-}
-
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-	kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
-}
-
 #define pgd_populate(MM, PGD, PUD)	pgd_set(PGD, (unsigned long)PUD)
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h b/arch/powerpc/include/asm/nohash/pgalloc.h
index 4fccac6af3ad..332b13b4ecdb 100644
--- a/arch/powerpc/include/asm/nohash/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/pgalloc.h
@@ -3,6 +3,7 @@
 #define _ASM_POWERPC_NOHASH_PGALLOC_H
 
 #include <linux/mm.h>
+#include <linux/slab.h>
 
 extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
 #ifdef CONFIG_PPC64
@@ -16,6 +17,17 @@ static inline void tlb_flush_pgtable(struct mmu_gather *tlb,
 }
 #endif /* !CONFIG_PPC_BOOK3E */
 
+static inline pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+	return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE),
+			pgtable_gfp_flags(mm, GFP_KERNEL));
+}
+
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+	kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
+}
+
 #ifdef CONFIG_PPC64
 #include <asm/nohash/64/pgalloc.h>
 #else
-- 
cgit v1.2.3-59-g8ed1b


From d69ca6bab39e84a84781535b977c7e62c8f84d37 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 16:23:25 +0000
Subject: powerpc/32: Move early_init() in a separate file

In preparation of KASAN, move early_init() into a separate
file in order to allow deactivation of KASAN for that function.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/Makefile   |  2 +-
 arch/powerpc/kernel/early_32.c | 36 ++++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/setup_32.c | 28 ----------------------------
 3 files changed, 37 insertions(+), 29 deletions(-)
 create mode 100644 arch/powerpc/kernel/early_32.c

(limited to 'arch')

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index cddadccf551d..45e47752b692 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -93,7 +93,7 @@ extra-y				+= vmlinux.lds
 
 obj-$(CONFIG_RELOCATABLE)	+= reloc_$(BITS).o
 
-obj-$(CONFIG_PPC32)		+= entry_32.o setup_32.o
+obj-$(CONFIG_PPC32)		+= entry_32.o setup_32.o early_32.o
 obj-$(CONFIG_PPC64)		+= dma-iommu.o iommu.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_BOOTX_TEXT)	+= btext.o
diff --git a/arch/powerpc/kernel/early_32.c b/arch/powerpc/kernel/early_32.c
new file mode 100644
index 000000000000..cf3cdd81dc47
--- /dev/null
+++ b/arch/powerpc/kernel/early_32.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Early init before relocation
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/asm-prototypes.h>
+
+/*
+ * We're called here very early in the boot.
+ *
+ * Note that the kernel may be running at an address which is different
+ * from the address that it was linked at, so we must use RELOC/PTRRELOC
+ * to access static data (including strings).  -- paulus
+ */
+notrace unsigned long __init early_init(unsigned long dt_ptr)
+{
+	unsigned long offset = reloc_offset();
+
+	/* First zero the BSS -- use memset_io, some platforms don't have caches on yet */
+	memset_io((void __iomem *)PTRRELOC(&__bss_start), 0, __bss_stop - __bss_start);
+
+	/*
+	 * Identify the CPU type and fix up code sections
+	 * that depend on which cpu we have.
+	 */
+	identify_cpu(offset, mfspr(SPRN_PVR));
+
+	apply_feature_fixups();
+
+	return KERNELBASE + offset;
+}
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 4a65e08a6042..3fb9f64f88fd 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -63,34 +63,6 @@ unsigned int DMA_MODE_WRITE;
 EXPORT_SYMBOL(DMA_MODE_READ);
 EXPORT_SYMBOL(DMA_MODE_WRITE);
 
-/*
- * We're called here very early in the boot.
- *
- * Note that the kernel may be running at an address which is different
- * from the address that it was linked at, so we must use RELOC/PTRRELOC
- * to access static data (including strings).  -- paulus
- */
-notrace unsigned long __init early_init(unsigned long dt_ptr)
-{
-	unsigned long offset = reloc_offset();
-
-	/* First zero the BSS -- use memset_io, some platforms don't have
-	 * caches on yet */
-	memset_io((void __iomem *)PTRRELOC(&__bss_start), 0,
-			__bss_stop - __bss_start);
-
-	/*
-	 * Identify the CPU type and fix up code sections
-	 * that depend on which cpu we have.
-	 */
-	identify_cpu(offset, mfspr(SPRN_PVR));
-
-	apply_feature_fixups();
-
-	return KERNELBASE + offset;
-}
-
-
 /*
  * This is run before start_kernel(), the kernel has been relocated
  * and we are running with enough of the MMU enabled to have our
-- 
cgit v1.2.3-59-g8ed1b


From 26deb04342e343ac58ab05bc7d2345ff0be9b667 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 16:23:26 +0000
Subject: powerpc: prepare string/mem functions for KASAN

CONFIG_KASAN implements wrappers for memcpy() memmove() and memset()
Those wrappers are doing the verification then call respectively
__memcpy() __memmove() and __memset(). The arches are therefore
expected to rename their optimised functions that way.

For files on which KASAN is inhibited, #defines are used to allow
them to directly call optimised versions of the functions without
going through the KASAN wrappers.

See commit 393f203f5fd5 ("x86_64: kasan: add interceptors for
memset/memmove/memcpy functions") for details.

Other string / mem functions do not (yet) have kasan wrappers,
we therefore have to fallback to the generic versions when
KASAN is active, otherwise KASAN checks will be skipped.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
[mpe: Fixups to keep selftests working]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kasan.h                   | 15 ++++++++++
 arch/powerpc/include/asm/string.h                  | 32 ++++++++++++++++++++--
 arch/powerpc/kernel/prom_init_check.sh             | 10 ++++++-
 arch/powerpc/lib/Makefile                          | 11 ++++++--
 arch/powerpc/lib/copy_32.S                         | 12 ++++++--
 arch/powerpc/lib/mem_64.S                          |  9 ++++--
 arch/powerpc/lib/memcpy_64.S                       |  4 ++-
 .../selftests/powerpc/copyloops/asm/export.h       |  1 +
 .../selftests/powerpc/copyloops/asm/kasan.h        |  0
 .../selftests/powerpc/copyloops/asm/ppc_asm.h      |  1 +
 10 files changed, 82 insertions(+), 13 deletions(-)
 create mode 100644 arch/powerpc/include/asm/kasan.h
 create mode 100644 tools/testing/selftests/powerpc/copyloops/asm/kasan.h

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h
new file mode 100644
index 000000000000..2c179a39d4ba
--- /dev/null
+++ b/arch/powerpc/include/asm/kasan.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_KASAN_H
+#define __ASM_KASAN_H
+
+#ifdef CONFIG_KASAN
+#define _GLOBAL_KASAN(fn)	_GLOBAL(__##fn)
+#define _GLOBAL_TOC_KASAN(fn)	_GLOBAL_TOC(__##fn)
+#define EXPORT_SYMBOL_KASAN(fn)	EXPORT_SYMBOL(__##fn)
+#else
+#define _GLOBAL_KASAN(fn)	_GLOBAL(fn)
+#define _GLOBAL_TOC_KASAN(fn)	_GLOBAL_TOC(fn)
+#define EXPORT_SYMBOL_KASAN(fn)
+#endif
+
+#endif
diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index 1647de15a31e..9bf6dffb4090 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -4,14 +4,17 @@
 
 #ifdef __KERNEL__
 
+#ifndef CONFIG_KASAN
 #define __HAVE_ARCH_STRNCPY
 #define __HAVE_ARCH_STRNCMP
+#define __HAVE_ARCH_MEMCHR
+#define __HAVE_ARCH_MEMCMP
+#define __HAVE_ARCH_MEMSET16
+#endif
+
 #define __HAVE_ARCH_MEMSET
 #define __HAVE_ARCH_MEMCPY
 #define __HAVE_ARCH_MEMMOVE
-#define __HAVE_ARCH_MEMCMP
-#define __HAVE_ARCH_MEMCHR
-#define __HAVE_ARCH_MEMSET16
 #define __HAVE_ARCH_MEMCPY_FLUSHCACHE
 
 extern char * strcpy(char *,const char *);
@@ -27,7 +30,27 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
 extern void * memchr(const void *,int,__kernel_size_t);
 extern void * memcpy_flushcache(void *,const void *,__kernel_size_t);
 
+void *__memset(void *s, int c, __kernel_size_t count);
+void *__memcpy(void *to, const void *from, __kernel_size_t n);
+void *__memmove(void *to, const void *from, __kernel_size_t n);
+
+#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
+/*
+ * For files that are not instrumented (e.g. mm/slub.c) we
+ * should use not instrumented version of mem* functions.
+ */
+#define memcpy(dst, src, len) __memcpy(dst, src, len)
+#define memmove(dst, src, len) __memmove(dst, src, len)
+#define memset(s, c, n) __memset(s, c, n)
+
+#ifndef __NO_FORTIFY
+#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */
+#endif
+
+#endif
+
 #ifdef CONFIG_PPC64
+#ifndef CONFIG_KASAN
 #define __HAVE_ARCH_MEMSET32
 #define __HAVE_ARCH_MEMSET64
 
@@ -49,8 +72,11 @@ static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n)
 {
 	return __memset64(p, v, n * 8);
 }
+#endif
 #else
+#ifndef CONFIG_KASAN
 #define __HAVE_ARCH_STRLEN
+#endif
 
 extern void *memset16(uint16_t *, uint16_t, __kernel_size_t);
 #endif
diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh
index 667df97d2595..181fd10008ef 100644
--- a/arch/powerpc/kernel/prom_init_check.sh
+++ b/arch/powerpc/kernel/prom_init_check.sh
@@ -16,8 +16,16 @@
 # If you really need to reference something from prom_init.o add
 # it to the list below:
 
+grep "^CONFIG_KASAN=y$" .config >/dev/null
+if [ $? -eq 0 ]
+then
+	MEM_FUNCS="__memcpy __memset"
+else
+	MEM_FUNCS="memcpy memset"
+fi
+
 WHITELIST="add_reloc_offset __bss_start __bss_stop copy_and_flush
-_end enter_prom memcpy memset reloc_offset __secondary_hold
+_end enter_prom $MEM_FUNCS reloc_offset __secondary_hold
 __secondary_hold_acknowledge __secondary_hold_spinloop __start
 strcmp strcpy strlcpy strlen strncmp strstr kstrtobool logo_linux_clut224
 reloc_got2 kernstart_addr memstart_addr linux_banner _stext
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 79396e184bca..47a4de434c22 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -8,9 +8,14 @@ ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
 CFLAGS_REMOVE_code-patching.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE)
 
-obj-y += string.o alloc.o code-patching.o feature-fixups.o
+obj-y += alloc.o code-patching.o feature-fixups.o
 
-obj-$(CONFIG_PPC32)	+= div64.o copy_32.o crtsavres.o strlen_32.o
+ifndef CONFIG_KASAN
+obj-y	+=	string.o memcmp_$(BITS).o
+obj-$(CONFIG_PPC32)	+= strlen_32.o
+endif
+
+obj-$(CONFIG_PPC32)	+= div64.o copy_32.o crtsavres.o
 
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION)	+= error-inject.o
 
@@ -34,7 +39,7 @@ obj64-$(CONFIG_KPROBES_SANITY_TEST)	+= test_emulate_step.o \
 					   test_emulate_step_exec_instr.o
 
 obj-y			+= checksum_$(BITS).o checksum_wrappers.o \
-			   string_$(BITS).o memcmp_$(BITS).o
+			   string_$(BITS).o
 
 obj-y			+= sstep.o ldstfp.o quad.o
 obj64-y			+= quad.o
diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
index ba66846fe973..d5642481fb98 100644
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -14,6 +14,7 @@
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
 #include <asm/code-patching-asm.h>
+#include <asm/kasan.h>
 
 #define COPY_16_BYTES		\
 	lwz	r7,4(r4);	\
@@ -68,6 +69,7 @@ CACHELINE_BYTES = L1_CACHE_BYTES
 LG_CACHELINE_BYTES = L1_CACHE_SHIFT
 CACHELINE_MASK = (L1_CACHE_BYTES-1)
 
+#ifndef CONFIG_KASAN
 _GLOBAL(memset16)
 	rlwinm.	r0 ,r5, 31, 1, 31
 	addi	r6, r3, -4
@@ -81,6 +83,7 @@ _GLOBAL(memset16)
 	sth	r4, 4(r6)
 	blr
 EXPORT_SYMBOL(memset16)
+#endif
 
 /*
  * Use dcbz on the complete cache lines in the destination
@@ -91,7 +94,7 @@ EXPORT_SYMBOL(memset16)
  * We therefore skip the optimised bloc that uses dcbz. This jump is
  * replaced by a nop once cache is active. This is done in machine_init()
  */
-_GLOBAL(memset)
+_GLOBAL_KASAN(memset)
 	cmplwi	0,r5,4
 	blt	7f
 
@@ -151,6 +154,7 @@ _GLOBAL(memset)
 	bdnz	9b
 	blr
 EXPORT_SYMBOL(memset)
+EXPORT_SYMBOL_KASAN(memset)
 
 /*
  * This version uses dcbz on the complete cache lines in the
@@ -163,12 +167,12 @@ EXPORT_SYMBOL(memset)
  * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
  * replaced by a nop once cache is active. This is done in machine_init()
  */
-_GLOBAL(memmove)
+_GLOBAL_KASAN(memmove)
 	cmplw	0,r3,r4
 	bgt	backwards_memcpy
 	/* fall through */
 
-_GLOBAL(memcpy)
+_GLOBAL_KASAN(memcpy)
 1:	b	generic_memcpy
 	patch_site	1b, patch__memcpy_nocache
 
@@ -244,6 +248,8 @@ _GLOBAL(memcpy)
 65:	blr
 EXPORT_SYMBOL(memcpy)
 EXPORT_SYMBOL(memmove)
+EXPORT_SYMBOL_KASAN(memcpy)
+EXPORT_SYMBOL_KASAN(memmove)
 
 generic_memcpy:
 	srwi.	r7,r5,3
diff --git a/arch/powerpc/lib/mem_64.S b/arch/powerpc/lib/mem_64.S
index 3c3be02f33b7..7f6bd031c306 100644
--- a/arch/powerpc/lib/mem_64.S
+++ b/arch/powerpc/lib/mem_64.S
@@ -12,7 +12,9 @@
 #include <asm/errno.h>
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
+#include <asm/kasan.h>
 
+#ifndef CONFIG_KASAN
 _GLOBAL(__memset16)
 	rlwimi	r4,r4,16,0,15
 	/* fall through */
@@ -29,8 +31,9 @@ _GLOBAL(__memset64)
 EXPORT_SYMBOL(__memset16)
 EXPORT_SYMBOL(__memset32)
 EXPORT_SYMBOL(__memset64)
+#endif
 
-_GLOBAL(memset)
+_GLOBAL_KASAN(memset)
 	neg	r0,r3
 	rlwimi	r4,r4,8,16,23
 	andi.	r0,r0,7			/* # bytes to be 8-byte aligned */
@@ -96,8 +99,9 @@ _GLOBAL(memset)
 	stb	r4,0(r6)
 	blr
 EXPORT_SYMBOL(memset)
+EXPORT_SYMBOL_KASAN(memset)
 
-_GLOBAL_TOC(memmove)
+_GLOBAL_TOC_KASAN(memmove)
 	cmplw	0,r3,r4
 	bgt	backwards_memcpy
 	b	memcpy
@@ -139,3 +143,4 @@ _GLOBAL(backwards_memcpy)
 	mtctr	r7
 	b	1b
 EXPORT_SYMBOL(memmove)
+EXPORT_SYMBOL_KASAN(memmove)
diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S
index 273ea67e60a1..25c3772c1dfb 100644
--- a/arch/powerpc/lib/memcpy_64.S
+++ b/arch/powerpc/lib/memcpy_64.S
@@ -11,6 +11,7 @@
 #include <asm/export.h>
 #include <asm/asm-compat.h>
 #include <asm/feature-fixups.h>
+#include <asm/kasan.h>
 
 #ifndef SELFTEST_CASE
 /* For big-endian, 0 == most CPUs, 1 == POWER6, 2 == Cell */
@@ -18,7 +19,7 @@
 #endif
 
 	.align	7
-_GLOBAL_TOC(memcpy)
+_GLOBAL_TOC_KASAN(memcpy)
 BEGIN_FTR_SECTION
 #ifdef __LITTLE_ENDIAN__
 	cmpdi	cr7,r5,0
@@ -230,3 +231,4 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 	blr
 #endif
 EXPORT_SYMBOL(memcpy)
+EXPORT_SYMBOL_KASAN(memcpy)
diff --git a/tools/testing/selftests/powerpc/copyloops/asm/export.h b/tools/testing/selftests/powerpc/copyloops/asm/export.h
index 0bab35f6777a..05c1663c89b0 100644
--- a/tools/testing/selftests/powerpc/copyloops/asm/export.h
+++ b/tools/testing/selftests/powerpc/copyloops/asm/export.h
@@ -1,2 +1,3 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #define EXPORT_SYMBOL(x)
+#define EXPORT_SYMBOL_KASAN(x)
diff --git a/tools/testing/selftests/powerpc/copyloops/asm/kasan.h b/tools/testing/selftests/powerpc/copyloops/asm/kasan.h
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
index 0605df807593..58c1cef3e399 100644
--- a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
+++ b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
@@ -25,6 +25,7 @@
 
 #define _GLOBAL(A) FUNC_START(test_ ## A)
 #define _GLOBAL_TOC(A) _GLOBAL(A)
+#define _GLOBAL_TOC_KASAN(A) _GLOBAL(A)
 
 #define PPC_MTOCRF(A, B)	mtocrf A, B
 
-- 
cgit v1.2.3-59-g8ed1b


From cbe46bd4f5104552b612505b73d366f66efc2341 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 16:23:27 +0000
Subject: powerpc: remove CONFIG_CMDLINE #ifdef mess

This patch makes CONFIG_CMDLINE defined at all time. It avoids
having to enclose related code inside #ifdef CONFIG_CMDLINE

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig            | 6 +++---
 arch/powerpc/kernel/prom_init.c | 9 +++------
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 7815eb0cc2a5..515bd10d32f6 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -831,9 +831,9 @@ config CMDLINE_BOOL
 	bool "Default bootloader kernel arguments"
 
 config CMDLINE
-	string "Initial kernel command string"
-	depends on CMDLINE_BOOL
-	default "console=ttyS0,9600 console=tty0 root=/dev/sda2"
+	string "Initial kernel command string" if CMDLINE_BOOL
+	default "console=ttyS0,9600 console=tty0 root=/dev/sda2" if CMDLINE_BOOL
+	default ""
 	help
 	  On some platforms, there is currently no way for the boot loader to
 	  pass arguments to the kernel. For these platforms, you can supply
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index f33ff4163a51..ecf083c46bdb 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -631,17 +631,14 @@ static void __init early_cmdline_parse(void)
 	const char *opt;
 
 	char *p;
-	int l __maybe_unused = 0;
+	int l = 0;
 
 	prom_cmd_line[0] = 0;
 	p = prom_cmd_line;
 	if ((long)prom.chosen > 0)
 		l = prom_getprop(prom.chosen, "bootargs", p, COMMAND_LINE_SIZE-1);
-#ifdef CONFIG_CMDLINE
-	if (l <= 0 || p[0] == '\0') /* dbl check */
-		strlcpy(prom_cmd_line,
-			CONFIG_CMDLINE, sizeof(prom_cmd_line));
-#endif /* CONFIG_CMDLINE */
+	if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && (l <= 0 || p[0] == '\0')) /* dbl check */
+		strlcpy(prom_cmd_line, CONFIG_CMDLINE, sizeof(prom_cmd_line));
 	prom_printf("command line: %s\n", prom_cmd_line);
 
 #ifdef CONFIG_PPC64
-- 
cgit v1.2.3-59-g8ed1b


From 450e7dd4001f22f796e22422dd1d2cbd5bda21fc Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 16:23:28 +0000
Subject: powerpc/prom_init: don't use string functions from lib/

When KASAN is active, the string functions in lib/ are doing the
KASAN checks. This is too early for prom_init.

This patch implements dedicated string functions for prom_init,
which will be compiled in with KASAN disabled.

Size of prom_init before the patch:
   text	   data	    bss	    dec	    hex	filename
  12060	    488	   6960	  19508	   4c34	arch/powerpc/kernel/prom_init.o

Size of prom_init after the patch:
   text	   data	    bss	    dec	    hex	filename
  12460	    488	   6960	  19908	   4dc4	arch/powerpc/kernel/prom_init.o

This increases the size of prom_init a bit, but as prom_init is
in __init section, it is freed after boot anyway.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/prom_init.c        | 211 ++++++++++++++++++++++++++-------
 arch/powerpc/kernel/prom_init_check.sh |   2 +-
 2 files changed, 171 insertions(+), 42 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index ecf083c46bdb..7017156168e8 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -224,6 +224,135 @@ static bool  __prombss rtas_has_query_cpu_stopped;
 #define PHANDLE_VALID(p)	((p) != 0 && (p) != PROM_ERROR)
 #define IHANDLE_VALID(i)	((i) != 0 && (i) != PROM_ERROR)
 
+/* Copied from lib/string.c and lib/kstrtox.c */
+
+static int __init prom_strcmp(const char *cs, const char *ct)
+{
+	unsigned char c1, c2;
+
+	while (1) {
+		c1 = *cs++;
+		c2 = *ct++;
+		if (c1 != c2)
+			return c1 < c2 ? -1 : 1;
+		if (!c1)
+			break;
+	}
+	return 0;
+}
+
+static char __init *prom_strcpy(char *dest, const char *src)
+{
+	char *tmp = dest;
+
+	while ((*dest++ = *src++) != '\0')
+		/* nothing */;
+	return tmp;
+}
+
+static int __init prom_strncmp(const char *cs, const char *ct, size_t count)
+{
+	unsigned char c1, c2;
+
+	while (count) {
+		c1 = *cs++;
+		c2 = *ct++;
+		if (c1 != c2)
+			return c1 < c2 ? -1 : 1;
+		if (!c1)
+			break;
+		count--;
+	}
+	return 0;
+}
+
+static size_t __init prom_strlen(const char *s)
+{
+	const char *sc;
+
+	for (sc = s; *sc != '\0'; ++sc)
+		/* nothing */;
+	return sc - s;
+}
+
+static int __init prom_memcmp(const void *cs, const void *ct, size_t count)
+{
+	const unsigned char *su1, *su2;
+	int res = 0;
+
+	for (su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--)
+		if ((res = *su1 - *su2) != 0)
+			break;
+	return res;
+}
+
+static char __init *prom_strstr(const char *s1, const char *s2)
+{
+	size_t l1, l2;
+
+	l2 = prom_strlen(s2);
+	if (!l2)
+		return (char *)s1;
+	l1 = prom_strlen(s1);
+	while (l1 >= l2) {
+		l1--;
+		if (!prom_memcmp(s1, s2, l2))
+			return (char *)s1;
+		s1++;
+	}
+	return NULL;
+}
+
+static size_t __init prom_strlcpy(char *dest, const char *src, size_t size)
+{
+	size_t ret = prom_strlen(src);
+
+	if (size) {
+		size_t len = (ret >= size) ? size - 1 : ret;
+		memcpy(dest, src, len);
+		dest[len] = '\0';
+	}
+	return ret;
+}
+
+#ifdef CONFIG_PPC_PSERIES
+static int __init prom_strtobool(const char *s, bool *res)
+{
+	if (!s)
+		return -EINVAL;
+
+	switch (s[0]) {
+	case 'y':
+	case 'Y':
+	case '1':
+		*res = true;
+		return 0;
+	case 'n':
+	case 'N':
+	case '0':
+		*res = false;
+		return 0;
+	case 'o':
+	case 'O':
+		switch (s[1]) {
+		case 'n':
+		case 'N':
+			*res = true;
+			return 0;
+		case 'f':
+		case 'F':
+			*res = false;
+			return 0;
+		default:
+			break;
+		}
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+#endif
 
 /* This is the one and *ONLY* place where we actually call open
  * firmware.
@@ -555,7 +684,7 @@ static int __init prom_setprop(phandle node, const char *nodename,
 	add_string(&p, tohex((u32)(unsigned long) value));
 	add_string(&p, tohex(valuelen));
 	add_string(&p, tohex(ADDR(pname)));
-	add_string(&p, tohex(strlen(pname)));
+	add_string(&p, tohex(prom_strlen(pname)));
 	add_string(&p, "property");
 	*p = 0;
 	return call_prom("interpret", 1, 1, (u32)(unsigned long) cmd);
@@ -638,23 +767,23 @@ static void __init early_cmdline_parse(void)
 	if ((long)prom.chosen > 0)
 		l = prom_getprop(prom.chosen, "bootargs", p, COMMAND_LINE_SIZE-1);
 	if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && (l <= 0 || p[0] == '\0')) /* dbl check */
-		strlcpy(prom_cmd_line, CONFIG_CMDLINE, sizeof(prom_cmd_line));
+		prom_strlcpy(prom_cmd_line, CONFIG_CMDLINE, sizeof(prom_cmd_line));
 	prom_printf("command line: %s\n", prom_cmd_line);
 
 #ifdef CONFIG_PPC64
-	opt = strstr(prom_cmd_line, "iommu=");
+	opt = prom_strstr(prom_cmd_line, "iommu=");
 	if (opt) {
 		prom_printf("iommu opt is: %s\n", opt);
 		opt += 6;
 		while (*opt && *opt == ' ')
 			opt++;
-		if (!strncmp(opt, "off", 3))
+		if (!prom_strncmp(opt, "off", 3))
 			prom_iommu_off = 1;
-		else if (!strncmp(opt, "force", 5))
+		else if (!prom_strncmp(opt, "force", 5))
 			prom_iommu_force_on = 1;
 	}
 #endif
-	opt = strstr(prom_cmd_line, "mem=");
+	opt = prom_strstr(prom_cmd_line, "mem=");
 	if (opt) {
 		opt += 4;
 		prom_memory_limit = prom_memparse(opt, (const char **)&opt);
@@ -666,13 +795,13 @@ static void __init early_cmdline_parse(void)
 
 #ifdef CONFIG_PPC_PSERIES
 	prom_radix_disable = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT);
-	opt = strstr(prom_cmd_line, "disable_radix");
+	opt = prom_strstr(prom_cmd_line, "disable_radix");
 	if (opt) {
 		opt += 13;
 		if (*opt && *opt == '=') {
 			bool val;
 
-			if (kstrtobool(++opt, &val))
+			if (prom_strtobool(++opt, &val))
 				prom_radix_disable = false;
 			else
 				prom_radix_disable = val;
@@ -1025,7 +1154,7 @@ static int __init prom_count_smt_threads(void)
 		type[0] = 0;
 		prom_getprop(node, "device_type", type, sizeof(type));
 
-		if (strcmp(type, "cpu"))
+		if (prom_strcmp(type, "cpu"))
 			continue;
 		/*
 		 * There is an entry for each smt thread, each entry being
@@ -1472,7 +1601,7 @@ static void __init prom_init_mem(void)
 			 */
 			prom_getprop(node, "name", type, sizeof(type));
 		}
-		if (strcmp(type, "memory"))
+		if (prom_strcmp(type, "memory"))
 			continue;
 
 		plen = prom_getprop(node, "reg", regbuf, sizeof(regbuf));
@@ -1753,19 +1882,19 @@ static void __init prom_initialize_tce_table(void)
 		prom_getprop(node, "device_type", type, sizeof(type));
 		prom_getprop(node, "model", model, sizeof(model));
 
-		if ((type[0] == 0) || (strstr(type, "pci") == NULL))
+		if ((type[0] == 0) || (prom_strstr(type, "pci") == NULL))
 			continue;
 
 		/* Keep the old logic intact to avoid regression. */
 		if (compatible[0] != 0) {
-			if ((strstr(compatible, "python") == NULL) &&
-			    (strstr(compatible, "Speedwagon") == NULL) &&
-			    (strstr(compatible, "Winnipeg") == NULL))
+			if ((prom_strstr(compatible, "python") == NULL) &&
+			    (prom_strstr(compatible, "Speedwagon") == NULL) &&
+			    (prom_strstr(compatible, "Winnipeg") == NULL))
 				continue;
 		} else if (model[0] != 0) {
-			if ((strstr(model, "ython") == NULL) &&
-			    (strstr(model, "peedwagon") == NULL) &&
-			    (strstr(model, "innipeg") == NULL))
+			if ((prom_strstr(model, "ython") == NULL) &&
+			    (prom_strstr(model, "peedwagon") == NULL) &&
+			    (prom_strstr(model, "innipeg") == NULL))
 				continue;
 		}
 
@@ -1914,12 +2043,12 @@ static void __init prom_hold_cpus(void)
 
 		type[0] = 0;
 		prom_getprop(node, "device_type", type, sizeof(type));
-		if (strcmp(type, "cpu") != 0)
+		if (prom_strcmp(type, "cpu") != 0)
 			continue;
 
 		/* Skip non-configured cpus. */
 		if (prom_getprop(node, "status", type, sizeof(type)) > 0)
-			if (strcmp(type, "okay") != 0)
+			if (prom_strcmp(type, "okay") != 0)
 				continue;
 
 		reg = cpu_to_be32(-1); /* make sparse happy */
@@ -1995,9 +2124,9 @@ static void __init prom_find_mmu(void)
 		return;
 	version[sizeof(version) - 1] = 0;
 	/* XXX might need to add other versions here */
-	if (strcmp(version, "Open Firmware, 1.0.5") == 0)
+	if (prom_strcmp(version, "Open Firmware, 1.0.5") == 0)
 		of_workarounds = OF_WA_CLAIM;
-	else if (strncmp(version, "FirmWorks,3.", 12) == 0) {
+	else if (prom_strncmp(version, "FirmWorks,3.", 12) == 0) {
 		of_workarounds = OF_WA_CLAIM | OF_WA_LONGTRAIL;
 		call_prom("interpret", 1, 1, "dev /memory 0 to allow-reclaim");
 	} else
@@ -2030,7 +2159,7 @@ static void __init prom_init_stdout(void)
 	call_prom("instance-to-path", 3, 1, prom.stdout, path, 255);
 	prom_printf("OF stdout device is: %s\n", of_stdout_device);
 	prom_setprop(prom.chosen, "/chosen", "linux,stdout-path",
-		     path, strlen(path) + 1);
+		     path, prom_strlen(path) + 1);
 
 	/* instance-to-package fails on PA-Semi */
 	stdout_node = call_prom("instance-to-package", 1, 1, prom.stdout);
@@ -2040,7 +2169,7 @@ static void __init prom_init_stdout(void)
 		/* If it's a display, note it */
 		memset(type, 0, sizeof(type));
 		prom_getprop(stdout_node, "device_type", type, sizeof(type));
-		if (strcmp(type, "display") == 0)
+		if (prom_strcmp(type, "display") == 0)
 			prom_setprop(stdout_node, path, "linux,boot-display", NULL, 0);
 	}
 }
@@ -2061,19 +2190,19 @@ static int __init prom_find_machine_type(void)
 		compat[len] = 0;
 		while (i < len) {
 			char *p = &compat[i];
-			int sl = strlen(p);
+			int sl = prom_strlen(p);
 			if (sl == 0)
 				break;
-			if (strstr(p, "Power Macintosh") ||
-			    strstr(p, "MacRISC"))
+			if (prom_strstr(p, "Power Macintosh") ||
+			    prom_strstr(p, "MacRISC"))
 				return PLATFORM_POWERMAC;
 #ifdef CONFIG_PPC64
 			/* We must make sure we don't detect the IBM Cell
 			 * blades as pSeries due to some firmware issues,
 			 * so we do it here.
 			 */
-			if (strstr(p, "IBM,CBEA") ||
-			    strstr(p, "IBM,CPBW-1.0"))
+			if (prom_strstr(p, "IBM,CBEA") ||
+			    prom_strstr(p, "IBM,CPBW-1.0"))
 				return PLATFORM_GENERIC;
 #endif /* CONFIG_PPC64 */
 			i += sl + 1;
@@ -2090,7 +2219,7 @@ static int __init prom_find_machine_type(void)
 			   compat, sizeof(compat)-1);
 	if (len <= 0)
 		return PLATFORM_GENERIC;
-	if (strcmp(compat, "chrp"))
+	if (prom_strcmp(compat, "chrp"))
 		return PLATFORM_GENERIC;
 
 	/* Default to pSeries. We need to know if we are running LPAR */
@@ -2152,7 +2281,7 @@ static void __init prom_check_displays(void)
 	for (node = 0; prom_next_node(&node); ) {
 		memset(type, 0, sizeof(type));
 		prom_getprop(node, "device_type", type, sizeof(type));
-		if (strcmp(type, "display") != 0)
+		if (prom_strcmp(type, "display") != 0)
 			continue;
 
 		/* It seems OF doesn't null-terminate the path :-( */
@@ -2256,9 +2385,9 @@ static unsigned long __init dt_find_string(char *str)
 	s = os = (char *)dt_string_start;
 	s += 4;
 	while (s <  (char *)dt_string_end) {
-		if (strcmp(s, str) == 0)
+		if (prom_strcmp(s, str) == 0)
 			return s - os;
-		s += strlen(s) + 1;
+		s += prom_strlen(s) + 1;
 	}
 	return 0;
 }
@@ -2291,7 +2420,7 @@ static void __init scan_dt_build_strings(phandle node,
 		}
 
  		/* skip "name" */
- 		if (strcmp(namep, "name") == 0) {
+		if (prom_strcmp(namep, "name") == 0) {
  			*mem_start = (unsigned long)namep;
  			prev_name = "name";
  			continue;
@@ -2303,7 +2432,7 @@ static void __init scan_dt_build_strings(phandle node,
 			namep = sstart + soff;
 		} else {
 			/* Trim off some if we can */
-			*mem_start = (unsigned long)namep + strlen(namep) + 1;
+			*mem_start = (unsigned long)namep + prom_strlen(namep) + 1;
 			dt_string_end = *mem_start;
 		}
 		prev_name = namep;
@@ -2372,7 +2501,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start,
 			break;
 
  		/* skip "name" */
- 		if (strcmp(pname, "name") == 0) {
+		if (prom_strcmp(pname, "name") == 0) {
  			prev_name = "name";
  			continue;
  		}
@@ -2403,7 +2532,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start,
 		call_prom("getprop", 4, 1, node, pname, valp, l);
 		*mem_start = _ALIGN(*mem_start, 4);
 
-		if (!strcmp(pname, "phandle"))
+		if (!prom_strcmp(pname, "phandle"))
 			has_phandle = 1;
 	}
 
@@ -2473,8 +2602,8 @@ static void __init flatten_device_tree(void)
 
 	/* Add "phandle" in there, we'll need it */
 	namep = make_room(&mem_start, &mem_end, 16, 1);
-	strcpy(namep, "phandle");
-	mem_start = (unsigned long)namep + strlen(namep) + 1;
+	prom_strcpy(namep, "phandle");
+	mem_start = (unsigned long)namep + prom_strlen(namep) + 1;
 
 	/* Build string array */
 	prom_printf("Building dt strings...\n"); 
@@ -2796,7 +2925,7 @@ static void __init fixup_device_tree_efika(void)
 	rv = prom_getprop(node, "model", prop, sizeof(prop));
 	if (rv == PROM_ERROR)
 		return;
-	if (strcmp(prop, "EFIKA5K2"))
+	if (prom_strcmp(prop, "EFIKA5K2"))
 		return;
 
 	prom_printf("Applying EFIKA device tree fixups\n");
@@ -2804,13 +2933,13 @@ static void __init fixup_device_tree_efika(void)
 	/* Claiming to be 'chrp' is death */
 	node = call_prom("finddevice", 1, 1, ADDR("/"));
 	rv = prom_getprop(node, "device_type", prop, sizeof(prop));
-	if (rv != PROM_ERROR && (strcmp(prop, "chrp") == 0))
+	if (rv != PROM_ERROR && (prom_strcmp(prop, "chrp") == 0))
 		prom_setprop(node, "/", "device_type", "efika", sizeof("efika"));
 
 	/* CODEGEN,description is exposed in /proc/cpuinfo so
 	   fix that too */
 	rv = prom_getprop(node, "CODEGEN,description", prop, sizeof(prop));
-	if (rv != PROM_ERROR && (strstr(prop, "CHRP")))
+	if (rv != PROM_ERROR && (prom_strstr(prop, "CHRP")))
 		prom_setprop(node, "/", "CODEGEN,description",
 			     "Efika 5200B PowerPC System",
 			     sizeof("Efika 5200B PowerPC System"));
diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh
index 181fd10008ef..4cac45cb5de5 100644
--- a/arch/powerpc/kernel/prom_init_check.sh
+++ b/arch/powerpc/kernel/prom_init_check.sh
@@ -27,7 +27,7 @@ fi
 WHITELIST="add_reloc_offset __bss_start __bss_stop copy_and_flush
 _end enter_prom $MEM_FUNCS reloc_offset __secondary_hold
 __secondary_hold_acknowledge __secondary_hold_spinloop __start
-strcmp strcpy strlcpy strlen strncmp strstr kstrtobool logo_linux_clut224
+logo_linux_clut224
 reloc_got2 kernstart_addr memstart_addr linux_banner _stext
 __prom_init_toc_start __prom_init_toc_end btext_setup_display TOC."
 
-- 
cgit v1.2.3-59-g8ed1b


From adcf59187e2705721ccf23733a5fa2fb20d91415 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 16:23:29 +0000
Subject: powerpc: don't use direct assignation during early boot.

In kernel/cputable.c, explicitly use memcpy() instead of *y = *x;
This will allow GCC to replace it with __memcpy() when KASAN is
selected.

Acked-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/cputable.c  | 13 ++++++++++---
 arch/powerpc/kernel/prom_init.c | 10 ++++++++--
 2 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 1eab54bc6ee9..cd12f362b61f 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -2147,7 +2147,11 @@ void __init set_cur_cpu_spec(struct cpu_spec *s)
 	struct cpu_spec *t = &the_cpu_spec;
 
 	t = PTRRELOC(t);
-	*t = *s;
+	/*
+	 * use memcpy() instead of *t = *s so that GCC replaces it
+	 * by __memcpy() when KASAN is active
+	 */
+	memcpy(t, s, sizeof(*t));
 
 	*PTRRELOC(&cur_cpu_spec) = &the_cpu_spec;
 }
@@ -2161,8 +2165,11 @@ static struct cpu_spec * __init setup_cpu_spec(unsigned long offset,
 	t = PTRRELOC(t);
 	old = *t;
 
-	/* Copy everything, then do fixups */
-	*t = *s;
+	/*
+	 * Copy everything, then do fixups. Use memcpy() instead of *t = *s
+	 * so that GCC replaces it by __memcpy() when KASAN is active
+	 */
+	memcpy(t, s, sizeof(*t));
 
 	/*
 	 * If we are overriding a previous value derived from the real
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 7017156168e8..d3b0d543d924 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -1264,8 +1264,14 @@ static void __init prom_check_platform_support(void)
 	int prop_len = prom_getproplen(prom.chosen,
 				       "ibm,arch-vec-5-platform-support");
 
-	/* First copy the architecture vec template */
-	ibm_architecture_vec = ibm_architecture_vec_template;
+	/*
+	 * First copy the architecture vec template
+	 *
+	 * use memcpy() instead of *vec = *vec_template so that GCC replaces it
+	 * by __memcpy() when KASAN is active
+	 */
+	memcpy(&ibm_architecture_vec, &ibm_architecture_vec_template,
+	       sizeof(ibm_architecture_vec));
 
 	if (prop_len > 1) {
 		int i;
-- 
cgit v1.2.3-59-g8ed1b


From 7934cea7f0b93fcfdb3b175df94f539e4af86c9b Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 16:23:30 +0000
Subject: powerpc/32: use memset() instead of memset_io() to zero BSS

Since commit 400c47d81ca38 ("powerpc32: memset: only use dcbz once cache is
enabled"), memset() can be used before activation of the cache,
so no need to use memset_io() for zeroing the BSS.

Acked-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/early_32.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/early_32.c b/arch/powerpc/kernel/early_32.c
index cf3cdd81dc47..3482118ffe76 100644
--- a/arch/powerpc/kernel/early_32.c
+++ b/arch/powerpc/kernel/early_32.c
@@ -21,8 +21,8 @@ notrace unsigned long __init early_init(unsigned long dt_ptr)
 {
 	unsigned long offset = reloc_offset();
 
-	/* First zero the BSS -- use memset_io, some platforms don't have caches on yet */
-	memset_io((void __iomem *)PTRRELOC(&__bss_start), 0, __bss_stop - __bss_start);
+	/* First zero the BSS */
+	memset(PTRRELOC(&__bss_start), 0, __bss_stop - __bss_start);
 
 	/*
 	 * Identify the CPU type and fix up code sections
-- 
cgit v1.2.3-59-g8ed1b


From a67beca077ef79e971443aa6af6b14d4b3fb3bd6 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 16:23:31 +0000
Subject: powerpc/32: make KVIRT_TOP dependent on FIXMAP_START

When we add KASAN shadow area, KVIRT_TOP can't be anymore fixed
at 0xfe000000.

This patch uses FIXADDR_START to define KVIRT_TOP.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/32/pgtable.h | 13 ++++++++++---
 arch/powerpc/include/asm/nohash/32/pgtable.h | 13 ++++++++++---
 2 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index aa8406b8f7ba..838de59f6754 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -134,15 +134,24 @@ static inline bool pte_user(pte_t pte)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
 #define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
+
+#ifndef __ASSEMBLY__
+
+int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
+
+#endif /* !__ASSEMBLY__ */
+
 /*
  * This is the bottom of the PKMAP area with HIGHMEM or an arbitrary
  * value (for now) on others, from where we can start layout kernel
  * virtual space that goes below PKMAP and FIXMAP
  */
+#include <asm/fixmap.h>
+
 #ifdef CONFIG_HIGHMEM
 #define KVIRT_TOP	PKMAP_BASE
 #else
-#define KVIRT_TOP	(0xfe000000UL)	/* for now, could be FIXMAP_BASE ? */
+#define KVIRT_TOP	FIXADDR_START
 #endif
 
 /*
@@ -373,8 +382,6 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma,
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) >> 3 })
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val << 3 })
 
-int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
-
 /* Generic accessors to PTE bits */
 static inline int pte_write(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_RW);}
 static inline int pte_read(pte_t pte)		{ return 1; }
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index bed433358260..0284f8f5305f 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -64,15 +64,24 @@ extern int icache_44x_need_flush;
 #define pgd_ERROR(e) \
 	pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
 
+#ifndef __ASSEMBLY__
+
+int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
+
+#endif /* !__ASSEMBLY__ */
+
+
 /*
  * This is the bottom of the PKMAP area with HIGHMEM or an arbitrary
  * value (for now) on others, from where we can start layout kernel
  * virtual space that goes below PKMAP and FIXMAP
  */
+#include <asm/fixmap.h>
+
 #ifdef CONFIG_HIGHMEM
 #define KVIRT_TOP	PKMAP_BASE
 #else
-#define KVIRT_TOP	(0xfe000000UL)	/* for now, could be FIXMAP_BASE ? */
+#define KVIRT_TOP	FIXADDR_START
 #endif
 
 /*
@@ -379,8 +388,6 @@ static inline int pte_young(pte_t pte)
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) >> 3 })
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val << 3 })
 
-int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
-
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ASM_POWERPC_NOHASH_32_PGTABLE_H */
-- 
cgit v1.2.3-59-g8ed1b


From b4abe38fd698ace6942edeeb79a5b8a60a7af4fa Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 16:23:32 +0000
Subject: powerpc/32: prepare shadow area for KASAN

This patch prepares a shadow area for KASAN.

The shadow area will be at the top of the kernel virtual
memory space above the fixmap area and will occupy one
eighth of the total kernel virtual memory space.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig.debug        |  5 +++++
 arch/powerpc/include/asm/fixmap.h |  5 +++++
 arch/powerpc/include/asm/kasan.h  | 16 ++++++++++++++++
 arch/powerpc/mm/mem.c             |  4 ++++
 arch/powerpc/mm/ptdump/ptdump.c   |  8 ++++++++
 5 files changed, 38 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 4e00cb0a5464..61febbbdd02b 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -366,3 +366,8 @@ config PPC_FAST_ENDIAN_SWITCH
         depends on DEBUG_KERNEL && PPC_BOOK3S_64
         help
 	  If you're unsure what this is, say N.
+
+config KASAN_SHADOW_OFFSET
+	hex
+	depends on KASAN
+	default 0xe0000000
diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h
index b9fbed84ddca..0cfc365d814b 100644
--- a/arch/powerpc/include/asm/fixmap.h
+++ b/arch/powerpc/include/asm/fixmap.h
@@ -22,7 +22,12 @@
 #include <asm/kmap_types.h>
 #endif
 
+#ifdef CONFIG_KASAN
+#include <asm/kasan.h>
+#define FIXADDR_TOP	(KASAN_SHADOW_START - PAGE_SIZE)
+#else
 #define FIXADDR_TOP	((unsigned long)(-PAGE_SIZE))
+#endif
 
 /*
  * Here we define all the compile-time 'special' virtual
diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h
index 2c179a39d4ba..05274dea3109 100644
--- a/arch/powerpc/include/asm/kasan.h
+++ b/arch/powerpc/include/asm/kasan.h
@@ -12,4 +12,20 @@
 #define EXPORT_SYMBOL_KASAN(fn)
 #endif
 
+#ifndef __ASSEMBLY__
+
+#include <asm/page.h>
+
+#define KASAN_SHADOW_SCALE_SHIFT	3
+
+#define KASAN_SHADOW_START	(KASAN_SHADOW_OFFSET + \
+				 (PAGE_OFFSET >> KASAN_SHADOW_SCALE_SHIFT))
+
+#define KASAN_SHADOW_OFFSET	ASM_CONST(CONFIG_KASAN_SHADOW_OFFSET)
+
+#define KASAN_SHADOW_END	0UL
+
+#define KASAN_SHADOW_SIZE	(KASAN_SHADOW_END - KASAN_SHADOW_START)
+
+#endif /* __ASSEMBLY */
 #endif
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 105c58f8900a..cd525d709072 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -310,6 +310,10 @@ void __init mem_init(void)
 	mem_init_print_info(NULL);
 #ifdef CONFIG_PPC32
 	pr_info("Kernel virtual memory layout:\n");
+#ifdef CONFIG_KASAN
+	pr_info("  * 0x%08lx..0x%08lx  : kasan shadow mem\n",
+		KASAN_SHADOW_START, KASAN_SHADOW_END);
+#endif
 	pr_info("  * 0x%08lx..0x%08lx  : fixmap\n", FIXADDR_START, FIXADDR_TOP);
 #ifdef CONFIG_HIGHMEM
 	pr_info("  * 0x%08lx..0x%08lx  : highmem PTEs\n",
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 63fc56feea15..48135ba6fa74 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -100,6 +100,10 @@ static struct addr_marker address_markers[] = {
 #endif
 	{ 0,	"Fixmap start" },
 	{ 0,	"Fixmap end" },
+#endif
+#ifdef CONFIG_KASAN
+	{ 0,	"kasan shadow mem start" },
+	{ 0,	"kasan shadow mem end" },
 #endif
 	{ -1,	NULL },
 };
@@ -323,6 +327,10 @@ static void populate_markers(void)
 #endif
 	address_markers[i++].start_address = FIXADDR_START;
 	address_markers[i++].start_address = FIXADDR_TOP;
+#ifdef CONFIG_KASAN
+	address_markers[i++].start_address = KASAN_SHADOW_START;
+	address_markers[i++].start_address = KASAN_SHADOW_END;
+#endif
 #endif /* CONFIG_PPC64 */
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From f072015c7b742c42aa5649d22f43163cd0eb7024 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 16:23:33 +0000
Subject: powerpc: disable KASAN instrumentation on early/critical files.

All files containing functions run before kasan_early_init() is called
must have KASAN instrumentation disabled.

For those file, branch profiling also have to be disabled otherwise
each if () generates a call to ftrace_likely_update().

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/Makefile             | 12 ++++++++++++
 arch/powerpc/lib/Makefile                |  8 ++++++++
 arch/powerpc/mm/Makefile                 |  6 ++++++
 arch/powerpc/platforms/powermac/Makefile |  6 ++++++
 arch/powerpc/purgatory/Makefile          |  3 +++
 arch/powerpc/xmon/Makefile               |  1 +
 6 files changed, 36 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 45e47752b692..0ea6c4aa3a20 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -31,6 +31,18 @@ CFLAGS_REMOVE_btext.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_prom.o = $(CC_FLAGS_FTRACE)
 endif
 
+KASAN_SANITIZE_early_32.o := n
+KASAN_SANITIZE_cputable.o := n
+KASAN_SANITIZE_prom_init.o := n
+KASAN_SANITIZE_btext.o := n
+
+ifdef CONFIG_KASAN
+CFLAGS_early_32.o += -DDISABLE_BRANCH_PROFILING
+CFLAGS_cputable.o += -DDISABLE_BRANCH_PROFILING
+CFLAGS_prom_init.o += -DDISABLE_BRANCH_PROFILING
+CFLAGS_btext.o += -DDISABLE_BRANCH_PROFILING
+endif
+
 obj-y				:= cputable.o ptrace.o syscalls.o \
 				   irq.o align.o signal_32.o pmc.o vdso.o \
 				   process.o systbl.o idle.o \
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 47a4de434c22..c55f9c27bf79 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -8,6 +8,14 @@ ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
 CFLAGS_REMOVE_code-patching.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE)
 
+KASAN_SANITIZE_code-patching.o := n
+KASAN_SANITIZE_feature-fixups.o := n
+
+ifdef CONFIG_KASAN
+CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING
+CFLAGS_feature-fixups.o += -DDISABLE_BRANCH_PROFILING
+endif
+
 obj-y += alloc.o code-patching.o feature-fixups.o
 
 ifndef CONFIG_KASAN
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index c7d5f37f7c52..62735f335bce 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -5,6 +5,12 @@
 
 ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
 
+KASAN_SANITIZE_ppc_mmu_32.o := n
+
+ifdef CONFIG_KASAN
+CFLAGS_ppc_mmu_32.o  		+= -DDISABLE_BRANCH_PROFILING
+endif
+
 obj-y				:= fault.o mem.o pgtable.o mmap.o \
 				   init_$(BITS).o pgtable_$(BITS).o \
 				   pgtable-frag.o \
diff --git a/arch/powerpc/platforms/powermac/Makefile b/arch/powerpc/platforms/powermac/Makefile
index 20ebf35d7913..f4247ade71ca 100644
--- a/arch/powerpc/platforms/powermac/Makefile
+++ b/arch/powerpc/platforms/powermac/Makefile
@@ -2,6 +2,12 @@
 CFLAGS_bootx_init.o  		+= -fPIC
 CFLAGS_bootx_init.o  		+= $(call cc-option, -fno-stack-protector)
 
+KASAN_SANITIZE_bootx_init.o := n
+
+ifdef CONFIG_KASAN
+CFLAGS_bootx_init.o  		+= -DDISABLE_BRANCH_PROFILING
+endif
+
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace early boot code
 CFLAGS_REMOVE_bootx_init.o = $(CC_FLAGS_FTRACE)
diff --git a/arch/powerpc/purgatory/Makefile b/arch/powerpc/purgatory/Makefile
index 4314ba5baf43..7c6d8b14f440 100644
--- a/arch/powerpc/purgatory/Makefile
+++ b/arch/powerpc/purgatory/Makefile
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
+
+KASAN_SANITIZE := n
+
 targets += trampoline.o purgatory.ro kexec-purgatory.c
 
 LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined
diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile
index 3050f9323254..f142570ad860 100644
--- a/arch/powerpc/xmon/Makefile
+++ b/arch/powerpc/xmon/Makefile
@@ -7,6 +7,7 @@ subdir-ccflags-y := $(call cc-disable-warning, builtin-requires-header)
 GCOV_PROFILE := n
 KCOV_INSTRUMENT := n
 UBSAN_SANITIZE := n
+KASAN_SANITIZE := n
 
 # Disable ftrace for the entire directory
 ORIG_CFLAGS := $(KBUILD_CFLAGS)
-- 
cgit v1.2.3-59-g8ed1b


From 2edb16efc899f9c232e2d880930b855e4cf55df4 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 16:23:34 +0000
Subject: powerpc/32: Add KASAN support

This patch adds KASAN support for PPC32. The following patch
will add an early activation of hash table for book3s. Until
then, a warning will be raised if trying to use KASAN on an
hash 6xx.

To support KASAN, this patch initialises that MMU mapings for
accessing to the KASAN shadow area defined in a previous patch.

An early mapping is set as soon as the kernel code has been
relocated at its definitive place.

Then the definitive mapping is set once paging is initialised.

For modules, the shadow area is allocated at module_alloc().

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig                  |   1 +
 arch/powerpc/include/asm/kasan.h      |   9 ++
 arch/powerpc/kernel/head_32.S         |   3 +
 arch/powerpc/kernel/head_40x.S        |   3 +
 arch/powerpc/kernel/head_44x.S        |   3 +
 arch/powerpc/kernel/head_8xx.S        |   3 +
 arch/powerpc/kernel/head_fsl_booke.S  |   3 +
 arch/powerpc/kernel/setup-common.c    |   3 +
 arch/powerpc/mm/Makefile              |   1 +
 arch/powerpc/mm/init_32.c             |   3 +
 arch/powerpc/mm/kasan/kasan_init_32.c | 156 ++++++++++++++++++++++++++++++++++
 11 files changed, 188 insertions(+)
 create mode 100644 arch/powerpc/mm/kasan/kasan_init_32.c

(limited to 'arch')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 515bd10d32f6..2711aac24621 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -173,6 +173,7 @@ config PPC
 	select GENERIC_TIME_VSYSCALL
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_JUMP_LABEL
+	select HAVE_ARCH_KASAN			if PPC32
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_MMAP_RND_BITS
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS	if COMPAT
diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h
index 05274dea3109..296e51c2f066 100644
--- a/arch/powerpc/include/asm/kasan.h
+++ b/arch/powerpc/include/asm/kasan.h
@@ -27,5 +27,14 @@
 
 #define KASAN_SHADOW_SIZE	(KASAN_SHADOW_END - KASAN_SHADOW_START)
 
+#ifdef CONFIG_KASAN
+void kasan_early_init(void);
+void kasan_mmu_init(void);
+void kasan_init(void);
+#else
+static inline void kasan_init(void) { }
+static inline void kasan_mmu_init(void) { }
+#endif
+
 #endif /* __ASSEMBLY */
 #endif
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 40aec3f00a05..6e85171e513c 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -969,6 +969,9 @@ start_here:
  * Do early platform-specific initialization,
  * and set up the MMU.
  */
+#ifdef CONFIG_KASAN
+	bl	kasan_early_init
+#endif
 	li	r3,0
 	mr	r4,r31
 	bl	machine_init
diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S
index a9c934f2319b..efa219d2136e 100644
--- a/arch/powerpc/kernel/head_40x.S
+++ b/arch/powerpc/kernel/head_40x.S
@@ -848,6 +848,9 @@ start_here:
 /*
  * Decide what sort of machine this is and initialize the MMU.
  */
+#ifdef CONFIG_KASAN
+	bl	kasan_early_init
+#endif
 	li	r3,0
 	mr	r4,r31
 	bl	machine_init
diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
index 37117ab11584..34a5df827b38 100644
--- a/arch/powerpc/kernel/head_44x.S
+++ b/arch/powerpc/kernel/head_44x.S
@@ -203,6 +203,9 @@ _ENTRY(_start);
 /*
  * Decide what sort of machine this is and initialize the MMU.
  */
+#ifdef CONFIG_KASAN
+	bl	kasan_early_init
+#endif
 	li	r3,0
 	mr	r4,r31
 	bl	machine_init
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 03c73b4c6435..d25adb6ef235 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -853,6 +853,9 @@ start_here:
 /*
  * Decide what sort of machine this is and initialize the MMU.
  */
+#ifdef CONFIG_KASAN
+	bl	kasan_early_init
+#endif
 	li	r3,0
 	mr	r4,r31
 	bl	machine_init
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index 32332e24e421..567e0ed45ca8 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -268,6 +268,9 @@ set_ivor:
 /*
  * Decide what sort of machine this is and initialize the MMU.
  */
+#ifdef CONFIG_KASAN
+	bl	kasan_early_init
+#endif
 	mr	r3,r30
 	mr	r4,r31
 	bl	machine_init
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 19d68a9b5f37..91d2c6970bdb 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -67,6 +67,7 @@
 #include <asm/livepatch.h>
 #include <asm/mmu_context.h>
 #include <asm/cpu_has_feature.h>
+#include <asm/kasan.h>
 
 #include "setup.h"
 
@@ -871,6 +872,8 @@ static void smp_setup_pacas(void)
  */
 void __init setup_arch(char **cmdline_p)
 {
+	kasan_init();
+
 	*cmdline_p = boot_command_line;
 
 	/* Set a half-reasonable default so udelay does something sensible */
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 62735f335bce..d8c0ce9b2557 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -26,3 +26,4 @@ obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)		+= highmem.o
 obj-$(CONFIG_PPC_COPRO_BASE)	+= copro_fault.o
 obj-$(CONFIG_PPC_PTDUMP)	+= ptdump/
+obj-$(CONFIG_KASAN)		+= kasan/
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 3eb4cb09749c..c3121b6c8cbd 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -46,6 +46,7 @@
 #include <asm/sections.h>
 #include <asm/hugetlb.h>
 #include <asm/kup.h>
+#include <asm/kasan.h>
 
 #include <mm/mmu_decl.h>
 
@@ -179,6 +180,8 @@ void __init MMU_init(void)
 	btext_unmap();
 #endif
 
+	kasan_mmu_init();
+
 	setup_kup();
 
 	/* Shortly after that, the entire linear mapping will be available */
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
new file mode 100644
index 000000000000..42617fcad828
--- /dev/null
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/kasan.h>
+#include <linux/printk.h>
+#include <linux/memblock.h>
+#include <linux/sched/task.h>
+#include <linux/vmalloc.h>
+#include <asm/pgalloc.h>
+#include <asm/code-patching.h>
+#include <mm/mmu_decl.h>
+
+static void kasan_populate_pte(pte_t *ptep, pgprot_t prot)
+{
+	unsigned long va = (unsigned long)kasan_early_shadow_page;
+	phys_addr_t pa = __pa(kasan_early_shadow_page);
+	int i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++, ptep++)
+		__set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 0);
+}
+
+static int kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end)
+{
+	pmd_t *pmd;
+	unsigned long k_cur, k_next;
+
+	pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start);
+
+	for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) {
+		pte_t *new;
+
+		k_next = pgd_addr_end(k_cur, k_end);
+		if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte)
+			continue;
+
+		new = pte_alloc_one_kernel(&init_mm);
+
+		if (!new)
+			return -ENOMEM;
+		kasan_populate_pte(new, PAGE_KERNEL_RO);
+		pmd_populate_kernel(&init_mm, pmd, new);
+	}
+	return 0;
+}
+
+static void __ref *kasan_get_one_page(void)
+{
+	if (slab_is_available())
+		return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+
+	return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+}
+
+static int __ref kasan_init_region(void *start, size_t size)
+{
+	unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start);
+	unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size);
+	unsigned long k_cur;
+	int ret;
+	void *block = NULL;
+
+	ret = kasan_init_shadow_page_tables(k_start, k_end);
+	if (ret)
+		return ret;
+
+	if (!slab_is_available())
+		block = memblock_alloc(k_end - k_start, PAGE_SIZE);
+
+	for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE) {
+		pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
+		void *va = block ? block + k_cur - k_start : kasan_get_one_page();
+		pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL);
+
+		if (!va)
+			return -ENOMEM;
+
+		__set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0);
+	}
+	flush_tlb_kernel_range(k_start, k_end);
+	return 0;
+}
+
+static void __init kasan_remap_early_shadow_ro(void)
+{
+	kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL_RO);
+
+	flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END);
+}
+
+void __init kasan_mmu_init(void)
+{
+	int ret;
+	struct memblock_region *reg;
+
+	for_each_memblock(memory, reg) {
+		phys_addr_t base = reg->base;
+		phys_addr_t top = min(base + reg->size, total_lowmem);
+
+		if (base >= top)
+			continue;
+
+		ret = kasan_init_region(__va(base), top - base);
+		if (ret)
+			panic("kasan: kasan_init_region() failed");
+	}
+}
+
+void __init kasan_init(void)
+{
+	kasan_remap_early_shadow_ro();
+
+	clear_page(kasan_early_shadow_page);
+
+	/* At this point kasan is fully initialized. Enable error messages */
+	init_task.kasan_depth = 0;
+	pr_info("KASAN init done\n");
+}
+
+#ifdef CONFIG_MODULES
+void *module_alloc(unsigned long size)
+{
+	void *base = vmalloc_exec(size);
+
+	if (!base)
+		return NULL;
+
+	if (!kasan_init_region(base, size))
+		return base;
+
+	vfree(base);
+
+	return NULL;
+}
+#endif
+
+void __init kasan_early_init(void)
+{
+	unsigned long addr = KASAN_SHADOW_START;
+	unsigned long end = KASAN_SHADOW_END;
+	unsigned long next;
+	pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(addr), addr), addr);
+
+	BUILD_BUG_ON(KASAN_SHADOW_START & ~PGDIR_MASK);
+
+	kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL);
+
+	do {
+		next = pgd_addr_end(addr, end);
+		pmd_populate_kernel(&init_mm, pmd, kasan_early_shadow_pte);
+	} while (pmd++, addr = next, addr != end);
+
+	if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
+		WARN(true, "KASAN not supported on hash 6xx");
+}
-- 
cgit v1.2.3-59-g8ed1b


From 72f208c6a8f7bc78ef5248babd9e6ed6302bd2a0 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 16:23:35 +0000
Subject: powerpc/32s: move hash code patching out of MMU_init_hw()

For KASAN, hash table handling will be activated early for
accessing to KASAN shadow areas.

In order to avoid any modification of the hash functions while
they are still used with the early hash table, the code patching
is moved out of MMU_init_hw() and put close to the big-bang switch
to the final hash table.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/head_32.S  |  3 +++
 arch/powerpc/mm/book3s32/mmu.c | 36 ++++++++++++++++++++++--------------
 arch/powerpc/mm/mmu_decl.h     |  1 +
 3 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 6e85171e513c..5958ea685968 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -977,6 +977,9 @@ start_here:
 	bl	machine_init
 	bl	__save_cpu_setup
 	bl	MMU_init
+BEGIN_MMU_FTR_SECTION
+	bl	MMU_init_hw_patch
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
 
 /*
  * Go back to running unmapped so we can load up new values
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 1db55159031c..165529cc9087 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -39,6 +39,7 @@
 struct hash_pte *Hash, *Hash_end;
 unsigned long Hash_size, Hash_mask;
 unsigned long _SDR1;
+static unsigned int hash_mb, hash_mb2;
 
 struct ppc_bat BATS[8][2];	/* 8 pairs of IBAT, DBAT */
 
@@ -308,7 +309,6 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
  */
 void __init MMU_init_hw(void)
 {
-	unsigned int hmask, mb, mb2;
 	unsigned int n_hpteg, lg_n_hpteg;
 
 	if (!mmu_has_feature(MMU_FTR_HPTE_TABLE))
@@ -351,20 +351,30 @@ void __init MMU_init_hw(void)
 	       (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash);
 
 
-	/*
-	 * Patch up the instructions in hashtable.S:create_hpte
-	 */
-	if ( ppc_md.progress ) ppc_md.progress("hash:patch", 0x345);
 	Hash_mask = n_hpteg - 1;
-	hmask = Hash_mask >> (16 - LG_HPTEG_SIZE);
-	mb2 = mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg;
+	hash_mb2 = hash_mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg;
 	if (lg_n_hpteg > 16)
-		mb2 = 16 - LG_HPTEG_SIZE;
+		hash_mb2 = 16 - LG_HPTEG_SIZE;
+}
+
+void __init MMU_init_hw_patch(void)
+{
+	unsigned int hmask = Hash_mask >> (16 - LG_HPTEG_SIZE);
 
+	if (ppc_md.progress)
+		ppc_md.progress("hash:patch", 0x345);
+	if (ppc_md.progress)
+		ppc_md.progress("hash:done", 0x205);
+
+	/* WARNING: Make sure nothing can trigger a KASAN check past this point */
+
+	/*
+	 * Patch up the instructions in hashtable.S:create_hpte
+	 */
 	modify_instruction_site(&patch__hash_page_A0, 0xffff,
 				((unsigned int)Hash - PAGE_OFFSET) >> 16);
-	modify_instruction_site(&patch__hash_page_A1, 0x7c0, mb << 6);
-	modify_instruction_site(&patch__hash_page_A2, 0x7c0, mb2 << 6);
+	modify_instruction_site(&patch__hash_page_A1, 0x7c0, hash_mb << 6);
+	modify_instruction_site(&patch__hash_page_A2, 0x7c0, hash_mb2 << 6);
 	modify_instruction_site(&patch__hash_page_B, 0xffff, hmask);
 	modify_instruction_site(&patch__hash_page_C, 0xffff, hmask);
 
@@ -373,11 +383,9 @@ void __init MMU_init_hw(void)
 	 */
 	modify_instruction_site(&patch__flush_hash_A0, 0xffff,
 				((unsigned int)Hash - PAGE_OFFSET) >> 16);
-	modify_instruction_site(&patch__flush_hash_A1, 0x7c0, mb << 6);
-	modify_instruction_site(&patch__flush_hash_A2, 0x7c0, mb2 << 6);
+	modify_instruction_site(&patch__flush_hash_A1, 0x7c0, hash_mb << 6);
+	modify_instruction_site(&patch__flush_hash_A2, 0x7c0, hash_mb2 << 6);
 	modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask);
-
-	if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205);
 }
 
 void setup_initial_memory_limit(phys_addr_t first_memblock_base,
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 74ff61dabcb1..d726ff776054 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -130,6 +130,7 @@ extern void wii_memory_fixups(void);
  */
 #ifdef CONFIG_PPC32
 extern void MMU_init_hw(void);
+void MMU_init_hw_patch(void);
 unsigned long mmu_mapin_ram(unsigned long base, unsigned long top);
 #endif
 
-- 
cgit v1.2.3-59-g8ed1b


From 215b823707ce4e8e52b106915f70357fa474c669 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 16:23:36 +0000
Subject: powerpc/32s: set up an early static hash table for KASAN.

KASAN requires early activation of hash table, before memblock()
functions are available.

This patch implements an early hash_table statically defined in
__initdata.

During early boot, a single page table is used.

For hash32, when doing the final init, one page table is allocated
for each PGD entry because of the _PAGE_HASHPTE flag which can't be
common to several virt pages. This is done after memblock get
available but before switching to the final hash table, otherwise
there are issues with TLB flushing due to the shared entries.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/head_32.S         | 70 ++++++++++++++++++++++-------------
 arch/powerpc/mm/kasan/kasan_init_32.c | 23 +++++++++++-
 arch/powerpc/mm/mmu_decl.h            |  1 +
 3 files changed, 68 insertions(+), 26 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 5958ea685968..73288df1c5d6 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -160,6 +160,10 @@ __after_mmu_off:
 	bl	flush_tlbs
 
 	bl	initial_bats
+	bl	load_segment_registers
+#ifdef CONFIG_KASAN
+	bl	early_hash_table
+#endif
 #if defined(CONFIG_BOOTX_TEXT)
 	bl	setup_disp_bat
 #endif
@@ -205,7 +209,7 @@ __after_mmu_off:
  */
 turn_on_mmu:
 	mfmsr	r0
-	ori	r0,r0,MSR_DR|MSR_IR
+	ori	r0,r0,MSR_DR|MSR_IR|MSR_RI
 	mtspr	SPRN_SRR1,r0
 	lis	r0,start_here@h
 	ori	r0,r0,start_here@l
@@ -884,11 +888,24 @@ _ENTRY(__restore_cpu_setup)
 	blr
 #endif /* !defined(CONFIG_PPC_BOOK3S_32) */
 
-
 /*
  * Load stuff into the MMU.  Intended to be called with
  * IR=0 and DR=0.
  */
+#ifdef CONFIG_KASAN
+early_hash_table:
+	sync			/* Force all PTE updates to finish */
+	isync
+	tlbia			/* Clear all TLB entries */
+	sync			/* wait for tlbia/tlbie to finish */
+	TLBSYNC			/* ... on all CPUs */
+	/* Load the SDR1 register (hash table base & size) */
+	lis	r6, early_hash - PAGE_OFFSET@h
+	ori	r6, r6, 3	/* 256kB table */
+	mtspr	SPRN_SDR1, r6
+	blr
+#endif
+
 load_up_mmu:
 	sync			/* Force all PTE updates to finish */
 	isync
@@ -900,29 +917,6 @@ load_up_mmu:
 	tophys(r6,r6)
 	lwz	r6,_SDR1@l(r6)
 	mtspr	SPRN_SDR1,r6
-	li	r0, NUM_USER_SEGMENTS /* load up user segment register values */
-	mtctr	r0		/* for context 0 */
-	li	r3, 0		/* Kp = 0, Ks = 0, VSID = 0 */
-#ifdef CONFIG_PPC_KUEP
-	oris	r3, r3, SR_NX@h	/* Set Nx */
-#endif
-#ifdef CONFIG_PPC_KUAP
-	oris	r3, r3, SR_KS@h	/* Set Ks */
-#endif
-	li	r4,0
-3:	mtsrin	r3,r4
-	addi	r3,r3,0x111	/* increment VSID */
-	addis	r4,r4,0x1000	/* address of next segment */
-	bdnz	3b
-	li	r0, 16 - NUM_USER_SEGMENTS /* load up kernel segment registers */
-	mtctr	r0			/* for context 0 */
-	rlwinm	r3, r3, 0, ~SR_NX	/* Nx = 0 */
-	rlwinm	r3, r3, 0, ~SR_KS	/* Ks = 0 */
-	oris	r3, r3, SR_KP@h		/* Kp = 1 */
-3:	mtsrin	r3, r4
-	addi	r3, r3, 0x111	/* increment VSID */
-	addis	r4, r4, 0x1000	/* address of next segment */
-	bdnz	3b
 
 /* Load the BAT registers with the values set up by MMU_init.
    MMU_init takes care of whether we're on a 601 or not. */
@@ -944,6 +938,32 @@ BEGIN_MMU_FTR_SECTION
 END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS)
 	blr
 
+load_segment_registers:
+	li	r0, NUM_USER_SEGMENTS /* load up user segment register values */
+	mtctr	r0		/* for context 0 */
+	li	r3, 0		/* Kp = 0, Ks = 0, VSID = 0 */
+#ifdef CONFIG_PPC_KUEP
+	oris	r3, r3, SR_NX@h	/* Set Nx */
+#endif
+#ifdef CONFIG_PPC_KUAP
+	oris	r3, r3, SR_KS@h	/* Set Ks */
+#endif
+	li	r4, 0
+3:	mtsrin	r3, r4
+	addi	r3, r3, 0x111	/* increment VSID */
+	addis	r4, r4, 0x1000	/* address of next segment */
+	bdnz	3b
+	li	r0, 16 - NUM_USER_SEGMENTS /* load up kernel segment registers */
+	mtctr	r0			/* for context 0 */
+	rlwinm	r3, r3, 0, ~SR_NX	/* Nx = 0 */
+	rlwinm	r3, r3, 0, ~SR_KS	/* Ks = 0 */
+	oris	r3, r3, SR_KP@h		/* Kp = 1 */
+3:	mtsrin	r3, r4
+	addi	r3, r3, 0x111	/* increment VSID */
+	addis	r4, r4, 0x1000	/* address of next segment */
+	bdnz	3b
+	blr
+
 /*
  * This is where the main kernel code starts.
  */
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index 42617fcad828..ba8361487075 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -94,6 +94,13 @@ void __init kasan_mmu_init(void)
 	int ret;
 	struct memblock_region *reg;
 
+	if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) {
+		ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+		if (ret)
+			panic("kasan: kasan_init_shadow_page_tables() failed");
+	}
+
 	for_each_memblock(memory, reg) {
 		phys_addr_t base = reg->base;
 		phys_addr_t top = min(base + reg->size, total_lowmem);
@@ -135,6 +142,20 @@ void *module_alloc(unsigned long size)
 }
 #endif
 
+#ifdef CONFIG_PPC_BOOK3S_32
+u8 __initdata early_hash[256 << 10] __aligned(256 << 10) = {0};
+
+static void __init kasan_early_hash_table(void)
+{
+	modify_instruction_site(&patch__hash_page_A0, 0xffff, __pa(early_hash) >> 16);
+	modify_instruction_site(&patch__flush_hash_A0, 0xffff, __pa(early_hash) >> 16);
+
+	Hash = (struct hash_pte *)early_hash;
+}
+#else
+static void __init kasan_early_hash_table(void) {}
+#endif
+
 void __init kasan_early_init(void)
 {
 	unsigned long addr = KASAN_SHADOW_START;
@@ -152,5 +173,5 @@ void __init kasan_early_init(void)
 	} while (pmd++, addr = next, addr != end);
 
 	if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
-		WARN(true, "KASAN not supported on hash 6xx");
+		kasan_early_hash_table();
 }
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index d726ff776054..31fce3914ddc 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -106,6 +106,7 @@ extern unsigned int rtas_data, rtas_size;
 struct hash_pte;
 extern struct hash_pte *Hash, *Hash_end;
 extern unsigned long Hash_size, Hash_mask;
+extern u8 early_hash[];
 
 #endif /* CONFIG_PPC32 */
 
-- 
cgit v1.2.3-59-g8ed1b


From da3a3b0a0e38377c98946420acdc7d4ca38cff47 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 16:23:37 +0000
Subject: powerpc/32s: map kasan zero shadow with PAGE_READONLY instead of
 PAGE_KERNEL_RO

For hash32, the zero shadow page gets mapped with PAGE_READONLY instead
of PAGE_KERNEL_RO, because the PP bits don't provide a RO kernel, so
PAGE_KERNEL_RO is equivalent to PAGE_KERNEL. By using PAGE_READONLY,
the page is RO for both kernel and user, but this is not a security issue
as it contains only zeroes.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/kasan/kasan_init_32.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index ba8361487075..0d62be3cba47 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -39,7 +39,10 @@ static int kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_
 
 		if (!new)
 			return -ENOMEM;
-		kasan_populate_pte(new, PAGE_KERNEL_RO);
+		if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
+			kasan_populate_pte(new, PAGE_READONLY);
+		else
+			kasan_populate_pte(new, PAGE_KERNEL_RO);
 		pmd_populate_kernel(&init_mm, pmd, new);
 	}
 	return 0;
@@ -84,7 +87,10 @@ static int __ref kasan_init_region(void *start, size_t size)
 
 static void __init kasan_remap_early_shadow_ro(void)
 {
-	kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL_RO);
+	if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
+		kasan_populate_pte(kasan_early_shadow_pte, PAGE_READONLY);
+	else
+		kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL_RO);
 
 	flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 57e0491b58fa2a217029b696511499008852a642 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 16:36:36 +0000
Subject: powerpc/32s: drop Hash_end

Hash_end has never been used, drop it.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/book3s32/mmu.c | 4 +---
 arch/powerpc/mm/mmu_decl.h     | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 165529cc9087..03265de05637 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -36,7 +36,7 @@
 
 #include <mm/mmu_decl.h>
 
-struct hash_pte *Hash, *Hash_end;
+struct hash_pte *Hash;
 unsigned long Hash_size, Hash_mask;
 unsigned long _SDR1;
 static unsigned int hash_mb, hash_mb2;
@@ -345,8 +345,6 @@ void __init MMU_init_hw(void)
 		      __func__, Hash_size, Hash_size);
 	_SDR1 = __pa(Hash) | SDR1_LOW_BITS;
 
-	Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size);
-
 	printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n",
 	       (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash);
 
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 31fce3914ddc..7b8833d695d1 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -104,7 +104,7 @@ extern int __map_without_bats;
 extern unsigned int rtas_data, rtas_size;
 
 struct hash_pte;
-extern struct hash_pte *Hash, *Hash_end;
+extern struct hash_pte *Hash;
 extern unsigned long Hash_size, Hash_mask;
 extern u8 early_hash[];
 
-- 
cgit v1.2.3-59-g8ed1b


From 8f156c23f4c04ca51961cd1f6a0edbc80caa2683 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 16:36:37 +0000
Subject: powerpc/32s: don't try to print hash table address.

Due to %p, (ptrval) is printed in lieu of the hash table address.

showing the hash table address isn't an operationnal need so just
don't print it.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/book3s32/mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 03265de05637..131cd3acb6b8 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -345,8 +345,8 @@ void __init MMU_init_hw(void)
 		      __func__, Hash_size, Hash_size);
 	_SDR1 = __pa(Hash) | SDR1_LOW_BITS;
 
-	printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n",
-	       (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash);
+	pr_info("Total memory = %lldMB; using %ldkB for hash table\n",
+		(unsigned long long)(total_memory >> 20), Hash_size >> 10);
 
 
 	Hash_mask = n_hpteg - 1;
-- 
cgit v1.2.3-59-g8ed1b


From e4dccf9092ab48a6f902003b9558c0e45d0e849a Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 26 Apr 2019 16:36:39 +0000
Subject: powerpc/mm: print hash info in a helper

Reduce #ifdef mess by defining a helper to print
hash info at startup.

In the meantime, remove the display of hash table address
to reduce leak of non necessary information.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/setup-common.c    | 22 +---------------------
 arch/powerpc/mm/book3s32/mmu.c        |  9 ++++++++-
 arch/powerpc/mm/book3s64/hash_utils.c | 13 +++++++++++++
 arch/powerpc/mm/mmu_decl.h            |  5 ++++-
 4 files changed, 26 insertions(+), 23 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 91d2c6970bdb..3f8805d3c0c9 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -800,12 +800,6 @@ void arch_setup_pdev_archdata(struct platform_device *pdev)
 static __init void print_system_info(void)
 {
 	pr_info("-----------------------------------------------------\n");
-#ifdef CONFIG_PPC_BOOK3S_64
-	pr_info("ppc64_pft_size    = 0x%llx\n", ppc64_pft_size);
-#endif
-#ifdef CONFIG_PPC_BOOK3S_32
-	pr_info("Hash_size         = 0x%lx\n", Hash_size);
-#endif
 	pr_info("phys_mem_size     = 0x%llx\n",
 		(unsigned long long)memblock_phys_mem_size());
 
@@ -827,21 +821,7 @@ static __init void print_system_info(void)
 	pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features);
 #endif
 
-#ifdef CONFIG_PPC_BOOK3S_64
-	if (htab_address)
-		pr_info("htab_address      = 0x%p\n", htab_address);
-	if (htab_hash_mask)
-		pr_info("htab_hash_mask    = 0x%lx\n", htab_hash_mask);
-	pr_info("kernel vmalloc start   = 0x%lx\n", KERN_VIRT_START);
-	pr_info("kernel IO start        = 0x%lx\n", KERN_IO_START);
-	pr_info("kernel vmemmap start   = 0x%lx\n", (unsigned long)vmemmap);
-#endif
-#ifdef CONFIG_PPC_BOOK3S_32
-	if (Hash)
-		pr_info("Hash              = 0x%p\n", Hash);
-	if (Hash_mask)
-		pr_info("Hash_mask         = 0x%lx\n", Hash_mask);
-#endif
+	print_system_hash_info();
 
 	if (PHYSICAL_START > 0)
 		pr_info("physical_start    = 0x%llx\n",
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 131cd3acb6b8..615f78d35536 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -37,7 +37,7 @@
 #include <mm/mmu_decl.h>
 
 struct hash_pte *Hash;
-unsigned long Hash_size, Hash_mask;
+static unsigned long Hash_size, Hash_mask;
 unsigned long _SDR1;
 static unsigned int hash_mb, hash_mb2;
 
@@ -401,6 +401,13 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 		memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000));
 }
 
+void __init print_system_hash_info(void)
+{
+	pr_info("Hash_size         = 0x%lx\n", Hash_size);
+	if (Hash_mask)
+		pr_info("Hash_mask         = 0x%lx\n", Hash_mask);
+}
+
 #ifdef CONFIG_PPC_KUEP
 void __init setup_kuep(bool disabled)
 {
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index f0ce860a69ac..919a861a8ec0 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -66,6 +66,8 @@
 #include <asm/pte-walk.h>
 #include <asm/asm-prototypes.h>
 
+#include <mm/mmu_decl.h>
+
 #ifdef DEBUG
 #define DBG(fmt...) udbg_printf(fmt)
 #else
@@ -1945,3 +1947,14 @@ static int __init hash64_debugfs(void)
 }
 machine_device_initcall(pseries, hash64_debugfs);
 #endif /* CONFIG_DEBUG_FS */
+
+void __init print_system_hash_info(void)
+{
+	pr_info("ppc64_pft_size    = 0x%llx\n", ppc64_pft_size);
+
+	if (htab_hash_mask)
+		pr_info("htab_hash_mask    = 0x%lx\n", htab_hash_mask);
+	pr_info("kernel vmalloc start   = 0x%lx\n", KERN_VIRT_START);
+	pr_info("kernel IO start        = 0x%lx\n", KERN_IO_START);
+	pr_info("kernel vmemmap start   = 0x%lx\n", (unsigned long)vmemmap);
+}
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 7b8833d695d1..7bac0aa2026a 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -83,6 +83,8 @@ static inline void _tlbivax_bcast(unsigned long address, unsigned int pid,
 }
 #endif
 
+static inline void print_system_hash_info(void) {}
+
 #else /* CONFIG_PPC_MMU_NOHASH */
 
 extern void hash_preload(struct mm_struct *mm, unsigned long ea,
@@ -92,6 +94,8 @@ extern void hash_preload(struct mm_struct *mm, unsigned long ea,
 extern void _tlbie(unsigned long address);
 extern void _tlbia(void);
 
+void print_system_hash_info(void);
+
 #endif /* CONFIG_PPC_MMU_NOHASH */
 
 #ifdef CONFIG_PPC32
@@ -105,7 +109,6 @@ extern unsigned int rtas_data, rtas_size;
 
 struct hash_pte;
 extern struct hash_pte *Hash;
-extern unsigned long Hash_size, Hash_mask;
 extern u8 early_hash[];
 
 #endif /* CONFIG_PPC32 */
-- 
cgit v1.2.3-59-g8ed1b


From 8a23fdec3dbdc8bfde6f806d36e773236dab6663 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 30 Apr 2019 12:38:50 +0000
Subject: powerpc/32: Refactor EXCEPTION entry macros for head_8xx.S and
 head_32.S

EXCEPTION_PROLOG is similar in head_8xx.S and head_32.S

This patch creates head_32.h and moves EXCEPTION_PROLOG macro
into it. It also converts it from a GCC macro to a GAS macro
in order to ease refactorisation with 40x later, since
GAS macros allows the use of #ifdef/#else/#endif inside it.
And it also has the advantage of not requiring the uggly "; \"
at the end of each line.

This patch also moves EXCEPTION() and EXC_XFER_XXXX() macros which
are also similar while adding START_EXCEPTION() out of EXCEPTION().

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/head_32.S  |  99 +---------------------------------
 arch/powerpc/kernel/head_32.h  | 118 +++++++++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/head_8xx.S |  98 +---------------------------------
 3 files changed, 122 insertions(+), 193 deletions(-)
 create mode 100644 arch/powerpc/kernel/head_32.h

(limited to 'arch')

diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 73288df1c5d6..f98e6b461238 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -37,6 +37,8 @@
 #include <asm/export.h>
 #include <asm/feature-fixups.h>
 
+#include "head_32.h"
+
 /* 601 only have IBAT; cr0.eq is set on 601 when using this macro */
 #define LOAD_BAT(n, reg, RA, RB)	\
 	/* see the comment for clear_bats() -- Cort */ \
@@ -246,103 +248,6 @@ __secondary_hold_spinloop:
 __secondary_hold_acknowledge:
 	.long	-1
 
-/*
- * Exception entry code.  This code runs with address translation
- * turned off, i.e. using physical addresses.
- * We assume sprg3 has the physical address of the current
- * task's thread_struct.
- */
-#define EXCEPTION_PROLOG	\
-	mtspr	SPRN_SPRG_SCRATCH0,r10;	\
-	mtspr	SPRN_SPRG_SCRATCH1,r11;	\
-	mfcr	r10;		\
-	EXCEPTION_PROLOG_1;	\
-	EXCEPTION_PROLOG_2
-
-#define EXCEPTION_PROLOG_1	\
-	mfspr	r11,SPRN_SRR1;		/* check whether user or kernel */ \
-	andi.	r11,r11,MSR_PR;	\
-	tophys(r11,r1);			/* use tophys(r1) if kernel */ \
-	beq	1f;		\
-	mfspr	r11,SPRN_SPRG_THREAD;	\
-	lwz	r11,TASK_STACK-THREAD(r11);	\
-	addi	r11,r11,THREAD_SIZE;	\
-	tophys(r11,r11);	\
-1:	subi	r11,r11,INT_FRAME_SIZE	/* alloc exc. frame */
-
-
-#define EXCEPTION_PROLOG_2	\
-	stw	r10,_CCR(r11);		/* save registers */ \
-	stw	r12,GPR12(r11);	\
-	stw	r9,GPR9(r11);	\
-	mfspr	r10,SPRN_SPRG_SCRATCH0;	\
-	stw	r10,GPR10(r11);	\
-	mfspr	r12,SPRN_SPRG_SCRATCH1;	\
-	stw	r12,GPR11(r11);	\
-	mflr	r10;		\
-	stw	r10,_LINK(r11);	\
-	mfspr	r12,SPRN_SRR0;	\
-	mfspr	r9,SPRN_SRR1;	\
-	stw	r1,GPR1(r11);	\
-	stw	r1,0(r11);	\
-	tovirt(r1,r11);			/* set new kernel sp */	\
-	li	r10,MSR_KERNEL & ~(MSR_IR|MSR_DR); /* can take exceptions */ \
-	MTMSRD(r10);			/* (except for mach check in rtas) */ \
-	stw	r0,GPR0(r11);	\
-	lis	r10,STACK_FRAME_REGS_MARKER@ha; /* exception frame marker */ \
-	addi	r10,r10,STACK_FRAME_REGS_MARKER@l; \
-	stw	r10,8(r11);	\
-	SAVE_4GPRS(3, r11);	\
-	SAVE_2GPRS(7, r11)
-
-/*
- * Note: code which follows this uses cr0.eq (set if from kernel),
- * r11, r12 (SRR0), and r9 (SRR1).
- *
- * Note2: once we have set r1 we are in a position to take exceptions
- * again, and we could thus set MSR:RI at that point.
- */
-
-/*
- * Exception vectors.
- */
-#define EXCEPTION(n, label, hdlr, xfer)		\
-	. = n;					\
-	DO_KVM n;				\
-label:						\
-	EXCEPTION_PROLOG;			\
-	addi	r3,r1,STACK_FRAME_OVERHEAD;	\
-	xfer(n, hdlr)
-
-#define EXC_XFER_TEMPLATE(n, hdlr, trap, copyee, tfer, ret)	\
-	li	r10,trap;					\
-	stw	r10,_TRAP(r11);					\
-	li	r10,MSR_KERNEL;					\
-	copyee(r10, r9);					\
-	bl	tfer;						\
-i##n:								\
-	.long	hdlr;						\
-	.long	ret
-
-#define COPY_EE(d, s)		rlwimi d,s,0,16,16
-#define NOCOPY(d, s)
-
-#define EXC_XFER_STD(n, hdlr)		\
-	EXC_XFER_TEMPLATE(n, hdlr, n, NOCOPY, transfer_to_handler_full,	\
-			  ret_from_except_full)
-
-#define EXC_XFER_LITE(n, hdlr)		\
-	EXC_XFER_TEMPLATE(n, hdlr, n+1, NOCOPY, transfer_to_handler, \
-			  ret_from_except)
-
-#define EXC_XFER_EE(n, hdlr)		\
-	EXC_XFER_TEMPLATE(n, hdlr, n, COPY_EE, transfer_to_handler_full, \
-			  ret_from_except_full)
-
-#define EXC_XFER_EE_LITE(n, hdlr)	\
-	EXC_XFER_TEMPLATE(n, hdlr, n+1, COPY_EE, transfer_to_handler, \
-			  ret_from_except)
-
 /* System reset */
 /* core99 pmac starts the seconary here by changing the vector, and
    putting it back to what it was (unknown_exception) when done.  */
diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
new file mode 100644
index 000000000000..7456e2a45acc
--- /dev/null
+++ b/arch/powerpc/kernel/head_32.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __HEAD_32_H__
+#define __HEAD_32_H__
+
+#include <asm/ptrace.h>	/* for STACK_FRAME_REGS_MARKER */
+
+/*
+ * Exception entry code.  This code runs with address translation
+ * turned off, i.e. using physical addresses.
+ * We assume sprg3 has the physical address of the current
+ * task's thread_struct.
+ */
+
+.macro EXCEPTION_PROLOG
+	mtspr	SPRN_SPRG_SCRATCH0,r10
+	mtspr	SPRN_SPRG_SCRATCH1,r11
+	mfcr	r10
+	EXCEPTION_PROLOG_1
+	EXCEPTION_PROLOG_2
+.endm
+
+.macro EXCEPTION_PROLOG_1
+	mfspr	r11,SPRN_SRR1		/* check whether user or kernel */
+	andi.	r11,r11,MSR_PR
+	tophys(r11,r1)			/* use tophys(r1) if kernel */
+	beq	1f
+	mfspr	r11,SPRN_SPRG_THREAD
+	lwz	r11,TASK_STACK-THREAD(r11)
+	addi	r11,r11,THREAD_SIZE
+	tophys(r11,r11)
+1:	subi	r11,r11,INT_FRAME_SIZE	/* alloc exc. frame */
+.endm
+
+.macro EXCEPTION_PROLOG_2
+	stw	r10,_CCR(r11)		/* save registers */
+	stw	r12,GPR12(r11)
+	stw	r9,GPR9(r11)
+	mfspr	r10,SPRN_SPRG_SCRATCH0
+	stw	r10,GPR10(r11)
+	mfspr	r12,SPRN_SPRG_SCRATCH1
+	stw	r12,GPR11(r11)
+	mflr	r10
+	stw	r10,_LINK(r11)
+	mfspr	r12,SPRN_SRR0
+	mfspr	r9,SPRN_SRR1
+	stw	r1,GPR1(r11)
+	stw	r1,0(r11)
+	tovirt(r1,r11)			/* set new kernel sp */
+	li	r10,MSR_KERNEL & ~(MSR_IR|MSR_DR) /* can take exceptions */
+	MTMSRD(r10)			/* (except for mach check in rtas) */
+	stw	r0,GPR0(r11)
+	lis	r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */
+	addi	r10,r10,STACK_FRAME_REGS_MARKER@l
+	stw	r10,8(r11)
+	SAVE_4GPRS(3, r11)
+	SAVE_2GPRS(7, r11)
+.endm
+
+/*
+ * Note: code which follows this uses cr0.eq (set if from kernel),
+ * r11, r12 (SRR0), and r9 (SRR1).
+ *
+ * Note2: once we have set r1 we are in a position to take exceptions
+ * again, and we could thus set MSR:RI at that point.
+ */
+
+/*
+ * Exception vectors.
+ */
+#ifdef CONFIG_PPC_BOOK3S
+#define	START_EXCEPTION(n, label)		\
+	. = n;					\
+	DO_KVM n;				\
+label:
+
+#else
+#define	START_EXCEPTION(n, label)		\
+	. = n;					\
+label:
+
+#endif
+
+#define EXCEPTION(n, label, hdlr, xfer)		\
+	START_EXCEPTION(n, label)		\
+	EXCEPTION_PROLOG;			\
+	addi	r3,r1,STACK_FRAME_OVERHEAD;	\
+	xfer(n, hdlr)
+
+#define EXC_XFER_TEMPLATE(n, hdlr, trap, copyee, tfer, ret)	\
+	li	r10,trap;					\
+	stw	r10,_TRAP(r11);					\
+	li	r10,MSR_KERNEL;					\
+	copyee(r10, r9);					\
+	bl	tfer;						\
+i##n:								\
+	.long	hdlr;						\
+	.long	ret
+
+#define COPY_EE(d, s)		rlwimi d,s,0,MSR_EE
+#define NOCOPY(d, s)
+
+#define EXC_XFER_STD(n, hdlr)		\
+	EXC_XFER_TEMPLATE(n, hdlr, n, NOCOPY, transfer_to_handler_full,	\
+			  ret_from_except_full)
+
+#define EXC_XFER_LITE(n, hdlr)		\
+	EXC_XFER_TEMPLATE(n, hdlr, n+1, NOCOPY, transfer_to_handler, \
+			  ret_from_except)
+
+#define EXC_XFER_EE(n, hdlr)		\
+	EXC_XFER_TEMPLATE(n, hdlr, n, COPY_EE, transfer_to_handler_full, \
+			  ret_from_except_full)
+
+#define EXC_XFER_EE_LITE(n, hdlr)	\
+	EXC_XFER_TEMPLATE(n, hdlr, n+1, COPY_EE, transfer_to_handler, \
+			  ret_from_except)
+
+#endif /* __HEAD_32_H__ */
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index d25adb6ef235..14c3eb3267b8 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -33,6 +33,8 @@
 #include <asm/export.h>
 #include <asm/code-patching-asm.h>
 
+#include "head_32.h"
+
 #if CONFIG_TASK_SIZE <= 0x80000000 && CONFIG_PAGE_OFFSET >= 0x80000000
 /* By simply checking Address >= 0x80000000, we know if its a kernel address */
 #define SIMPLE_KERNEL_ADDRESS		1
@@ -123,102 +125,6 @@ instruction_counter:
 	.space	4
 #endif
 
-/*
- * Exception entry code.  This code runs with address translation
- * turned off, i.e. using physical addresses.
- * We assume sprg3 has the physical address of the current
- * task's thread_struct.
- */
-#define EXCEPTION_PROLOG	\
-	mtspr	SPRN_SPRG_SCRATCH0, r10;	\
-	mtspr	SPRN_SPRG_SCRATCH1, r11;	\
-	mfcr	r10;		\
-	EXCEPTION_PROLOG_1;	\
-	EXCEPTION_PROLOG_2
-
-#define EXCEPTION_PROLOG_1	\
-	mfspr	r11,SPRN_SRR1;		/* check whether user or kernel */ \
-	andi.	r11,r11,MSR_PR;	\
-	tophys(r11,r1);			/* use tophys(r1) if kernel */ \
-	beq	1f;		\
-	mfspr	r11,SPRN_SPRG_THREAD;	\
-	lwz	r11,TASK_STACK-THREAD(r11);	\
-	addi	r11,r11,THREAD_SIZE;	\
-	tophys(r11,r11);	\
-1:	subi	r11,r11,INT_FRAME_SIZE	/* alloc exc. frame */
-
-
-#define EXCEPTION_PROLOG_2	\
-	stw	r10,_CCR(r11);		/* save registers */ \
-	stw	r12,GPR12(r11);	\
-	stw	r9,GPR9(r11);	\
-	mfspr	r10,SPRN_SPRG_SCRATCH0;	\
-	stw	r10,GPR10(r11);	\
-	mfspr	r12,SPRN_SPRG_SCRATCH1;	\
-	stw	r12,GPR11(r11);	\
-	mflr	r10;		\
-	stw	r10,_LINK(r11);	\
-	mfspr	r12,SPRN_SRR0;	\
-	mfspr	r9,SPRN_SRR1;	\
-	stw	r1,GPR1(r11);	\
-	stw	r1,0(r11);	\
-	tovirt(r1,r11);			/* set new kernel sp */	\
-	li	r10,MSR_KERNEL & ~(MSR_IR|MSR_DR); /* can take exceptions */ \
-	mtmsr	r10;		\
-	stw	r0,GPR0(r11);	\
-	lis	r10, STACK_FRAME_REGS_MARKER@ha; /* exception frame marker */ \
-	addi	r10, r10, STACK_FRAME_REGS_MARKER@l; \
-	stw	r10, 8(r11);	\
-	SAVE_4GPRS(3, r11);	\
-	SAVE_2GPRS(7, r11)
-
-/*
- * Note: code which follows this uses cr0.eq (set if from kernel),
- * r11, r12 (SRR0), and r9 (SRR1).
- *
- * Note2: once we have set r1 we are in a position to take exceptions
- * again, and we could thus set MSR:RI at that point.
- */
-
-/*
- * Exception vectors.
- */
-#define EXCEPTION(n, label, hdlr, xfer)		\
-	. = n;					\
-label:						\
-	EXCEPTION_PROLOG;			\
-	addi	r3,r1,STACK_FRAME_OVERHEAD;	\
-	xfer(n, hdlr)
-
-#define EXC_XFER_TEMPLATE(n, hdlr, trap, copyee, tfer, ret)	\
-	li	r10,trap;					\
-	stw	r10,_TRAP(r11);					\
-	li	r10,MSR_KERNEL;					\
-	copyee(r10, r9);					\
-	bl	tfer;						\
-i##n:								\
-	.long	hdlr;						\
-	.long	ret
-
-#define COPY_EE(d, s)		rlwimi d,s,0,16,16
-#define NOCOPY(d, s)
-
-#define EXC_XFER_STD(n, hdlr)		\
-	EXC_XFER_TEMPLATE(n, hdlr, n, NOCOPY, transfer_to_handler_full,	\
-			  ret_from_except_full)
-
-#define EXC_XFER_LITE(n, hdlr)		\
-	EXC_XFER_TEMPLATE(n, hdlr, n+1, NOCOPY, transfer_to_handler, \
-			  ret_from_except)
-
-#define EXC_XFER_EE(n, hdlr)		\
-	EXC_XFER_TEMPLATE(n, hdlr, n, COPY_EE, transfer_to_handler_full, \
-			  ret_from_except_full)
-
-#define EXC_XFER_EE_LITE(n, hdlr)	\
-	EXC_XFER_TEMPLATE(n, hdlr, n+1, COPY_EE, transfer_to_handler, \
-			  ret_from_except)
-
 /* System reset */
 	EXCEPTION(0x100, Reset, system_reset_exception, EXC_XFER_STD)
 
-- 
cgit v1.2.3-59-g8ed1b


From 37737a2afd69c201d0dac334c84fd1f0d596dfc0 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 30 Apr 2019 12:38:51 +0000
Subject: powerpc/32: move LOAD_MSR_KERNEL() into head_32.h and use it

As preparation for using head_32.h for head_40x.S, move
LOAD_MSR_KERNEL() there and use it to load r10 with MSR_KERNEL value.

In the mean time, this patch modifies it so that it takes into account
the size of the passed value to determine if 'li' can be used or if
'lis/ori' is needed instead of using the size of MSR_KERNEL. This is
done by using gas macro.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/entry_32.S |  9 +--------
 arch/powerpc/kernel/head_32.h  | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 2f3d159c11d7..d0cea3deb86c 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -38,14 +38,7 @@
 #include <asm/barrier.h>
 #include <asm/kup.h>
 
-/*
- * MSR_KERNEL is > 0x10000 on 4xx/Book-E since it include MSR_CE.
- */
-#if MSR_KERNEL >= 0x10000
-#define LOAD_MSR_KERNEL(r, x)	lis r,(x)@h; ori r,r,(x)@l
-#else
-#define LOAD_MSR_KERNEL(r, x)	li r,(x)
-#endif
+#include "head_32.h"
 
 /*
  * Align to 4k in order to ensure that all functions modyfing srr0/srr1
diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
index 7456e2a45acc..cf3d00844597 100644
--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@@ -4,6 +4,19 @@
 
 #include <asm/ptrace.h>	/* for STACK_FRAME_REGS_MARKER */
 
+/*
+ * MSR_KERNEL is > 0x8000 on 4xx/Book-E since it include MSR_CE.
+ */
+.macro __LOAD_MSR_KERNEL r, x
+.if \x >= 0x8000
+	lis \r, (\x)@h
+	ori \r, \r, (\x)@l
+.else
+	li \r, (\x)
+.endif
+.endm
+#define LOAD_MSR_KERNEL(r, x) __LOAD_MSR_KERNEL r, x
+
 /*
  * Exception entry code.  This code runs with address translation
  * turned off, i.e. using physical addresses.
@@ -89,7 +102,7 @@ label:
 #define EXC_XFER_TEMPLATE(n, hdlr, trap, copyee, tfer, ret)	\
 	li	r10,trap;					\
 	stw	r10,_TRAP(r11);					\
-	li	r10,MSR_KERNEL;					\
+	LOAD_MSR_KERNEL(r10, MSR_KERNEL);			\
 	copyee(r10, r9);					\
 	bl	tfer;						\
 i##n:								\
-- 
cgit v1.2.3-59-g8ed1b


From 1d3034aed4489ae96bc7eec5050096944fd181f6 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 30 Apr 2019 12:38:52 +0000
Subject: powerpc/32: make the 6xx/8xx EXC_XFER_TEMPLATE() similar to the
 40x/booke one

6xx/8xx EXC_XFER_TEMPLATE() macro adds a i##n symbol which is
unused and can be removed.
40x and booke EXC_XFER_TEMPLATE() macros takes msr from the caller
while the 6xx/8xx version uses only MSR_KERNEL as msr value.

This patch modifies the 6xx/8xx version to make it similar to the
40x and booke versions.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/head_32.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
index cf3d00844597..985758cbf577 100644
--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@@ -99,13 +99,12 @@ label:
 	addi	r3,r1,STACK_FRAME_OVERHEAD;	\
 	xfer(n, hdlr)
 
-#define EXC_XFER_TEMPLATE(n, hdlr, trap, copyee, tfer, ret)	\
+#define EXC_XFER_TEMPLATE(hdlr, trap, msr, copyee, tfer, ret)	\
 	li	r10,trap;					\
 	stw	r10,_TRAP(r11);					\
-	LOAD_MSR_KERNEL(r10, MSR_KERNEL);			\
+	LOAD_MSR_KERNEL(r10, msr);				\
 	copyee(r10, r9);					\
 	bl	tfer;						\
-i##n:								\
 	.long	hdlr;						\
 	.long	ret
 
@@ -113,19 +112,19 @@ i##n:								\
 #define NOCOPY(d, s)
 
 #define EXC_XFER_STD(n, hdlr)		\
-	EXC_XFER_TEMPLATE(n, hdlr, n, NOCOPY, transfer_to_handler_full,	\
+	EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, NOCOPY, transfer_to_handler_full,	\
 			  ret_from_except_full)
 
 #define EXC_XFER_LITE(n, hdlr)		\
-	EXC_XFER_TEMPLATE(n, hdlr, n+1, NOCOPY, transfer_to_handler, \
+	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, NOCOPY, transfer_to_handler, \
 			  ret_from_except)
 
 #define EXC_XFER_EE(n, hdlr)		\
-	EXC_XFER_TEMPLATE(n, hdlr, n, COPY_EE, transfer_to_handler_full, \
+	EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, COPY_EE, transfer_to_handler_full, \
 			  ret_from_except_full)
 
 #define EXC_XFER_EE_LITE(n, hdlr)	\
-	EXC_XFER_TEMPLATE(n, hdlr, n+1, COPY_EE, transfer_to_handler, \
+	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, COPY_EE, transfer_to_handler, \
 			  ret_from_except)
 
 #endif /* __HEAD_32_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 57bc13acbe11b6d60d5dc4d574c34e1d981a8824 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 30 Apr 2019 12:38:53 +0000
Subject: powerpc/40x: Don't use SPRN_SPRG_SCRATCH2 in EXCEPTION_PROLOG

Unlike said in the comment, r1 is not reused by the critical
exception handler, as it uses a dedicated critirq_ctx stack.
Decrementing r1 early is then unneeded.

Should the above be valid, the code is crap buggy anyway as
r1 gets some intermediate values that would jeopardise the
whole process (for instance after mfspr   r1,SPRN_SPRG_THREAD)

Using SPRN_SPRG_SCRATCH2 to save r1 is then not needed, r11 can be
used instead. This avoids one mtspr and one mfspr and makes the
prolog closer to what's done on 6xx and 8xx.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/head_40x.S | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S
index efa219d2136e..fa033203dcdb 100644
--- a/arch/powerpc/kernel/head_40x.S
+++ b/arch/powerpc/kernel/head_40x.S
@@ -102,23 +102,20 @@ _ENTRY(saved_ksp_limit)
  * Exception vector entry code. This code runs with address translation
  * turned off (i.e. using physical addresses). We assume SPRG_THREAD has
  * the physical address of the current task thread_struct.
- * Note that we have to have decremented r1 before we write to any fields
- * of the exception frame, since a critical interrupt could occur at any
- * time, and it will write to the area immediately below the current r1.
  */
 #define NORMAL_EXCEPTION_PROLOG						     \
 	mtspr	SPRN_SPRG_SCRATCH0,r10;	/* save two registers to work with */\
 	mtspr	SPRN_SPRG_SCRATCH1,r11;					     \
-	mtspr	SPRN_SPRG_SCRATCH2,r1;					     \
 	mfcr	r10;			/* save CR in r10 for now	   */\
 	mfspr	r11,SPRN_SRR1;		/* check whether user or kernel    */\
 	andi.	r11,r11,MSR_PR;						     \
-	beq	1f;							     \
-	mfspr	r1,SPRN_SPRG_THREAD;	/* if from user, start at top of   */\
-	lwz	r1,TASK_STACK-THREAD(r1); /* this thread's kernel stack   */\
-	addi	r1,r1,THREAD_SIZE;					     \
-1:	subi	r1,r1,INT_FRAME_SIZE;	/* Allocate an exception frame     */\
 	tophys(r11,r1);							     \
+	beq	1f;							     \
+	mfspr	r11,SPRN_SPRG_THREAD;	/* if from user, start at top of   */\
+	lwz	r11,TASK_STACK-THREAD(r11); /* this thread's kernel stack */\
+	addi	r11,r11,THREAD_SIZE;					     \
+	tophys(r11,r11);						     \
+1:	subi	r11,r11,INT_FRAME_SIZE;	/* Allocate an exception frame     */\
 	stw	r10,_CCR(r11);          /* save various registers	   */\
 	stw	r12,GPR12(r11);						     \
 	stw	r9,GPR9(r11);						     \
@@ -128,11 +125,11 @@ _ENTRY(saved_ksp_limit)
 	stw	r12,GPR11(r11);						     \
 	mflr	r10;							     \
 	stw	r10,_LINK(r11);						     \
-	mfspr	r10,SPRN_SPRG_SCRATCH2;					     \
 	mfspr	r12,SPRN_SRR0;						     \
-	stw	r10,GPR1(r11);						     \
+	stw	r1,GPR1(r11);						     \
 	mfspr	r9,SPRN_SRR1;						     \
-	stw	r10,0(r11);						     \
+	stw	r1,0(r11);						     \
+	tovirt(r1,r11);			/* set new kernel sp */	\
 	rlwinm	r9,r9,0,14,12;		/* clear MSR_WE (necessary?)	   */\
 	stw	r0,GPR0(r11);						     \
 	SAVE_4GPRS(3, r11);						     \
-- 
cgit v1.2.3-59-g8ed1b


From bd82904d465c0655fdc40dfc753209ea54efdd23 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 30 Apr 2019 12:38:54 +0000
Subject: powerpc/40x: add exception frame marker

This patch adds STACK_FRAME_REGS_MARKER in the stack at exception entry
in order to see interrupts in call traces as below:

[    0.013964] Call Trace:
[    0.014014] [c0745db0] [c007a9d4] tick_periodic.constprop.5+0xd8/0x104 (unreliable)
[    0.014086] [c0745dc0] [c007aa20] tick_handle_periodic+0x20/0x9c
[    0.014181] [c0745de0] [c0009cd0] timer_interrupt+0xa0/0x264
[    0.014258] [c0745e10] [c000e484] ret_from_except+0x0/0x14
[    0.014390] --- interrupt: 901 at console_unlock.part.7+0x3f4/0x528
[    0.014390]     LR = console_unlock.part.7+0x3f0/0x528
[    0.014455] [c0745ee0] [c0050334] console_unlock.part.7+0x114/0x528 (unreliable)
[    0.014542] [c0745f30] [c00524e0] register_console+0x3d8/0x44c
[    0.014625] [c0745f60] [c0675aac] cpm_uart_console_init+0x18/0x2c
[    0.014709] [c0745f70] [c06614f4] console_init+0x114/0x1cc
[    0.014795] [c0745fb0] [c0658b68] start_kernel+0x300/0x3d8
[    0.014864] [c0745ff0] [c00022cc] start_here+0x44/0x98

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/head_40x.S | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S
index fa033203dcdb..be1fcd6147c1 100644
--- a/arch/powerpc/kernel/head_40x.S
+++ b/arch/powerpc/kernel/head_40x.S
@@ -132,6 +132,9 @@ _ENTRY(saved_ksp_limit)
 	tovirt(r1,r11);			/* set new kernel sp */	\
 	rlwinm	r9,r9,0,14,12;		/* clear MSR_WE (necessary?)	   */\
 	stw	r0,GPR0(r11);						     \
+	lis	r10, STACK_FRAME_REGS_MARKER@ha; /* exception frame marker */\
+	addi	r10, r10, STACK_FRAME_REGS_MARKER@l;			     \
+	stw	r10, 8(r11);						     \
 	SAVE_4GPRS(3, r11);						     \
 	SAVE_2GPRS(7, r11)
 
@@ -174,6 +177,9 @@ _ENTRY(saved_ksp_limit)
 	tovirt(r1,r11);							     \
 	rlwinm	r9,r9,0,14,12;		/* clear MSR_WE (necessary?)	   */\
 	stw	r0,GPR0(r11);						     \
+	lis	r10, STACK_FRAME_REGS_MARKER@ha; /* exception frame marker */\
+	addi	r10, r10, STACK_FRAME_REGS_MARKER@l;			     \
+	stw	r10, 8(r11);						     \
 	SAVE_4GPRS(3, r11);						     \
 	SAVE_2GPRS(7, r11)
 
-- 
cgit v1.2.3-59-g8ed1b


From 7271fc960424a2fed3823a57358f67f650fd708d Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 30 Apr 2019 12:38:55 +0000
Subject: powerpc/40x: Split and rename NORMAL_EXCEPTION_PROLOG

This patch splits NORMAL_EXCEPTION_PROLOG in the same way as in
head_8xx.S and head_32.S and renames it EXCEPTION_PROLOG() as well
to match head_32.h

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/head_40x.S | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S
index be1fcd6147c1..004ebe823bd4 100644
--- a/arch/powerpc/kernel/head_40x.S
+++ b/arch/powerpc/kernel/head_40x.S
@@ -103,10 +103,14 @@ _ENTRY(saved_ksp_limit)
  * turned off (i.e. using physical addresses). We assume SPRG_THREAD has
  * the physical address of the current task thread_struct.
  */
-#define NORMAL_EXCEPTION_PROLOG						     \
+#define EXCEPTION_PROLOG						     \
 	mtspr	SPRN_SPRG_SCRATCH0,r10;	/* save two registers to work with */\
 	mtspr	SPRN_SPRG_SCRATCH1,r11;					     \
 	mfcr	r10;			/* save CR in r10 for now	   */\
+	EXCEPTION_PROLOG_1;						     \
+	EXCEPTION_PROLOG_2
+
+#define EXCEPTION_PROLOG_1						     \
 	mfspr	r11,SPRN_SRR1;		/* check whether user or kernel    */\
 	andi.	r11,r11,MSR_PR;						     \
 	tophys(r11,r1);							     \
@@ -115,7 +119,9 @@ _ENTRY(saved_ksp_limit)
 	lwz	r11,TASK_STACK-THREAD(r11); /* this thread's kernel stack */\
 	addi	r11,r11,THREAD_SIZE;					     \
 	tophys(r11,r11);						     \
-1:	subi	r11,r11,INT_FRAME_SIZE;	/* Allocate an exception frame     */\
+1:	subi	r11,r11,INT_FRAME_SIZE	/* Allocate an exception frame     */
+
+#define EXCEPTION_PROLOG_2						     \
 	stw	r10,_CCR(r11);          /* save various registers	   */\
 	stw	r12,GPR12(r11);						     \
 	stw	r9,GPR9(r11);						     \
@@ -205,7 +211,7 @@ label:
 
 #define EXCEPTION(n, label, hdlr, xfer)				\
 	START_EXCEPTION(n, label);				\
-	NORMAL_EXCEPTION_PROLOG;				\
+	EXCEPTION_PROLOG;				\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
 	xfer(n, hdlr)
 
@@ -396,7 +402,7 @@ label:
  * This is caused by a fetch from non-execute or guarded pages.
  */
 	START_EXCEPTION(0x0400, InstructionAccess)
-	NORMAL_EXCEPTION_PROLOG
+	EXCEPTION_PROLOG
 	mr	r4,r12			/* Pass SRR0 as arg2 */
 	li	r5,0			/* Pass zero as arg3 */
 	EXC_XFER_LITE(0x400, handle_page_fault)
@@ -406,7 +412,7 @@ label:
 
 /* 0x0600 - Alignment Exception */
 	START_EXCEPTION(0x0600, Alignment)
-	NORMAL_EXCEPTION_PROLOG
+	EXCEPTION_PROLOG
 	mfspr	r4,SPRN_DEAR		/* Grab the DEAR and save it */
 	stw	r4,_DEAR(r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
@@ -414,7 +420,7 @@ label:
 
 /* 0x0700 - Program Exception */
 	START_EXCEPTION(0x0700, ProgramCheck)
-	NORMAL_EXCEPTION_PROLOG
+	EXCEPTION_PROLOG
 	mfspr	r4,SPRN_ESR		/* Grab the ESR and save it */
 	stw	r4,_ESR(r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
@@ -427,7 +433,7 @@ label:
 
 /* 0x0C00 - System Call Exception */
 	START_EXCEPTION(0x0C00,	SystemCall)
-	NORMAL_EXCEPTION_PROLOG
+	EXCEPTION_PROLOG
 	EXC_XFER_EE_LITE(0xc00, DoSyscall)
 
 	EXCEPTION(0x0D00, Trap_0D, unknown_exception, EXC_XFER_EE)
@@ -733,7 +739,7 @@ label:
 
 	/* Programmable Interval Timer (PIT) Exception. (from 0x1000) */
 Decrementer:
-	NORMAL_EXCEPTION_PROLOG
+	EXCEPTION_PROLOG
 	lis	r0,TSR_PIS@h
 	mtspr	SPRN_TSR,r0		/* Clear the PIT exception */
 	addi	r3,r1,STACK_FRAME_OVERHEAD
@@ -741,7 +747,7 @@ Decrementer:
 
 	/* Fixed Interval Timer (FIT) Exception. (from 0x1010) */
 FITException:
-	NORMAL_EXCEPTION_PROLOG
+	EXCEPTION_PROLOG
 	addi	r3,r1,STACK_FRAME_OVERHEAD;
 	EXC_XFER_EE(0x1010, unknown_exception)
 
@@ -759,7 +765,7 @@ WDTException:
  * if they can't resolve the lightweight TLB fault.
  */
 DataAccess:
-	NORMAL_EXCEPTION_PROLOG
+	EXCEPTION_PROLOG
 	mfspr	r5,SPRN_ESR		/* Grab the ESR, save it, pass arg3 */
 	stw	r5,_ESR(r11)
 	mfspr	r4,SPRN_DEAR		/* Grab the DEAR, save it, pass arg2 */
-- 
cgit v1.2.3-59-g8ed1b


From 90f204b9a1f2dc81904547a52ba976d3e84dcf59 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 30 Apr 2019 12:38:56 +0000
Subject: powerpc/40x: Refactor exception entry macros by using head_32.h

Refactor exception entry macros by using the ones defined in head_32.h

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/head_32.h  |  4 ++
 arch/powerpc/kernel/head_40x.S | 88 +-----------------------------------------
 2 files changed, 6 insertions(+), 86 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
index 985758cbf577..aa0131bb09b5 100644
--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@@ -59,8 +59,12 @@
 	stw	r1,GPR1(r11)
 	stw	r1,0(r11)
 	tovirt(r1,r11)			/* set new kernel sp */
+#ifdef CONFIG_40x
+	rlwinm	r9,r9,0,14,12		/* clear MSR_WE (necessary?) */
+#else
 	li	r10,MSR_KERNEL & ~(MSR_IR|MSR_DR) /* can take exceptions */
 	MTMSRD(r10)			/* (except for mach check in rtas) */
+#endif
 	stw	r0,GPR0(r11)
 	lis	r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */
 	addi	r10,r10,STACK_FRAME_REGS_MARKER@l
diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S
index 004ebe823bd4..b3a2e55e1c15 100644
--- a/arch/powerpc/kernel/head_40x.S
+++ b/arch/powerpc/kernel/head_40x.S
@@ -44,6 +44,8 @@
 #include <asm/export.h>
 #include <asm/asm-405.h>
 
+#include "head_32.h"
+
 /* As with the other PowerPC ports, it is expected that when code
  * execution begins here, the following registers contain valid, yet
  * optional, information:
@@ -98,52 +100,6 @@ _ENTRY(crit_srr1)
 _ENTRY(saved_ksp_limit)
 	.space	4
 
-/*
- * Exception vector entry code. This code runs with address translation
- * turned off (i.e. using physical addresses). We assume SPRG_THREAD has
- * the physical address of the current task thread_struct.
- */
-#define EXCEPTION_PROLOG						     \
-	mtspr	SPRN_SPRG_SCRATCH0,r10;	/* save two registers to work with */\
-	mtspr	SPRN_SPRG_SCRATCH1,r11;					     \
-	mfcr	r10;			/* save CR in r10 for now	   */\
-	EXCEPTION_PROLOG_1;						     \
-	EXCEPTION_PROLOG_2
-
-#define EXCEPTION_PROLOG_1						     \
-	mfspr	r11,SPRN_SRR1;		/* check whether user or kernel    */\
-	andi.	r11,r11,MSR_PR;						     \
-	tophys(r11,r1);							     \
-	beq	1f;							     \
-	mfspr	r11,SPRN_SPRG_THREAD;	/* if from user, start at top of   */\
-	lwz	r11,TASK_STACK-THREAD(r11); /* this thread's kernel stack */\
-	addi	r11,r11,THREAD_SIZE;					     \
-	tophys(r11,r11);						     \
-1:	subi	r11,r11,INT_FRAME_SIZE	/* Allocate an exception frame     */
-
-#define EXCEPTION_PROLOG_2						     \
-	stw	r10,_CCR(r11);          /* save various registers	   */\
-	stw	r12,GPR12(r11);						     \
-	stw	r9,GPR9(r11);						     \
-	mfspr	r10,SPRN_SPRG_SCRATCH0;					     \
-	stw	r10,GPR10(r11);						     \
-	mfspr	r12,SPRN_SPRG_SCRATCH1;					     \
-	stw	r12,GPR11(r11);						     \
-	mflr	r10;							     \
-	stw	r10,_LINK(r11);						     \
-	mfspr	r12,SPRN_SRR0;						     \
-	stw	r1,GPR1(r11);						     \
-	mfspr	r9,SPRN_SRR1;						     \
-	stw	r1,0(r11);						     \
-	tovirt(r1,r11);			/* set new kernel sp */	\
-	rlwinm	r9,r9,0,14,12;		/* clear MSR_WE (necessary?)	   */\
-	stw	r0,GPR0(r11);						     \
-	lis	r10, STACK_FRAME_REGS_MARKER@ha; /* exception frame marker */\
-	addi	r10, r10, STACK_FRAME_REGS_MARKER@l;			     \
-	stw	r10, 8(r11);						     \
-	SAVE_4GPRS(3, r11);						     \
-	SAVE_2GPRS(7, r11)
-
 /*
  * Exception prolog for critical exceptions.  This is a little different
  * from the normal exception prolog above since a critical exception
@@ -205,16 +161,6 @@ _ENTRY(saved_ksp_limit)
 /*
  * Exception vectors.
  */
-#define	START_EXCEPTION(n, label)					     \
-	. = n;								     \
-label:
-
-#define EXCEPTION(n, label, hdlr, xfer)				\
-	START_EXCEPTION(n, label);				\
-	EXCEPTION_PROLOG;				\
-	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
-	xfer(n, hdlr)
-
 #define CRITICAL_EXCEPTION(n, label, hdlr)			\
 	START_EXCEPTION(n, label);				\
 	CRITICAL_EXCEPTION_PROLOG;				\
@@ -223,36 +169,6 @@ label:
 			  NOCOPY, crit_transfer_to_handler,	\
 			  ret_from_crit_exc)
 
-#define EXC_XFER_TEMPLATE(hdlr, trap, msr, copyee, tfer, ret)	\
-	li	r10,trap;					\
-	stw	r10,_TRAP(r11);					\
-	lis	r10,msr@h;					\
-	ori	r10,r10,msr@l;					\
-	copyee(r10, r9);					\
-	bl	tfer;		 				\
-	.long	hdlr;						\
-	.long	ret
-
-#define COPY_EE(d, s)		rlwimi d,s,0,16,16
-#define NOCOPY(d, s)
-
-#define EXC_XFER_STD(n, hdlr)		\
-	EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, NOCOPY, transfer_to_handler_full, \
-			  ret_from_except_full)
-
-#define EXC_XFER_LITE(n, hdlr)		\
-	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, NOCOPY, transfer_to_handler, \
-			  ret_from_except)
-
-#define EXC_XFER_EE(n, hdlr)		\
-	EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, COPY_EE, transfer_to_handler_full, \
-			  ret_from_except_full)
-
-#define EXC_XFER_EE_LITE(n, hdlr)	\
-	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, COPY_EE, transfer_to_handler, \
-			  ret_from_except)
-
-
 /*
  * 0x0100 - Critical Interrupt Exception
  */
-- 
cgit v1.2.3-59-g8ed1b


From ef4291243f51d0a69899ee2025de09578c0fcba8 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 30 Apr 2019 12:38:57 +0000
Subject: powerpc/fsl_booke: ensure SPEFloatingPointException() reenables
 interrupts

SPEFloatingPointException() is the only exception handler which 'forgets' to
re-enable interrupts. This patch makes sure it does.

Suggested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/traps.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 1fd45a8650e1..665f294725cb 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -2088,6 +2088,10 @@ void SPEFloatingPointException(struct pt_regs *regs)
 	int code = FPE_FLTUNK;
 	int err;
 
+	/* We restore the interrupt state now */
+	if (!arch_irq_disabled_regs(regs))
+		local_irq_enable();
+
 	flush_spe_to_thread(current);
 
 	spefscr = current->thread.spefscr;
@@ -2133,6 +2137,10 @@ void SPEFloatingPointRoundException(struct pt_regs *regs)
 	extern int speround_handler(struct pt_regs *regs);
 	int err;
 
+	/* We restore the interrupt state now */
+	if (!arch_irq_disabled_regs(regs))
+		local_irq_enable();
+
 	preempt_disable();
 	if (regs->msr & MSR_SPE)
 		giveup_spe(current);
-- 
cgit v1.2.3-59-g8ed1b


From f97dec21a306967edbc49ce46f3ecefa3cd16907 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 30 Apr 2019 12:38:58 +0000
Subject: powerpc/32: enter syscall with MSR_EE inconditionaly set

syscalls are expected to be entered with MSR_EE set. Lets
make it inconditional by forcing MSR_EE on syscalls.

This patch adds EXC_XFER_SYS for that.

Suggested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[splited out from benh RFC patch]

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/head_32.S        | 2 +-
 arch/powerpc/kernel/head_32.h        | 4 ++++
 arch/powerpc/kernel/head_40x.S       | 2 +-
 arch/powerpc/kernel/head_44x.S       | 2 +-
 arch/powerpc/kernel/head_8xx.S       | 2 +-
 arch/powerpc/kernel/head_booke.h     | 4 ++++
 arch/powerpc/kernel/head_fsl_booke.S | 2 +-
 7 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index f98e6b461238..c5fd76d0caf6 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -375,7 +375,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_FPU_UNAVAILABLE)
 	DO_KVM  0xc00
 SystemCall:
 	EXCEPTION_PROLOG
-	EXC_XFER_EE_LITE(0xc00, DoSyscall)
+	EXC_XFER_SYS(0xc00, DoSyscall)
 
 /* Single step - not used on 601 */
 	EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD)
diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
index aa0131bb09b5..7221418a883f 100644
--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@@ -123,6 +123,10 @@ label:
 	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, NOCOPY, transfer_to_handler, \
 			  ret_from_except)
 
+#define EXC_XFER_SYS(n, hdlr)		\
+	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, NOCOPY, transfer_to_handler, \
+			  ret_from_except)
+
 #define EXC_XFER_EE(n, hdlr)		\
 	EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, COPY_EE, transfer_to_handler_full, \
 			  ret_from_except_full)
diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S
index b3a2e55e1c15..3e1b8a85cc0d 100644
--- a/arch/powerpc/kernel/head_40x.S
+++ b/arch/powerpc/kernel/head_40x.S
@@ -350,7 +350,7 @@ _ENTRY(saved_ksp_limit)
 /* 0x0C00 - System Call Exception */
 	START_EXCEPTION(0x0C00,	SystemCall)
 	EXCEPTION_PROLOG
-	EXC_XFER_EE_LITE(0xc00, DoSyscall)
+	EXC_XFER_SYS(0xc00, DoSyscall)
 
 	EXCEPTION(0x0D00, Trap_0D, unknown_exception, EXC_XFER_EE)
 	EXCEPTION(0x0E00, Trap_0E, unknown_exception, EXC_XFER_EE)
diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
index 34a5df827b38..19268713b692 100644
--- a/arch/powerpc/kernel/head_44x.S
+++ b/arch/powerpc/kernel/head_44x.S
@@ -286,7 +286,7 @@ interrupt_base:
 	/* System Call Interrupt */
 	START_EXCEPTION(SystemCall)
 	NORMAL_EXCEPTION_PROLOG(BOOKE_INTERRUPT_SYSCALL)
-	EXC_XFER_EE_LITE(0x0c00, DoSyscall)
+	EXC_XFER_SYS(0x0c00, DoSyscall)
 
 	/* Auxiliary Processor Unavailable Interrupt */
 	EXCEPTION(0x2020, BOOKE_INTERRUPT_AP_UNAVAIL, \
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 14c3eb3267b8..aa8e629f7725 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -186,7 +186,7 @@ Alignment:
 	. = 0xc00
 SystemCall:
 	EXCEPTION_PROLOG
-	EXC_XFER_EE_LITE(0xc00, DoSyscall)
+	EXC_XFER_SYS(0xc00, DoSyscall)
 
 /* Single step - not used on 601 */
 	EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD)
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index 1b22a8dea399..612f54ba1125 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -251,6 +251,10 @@ label:
 	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, NOCOPY, transfer_to_handler, \
 			  ret_from_except)
 
+#define EXC_XFER_SYS(n, hdlr)						\
+	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, NOCOPY, transfer_to_handler, \
+			  ret_from_except)
+
 #define EXC_XFER_EE(n, hdlr)		\
 	EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, COPY_EE, transfer_to_handler_full, \
 			  ret_from_except_full)
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index 567e0ed45ca8..a7bebb996393 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -414,7 +414,7 @@ interrupt_base:
 	/* System Call Interrupt */
 	START_EXCEPTION(SystemCall)
 	NORMAL_EXCEPTION_PROLOG(SYSCALL)
-	EXC_XFER_EE_LITE(0x0c00, DoSyscall)
+	EXC_XFER_SYS(0x0c00, DoSyscall)
 
 	/* Auxiliary Processor Unavailable Interrupt */
 	EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, \
-- 
cgit v1.2.3-59-g8ed1b


From 642770dd96cb04e7cf8f7677e35cd528cda0a97b Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 30 Apr 2019 12:38:59 +0000
Subject: powerpc/32: Enter exceptions with MSR_EE unset

All exceptions handlers know when to reenable interrupts, so
it is safer to enter all of them with MSR_EE unset, except
for syscalls.

Suggested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[splited out from benh RFC patch]

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/head_32.S        | 68 ++++++++++++++++++------------------
 arch/powerpc/kernel/head_32.h        |  8 -----
 arch/powerpc/kernel/head_40x.S       | 44 +++++++++++------------
 arch/powerpc/kernel/head_44x.S       |  6 ++--
 arch/powerpc/kernel/head_8xx.S       | 32 ++++++++---------
 arch/powerpc/kernel/head_booke.h     | 12 ++-----
 arch/powerpc/kernel/head_fsl_booke.S | 26 +++++++-------
 7 files changed, 90 insertions(+), 106 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index c5fd76d0caf6..7f5555e362a1 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -341,7 +341,7 @@ Alignment:
 	mfspr	r5,SPRN_DSISR
 	stw	r5,_DSISR(r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	EXC_XFER_EE(0x600, alignment_exception)
+	EXC_XFER_STD(0x600, alignment_exception)
 
 /* Program check exception */
 	EXCEPTION(0x700, ProgramCheck, program_check_exception, EXC_XFER_STD)
@@ -362,13 +362,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_FPU_UNAVAILABLE)
 	bl	load_up_fpu		/* if from user, just load it up */
 	b	fast_exception_return
 1:	addi	r3,r1,STACK_FRAME_OVERHEAD
-	EXC_XFER_EE_LITE(0x800, kernel_fp_unavailable_exception)
+	EXC_XFER_LITE(0x800, kernel_fp_unavailable_exception)
 
 /* Decrementer */
 	EXCEPTION(0x900, Decrementer, timer_interrupt, EXC_XFER_LITE)
 
-	EXCEPTION(0xa00, Trap_0a, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0xb00, Trap_0b, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0xa00, Trap_0a, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0xb00, Trap_0b, unknown_exception, EXC_XFER_STD)
 
 /* System call */
 	. = 0xc00
@@ -379,7 +379,7 @@ SystemCall:
 
 /* Single step - not used on 601 */
 	EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD)
-	EXCEPTION(0xe00, Trap_0e, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0xe00, Trap_0e, unknown_exception, EXC_XFER_STD)
 
 /*
  * The Altivec unavailable trap is at 0x0f20.  Foo.
@@ -611,35 +611,35 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU)
 #define altivec_assist_exception	unknown_exception
 #endif
 
-	EXCEPTION(0x1300, Trap_13, instruction_breakpoint_exception, EXC_XFER_EE)
-	EXCEPTION(0x1400, SMI, SMIException, EXC_XFER_EE)
-	EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1600, Trap_16, altivec_assist_exception, EXC_XFER_EE)
+	EXCEPTION(0x1300, Trap_13, instruction_breakpoint_exception, EXC_XFER_STD)
+	EXCEPTION(0x1400, SMI, SMIException, EXC_XFER_STD)
+	EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1600, Trap_16, altivec_assist_exception, EXC_XFER_STD)
 	EXCEPTION(0x1700, Trap_17, TAUException, EXC_XFER_STD)
-	EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1a00, Trap_1a, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1b00, Trap_1b, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1c00, Trap_1c, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1d00, Trap_1d, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1e00, Trap_1e, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1f00, Trap_1f, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2000, RunMode, RunModeException, EXC_XFER_EE)
-	EXCEPTION(0x2100, Trap_21, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2200, Trap_22, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2300, Trap_23, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2400, Trap_24, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2500, Trap_25, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2600, Trap_26, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2700, Trap_27, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2800, Trap_28, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2900, Trap_29, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2a00, Trap_2a, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2b00, Trap_2b, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2c00, Trap_2c, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2d00, Trap_2d, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2e00, Trap_2e, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2f00, Trap_2f, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1a00, Trap_1a, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1b00, Trap_1b, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1c00, Trap_1c, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1d00, Trap_1d, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1e00, Trap_1e, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1f00, Trap_1f, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x2000, RunMode, RunModeException, EXC_XFER_STD)
+	EXCEPTION(0x2100, Trap_21, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x2200, Trap_22, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x2300, Trap_23, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x2400, Trap_24, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x2500, Trap_25, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x2600, Trap_26, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x2700, Trap_27, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x2800, Trap_28, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x2900, Trap_29, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x2a00, Trap_2a, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x2b00, Trap_2b, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x2c00, Trap_2c, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x2d00, Trap_2d, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x2e00, Trap_2e, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x2f00, Trap_2f, unknown_exception, EXC_XFER_STD)
 
 	. = 0x3000
 
@@ -651,7 +651,7 @@ AltiVecUnavailable:
 	b	fast_exception_return
 #endif /* CONFIG_ALTIVEC */
 1:	addi	r3,r1,STACK_FRAME_OVERHEAD
-	EXC_XFER_EE_LITE(0xf20, altivec_unavailable_exception)
+	EXC_XFER_LITE(0xf20, altivec_unavailable_exception)
 
 PerformanceMonitor:
 	EXCEPTION_PROLOG
diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
index 7221418a883f..8881b6887841 100644
--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@@ -127,12 +127,4 @@ label:
 	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, NOCOPY, transfer_to_handler, \
 			  ret_from_except)
 
-#define EXC_XFER_EE(n, hdlr)		\
-	EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, COPY_EE, transfer_to_handler_full, \
-			  ret_from_except_full)
-
-#define EXC_XFER_EE_LITE(n, hdlr)	\
-	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, COPY_EE, transfer_to_handler, \
-			  ret_from_except)
-
 #endif /* __HEAD_32_H__ */
diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S
index 3e1b8a85cc0d..0463f56fc7ab 100644
--- a/arch/powerpc/kernel/head_40x.S
+++ b/arch/powerpc/kernel/head_40x.S
@@ -332,7 +332,7 @@ _ENTRY(saved_ksp_limit)
 	mfspr	r4,SPRN_DEAR		/* Grab the DEAR and save it */
 	stw	r4,_DEAR(r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	EXC_XFER_EE(0x600, alignment_exception)
+	EXC_XFER_STD(0x600, alignment_exception)
 
 /* 0x0700 - Program Exception */
 	START_EXCEPTION(0x0700, ProgramCheck)
@@ -342,19 +342,19 @@ _ENTRY(saved_ksp_limit)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	EXC_XFER_STD(0x700, program_check_exception)
 
-	EXCEPTION(0x0800, Trap_08, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x0900, Trap_09, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x0A00, Trap_0A, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x0B00, Trap_0B, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x0800, Trap_08, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x0900, Trap_09, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x0A00, Trap_0A, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x0B00, Trap_0B, unknown_exception, EXC_XFER_STD)
 
 /* 0x0C00 - System Call Exception */
 	START_EXCEPTION(0x0C00,	SystemCall)
 	EXCEPTION_PROLOG
 	EXC_XFER_SYS(0xc00, DoSyscall)
 
-	EXCEPTION(0x0D00, Trap_0D, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x0E00, Trap_0E, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x0F00, Trap_0F, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x0D00, Trap_0D, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x0E00, Trap_0E, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x0F00, Trap_0F, unknown_exception, EXC_XFER_STD)
 
 /* 0x1000 - Programmable Interval Timer (PIT) Exception */
 	. = 0x1000
@@ -571,25 +571,25 @@ _ENTRY(saved_ksp_limit)
 	mfspr	r10, SPRN_SPRG_SCRATCH0
 	b	InstructionAccess
 
-	EXCEPTION(0x1300, Trap_13, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1400, Trap_14, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1600, Trap_16, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1300, Trap_13, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1400, Trap_14, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1600, Trap_16, unknown_exception, EXC_XFER_STD)
 #ifdef CONFIG_IBM405_ERR51
 	/* 405GP errata 51 */
 	START_EXCEPTION(0x1700, Trap_17)
 	b DTLBMiss
 #else
-	EXCEPTION(0x1700, Trap_17, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1700, Trap_17, unknown_exception, EXC_XFER_STD)
 #endif
-	EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1A00, Trap_1A, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1B00, Trap_1B, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1C00, Trap_1C, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1D00, Trap_1D, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1E00, Trap_1E, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1F00, Trap_1F, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1A00, Trap_1A, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1B00, Trap_1B, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1C00, Trap_1C, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1D00, Trap_1D, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1E00, Trap_1E, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1F00, Trap_1F, unknown_exception, EXC_XFER_STD)
 
 /* Check for a single step debug exception while in an exception
  * handler before state has been saved.  This is to catch the case
@@ -665,7 +665,7 @@ Decrementer:
 FITException:
 	EXCEPTION_PROLOG
 	addi	r3,r1,STACK_FRAME_OVERHEAD;
-	EXC_XFER_EE(0x1010, unknown_exception)
+	EXC_XFER_STD(0x1010, unknown_exception)
 
 	/* Watchdog Timer (WDT) Exception. (from 0x1020) */
 WDTException:
diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
index 19268713b692..0381fdb294a6 100644
--- a/arch/powerpc/kernel/head_44x.S
+++ b/arch/powerpc/kernel/head_44x.S
@@ -281,7 +281,7 @@ interrupt_base:
 	FP_UNAVAILABLE_EXCEPTION
 #else
 	EXCEPTION(0x2010, BOOKE_INTERRUPT_FP_UNAVAIL, \
-		  FloatingPointUnavailable, unknown_exception, EXC_XFER_EE)
+		  FloatingPointUnavailable, unknown_exception, EXC_XFER_STD)
 #endif
 	/* System Call Interrupt */
 	START_EXCEPTION(SystemCall)
@@ -290,7 +290,7 @@ interrupt_base:
 
 	/* Auxiliary Processor Unavailable Interrupt */
 	EXCEPTION(0x2020, BOOKE_INTERRUPT_AP_UNAVAIL, \
-		  AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE)
+		  AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_STD)
 
 	/* Decrementer Interrupt */
 	DECREMENTER_EXCEPTION
@@ -298,7 +298,7 @@ interrupt_base:
 	/* Fixed Internal Timer Interrupt */
 	/* TODO: Add FIT support */
 	EXCEPTION(0x1010, BOOKE_INTERRUPT_FIT, FixedIntervalTimer, \
-		  unknown_exception, EXC_XFER_EE)
+		  unknown_exception, EXC_XFER_STD)
 
 	/* Watchdog Timer Interrupt */
 	/* TODO: Add watchdog support */
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index aa8e629f7725..16b4791a2a9f 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -167,7 +167,7 @@ Alignment:
 	mfspr	r5,SPRN_DSISR
 	stw	r5,_DSISR(r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	EXC_XFER_EE(0x600, alignment_exception)
+	EXC_XFER_STD(0x600, alignment_exception)
 
 /* Program check exception */
 	EXCEPTION(0x700, ProgramCheck, program_check_exception, EXC_XFER_STD)
@@ -179,8 +179,8 @@ Alignment:
 /* Decrementer */
 	EXCEPTION(0x900, Decrementer, timer_interrupt, EXC_XFER_LITE)
 
-	EXCEPTION(0xa00, Trap_0a, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0xb00, Trap_0b, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0xa00, Trap_0a, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0xb00, Trap_0b, unknown_exception, EXC_XFER_STD)
 
 /* System call */
 	. = 0xc00
@@ -190,8 +190,8 @@ SystemCall:
 
 /* Single step - not used on 601 */
 	EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD)
-	EXCEPTION(0xe00, Trap_0e, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0xf00, Trap_0f, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0xe00, Trap_0e, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0xf00, Trap_0f, unknown_exception, EXC_XFER_STD)
 
 /* On the MPC8xx, this is a software emulation interrupt.  It occurs
  * for all unimplemented and illegal instructions.
@@ -521,13 +521,13 @@ DARFixed:/* Return from dcbx instruction bug workaround */
 	/* 0x300 is DataAccess exception, needed by bad_page_fault() */
 	EXC_XFER_LITE(0x300, handle_page_fault)
 
-	EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1600, Trap_16, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1700, Trap_17, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1a00, Trap_1a, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1b00, Trap_1b, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1600, Trap_16, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1700, Trap_17, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1a00, Trap_1a, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1b00, Trap_1b, unknown_exception, EXC_XFER_STD)
 
 /* On the MPC8xx, these next four traps are used for development
  * support of breakpoints and such.  Someday I will get around to
@@ -549,7 +549,7 @@ DataBreakpoint:
 	mfspr	r4,SPRN_BAR
 	stw	r4,_DAR(r11)
 	mfspr	r5,SPRN_DSISR
-	EXC_XFER_EE(0x1c00, do_break)
+	EXC_XFER_STD(0x1c00, do_break)
 11:
 	mtcr	r10
 	mfspr	r10, SPRN_SPRG_SCRATCH0
@@ -569,10 +569,10 @@ InstructionBreakpoint:
 	mfspr	r10, SPRN_SPRG_SCRATCH0
 	rfi
 #else
-	EXCEPTION(0x1d00, Trap_1d, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1d00, Trap_1d, unknown_exception, EXC_XFER_STD)
 #endif
-	EXCEPTION(0x1e00, Trap_1e, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x1f00, Trap_1f, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1e00, Trap_1e, unknown_exception, EXC_XFER_STD)
+	EXCEPTION(0x1f00, Trap_1f, unknown_exception, EXC_XFER_STD)
 
 	. = 0x2000
 
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index 612f54ba1125..264976c43f34 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -255,14 +255,6 @@ label:
 	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, NOCOPY, transfer_to_handler, \
 			  ret_from_except)
 
-#define EXC_XFER_EE(n, hdlr)		\
-	EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, COPY_EE, transfer_to_handler_full, \
-			  ret_from_except_full)
-
-#define EXC_XFER_EE_LITE(n, hdlr)	\
-	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, COPY_EE, transfer_to_handler, \
-			  ret_from_except)
-
 /* Check for a single step debug exception while in an exception
  * handler before state has been saved.  This is to catch the case
  * where an instruction that we are trying to single step causes
@@ -405,7 +397,7 @@ label:
 	mfspr   r4,SPRN_DEAR;           /* Grab the DEAR and save it */	      \
 	stw     r4,_DEAR(r11);						      \
 	addi    r3,r1,STACK_FRAME_OVERHEAD;				      \
-	EXC_XFER_EE(0x0600, alignment_exception)
+	EXC_XFER_STD(0x0600, alignment_exception)
 
 #define PROGRAM_EXCEPTION						      \
 	START_EXCEPTION(Program)					      \
@@ -430,7 +422,7 @@ label:
 	bl	load_up_fpu;		/* if from user, just load it up */   \
 	b	fast_exception_return;					      \
 1:	addi	r3,r1,STACK_FRAME_OVERHEAD;				      \
-	EXC_XFER_EE_LITE(0x800, kernel_fp_unavailable_exception)
+	EXC_XFER_STD(0x800, kernel_fp_unavailable_exception)
 
 #ifndef __ASSEMBLY__
 struct exception_regs {
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index a7bebb996393..df980ad0f95f 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -383,7 +383,7 @@ interrupt_base:
 	EXC_XFER_LITE(0x0300, handle_page_fault)
 1:
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	EXC_XFER_EE_LITE(0x0300, CacheLockingException)
+	EXC_XFER_LITE(0x0300, CacheLockingException)
 
 	/* Instruction Storage Interrupt */
 	INSTRUCTION_STORAGE_EXCEPTION
@@ -404,10 +404,10 @@ interrupt_base:
 #ifdef CONFIG_E200
 	/* E200 treats 'normal' floating point instructions as FP Unavail exception */
 	EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \
-		  program_check_exception, EXC_XFER_EE)
+		  program_check_exception, EXC_XFER_STD)
 #else
 	EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \
-		  unknown_exception, EXC_XFER_EE)
+		  unknown_exception, EXC_XFER_STD)
 #endif
 #endif
 
@@ -418,7 +418,7 @@ interrupt_base:
 
 	/* Auxiliary Processor Unavailable Interrupt */
 	EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, \
-		  unknown_exception, EXC_XFER_EE)
+		  unknown_exception, EXC_XFER_STD)
 
 	/* Decrementer Interrupt */
 	DECREMENTER_EXCEPTION
@@ -426,7 +426,7 @@ interrupt_base:
 	/* Fixed Internal Timer Interrupt */
 	/* TODO: Add FIT support */
 	EXCEPTION(0x3100, FIT, FixedIntervalTimer, \
-		  unknown_exception, EXC_XFER_EE)
+		  unknown_exception, EXC_XFER_STD)
 
 	/* Watchdog Timer Interrupt */
 #ifdef CONFIG_BOOKE_WDT
@@ -636,25 +636,25 @@ END_BTB_FLUSH_SECTION
 	bl	load_up_spe
 	b	fast_exception_return
 1:	addi	r3,r1,STACK_FRAME_OVERHEAD
-	EXC_XFER_EE_LITE(0x2010, KernelSPE)
+	EXC_XFER_LITE(0x2010, KernelSPE)
 #elif defined(CONFIG_SPE_POSSIBLE)
 	EXCEPTION(0x2020, SPE_UNAVAIL, SPEUnavailable, \
-		  unknown_exception, EXC_XFER_EE)
+		  unknown_exception, EXC_XFER_STD)
 #endif /* CONFIG_SPE_POSSIBLE */
 
 	/* SPE Floating Point Data */
 #ifdef CONFIG_SPE
 	EXCEPTION(0x2030, SPE_FP_DATA, SPEFloatingPointData,
-		  SPEFloatingPointException, EXC_XFER_EE)
+		  SPEFloatingPointException, EXC_XFER_STD)
 
 	/* SPE Floating Point Round */
 	EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \
-		  SPEFloatingPointRoundException, EXC_XFER_EE)
+		  SPEFloatingPointRoundException, EXC_XFER_STD)
 #elif defined(CONFIG_SPE_POSSIBLE)
 	EXCEPTION(0x2040, SPE_FP_DATA, SPEFloatingPointData,
-		  unknown_exception, EXC_XFER_EE)
+		  unknown_exception, EXC_XFER_STD)
 	EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \
-		  unknown_exception, EXC_XFER_EE)
+		  unknown_exception, EXC_XFER_STD)
 #endif /* CONFIG_SPE_POSSIBLE */
 
 
@@ -677,10 +677,10 @@ END_BTB_FLUSH_SECTION
 			   unknown_exception)
 
 	/* Hypercall */
-	EXCEPTION(0, HV_SYSCALL, Hypercall, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0, HV_SYSCALL, Hypercall, unknown_exception, EXC_XFER_STD)
 
 	/* Embedded Hypervisor Privilege */
-	EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception, EXC_XFER_STD)
 
 interrupt_end:
 
-- 
cgit v1.2.3-59-g8ed1b


From 1ae99b4b924ab10452da653baed29d3883705519 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 30 Apr 2019 12:39:00 +0000
Subject: powerpc/32: get rid of COPY_EE in exception entry

EXC_XFER_TEMPLATE() is not called with COPY_EE anymore so
we can get rid of copyee parameters and related COPY_EE and NOCOPY
macros.

Suggested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[splited out from benh RFC patch]

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/head_32.h    | 12 ++++--------
 arch/powerpc/kernel/head_40x.S   |  8 +++-----
 arch/powerpc/kernel/head_booke.h | 22 ++++++++--------------
 3 files changed, 15 insertions(+), 27 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
index 8881b6887841..14cb0af2f494 100644
--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@@ -103,28 +103,24 @@ label:
 	addi	r3,r1,STACK_FRAME_OVERHEAD;	\
 	xfer(n, hdlr)
 
-#define EXC_XFER_TEMPLATE(hdlr, trap, msr, copyee, tfer, ret)	\
+#define EXC_XFER_TEMPLATE(hdlr, trap, msr, tfer, ret)		\
 	li	r10,trap;					\
 	stw	r10,_TRAP(r11);					\
 	LOAD_MSR_KERNEL(r10, msr);				\
-	copyee(r10, r9);					\
 	bl	tfer;						\
 	.long	hdlr;						\
 	.long	ret
 
-#define COPY_EE(d, s)		rlwimi d,s,0,MSR_EE
-#define NOCOPY(d, s)
-
 #define EXC_XFER_STD(n, hdlr)		\
-	EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, NOCOPY, transfer_to_handler_full,	\
+	EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, transfer_to_handler_full,	\
 			  ret_from_except_full)
 
 #define EXC_XFER_LITE(n, hdlr)		\
-	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, NOCOPY, transfer_to_handler, \
+	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, transfer_to_handler, \
 			  ret_from_except)
 
 #define EXC_XFER_SYS(n, hdlr)		\
-	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, NOCOPY, transfer_to_handler, \
+	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, transfer_to_handler, \
 			  ret_from_except)
 
 #endif /* __HEAD_32_H__ */
diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S
index 0463f56fc7ab..fc332d33112e 100644
--- a/arch/powerpc/kernel/head_40x.S
+++ b/arch/powerpc/kernel/head_40x.S
@@ -166,8 +166,7 @@ _ENTRY(saved_ksp_limit)
 	CRITICAL_EXCEPTION_PROLOG;				\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
 	EXC_XFER_TEMPLATE(hdlr, n+2, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \
-			  NOCOPY, crit_transfer_to_handler,	\
-			  ret_from_crit_exc)
+			  crit_transfer_to_handler, ret_from_crit_exc)
 
 /*
  * 0x0100 - Critical Interrupt Exception
@@ -651,7 +650,7 @@ _ENTRY(saved_ksp_limit)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	EXC_XFER_TEMPLATE(DebugException, 0x2002, \
 		(MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \
-		NOCOPY, crit_transfer_to_handler, ret_from_crit_exc)
+		crit_transfer_to_handler, ret_from_crit_exc)
 
 	/* Programmable Interval Timer (PIT) Exception. (from 0x1000) */
 Decrementer:
@@ -673,8 +672,7 @@ WDTException:
 	addi	r3,r1,STACK_FRAME_OVERHEAD;
 	EXC_XFER_TEMPLATE(WatchdogException, 0x1020+2,
 	                  (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)),
-			  NOCOPY, crit_transfer_to_handler,
-			  ret_from_crit_exc)
+			  crit_transfer_to_handler, ret_from_crit_exc)
 
 /*
  * The other Data TLB exceptions bail out to this point
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index 264976c43f34..56dd1341eb3d 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -217,8 +217,7 @@ label:
 	CRITICAL_EXCEPTION_PROLOG(intno);				\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;				\
 	EXC_XFER_TEMPLATE(hdlr, n+2, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \
-			  NOCOPY, crit_transfer_to_handler, \
-			  ret_from_crit_exc)
+			  crit_transfer_to_handler, ret_from_crit_exc)
 
 #define MCHECK_EXCEPTION(n, label, hdlr)			\
 	START_EXCEPTION(label);					\
@@ -227,32 +226,27 @@ label:
 	stw	r5,_ESR(r11);					\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
 	EXC_XFER_TEMPLATE(hdlr, n+4, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \
-			  NOCOPY, mcheck_transfer_to_handler,   \
-			  ret_from_mcheck_exc)
+			  mcheck_transfer_to_handler, ret_from_mcheck_exc)
 
-#define EXC_XFER_TEMPLATE(hdlr, trap, msr, copyee, tfer, ret)	\
+#define EXC_XFER_TEMPLATE(hdlr, trap, msr, tfer, ret)	\
 	li	r10,trap;					\
 	stw	r10,_TRAP(r11);					\
 	lis	r10,msr@h;					\
 	ori	r10,r10,msr@l;					\
-	copyee(r10, r9);					\
 	bl	tfer;		 				\
 	.long	hdlr;						\
 	.long	ret
 
-#define COPY_EE(d, s)		rlwimi d,s,0,16,16
-#define NOCOPY(d, s)
-
 #define EXC_XFER_STD(n, hdlr)		\
-	EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, NOCOPY, transfer_to_handler_full, \
+	EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, transfer_to_handler_full, \
 			  ret_from_except_full)
 
 #define EXC_XFER_LITE(n, hdlr)		\
-	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, NOCOPY, transfer_to_handler, \
+	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, transfer_to_handler, \
 			  ret_from_except)
 
 #define EXC_XFER_SYS(n, hdlr)						\
-	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, NOCOPY, transfer_to_handler, \
+	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, transfer_to_handler, \
 			  ret_from_except)
 
 /* Check for a single step debug exception while in an exception
@@ -319,7 +313,7 @@ label:
 	/* continue normal handling for a debug exception... */		      \
 2:	mfspr	r4,SPRN_DBSR;						      \
 	addi	r3,r1,STACK_FRAME_OVERHEAD;				      \
-	EXC_XFER_TEMPLATE(DebugException, 0x2008, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), NOCOPY, debug_transfer_to_handler, ret_from_debug_exc)
+	EXC_XFER_TEMPLATE(DebugException, 0x2008, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), debug_transfer_to_handler, ret_from_debug_exc)
 
 #define DEBUG_CRIT_EXCEPTION						      \
 	START_EXCEPTION(DebugCrit);					      \
@@ -372,7 +366,7 @@ label:
 	/* continue normal handling for a critical exception... */	      \
 2:	mfspr	r4,SPRN_DBSR;						      \
 	addi	r3,r1,STACK_FRAME_OVERHEAD;				      \
-	EXC_XFER_TEMPLATE(DebugException, 0x2002, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), NOCOPY, crit_transfer_to_handler, ret_from_crit_exc)
+	EXC_XFER_TEMPLATE(DebugException, 0x2002, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), crit_transfer_to_handler, ret_from_crit_exc)
 
 #define DATA_STORAGE_EXCEPTION						      \
 	START_EXCEPTION(DataStorage)					      \
-- 
cgit v1.2.3-59-g8ed1b


From 40530db7c656119b1671aae5bc27811f66f5f424 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 30 Apr 2019 12:39:01 +0000
Subject: powerpc: Fix 32-bit handling of MSR_EE on exceptions

[text mostly copied from benh's RFC/WIP]

ppc32 are still doing something rather gothic and wrong on 32-bit
which we stopped doing on 64-bit a while ago.

We have that thing where some handlers "copy" the EE value from the
original stack frame into the new MSR before transferring to the
handler.

Thus for a number of exceptions, we enter the handlers with interrupts
enabled.

This is rather fishy, some of the stuff that handlers might do early
on such as irq_enter/exit or user_exit, context tracking, etc...
should be run with interrupts off afaik.

Generally our handlers know when to re-enable interrupts if needed.

The problem we were having is that we assumed these interrupts would
return with interrupts enabled. However that isn't the case.

Instead, this patch changes things so that we always enter exception
handlers with interrupts *off* with the notable exception of syscalls
which are special (and get a fast path).

Suggested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/entry_32.S | 116 ++++++++++++++++++++++++-----------------
 1 file changed, 67 insertions(+), 49 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index d0cea3deb86c..0c555f9f1543 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -37,6 +37,7 @@
 #include <asm/feature-fixups.h>
 #include <asm/barrier.h>
 #include <asm/kup.h>
+#include <asm/bug.h>
 
 #include "head_32.h"
 
@@ -206,19 +207,42 @@ transfer_to_handler_cont:
 	mtspr	SPRN_NRI, r0
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
+	/*
+	 * When tracing IRQ state (lockdep) we enable the MMU before we call
+	 * the IRQ tracing functions as they might access vmalloc space or
+	 * perform IOs for console output.
+	 *
+	 * To speed up the syscall path where interrupts stay on, let's check
+	 * first if we are changing the MSR value at all.
+	 */
+	tophys(r12, r1)
+	lwz	r12,_MSR(r12)
+	xor	r12,r10,r12
+	andi.	r12,r12,MSR_EE
+	bne	1f
+
+	/* MSR isn't changing, just transition directly */
+#endif
+	mtspr	SPRN_SRR0,r11
+	mtspr	SPRN_SRR1,r10
+	mtlr	r9
+	SYNC
+	RFI				/* jump to handler, enable MMU */
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+1:	/* MSR is changing, re-enable MMU so we can notify lockdep. We need to
+	 * keep interrupts disabled at this point otherwise we might risk
+	 * taking an interrupt before we tell lockdep they are enabled.
+	 */
 	lis	r12,reenable_mmu@h
 	ori	r12,r12,reenable_mmu@l
+	LOAD_MSR_KERNEL(r0, MSR_KERNEL)
 	mtspr	SPRN_SRR0,r12
-	mtspr	SPRN_SRR1,r10
+	mtspr	SPRN_SRR1,r0
 	SYNC
 	RFI
-reenable_mmu:				/* re-enable mmu so we can */
-	mfmsr	r10
-	lwz	r12,_MSR(r1)
-	xor	r10,r10,r12
-	andi.	r10,r10,MSR_EE		/* Did EE change? */
-	beq	1f
 
+reenable_mmu:
 	/*
 	 * The trace_hardirqs_off will use CALLER_ADDR0 and CALLER_ADDR1.
 	 * If from user mode there is only one stack frame on the stack, and
@@ -233,14 +257,24 @@ reenable_mmu:				/* re-enable mmu so we can */
 	 * they aren't useful past this point (aren't syscall arguments),
 	 * the rest is restored from the exception frame.
 	 */
+
+	/* Are we enabling or disabling interrupts ? */
+	andi.	r0,r10,MSR_EE
+
 	stwu	r1,-32(r1)
 	stw	r9,8(r1)
 	stw	r11,12(r1)
 	stw	r3,16(r1)
 	stw	r4,20(r1)
 	stw	r5,24(r1)
-	bl	trace_hardirqs_off
-	lwz	r5,24(r1)
+
+	bne-	0f
+
+	/* If we are disabling interrupts (normal case), simply log it with
+	 * lockdep
+	 */
+1:	bl	trace_hardirqs_off
+2:	lwz	r5,24(r1)
 	lwz	r4,20(r1)
 	lwz	r3,16(r1)
 	lwz	r11,12(r1)
@@ -250,15 +284,22 @@ reenable_mmu:				/* re-enable mmu so we can */
 	lwz	r6,GPR6(r1)
 	lwz	r7,GPR7(r1)
 	lwz	r8,GPR8(r1)
-1:	mtctr	r11
+	mtctr	r11
 	mtlr	r9
 	bctr				/* jump to handler */
-#else /* CONFIG_TRACE_IRQFLAGS */
-	mtspr	SPRN_SRR0,r11
-	mtspr	SPRN_SRR1,r10
-	mtlr	r9
-	SYNC
-	RFI				/* jump to handler, enable MMU */
+
+	/* If we are enabling interrupt, this is a syscall. They shouldn't
+	 * happen while interrupts are disabled, so let's do a warning here.
+	 */
+0:	trap
+	EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
+	bl	trace_hardirqs_on
+
+	/* Now enable for real */
+	mfmsr	r10
+	ori	r10,r10,MSR_EE
+	mtmsr	r10
+	b	2b
 #endif /* CONFIG_TRACE_IRQFLAGS */
 
 #if defined (CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500)
@@ -316,29 +357,13 @@ _GLOBAL(DoSyscall)
 	rlwinm	r11,r11,0,4,2
 	stw	r11,_CCR(r1)
 #ifdef CONFIG_TRACE_IRQFLAGS
-	/* Return from syscalls can (and generally will) hard enable
-	 * interrupts. You aren't supposed to call a syscall with
-	 * interrupts disabled in the first place. However, to ensure
-	 * that we get it right vs. lockdep if it happens, we force
-	 * that hard enable here with appropriate tracing if we see
-	 * that we have been called with interrupts off
-	 */
+	/* Make sure interrupts are enabled */
 	mfmsr	r11
 	andi.	r12,r11,MSR_EE
-	bne+	1f
-	/* We came in with interrupts disabled, we enable them now */
-	bl	trace_hardirqs_on
-	mfmsr	r11
-	lwz	r0,GPR0(r1)
-	lwz	r3,GPR3(r1)
-	lwz	r4,GPR4(r1)
-	ori	r11,r11,MSR_EE
-	lwz	r5,GPR5(r1)
-	lwz	r6,GPR6(r1)
-	lwz	r7,GPR7(r1)
-	lwz	r8,GPR8(r1)
-	mtmsr	r11
-1:
+	/* We came in with interrupts disabled, we WARN and mark them enabled
+	 * for lockdep now */
+0:	tweqi	r12, 0
+	EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
 #endif /* CONFIG_TRACE_IRQFLAGS */
 	lwz	r11,TI_FLAGS(r2)
 	andi.	r11,r11,_TIF_SYSCALL_DOTRACE
@@ -392,8 +417,7 @@ syscall_exit_cont:
 	lwz	r8,_MSR(r1)
 #ifdef CONFIG_TRACE_IRQFLAGS
 	/* If we are going to return from the syscall with interrupts
-	 * off, we trace that here. It shouldn't happen though but we
-	 * want to catch the bugger if it does right ?
+	 * off, we trace that here. It shouldn't normally happen.
 	 */
 	andi.	r10,r8,MSR_EE
 	bne+	1f
@@ -918,13 +942,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
 	 * off in this assembly code while peeking at TI_FLAGS() and such. However
 	 * we need to inform it if the exception turned interrupts off, and we
 	 * are about to trun them back on.
-	 *
-	 * The problem here sadly is that we don't know whether the exceptions was
-	 * one that turned interrupts off or not. So we always tell lockdep about
-	 * turning them on here when we go back to wherever we came from with EE
-	 * on, even if that may meen some redudant calls being tracked. Maybe later
-	 * we could encode what the exception did somewhere or test the exception
-	 * type in the pt_regs but that sounds overkill
 	 */
 	andi.	r10,r9,MSR_EE
 	beq	1f
@@ -1212,9 +1229,10 @@ do_work:			/* r10 contains MSR_KERNEL here */
 	beq	do_user_signal
 
 do_resched:			/* r10 contains MSR_KERNEL here */
-	/* Note: We don't need to inform lockdep that we are enabling
-	 * interrupts here. As far as it knows, they are already enabled
-	 */
+#ifdef CONFIG_TRACE_IRQFLAGS
+	bl	trace_hardirqs_on
+	mfmsr	r10
+#endif
 	ori	r10,r10,MSR_EE
 	SYNC
 	MTMSRD(r10)		/* hard-enable interrupts */
-- 
cgit v1.2.3-59-g8ed1b


From b86fb88855ea7881314b935df1df6b1ef1bd0c32 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 30 Apr 2019 12:39:02 +0000
Subject: powerpc/32: implement fast entry for syscalls on non BOOKE

This patch implements a fast entry for syscalls.

Syscalls don't have to preserve non volatile registers except LR.

This patch then implement a fast entry for syscalls, where
volatile registers get clobbered.

As this entry is dedicated to syscall it always sets MSR_EE
and warns in case MSR_EE was previously off

It also assumes that the call is always from user, system calls are
unexpected from kernel.

The overall series improves null_syscall selftest by 12,5% on an 83xx
and by 17% on a 8xx.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/entry_32.S | 32 ++++++++++++++++
 arch/powerpc/kernel/head_32.S  |  3 +-
 arch/powerpc/kernel/head_32.h  | 85 ++++++++++++++++++++++++++++++++++++++++--
 arch/powerpc/kernel/head_40x.S |  3 +-
 arch/powerpc/kernel/head_8xx.S |  3 +-
 5 files changed, 116 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 0c555f9f1543..184cc1de2f37 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -342,6 +342,35 @@ stack_ovf:
 	SYNC
 	RFI
 
+#ifndef CONFIG_BOOKE	/* to be removed once BOOKE uses fast syscall entry */
+#ifdef CONFIG_TRACE_IRQFLAGS
+trace_syscall_entry_irq_off:
+	/*
+	 * Syscall shouldn't happen while interrupts are disabled,
+	 * so let's do a warning here.
+	 */
+0:	trap
+	EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
+	bl	trace_hardirqs_on
+
+	/* Now enable for real */
+	LOAD_MSR_KERNEL(r10, MSR_KERNEL | MSR_EE)
+	mtmsr	r10
+
+	REST_GPR(0, r1)
+	REST_4GPRS(3, r1)
+	REST_2GPRS(7, r1)
+	b	DoSyscall
+#endif /* CONFIG_TRACE_IRQFLAGS */
+
+	.globl	transfer_to_syscall
+transfer_to_syscall:
+#ifdef CONFIG_TRACE_IRQFLAGS
+	andi.	r12,r9,MSR_EE
+	beq-	trace_syscall_entry_irq_off
+#endif /* CONFIG_TRACE_IRQFLAGS */
+#endif /* !CONFIG_BOOKE */
+
 /*
  * Handle a system call.
  */
@@ -353,9 +382,11 @@ _GLOBAL(DoSyscall)
 	stw	r3,ORIG_GPR3(r1)
 	li	r12,0
 	stw	r12,RESULT(r1)
+#ifdef CONFIG_BOOKE	/* to be removed once BOOKE uses fast syscall entry */
 	lwz	r11,_CCR(r1)	/* Clear SO bit in CR */
 	rlwinm	r11,r11,0,4,2
 	stw	r11,_CCR(r1)
+#endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	/* Make sure interrupts are enabled */
 	mfmsr	r11
@@ -1219,6 +1250,7 @@ load_dbcr0:
 
 	.section .bss
 	.align	4
+	.global global_dbcr0
 global_dbcr0:
 	.space	8*NR_CPUS
 	.previous
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 7f5555e362a1..755fab9641d6 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -374,8 +374,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_FPU_UNAVAILABLE)
 	. = 0xc00
 	DO_KVM  0xc00
 SystemCall:
-	EXCEPTION_PROLOG
-	EXC_XFER_SYS(0xc00, DoSyscall)
+	SYSCALL_ENTRY	0xc00
 
 /* Single step - not used on 601 */
 	EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD)
diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
index 14cb0af2f494..4a692553651f 100644
--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@@ -73,6 +73,87 @@
 	SAVE_2GPRS(7, r11)
 .endm
 
+.macro SYSCALL_ENTRY trapno
+	mfspr	r12,SPRN_SPRG_THREAD
+	mfcr	r10
+	lwz	r11,TASK_STACK-THREAD(r12)
+	mflr	r9
+	addi	r11,r11,THREAD_SIZE - INT_FRAME_SIZE
+	rlwinm	r10,r10,0,4,2	/* Clear SO bit in CR */
+	tophys(r11,r11)
+	stw	r10,_CCR(r11)		/* save registers */
+	mfspr	r10,SPRN_SRR0
+	stw	r9,_LINK(r11)
+	mfspr	r9,SPRN_SRR1
+	stw	r1,GPR1(r11)
+	stw	r1,0(r11)
+	tovirt(r1,r11)			/* set new kernel sp */
+	stw	r10,_NIP(r11)
+#ifdef CONFIG_40x
+	rlwinm	r9,r9,0,14,12		/* clear MSR_WE (necessary?) */
+#else
+	LOAD_MSR_KERNEL(r10, MSR_KERNEL & ~(MSR_IR|MSR_DR)) /* can take exceptions */
+	MTMSRD(r10)			/* (except for mach check in rtas) */
+#endif
+	lis	r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */
+	stw	r2,GPR2(r11)
+	addi	r10,r10,STACK_FRAME_REGS_MARKER@l
+	stw	r9,_MSR(r11)
+	li	r2, \trapno + 1
+	stw	r10,8(r11)
+	stw	r2,_TRAP(r11)
+	SAVE_GPR(0, r11)
+	SAVE_4GPRS(3, r11)
+	SAVE_2GPRS(7, r11)
+	addi	r11,r1,STACK_FRAME_OVERHEAD
+	addi	r2,r12,-THREAD
+	stw	r11,PT_REGS(r12)
+#if defined(CONFIG_40x)
+	/* Check to see if the dbcr0 register is set up to debug.  Use the
+	   internal debug mode bit to do this. */
+	lwz	r12,THREAD_DBCR0(r12)
+	andis.	r12,r12,DBCR0_IDM@h
+#endif
+	ACCOUNT_CPU_USER_ENTRY(r2, r11, r12)
+#if defined(CONFIG_40x)
+	beq+	3f
+	/* From user and task is ptraced - load up global dbcr0 */
+	li	r12,-1			/* clear all pending debug events */
+	mtspr	SPRN_DBSR,r12
+	lis	r11,global_dbcr0@ha
+	tophys(r11,r11)
+	addi	r11,r11,global_dbcr0@l
+	lwz	r12,0(r11)
+	mtspr	SPRN_DBCR0,r12
+	lwz	r12,4(r11)
+	addi	r12,r12,-1
+	stw	r12,4(r11)
+#endif
+
+3:
+	tovirt(r2, r2)			/* set r2 to current */
+	lis	r11, transfer_to_syscall@h
+	ori	r11, r11, transfer_to_syscall@l
+#ifdef CONFIG_TRACE_IRQFLAGS
+	/*
+	 * If MSR is changing we need to keep interrupts disabled at this point
+	 * otherwise we might risk taking an interrupt before we tell lockdep
+	 * they are enabled.
+	 */
+	LOAD_MSR_KERNEL(r10, MSR_KERNEL)
+	rlwimi	r10, r9, 0, MSR_EE
+#else
+	LOAD_MSR_KERNEL(r10, MSR_KERNEL | MSR_EE)
+#endif
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
+	mtspr	SPRN_NRI, r0
+#endif
+	mtspr	SPRN_SRR1,r10
+	mtspr	SPRN_SRR0,r11
+	SYNC
+	RFI				/* jump to handler, enable MMU */
+.endm
+
 /*
  * Note: code which follows this uses cr0.eq (set if from kernel),
  * r11, r12 (SRR0), and r9 (SRR1).
@@ -119,8 +200,4 @@ label:
 	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, transfer_to_handler, \
 			  ret_from_except)
 
-#define EXC_XFER_SYS(n, hdlr)		\
-	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, transfer_to_handler, \
-			  ret_from_except)
-
 #endif /* __HEAD_32_H__ */
diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S
index fc332d33112e..cf54b784100d 100644
--- a/arch/powerpc/kernel/head_40x.S
+++ b/arch/powerpc/kernel/head_40x.S
@@ -348,8 +348,7 @@ _ENTRY(saved_ksp_limit)
 
 /* 0x0C00 - System Call Exception */
 	START_EXCEPTION(0x0C00,	SystemCall)
-	EXCEPTION_PROLOG
-	EXC_XFER_SYS(0xc00, DoSyscall)
+	SYSCALL_ENTRY	0xc00
 
 	EXCEPTION(0x0D00, Trap_0D, unknown_exception, EXC_XFER_STD)
 	EXCEPTION(0x0E00, Trap_0E, unknown_exception, EXC_XFER_STD)
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 16b4791a2a9f..885be7f3d29a 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -185,8 +185,7 @@ Alignment:
 /* System call */
 	. = 0xc00
 SystemCall:
-	EXCEPTION_PROLOG
-	EXC_XFER_SYS(0xc00, DoSyscall)
+	SYSCALL_ENTRY	0xc00
 
 /* Single step - not used on 601 */
 	EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD)
-- 
cgit v1.2.3-59-g8ed1b


From 1a4b739bbb4f8857d1b4feb46d6b3ec72269c111 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 30 Apr 2019 12:39:03 +0000
Subject: powerpc/32: implement fast entry for syscalls on BOOKE

This patch implements a fast entry for syscalls.

Syscalls don't have to preserve non volatile registers except LR.

This patch then implement a fast entry for syscalls, where
volatile registers get clobbered.

As this entry is dedicated to syscall it always sets MSR_EE
and warns in case MSR_EE was previously off

It also assumes that the call is always from user, system calls are
unexpected from kernel.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/entry_32.S       |   7 ---
 arch/powerpc/kernel/head_44x.S       |   3 +-
 arch/powerpc/kernel/head_booke.h     | 103 +++++++++++++++++++++++++++++++++--
 arch/powerpc/kernel/head_fsl_booke.S |   3 +-
 4 files changed, 100 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 184cc1de2f37..dc58fec51ed6 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -342,7 +342,6 @@ stack_ovf:
 	SYNC
 	RFI
 
-#ifndef CONFIG_BOOKE	/* to be removed once BOOKE uses fast syscall entry */
 #ifdef CONFIG_TRACE_IRQFLAGS
 trace_syscall_entry_irq_off:
 	/*
@@ -369,7 +368,6 @@ transfer_to_syscall:
 	andi.	r12,r9,MSR_EE
 	beq-	trace_syscall_entry_irq_off
 #endif /* CONFIG_TRACE_IRQFLAGS */
-#endif /* !CONFIG_BOOKE */
 
 /*
  * Handle a system call.
@@ -382,11 +380,6 @@ _GLOBAL(DoSyscall)
 	stw	r3,ORIG_GPR3(r1)
 	li	r12,0
 	stw	r12,RESULT(r1)
-#ifdef CONFIG_BOOKE	/* to be removed once BOOKE uses fast syscall entry */
-	lwz	r11,_CCR(r1)	/* Clear SO bit in CR */
-	rlwinm	r11,r11,0,4,2
-	stw	r11,_CCR(r1)
-#endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	/* Make sure interrupts are enabled */
 	mfmsr	r11
diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
index 0381fdb294a6..f15fba58c744 100644
--- a/arch/powerpc/kernel/head_44x.S
+++ b/arch/powerpc/kernel/head_44x.S
@@ -285,8 +285,7 @@ interrupt_base:
 #endif
 	/* System Call Interrupt */
 	START_EXCEPTION(SystemCall)
-	NORMAL_EXCEPTION_PROLOG(BOOKE_INTERRUPT_SYSCALL)
-	EXC_XFER_SYS(0x0c00, DoSyscall)
+	SYSCALL_ENTRY   0xc00 BOOKE_INTERRUPT_SYSCALL
 
 	/* Auxiliary Processor Unavailable Interrupt */
 	EXCEPTION(0x2020, BOOKE_INTERRUPT_AP_UNAVAIL, \
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index 56dd1341eb3d..bfeb469e8106 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -6,6 +6,8 @@
 #include <asm/kvm_asm.h>
 #include <asm/kvm_booke_hv_asm.h>
 
+#ifdef __ASSEMBLY__
+
 /*
  * Macros used for common Book-e exception handling
  */
@@ -81,6 +83,101 @@ END_BTB_FLUSH_SECTION
 	SAVE_4GPRS(3, r11);						     \
 	SAVE_2GPRS(7, r11)
 
+.macro SYSCALL_ENTRY trapno intno
+	mfspr	r10, SPRN_SPRG_THREAD
+#ifdef CONFIG_KVM_BOOKE_HV
+BEGIN_FTR_SECTION
+	mtspr	SPRN_SPRG_WSCRATCH0, r10
+	stw	r11, THREAD_NORMSAVE(0)(r10)
+	stw	r13, THREAD_NORMSAVE(2)(r10)
+	mfcr	r13			/* save CR in r13 for now	   */
+	mfspr	r11, SPRN_SRR1
+	mtocrf	0x80, r11	/* check MSR[GS] without clobbering reg */
+	bf	3, 1975f
+	b	kvmppc_handler_BOOKE_INTERRUPT_\intno\()_SPRN_SRR1
+1975:
+	mr	r12, r13
+	lwz	r13, THREAD_NORMSAVE(2)(r10)
+FTR_SECTION_ELSE
+#endif
+	mfcr	r12
+#ifdef CONFIG_KVM_BOOKE_HV
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV)
+#endif
+	BOOKE_CLEAR_BTB(r11)
+	lwz	r11, TASK_STACK - THREAD(r10)
+	rlwinm	r12,r12,0,4,2	/* Clear SO bit in CR */
+	ALLOC_STACK_FRAME(r11, THREAD_SIZE - INT_FRAME_SIZE)
+	stw	r12, _CCR(r11)		/* save various registers */
+	mflr	r12
+	stw	r12,_LINK(r11)
+	mfspr	r12,SPRN_SRR0
+	stw	r1, GPR1(r11)
+	mfspr	r9,SPRN_SRR1
+	stw	r1, 0(r11)
+	mr	r1, r11
+	stw	r12,_NIP(r11)
+	rlwinm	r9,r9,0,14,12		/* clear MSR_WE (necessary?)	   */
+	lis	r12, STACK_FRAME_REGS_MARKER@ha /* exception frame marker */
+	stw	r2,GPR2(r11)
+	addi	r12, r12, STACK_FRAME_REGS_MARKER@l
+	stw	r9,_MSR(r11)
+	li	r2, \trapno + 1
+	stw	r12, 8(r11)
+	stw	r2,_TRAP(r11)
+	SAVE_GPR(0, r11)
+	SAVE_4GPRS(3, r11)
+	SAVE_2GPRS(7, r11)
+
+	addi	r11,r1,STACK_FRAME_OVERHEAD
+	addi	r2,r10,-THREAD
+	stw	r11,PT_REGS(r10)
+	/* Check to see if the dbcr0 register is set up to debug.  Use the
+	   internal debug mode bit to do this. */
+	lwz	r12,THREAD_DBCR0(r10)
+	andis.	r12,r12,DBCR0_IDM@h
+	ACCOUNT_CPU_USER_ENTRY(r2, r11, r12)
+	beq+	3f
+	/* From user and task is ptraced - load up global dbcr0 */
+	li	r12,-1			/* clear all pending debug events */
+	mtspr	SPRN_DBSR,r12
+	lis	r11,global_dbcr0@ha
+	tophys(r11,r11)
+	addi	r11,r11,global_dbcr0@l
+#ifdef CONFIG_SMP
+	lwz	r9,TASK_CPU(r2)
+	slwi	r9,r9,3
+	add	r11,r11,r9
+#endif
+	lwz	r12,0(r11)
+	mtspr	SPRN_DBCR0,r12
+	lwz	r12,4(r11)
+	addi	r12,r12,-1
+	stw	r12,4(r11)
+
+3:
+	tovirt(r2, r2)			/* set r2 to current */
+	lis	r11, transfer_to_syscall@h
+	ori	r11, r11, transfer_to_syscall@l
+#ifdef CONFIG_TRACE_IRQFLAGS
+	/*
+	 * If MSR is changing we need to keep interrupts disabled at this point
+	 * otherwise we might risk taking an interrupt before we tell lockdep
+	 * they are enabled.
+	 */
+	lis	r10, MSR_KERNEL@h
+	ori	r10, r10, MSR_KERNEL@l
+	rlwimi	r10, r9, 0, MSR_EE
+#else
+	lis	r10, (MSR_KERNEL | MSR_EE)@h
+	ori	r10, r10, (MSR_KERNEL | MSR_EE)@l
+#endif
+	mtspr	SPRN_SRR1,r10
+	mtspr	SPRN_SRR0,r11
+	SYNC
+	RFI				/* jump to handler, enable MMU */
+.endm
+
 /* To handle the additional exception priority levels on 40x and Book-E
  * processors we allocate a stack per additional priority level.
  *
@@ -245,10 +342,6 @@ label:
 	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, transfer_to_handler, \
 			  ret_from_except)
 
-#define EXC_XFER_SYS(n, hdlr)						\
-	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, transfer_to_handler, \
-			  ret_from_except)
-
 /* Check for a single step debug exception while in an exception
  * handler before state has been saved.  This is to catch the case
  * where an instruction that we are trying to single step causes
@@ -418,7 +511,7 @@ label:
 1:	addi	r3,r1,STACK_FRAME_OVERHEAD;				      \
 	EXC_XFER_STD(0x800, kernel_fp_unavailable_exception)
 
-#ifndef __ASSEMBLY__
+#else /* __ASSEMBLY__ */
 struct exception_regs {
 	unsigned long mas0;
 	unsigned long mas1;
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index df980ad0f95f..6621f230cc37 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -413,8 +413,7 @@ interrupt_base:
 
 	/* System Call Interrupt */
 	START_EXCEPTION(SystemCall)
-	NORMAL_EXCEPTION_PROLOG(SYSCALL)
-	EXC_XFER_SYS(0x0c00, DoSyscall)
+	SYSCALL_ENTRY   0xc00 SYSCALL
 
 	/* Auxiliary Processor Unavailable Interrupt */
 	EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, \
-- 
cgit v1.2.3-59-g8ed1b


From 38b4564cf042ad3f5333692687023803c1ab1112 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 30 Apr 2019 12:39:04 +0000
Subject: powerpc/32: don't do syscall stuff in transfer_to_handler

As syscalls are now handled via a fast entry path, syscall related
actions can be removed from the generic transfer_to_handler path.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/entry_32.S | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index dc58fec51ed6..e65c3e70c648 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -217,7 +217,6 @@ transfer_to_handler_cont:
 	 */
 	tophys(r12, r1)
 	lwz	r12,_MSR(r12)
-	xor	r12,r10,r12
 	andi.	r12,r12,MSR_EE
 	bne	1f
 
@@ -258,9 +257,6 @@ reenable_mmu:
 	 * the rest is restored from the exception frame.
 	 */
 
-	/* Are we enabling or disabling interrupts ? */
-	andi.	r0,r10,MSR_EE
-
 	stwu	r1,-32(r1)
 	stw	r9,8(r1)
 	stw	r11,12(r1)
@@ -268,8 +264,6 @@ reenable_mmu:
 	stw	r4,20(r1)
 	stw	r5,24(r1)
 
-	bne-	0f
-
 	/* If we are disabling interrupts (normal case), simply log it with
 	 * lockdep
 	 */
@@ -287,19 +281,6 @@ reenable_mmu:
 	mtctr	r11
 	mtlr	r9
 	bctr				/* jump to handler */
-
-	/* If we are enabling interrupt, this is a syscall. They shouldn't
-	 * happen while interrupts are disabled, so let's do a warning here.
-	 */
-0:	trap
-	EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
-	bl	trace_hardirqs_on
-
-	/* Now enable for real */
-	mfmsr	r10
-	ori	r10,r10,MSR_EE
-	mtmsr	r10
-	b	2b
 #endif /* CONFIG_TRACE_IRQFLAGS */
 
 #if defined (CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500)
-- 
cgit v1.2.3-59-g8ed1b


From d1865e71cdc9b75b6a6716a2983eb5d6004cfca9 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 30 Apr 2019 12:39:05 +0000
Subject: powerpc/32: Don't add dummy frames when calling trace_hardirqs_on/off

No need to add dummy frames when calling trace_hardirqs_on or
trace_hardirqs_off. GCC properly handles empty stacks.

In addition, powerpc doesn't set CONFIG_FRAME_POINTER, therefore
__builtin_return_address(1..) returns NULL at all time. So the
dummy frames are definitely unneeded here.

In the meantime, avoid reading memory for loading r1 with a value
we already know.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/entry_32.S | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index e65c3e70c648..235a01d34b6d 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -243,12 +243,7 @@ transfer_to_handler_cont:
 
 reenable_mmu:
 	/*
-	 * The trace_hardirqs_off will use CALLER_ADDR0 and CALLER_ADDR1.
-	 * If from user mode there is only one stack frame on the stack, and
-	 * accessing CALLER_ADDR1 will cause oops. So we need create a dummy
-	 * stack frame to make trace_hardirqs_off happy.
-	 *
-	 * This is handy because we also need to save a bunch of GPRs,
+	 * We save a bunch of GPRs,
 	 * r3 can be different from GPR3(r1) at this point, r9 and r11
 	 * contains the old MSR and handler address respectively,
 	 * r4 & r5 can contain page fault arguments that need to be passed
@@ -950,18 +945,11 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
 	 */
 	andi.	r10,r9,MSR_EE
 	beq	1f
-	/*
-	 * Since the ftrace irqsoff latency trace checks CALLER_ADDR1,
-	 * which is the stack frame here, we need to force a stack frame
-	 * in case we came from user space.
-	 */
 	stwu	r1,-32(r1)
 	mflr	r0
 	stw	r0,4(r1)
-	stwu	r1,-32(r1)
 	bl	trace_hardirqs_on
-	lwz	r1,0(r1)
-	lwz	r1,0(r1)
+	addi	r1, r1, 32
 	lwz	r9,_MSR(r1)
 1:
 #endif /* CONFIG_TRACE_IRQFLAGS */
-- 
cgit v1.2.3-59-g8ed1b


From 9c1d38b34e944cace44e0d2bea0beb5601a4d36d Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 22 Mar 2019 08:08:39 +0000
Subject: powerpc/fadump: define an empty fadump_cleanup()

To avoid #ifdefs, define an static inline fadump_cleanup() function
when CONFIG_FADUMP is not selected

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/fadump.h  | 1 +
 arch/powerpc/kernel/setup-common.c | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
index 188776befaf9..e2099c0a15c3 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -219,5 +219,6 @@ extern void fadump_cleanup(void);
 static inline int is_fadump_active(void) { return 0; }
 static inline int should_fadump_crash(void) { return 0; }
 static inline void crash_fadump(struct pt_regs *regs, const char *str) { }
+static inline void fadump_cleanup(void) { }
 #endif
 #endif
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 3f8805d3c0c9..13054980e11a 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -134,13 +134,11 @@ int crashing_cpu = -1;
 /* also used by kexec */
 void machine_shutdown(void)
 {
-#ifdef CONFIG_FA_DUMP
 	/*
 	 * if fadump is active, cleanup the fadump registration before we
 	 * shutdown.
 	 */
 	fadump_cleanup();
-#endif
 
 	if (ppc_md.machine_shutdown)
 		ppc_md.machine_shutdown();
-- 
cgit v1.2.3-59-g8ed1b


From 93f2cd813797baf5590459fb0439c62e873b7748 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 22 Mar 2019 08:08:40 +0000
Subject: powerpc/mm: define an empty mm_iommu_init()

To avoid ifdefs, define a empty static inline mm_iommu_init() function
when CONFIG_SPAPR_TCE_IOMMU is not selected.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mmu_context.h | 1 +
 arch/powerpc/kernel/setup-common.c     | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 66a3805dc935..611204e588b9 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -52,6 +52,7 @@ static inline bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
 {
 	return false;
 }
+static inline void mm_iommu_init(struct mm_struct *mm) { }
 #endif
 extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
 extern void set_context(unsigned long id, pgd_t *pgd);
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 13054980e11a..d06d50fe1e7e 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -931,9 +931,7 @@ void __init setup_arch(char **cmdline_p)
 	init_mm.end_data = (unsigned long) _edata;
 	init_mm.brk = klimit;
 
-#ifdef CONFIG_SPAPR_TCE_IOMMU
 	mm_iommu_init(&init_mm);
-#endif
 	irqstack_early_init();
 	exc_lvl_early_init();
 	emergency_stack_init();
-- 
cgit v1.2.3-59-g8ed1b


From e9e9b25a4c99eec0e678c78124ae79764d8f777a Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 22 Mar 2019 08:08:42 +0000
Subject: powerpc/setup: Remove unnecessary #ifdef CONFIG_ALTIVEC

CPU_FTR_ALTIVEC is only set when CONFIG_ALTIVEC is selected, so
the ifdef is unnecessary.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/setup-common.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index d06d50fe1e7e..3cb3774f380a 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -251,10 +251,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 	else
 		seq_printf(m, "unknown (%08x)", pvr);
 
-#ifdef CONFIG_ALTIVEC
 	if (cpu_has_feature(CPU_FTR_ALTIVEC))
 		seq_printf(m, ", altivec supported");
-#endif /* CONFIG_ALTIVEC */
 
 	seq_printf(m, "\n");
 
-- 
cgit v1.2.3-59-g8ed1b


From b5064efee2211f83b98a6a69e7319257c8411221 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 22 Mar 2019 08:08:43 +0000
Subject: powerpc/setup: cleanup ifdef mess in check_cache_coherency()

Use IS_ENABLED() instead of #ifdefs

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/setup-common.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 3cb3774f380a..6d3ebc40d21c 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -737,23 +737,19 @@ void __init setup_panic(void)
  * BUG() in that case.
  */
 
-#ifdef CONFIG_NOT_COHERENT_CACHE
-#define KERNEL_COHERENCY	0
-#else
-#define KERNEL_COHERENCY	1
-#endif
+#define KERNEL_COHERENCY	(!IS_ENABLED(CONFIG_NOT_COHERENT_CACHE))
 
 static int __init check_cache_coherency(void)
 {
 	struct device_node *np;
 	const void *prop;
-	int devtree_coherency;
+	bool devtree_coherency;
 
 	np = of_find_node_by_path("/");
 	prop = of_get_property(np, "coherency-off", NULL);
 	of_node_put(np);
 
-	devtree_coherency = prop ? 0 : 1;
+	devtree_coherency = prop ? false : true;
 
 	if (devtree_coherency != KERNEL_COHERENCY) {
 		printk(KERN_ERR
-- 
cgit v1.2.3-59-g8ed1b


From 48018e42e5c70c8ac4b222cc76af1a15ea2e09e7 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 22 Mar 2019 08:08:44 +0000
Subject: powerpc/setup: cleanup the #ifdef CONFIG_TAU block

Use cpu_has_feature() instead of opencoding

Use IS_ENABLED() instead of #ifdef for CONFIG_TAU_AVERAGE

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/setup-common.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 6d3ebc40d21c..c755fe6ec8ef 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -257,18 +257,18 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 	seq_printf(m, "\n");
 
 #ifdef CONFIG_TAU
-	if (cur_cpu_spec->cpu_features & CPU_FTR_TAU) {
-#ifdef CONFIG_TAU_AVERAGE
-		/* more straightforward, but potentially misleading */
-		seq_printf(m,  "temperature \t: %u C (uncalibrated)\n",
-			   cpu_temp(cpu_id));
-#else
-		/* show the actual temp sensor range */
-		u32 temp;
-		temp = cpu_temp_both(cpu_id);
-		seq_printf(m, "temperature \t: %u-%u C (uncalibrated)\n",
-			   temp & 0xff, temp >> 16);
-#endif
+	if (cpu_has_feature(CPU_FTR_TAU)) {
+		if (IS_ENABLED(CONFIG_TAU_AVERAGE)) {
+			/* more straightforward, but potentially misleading */
+			seq_printf(m,  "temperature \t: %u C (uncalibrated)\n",
+				   cpu_temp(cpu_id));
+		} else {
+			/* show the actual temp sensor range */
+			u32 temp;
+			temp = cpu_temp_both(cpu_id);
+			seq_printf(m, "temperature \t: %u-%u C (uncalibrated)\n",
+				   temp & 0xff, temp >> 16);
+		}
 	}
 #endif /* CONFIG_TAU */
 
-- 
cgit v1.2.3-59-g8ed1b


From 65184f2f045abc0eb35f934f6cbf7e23b9875e7c Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 22 Mar 2019 08:08:45 +0000
Subject: powerpc/setup: replace ifdefs by IS_ENABLED() wherever possible.

Compared to ifdefs, IS_ENABLED() provide a cleaner code and allows
to detect compilation failure regardless of the selected options.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/setup-common.c | 39 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index c755fe6ec8ef..aad9f5df6ab6 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -199,14 +199,15 @@ static void show_cpuinfo_summary(struct seq_file *m)
 {
 	struct device_node *root;
 	const char *model = NULL;
-#if defined(CONFIG_SMP) && defined(CONFIG_PPC32)
 	unsigned long bogosum = 0;
 	int i;
-	for_each_online_cpu(i)
-		bogosum += loops_per_jiffy;
-	seq_printf(m, "total bogomips\t: %lu.%02lu\n",
-		   bogosum/(500000/HZ), bogosum/(5000/HZ) % 100);
-#endif /* CONFIG_SMP && CONFIG_PPC32 */
+
+	if (IS_ENABLED(CONFIG_SMP) && IS_ENABLED(CONFIG_PPC32)) {
+		for_each_online_cpu(i)
+			bogosum += loops_per_jiffy;
+		seq_printf(m, "total bogomips\t: %lu.%02lu\n",
+			   bogosum / (500000 / HZ), bogosum / (5000 / HZ) % 100);
+	}
 	seq_printf(m, "timebase\t: %lu\n", ppc_tb_freq);
 	if (ppc_md.name)
 		seq_printf(m, "platform\t: %s\n", ppc_md.name);
@@ -220,11 +221,10 @@ static void show_cpuinfo_summary(struct seq_file *m)
 	if (ppc_md.show_cpuinfo != NULL)
 		ppc_md.show_cpuinfo(m);
 
-#ifdef CONFIG_PPC32
 	/* Display the amount of memory */
-	seq_printf(m, "Memory\t\t: %d MB\n",
-		   (unsigned int)(total_memory / (1024 * 1024)));
-#endif
+	if (IS_ENABLED(CONFIG_PPC32))
+		seq_printf(m, "Memory\t\t: %d MB\n",
+			   (unsigned int)(total_memory / (1024 * 1024)));
 }
 
 static int show_cpuinfo(struct seq_file *m, void *v)
@@ -332,11 +332,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 	seq_printf(m, "revision\t: %hd.%hd (pvr %04x %04x)\n",
 		   maj, min, PVR_VER(pvr), PVR_REV(pvr));
 
-#ifdef CONFIG_PPC32
-	seq_printf(m, "bogomips\t: %lu.%02lu\n",
-		   loops_per_jiffy / (500000/HZ),
-		   (loops_per_jiffy / (5000/HZ)) % 100);
-#endif
+	if (IS_ENABLED(CONFIG_PPC32))
+		seq_printf(m, "bogomips\t: %lu.%02lu\n", loops_per_jiffy / (500000 / HZ),
+			   (loops_per_jiffy / (5000 / HZ)) % 100);
+
 	seq_printf(m, "\n");
 
 	/* If this is the last cpu, print the summary */
@@ -934,9 +933,9 @@ void __init setup_arch(char **cmdline_p)
 
 	early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT);
 
-#ifdef CONFIG_DUMMY_CONSOLE
-	conswitchp = &dummy_con;
-#endif
+	if (IS_ENABLED(CONFIG_DUMMY_CONSOLE))
+		conswitchp = &dummy_con;
+
 	if (ppc_md.setup_arch)
 		ppc_md.setup_arch();
 
@@ -948,10 +947,8 @@ void __init setup_arch(char **cmdline_p)
 	/* Initialize the MMU context management stuff. */
 	mmu_context_init();
 
-#ifdef CONFIG_PPC64
 	/* Interrupt code needs to be 64K-aligned. */
-	if ((unsigned long)_stext & 0xffff)
+	if (IS_ENABLED(CONFIG_PPC64) && (unsigned long)_stext & 0xffff)
 		panic("Kernelbase not 64K-aligned (0x%lx)!\n",
 		      (unsigned long)_stext);
-#endif
 }
-- 
cgit v1.2.3-59-g8ed1b


From 502523fd1d2ac559b41d8302dc9f826f578ec54d Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Sat, 9 Mar 2019 18:47:27 +0100
Subject: powerpc/irq: drop __irq_offset_value

This patch drops__irq_offset_value which has not been used since
commit 9c4cb8251513 ("powerpc: Remove use of CONFIG_PPC_MERGE")

This removes a sparse warning.

Fixes: 9c4cb8251513 ("powerpc: Remove use of CONFIG_PPC_MERGE")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/irq.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 8a936723c791..6672fec75e2a 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -81,10 +81,7 @@
 DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
 EXPORT_PER_CPU_SYMBOL(irq_stat);
 
-int __irq_offset_value;
-
 #ifdef CONFIG_PPC32
-EXPORT_SYMBOL(__irq_offset_value);
 atomic_t ppc_n_lost_interrupts;
 
 #ifdef CONFIG_TAU_INT
-- 
cgit v1.2.3-59-g8ed1b


From e2b36d591720d81741f37e047a6f0047e8c89369 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 2 May 2019 15:21:07 +1000
Subject: powerpc/64: Don't trace code that runs with the soft irq mask
 unreconciled

"Reconciling" in terms of interrupt handling, is to bring the soft irq
mask state in to synch with the hardware, after an interrupt causes
MSR[EE] to be cleared (while the soft mask may be enabled, and hard
irqs not marked disabled).

General kernel code should not be called while unreconciled, because
local_irq_disable, etc. manipulations can cause surprising irq traces,
and it's fragile because the soft irq code does not really expect to
be called in this situation.

When exiting from an interrupt, MSR[EE] is cleared to prevent races,
but soft irq state is enabled for the returned-to context, so this is
now an unreconciled state. restore_math is called in this state, and
that can be ftraced, and the ftrace subsystem disables local irqs.

Mark restore_math and its callees as notrace. Restore a sanity check
in the soft irq code that had to be disabled for this case, by commit
4da1f79227ad4 ("powerpc/64: Disable irq restore warning for now").

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/fpu.S     |  1 +
 arch/powerpc/kernel/irq.c     | 13 +++----------
 arch/powerpc/kernel/process.c | 18 +++++++++++++++---
 arch/powerpc/kernel/vector.S  |  1 +
 4 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index 529dcc21c3f9..cecd57e1d046 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -63,6 +63,7 @@ _GLOBAL(load_fp_state)
 	REST_32FPVSRS(0, R4, R3)
 	blr
 EXPORT_SYMBOL(load_fp_state)
+_ASM_NOKPROBE_SYMBOL(load_fp_state); /* used by restore_math */
 
 /*
  * Store FP state into memory, including FPSCR
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 6672fec75e2a..ada901af4950 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -258,16 +258,9 @@ notrace void arch_local_irq_restore(unsigned long mask)
 	 */
 	irq_happened = get_irq_happened();
 	if (!irq_happened) {
-		/*
-		 * FIXME. Here we'd like to be able to do:
-		 *
-		 * #ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG
-		 *   WARN_ON(!(mfmsr() & MSR_EE));
-		 * #endif
-		 *
-		 * But currently it hits in a few paths, we should fix those and
-		 * enable the warning.
-		 */
+#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG
+		WARN_ON(!(mfmsr() & MSR_EE));
+#endif
 		return;
 	}
 
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 0c2017357073..87da40129927 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -134,7 +134,8 @@ static int __init enable_strict_msr_control(char *str)
 }
 early_param("ppc_strict_facility_enable", enable_strict_msr_control);
 
-unsigned long msr_check_and_set(unsigned long bits)
+/* notrace because it's called by restore_math */
+unsigned long notrace msr_check_and_set(unsigned long bits)
 {
 	unsigned long oldmsr = mfmsr();
 	unsigned long newmsr;
@@ -153,7 +154,8 @@ unsigned long msr_check_and_set(unsigned long bits)
 }
 EXPORT_SYMBOL_GPL(msr_check_and_set);
 
-void __msr_check_and_clear(unsigned long bits)
+/* notrace because it's called by restore_math */
+void notrace __msr_check_and_clear(unsigned long bits)
 {
 	unsigned long oldmsr = mfmsr();
 	unsigned long newmsr;
@@ -526,7 +528,17 @@ void giveup_all(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(giveup_all);
 
-void restore_math(struct pt_regs *regs)
+/*
+ * The exception exit path calls restore_math() with interrupts hard disabled
+ * but the soft irq state not "reconciled". ftrace code that calls
+ * local_irq_save/restore causes warnings.
+ *
+ * Rather than complicate the exit path, just don't trace restore_math. This
+ * could be done by having ftrace entry code check for this un-reconciled
+ * condition where MSR[EE]=0 and PACA_IRQ_HARD_DIS is not set, and
+ * temporarily fix it up for the duration of the ftrace call.
+ */
+void notrace restore_math(struct pt_regs *regs)
 {
 	unsigned long msr;
 
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
index 21165da0052d..8eb867dbad5f 100644
--- a/arch/powerpc/kernel/vector.S
+++ b/arch/powerpc/kernel/vector.S
@@ -21,6 +21,7 @@ _GLOBAL(load_vr_state)
 	REST_32VRS(0,r4,r3)
 	blr
 EXPORT_SYMBOL(load_vr_state)
+_ASM_NOKPROBE_SYMBOL(load_vr_state); /* used by restore_math */
 
 /*
  * Store VMX state into memory, including VSCR.
-- 
cgit v1.2.3-59-g8ed1b


From c9e0fc33b8be52a7134ed0ee79b6a1e332e1b9d0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 30 Apr 2019 14:27:39 -0400
Subject: powerpc: remove the __kernel_io_end export

This export was added in this merge window, but without any actual
user, or justification for a modular user.

Fixes: a35a3c6f6065 ("powerpc/mm/hash64: Add a variable to track the end of IO mapping")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/pgtable_64.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 95ed76519411..4c6a73782b19 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -97,7 +97,6 @@ EXPORT_SYMBOL(__vmalloc_end);
 unsigned long __kernel_io_start;
 EXPORT_SYMBOL(__kernel_io_start);
 unsigned long __kernel_io_end;
-EXPORT_SYMBOL(__kernel_io_end);
 struct page *vmemmap;
 EXPORT_SYMBOL(vmemmap);
 unsigned long __pte_frag_nr;
-- 
cgit v1.2.3-59-g8ed1b


From 5f18cbdbdd42b050c51eb9859f8ce43db3f51846 Mon Sep 17 00:00:00 2001
From: Russell Currey <ruscur@russell.cc>
Date: Thu, 2 May 2019 17:39:46 +1000
Subject: powerpc/mm/ptdump: Wrap seq_printf() to handle NULL pointers

Lovingly borrowed from the arch/arm64 ptdump code.

This doesn't seem to be an issue in practice, but is necessary for my
upcoming commit.

Signed-off-by: Russell Currey <ruscur@russell.cc>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/ptdump/ptdump.c | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 48135ba6fa74..e249a56c07bf 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -108,6 +108,18 @@ static struct addr_marker address_markers[] = {
 	{ -1,	NULL },
 };
 
+#define pt_dump_seq_printf(m, fmt, args...)	\
+({						\
+	if (m)					\
+		seq_printf(m, fmt, ##args);	\
+})
+
+#define pt_dump_seq_putc(m, c)		\
+({					\
+	if (m)				\
+		seq_putc(m, c);		\
+})
+
 static void dump_flag_info(struct pg_state *st, const struct flag_info
 		*flag, u64 pte, int num)
 {
@@ -125,19 +137,19 @@ static void dump_flag_info(struct pg_state *st, const struct flag_info
 			val = pte & flag->val;
 			if (flag->shift)
 				val = val >> flag->shift;
-			seq_printf(st->seq, "  %s:%llx", flag->set, val);
+			pt_dump_seq_printf(st->seq, "  %s:%llx", flag->set, val);
 		} else {
 			if ((pte & flag->mask) == flag->val)
 				s = flag->set;
 			else
 				s = flag->clear;
 			if (s)
-				seq_printf(st->seq, "  %s", s);
+				pt_dump_seq_printf(st->seq, "  %s", s);
 		}
 		st->current_flags &= ~flag->mask;
 	}
 	if (st->current_flags != 0)
-		seq_printf(st->seq, "  unknown flags:%llx", st->current_flags);
+		pt_dump_seq_printf(st->seq, "  unknown flags:%llx", st->current_flags);
 }
 
 static void dump_addr(struct pg_state *st, unsigned long addr)
@@ -152,12 +164,12 @@ static void dump_addr(struct pg_state *st, unsigned long addr)
 #define REG		"0x%08lx"
 #endif
 
-	seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1);
+	pt_dump_seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1);
 	if (st->start_pa == st->last_pa && st->start_address + PAGE_SIZE != addr) {
-		seq_printf(st->seq, "[" REG "]", st->start_pa);
+		pt_dump_seq_printf(st->seq, "[" REG "]", st->start_pa);
 		delta = PAGE_SIZE >> 10;
 	} else {
-		seq_printf(st->seq, " " REG " ", st->start_pa);
+		pt_dump_seq_printf(st->seq, " " REG " ", st->start_pa);
 		delta = (addr - st->start_address) >> 10;
 	}
 	/* Work out what appropriate unit to use */
@@ -165,7 +177,7 @@ static void dump_addr(struct pg_state *st, unsigned long addr)
 		delta >>= 10;
 		unit++;
 	}
-	seq_printf(st->seq, "%9lu%c", delta, *unit);
+	pt_dump_seq_printf(st->seq, "%9lu%c", delta, *unit);
 
 }
 
@@ -182,7 +194,7 @@ static void note_page(struct pg_state *st, unsigned long addr,
 		st->start_address = addr;
 		st->start_pa = pa;
 		st->last_pa = pa;
-		seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+		pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
 	/*
 	 * Dump the section of virtual memory when:
 	 *   - the PTE flags from one entry to the next differs.
@@ -206,7 +218,7 @@ static void note_page(struct pg_state *st, unsigned long addr,
 					  st->current_flags,
 					  pg_level[st->level].num);
 
-			seq_putc(st->seq, '\n');
+			pt_dump_seq_putc(st->seq, '\n');
 		}
 
 		/*
@@ -215,7 +227,7 @@ static void note_page(struct pg_state *st, unsigned long addr,
 		 */
 		while (addr >= st->marker[1].start_address) {
 			st->marker++;
-			seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+			pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
 		}
 		st->start_address = addr;
 		st->start_pa = pa;
-- 
cgit v1.2.3-59-g8ed1b


From 453d87f6a8aed827f5ebb1708a4cea458fd68d23 Mon Sep 17 00:00:00 2001
From: Russell Currey <ruscur@russell.cc>
Date: Thu, 2 May 2019 17:39:47 +1000
Subject: powerpc/mm: Warn if W+X pages found on boot

Implement code to walk all pages and warn if any are found to be both
writable and executable.  Depends on STRICT_KERNEL_RWX enabled, and is
behind the DEBUG_WX config option.

This only runs on boot and has no runtime performance implications.

Very heavily influenced (and in some cases copied verbatim) from the
ARM64 code written by Laura Abbott (thanks!), since our ptdump
infrastructure is similar.

Signed-off-by: Russell Currey <ruscur@russell.cc>
[mpe: Fixup build error when disabled]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig.debug         | 19 +++++++++++++++++
 arch/powerpc/include/asm/pgtable.h |  6 ++++++
 arch/powerpc/mm/pgtable_32.c       |  3 +++
 arch/powerpc/mm/pgtable_64.c       |  3 +++
 arch/powerpc/mm/ptdump/ptdump.c    | 43 +++++++++++++++++++++++++++++++++++++-
 5 files changed, 73 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 61febbbdd02b..e9ae650c8e93 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -361,6 +361,25 @@ config PPC_PTDUMP
 
 	  If you are unsure, say N.
 
+config PPC_DEBUG_WX
+	bool "Warn on W+X mappings at boot"
+	depends on PPC_PTDUMP
+	help
+	  Generate a warning if any W+X mappings are found at boot.
+
+	  This is useful for discovering cases where the kernel is leaving
+	  W+X mappings after applying NX, as such mappings are a security risk.
+
+	  Note that even if the check fails, your kernel is possibly
+	  still fine, as W+X mappings are not a security hole in
+	  themselves, what they do is that they make the exploitation
+	  of other unfixed kernel bugs easier.
+
+	  There is no runtime or memory usage effect of this option
+	  once the kernel has booted up - it's a one time check.
+
+	  If in doubt, say "Y".
+
 config PPC_FAST_ENDIAN_SWITCH
 	bool "Deprecated fast endian-switch syscall"
         depends on DEBUG_KERNEL && PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index c51846da41a7..3f53be60fb01 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -105,6 +105,12 @@ void mark_initmem_nx(void);
 static inline void mark_initmem_nx(void) { }
 #endif
 
+#ifdef CONFIG_PPC_DEBUG_WX
+void ptdump_check_wx(void);
+#else
+static inline void ptdump_check_wx(void) { }
+#endif
+
 /*
  * When used, PTE_FRAG_NR is defined in subarch pgtable.h
  * so we are sure it is included when arriving here.
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 2e67f9a1430b..16ada373b32b 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -396,6 +396,9 @@ void mark_rodata_ro(void)
 		   PFN_DOWN((unsigned long)__start_rodata);
 
 	change_page_attr(page, numpages, PAGE_KERNEL_RO);
+
+	// mark_initmem_nx() should have already run by now
+	ptdump_check_wx();
 }
 #endif
 
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 4c6a73782b19..d2d976ff8a0e 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -332,6 +332,9 @@ void mark_rodata_ro(void)
 		radix__mark_rodata_ro();
 	else
 		hash__mark_rodata_ro();
+
+	// mark_initmem_nx() should have already run by now
+	ptdump_check_wx();
 }
 
 void mark_initmem_nx(void)
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index e249a56c07bf..646876d9da64 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -31,7 +31,7 @@
 #include "ptdump.h"
 
 #ifdef CONFIG_PPC32
-#define KERN_VIRT_START	0
+#define KERN_VIRT_START	PAGE_OFFSET
 #endif
 
 /*
@@ -68,6 +68,8 @@ struct pg_state {
 	unsigned long last_pa;
 	unsigned int level;
 	u64 current_flags;
+	bool check_wx;
+	unsigned long wx_pages;
 };
 
 struct addr_marker {
@@ -181,6 +183,20 @@ static void dump_addr(struct pg_state *st, unsigned long addr)
 
 }
 
+static void note_prot_wx(struct pg_state *st, unsigned long addr)
+{
+	if (!st->check_wx)
+		return;
+
+	if (!((st->current_flags & pgprot_val(PAGE_KERNEL_X)) == pgprot_val(PAGE_KERNEL_X)))
+		return;
+
+	WARN_ONCE(1, "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n",
+		  (void *)st->start_address, (void *)st->start_address);
+
+	st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
+}
+
 static void note_page(struct pg_state *st, unsigned long addr,
 	       unsigned int level, u64 val)
 {
@@ -210,6 +226,7 @@ static void note_page(struct pg_state *st, unsigned long addr,
 
 		/* Check the PTE flags */
 		if (st->current_flags) {
+			note_prot_wx(st, addr);
 			dump_addr(st, addr);
 
 			/* Dump all the flags */
@@ -387,6 +404,30 @@ static void build_pgtable_complete_mask(void)
 				pg_level[i].mask |= pg_level[i].flag[j].mask;
 }
 
+#ifdef CONFIG_PPC_DEBUG_WX
+void ptdump_check_wx(void)
+{
+	struct pg_state st = {
+		.seq = NULL,
+		.marker = address_markers,
+		.check_wx = true,
+	};
+
+	if (radix_enabled())
+		st.start_address = PAGE_OFFSET;
+	else
+		st.start_address = KERN_VIRT_START;
+
+	walk_pagetables(&st);
+
+	if (st.wx_pages)
+		pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n",
+			st.wx_pages);
+	else
+		pr_info("Checked W+X mappings: passed, no W+X pages found\n");
+}
+#endif
+
 static int ptdump_init(void)
 {
 	struct dentry *debugfs_file;
-- 
cgit v1.2.3-59-g8ed1b


From 398af571128fe75f07343f929975b26d57eafd18 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 9 Apr 2019 23:14:20 +1000
Subject: powerpc/security: Show powerpc_security_features in debugfs

This can be helpful for debugging problems with the security feature
flags, especially on guests where the flags come from the hypervisor
via an hcall and so can't be observed in the device tree.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/security.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
index b33bafb8fcea..d6ba696d0ed0 100644
--- a/arch/powerpc/kernel/security.c
+++ b/arch/powerpc/kernel/security.c
@@ -104,6 +104,14 @@ static __init int barrier_nospec_debugfs_init(void)
 	return 0;
 }
 device_initcall(barrier_nospec_debugfs_init);
+
+static __init int security_feature_debugfs_init(void)
+{
+	debugfs_create_x64("security_features", 0400, powerpc_debugfs_root,
+			   (u64 *)&powerpc_security_features);
+	return 0;
+}
+device_initcall(security_feature_debugfs_init);
 #endif /* CONFIG_DEBUG_FS */
 
 #ifdef CONFIG_PPC_FSL_BOOK3E
-- 
cgit v1.2.3-59-g8ed1b


From d7fbe2a0439ce6f20917a65990a78c9e747aad34 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 2 Apr 2019 09:08:38 +0000
Subject: powerpc/prom_init: get rid of PROM_SCRATCH_SIZE

PROM_SCRATCH_SIZE is same as sizeof(prom_scratch)

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/prom_init.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index d3b0d543d924..523bb99d7676 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -154,10 +154,8 @@ static struct prom_t __prombss prom;
 
 static unsigned long __prombss prom_entry;
 
-#define PROM_SCRATCH_SIZE 256
-
 static char __prombss of_stdout_device[256];
-static char __prombss prom_scratch[PROM_SCRATCH_SIZE];
+static char __prombss prom_scratch[256];
 
 static unsigned long __prombss dt_header_start;
 static unsigned long __prombss dt_struct_start, dt_struct_end;
@@ -1619,8 +1617,8 @@ static void __init prom_init_mem(void)
 		endp = p + (plen / sizeof(cell_t));
 
 #ifdef DEBUG_PROM
-		memset(path, 0, PROM_SCRATCH_SIZE);
-		call_prom("package-to-path", 3, 1, node, path, PROM_SCRATCH_SIZE-1);
+		memset(path, 0, sizeof(prom_scratch));
+		call_prom("package-to-path", 3, 1, node, path, sizeof(prom_scratch) - 1);
 		prom_debug("  node %s :\n", path);
 #endif /* DEBUG_PROM */
 
@@ -1928,10 +1926,10 @@ static void __init prom_initialize_tce_table(void)
 			local_alloc_bottom = base;
 
 		/* It seems OF doesn't null-terminate the path :-( */
-		memset(path, 0, PROM_SCRATCH_SIZE);
+		memset(path, 0, sizeof(prom_scratch));
 		/* Call OF to setup the TCE hardware */
 		if (call_prom("package-to-path", 3, 1, node,
-			      path, PROM_SCRATCH_SIZE-1) == PROM_ERROR) {
+			      path, sizeof(prom_scratch) - 1) == PROM_ERROR) {
 			prom_printf("package-to-path failed\n");
 		}
 
@@ -2292,14 +2290,14 @@ static void __init prom_check_displays(void)
 
 		/* It seems OF doesn't null-terminate the path :-( */
 		path = prom_scratch;
-		memset(path, 0, PROM_SCRATCH_SIZE);
+		memset(path, 0, sizeof(prom_scratch));
 
 		/*
 		 * leave some room at the end of the path for appending extra
 		 * arguments
 		 */
 		if (call_prom("package-to-path", 3, 1, node, path,
-			      PROM_SCRATCH_SIZE-10) == PROM_ERROR)
+			      sizeof(prom_scratch) - 10) == PROM_ERROR)
 			continue;
 		prom_printf("found display   : %s, opening... ", path);
 		
@@ -2495,8 +2493,8 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start,
 
 	/* get it again for debugging */
 	path = prom_scratch;
-	memset(path, 0, PROM_SCRATCH_SIZE);
-	call_prom("package-to-path", 3, 1, node, path, PROM_SCRATCH_SIZE-1);
+	memset(path, 0, sizeof(prom_scratch));
+	call_prom("package-to-path", 3, 1, node, path, sizeof(prom_scratch) - 1);
 
 	/* get and store all properties */
 	prev_name = "";
-- 
cgit v1.2.3-59-g8ed1b


From 32eebf96669568014b79b83a03f7895f3ec8c95f Mon Sep 17 00:00:00 2001
From: Horia Geantă <horia.geanta@nxp.com>
Date: Wed, 20 Mar 2019 14:55:16 +0200
Subject: powerpc/dts/fsl: add crypto node alias for B4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

crypto node alias is needed by U-boot to identify the node and
perform fix-ups, like adding "fsl,sec-era" property.

Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/boot/dts/fsl/b4qds.dtsi | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/powerpc/boot/dts/fsl/b4qds.dtsi b/arch/powerpc/boot/dts/fsl/b4qds.dtsi
index 999efd3bc167..05be919f3545 100644
--- a/arch/powerpc/boot/dts/fsl/b4qds.dtsi
+++ b/arch/powerpc/boot/dts/fsl/b4qds.dtsi
@@ -40,6 +40,7 @@
 	interrupt-parent = <&mpic>;
 
 	aliases {
+		crypto = &crypto;
 		phy_sgmii_10 = &phy_sgmii_10;
 		phy_sgmii_11 = &phy_sgmii_11;
 		phy_sgmii_1c = &phy_sgmii_1c;
-- 
cgit v1.2.3-59-g8ed1b


From 90437bffa5f9b1440ba03e023f4875d1814b9360 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Mon, 11 Mar 2019 22:47:46 +0000
Subject: powerpc/entry: Remove unneeded need_resched() loop

Since the enabling and disabling of IRQs within preempt_schedule_irq()
is contained in a need_resched() loop, we don't need the outer arch
code loop.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
[mpe: Rebase since CURRENT_THREAD_INFO() removal]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/entry_32.S | 5 +----
 arch/powerpc/kernel/entry_64.S | 8 +-------
 2 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 235a01d34b6d..c18f3490a77e 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -906,10 +906,7 @@ resume_kernel:
 	 */
 	bl	trace_hardirqs_off
 #endif
-1:	bl	preempt_schedule_irq
-	lwz	r3,TI_FLAGS(r2)
-	andi.	r0,r3,_TIF_NEED_RESCHED
-	bne-	1b
+	bl	preempt_schedule_irq
 #ifdef CONFIG_TRACE_IRQFLAGS
 	/* And now, to properly rebalance the above, we tell lockdep they
 	 * are being turned back on, which will happen when we return
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 7cc25389c6bd..d978af78bf2a 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -865,13 +865,7 @@ resume_kernel:
 	 * sure we are soft-disabled first and reconcile irq state.
 	 */
 	RECONCILE_IRQ_STATE(r3,r4)
-1:	bl	preempt_schedule_irq
-
-	/* Re-test flags and eventually loop */
-	ld	r9, PACA_THREAD_INFO(r13)
-	ld	r4,TI_FLAGS(r9)
-	andi.	r0,r4,_TIF_NEED_RESCHED
-	bne	1b
+	bl	preempt_schedule_irq
 
 	/*
 	 * arch_local_irq_restore() from preempt_schedule_irq above may
-- 
cgit v1.2.3-59-g8ed1b


From 5d085ec04a000fefb5182d3b03ee46ca96d8389b Mon Sep 17 00:00:00 2001
From: Bo YU <tsu.yubo@gmail.com>
Date: Tue, 30 Oct 2018 09:21:55 -0400
Subject: powerpc/boot: Fix missing check of lseek() return value

This is detected by Coverity scan: CID: 1440481

Signed-off-by: Bo YU <tsu.yubo@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/boot/addnote.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/boot/addnote.c b/arch/powerpc/boot/addnote.c
index 9d9f6f334d3c..3da3e2b1b51b 100644
--- a/arch/powerpc/boot/addnote.c
+++ b/arch/powerpc/boot/addnote.c
@@ -223,7 +223,11 @@ main(int ac, char **av)
 	PUT_16(E_PHNUM, np + 2);
 
 	/* write back */
-	lseek(fd, (long) 0, SEEK_SET);
+	i = lseek(fd, (long) 0, SEEK_SET);
+	if (i < 0) {
+		perror("lseek");
+		exit(1);
+	}
 	i = write(fd, buf, n);
 	if (i < 0) {
 		perror("write");
-- 
cgit v1.2.3-59-g8ed1b


From 0acb5f64560a052fd66ab37b212a72964847160f Mon Sep 17 00:00:00 2001
From: "Christopher M. Riedl" <cmr@informatik.wtf>
Date: Mon, 15 Apr 2019 22:26:38 -0500
Subject: powerpc/xmon: add read-only mode

Operations which write to memory and special purpose registers should be
restricted on systems with integrity guarantees (such as Secure Boot)
and, optionally, to avoid self-destructive behaviors.

Add a config option, XMON_DEFAULT_RO_MODE, to set default xmon behavior.
The kernel cmdline options xmon=ro and xmon=rw override this default.

The following xmon operations are affected:
memops:
	disable memmove
	disable memset
	disable memzcan
memex:
	no-op'd mwrite
super_regs:
	no-op'd write_spr
bpt_cmds:
	disable
proc_call:
	disable

Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
Reviewed-by: Oliver O'Halloran <oohall@gmail.com>
Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig.debug |  8 ++++++++
 arch/powerpc/xmon/xmon.c   | 42 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index e9ae650c8e93..c59920920ddc 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -117,6 +117,14 @@ config XMON_DISASSEMBLY
 	  to say Y here, unless you're building for a memory-constrained
 	  system.
 
+config XMON_DEFAULT_RO_MODE
+	bool "Restrict xmon to read-only operations by default"
+	depends on XMON
+	default y
+	help
+          Operate xmon in read-only mode. The cmdline options 'xmon=rw' and
+          'xmon=ro' override this default.
+
 config DEBUGGER
 	bool
 	depends on KGDB || XMON
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index e583ed3f6b93..3e7be19aa208 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -80,6 +80,7 @@ static int set_indicator_token = RTAS_UNKNOWN_SERVICE;
 #endif
 static unsigned long in_xmon __read_mostly = 0;
 static int xmon_on = IS_ENABLED(CONFIG_XMON_DEFAULT);
+static bool xmon_is_ro = IS_ENABLED(CONFIG_XMON_DEFAULT_RO_MODE);
 
 static unsigned long adrs;
 static int size = 1;
@@ -202,6 +203,8 @@ static void dump_tlb_book3e(void);
 #define GETWORD(v)	(((v)[0] << 24) + ((v)[1] << 16) + ((v)[2] << 8) + (v)[3])
 #endif
 
+static const char *xmon_ro_msg = "Operation disabled: xmon in read-only mode\n";
+
 static char *help_string = "\
 Commands:\n\
   b	show breakpoints\n\
@@ -989,6 +992,10 @@ cmds(struct pt_regs *excp)
 				memlocate();
 				break;
 			case 'z':
+				if (xmon_is_ro) {
+					printf(xmon_ro_msg);
+					break;
+				}
 				memzcan();
 				break;
 			case 'i':
@@ -1042,6 +1049,10 @@ cmds(struct pt_regs *excp)
 			set_lpp_cmd();
 			break;
 		case 'b':
+			if (xmon_is_ro) {
+				printf(xmon_ro_msg);
+				break;
+			}
 			bpt_cmds();
 			break;
 		case 'C':
@@ -1055,6 +1066,10 @@ cmds(struct pt_regs *excp)
 			bootcmds();
 			break;
 		case 'p':
+			if (xmon_is_ro) {
+				printf(xmon_ro_msg);
+				break;
+			}
 			proccall();
 			break;
 		case 'P':
@@ -1777,6 +1792,11 @@ read_spr(int n, unsigned long *vp)
 static void
 write_spr(int n, unsigned long val)
 {
+	if (xmon_is_ro) {
+		printf(xmon_ro_msg);
+		return;
+	}
+
 	if (setjmp(bus_error_jmp) == 0) {
 		catch_spr_faults = 1;
 		sync();
@@ -2016,6 +2036,12 @@ mwrite(unsigned long adrs, void *buf, int size)
 	char *p, *q;
 
 	n = 0;
+
+	if (xmon_is_ro) {
+		printf(xmon_ro_msg);
+		return n;
+	}
+
 	if (setjmp(bus_error_jmp) == 0) {
 		catch_memory_errors = 1;
 		sync();
@@ -2880,9 +2906,17 @@ memops(int cmd)
 	scanhex((void *)&mcount);
 	switch( cmd ){
 	case 'm':
+		if (xmon_is_ro) {
+			printf(xmon_ro_msg);
+			break;
+		}
 		memmove((void *)mdest, (void *)msrc, mcount);
 		break;
 	case 's':
+		if (xmon_is_ro) {
+			printf(xmon_ro_msg);
+			break;
+		}
 		memset((void *)mdest, mval, mcount);
 		break;
 	case 'd':
@@ -3792,6 +3826,14 @@ static int __init early_parse_xmon(char *p)
 	} else if (strncmp(p, "on", 2) == 0) {
 		xmon_init(1);
 		xmon_on = 1;
+	} else if (strncmp(p, "rw", 2) == 0) {
+		xmon_init(1);
+		xmon_on = 1;
+		xmon_is_ro = false;
+	} else if (strncmp(p, "ro", 2) == 0) {
+		xmon_init(1);
+		xmon_on = 1;
+		xmon_is_ro = true;
 	} else if (strncmp(p, "off", 3) == 0)
 		xmon_on = 0;
 	else
-- 
cgit v1.2.3-59-g8ed1b


From de269129a48a2d590ba1d20c719e19d86e3ddb3f Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Tue, 5 Mar 2019 01:12:19 +0530
Subject: powerpc/hmi: Fix kernel hang when TB is in error state.

On TOD/TB errors timebase register stops/freezes until HMI error recovery
gets TOD/TB back into running state. On successful recovery, TB starts
running again and udelay() that relies on TB value continues to function
properly. But in case when HMI fails to recover from TOD/TB errors, the
TB register stay freezed. With TB not running the __delay() function
keeps looping and never return. If __delay() is called while in panic
path then system hangs and never reboots after panic.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/opal-api.h        | 10 ++++++++++
 arch/powerpc/include/asm/opal.h            |  2 ++
 arch/powerpc/include/asm/time.h            |  2 ++
 arch/powerpc/kernel/time.c                 |  9 +++++++++
 arch/powerpc/platforms/powernv/opal-call.c |  1 +
 arch/powerpc/platforms/powernv/opal.c      | 21 +++++++++++++++++++++
 arch/powerpc/platforms/powernv/setup.c     |  5 ++++-
 7 files changed, 49 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index e1d118ac61dc..234fde15b37c 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -209,6 +209,7 @@
 #define OPAL_SENSOR_GROUP_ENABLE		163
 #define OPAL_PCI_GET_PBCQ_TUNNEL_BAR		164
 #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR		165
+#define OPAL_HANDLE_HMI2			166
 #define	OPAL_NX_COPROC_INIT			167
 #define OPAL_XIVE_GET_VP_STATE			170
 #define OPAL_LAST				170
@@ -635,6 +636,15 @@ struct OpalHMIEvent {
 	} u;
 };
 
+/* OPAL_HANDLE_HMI2 out_flags */
+enum {
+	OPAL_HMI_FLAGS_TB_RESYNC	= (1ull << 0), /* Timebase has been resynced */
+	OPAL_HMI_FLAGS_DEC_LOST		= (1ull << 1), /* DEC lost, needs to be reprogrammed */
+	OPAL_HMI_FLAGS_HDEC_LOST	= (1ull << 2), /* HDEC lost, needs to be reprogrammed */
+	OPAL_HMI_FLAGS_TOD_TB_FAIL	= (1ull << 3), /* TOD/TB recovery failed. */
+	OPAL_HMI_FLAGS_NEW_EVENT	= (1ull << 63), /* An event has been created */
+};
+
 enum {
 	OPAL_P7IOC_DIAG_TYPE_NONE	= 0,
 	OPAL_P7IOC_DIAG_TYPE_RGC	= 1,
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 4e978d4dea5c..4cc37e708bc7 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -203,6 +203,7 @@ int64_t opal_set_param(uint64_t token, uint32_t param_id, uint64_t buffer,
 int64_t opal_sensor_read(uint32_t sensor_hndl, int token, __be32 *sensor_data);
 int64_t opal_sensor_read_u64(u32 sensor_hndl, int token, __be64 *sensor_data);
 int64_t opal_handle_hmi(void);
+int64_t opal_handle_hmi2(__be64 *out_flags);
 int64_t opal_register_dump_region(uint32_t id, uint64_t start, uint64_t end);
 int64_t opal_unregister_dump_region(uint32_t id);
 int64_t opal_slw_set_reg(uint64_t cpu_pir, uint64_t sprn, uint64_t val);
@@ -359,6 +360,7 @@ int opal_power_control_init(void);
 extern int opal_machine_check(struct pt_regs *regs);
 extern bool opal_mce_check_early_recovery(struct pt_regs *regs);
 extern int opal_hmi_exception_early(struct pt_regs *regs);
+extern int opal_hmi_exception_early2(struct pt_regs *regs);
 extern int opal_handle_hmi_exception(struct pt_regs *regs);
 
 extern void opal_shutdown(void);
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 54bf7e68a7e1..57e968413d1e 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -36,6 +36,8 @@ extern unsigned long ppc_proc_freq;
 extern unsigned long ppc_tb_freq;
 #define DEFAULT_TB_FREQ		125000000UL
 
+extern bool tb_invalid;
+
 struct div_result {
 	u64 result_high;
 	u64 result_low;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 6ef32472ee1d..325d60633dfa 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -150,6 +150,8 @@ EXPORT_SYMBOL_GPL(ppc_proc_freq);
 unsigned long ppc_tb_freq;
 EXPORT_SYMBOL_GPL(ppc_tb_freq);
 
+bool tb_invalid;
+
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 /*
  * Factor for converting from cputime_t (timebase ticks) to
@@ -459,6 +461,13 @@ void __delay(unsigned long loops)
 				diff += 1000000000;
 			spin_cpu_relax();
 		} while (diff < loops);
+	} else if (tb_invalid) {
+		/*
+		 * TB is in error state and isn't ticking anymore.
+		 * HMI handler was unable to recover from TB error.
+		 * Return immediately, so that kernel won't get stuck here.
+		 */
+		spin_cpu_relax();
 	} else {
 		start = get_tbl();
 		while (get_tbl() - start < loops)
diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c
index 7cba0d5da3ff..36c8fa3647a2 100644
--- a/arch/powerpc/platforms/powernv/opal-call.c
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -220,6 +220,7 @@ OPAL_CALL(opal_sensor_read,			OPAL_SENSOR_READ);
 OPAL_CALL(opal_get_param,			OPAL_GET_PARAM);
 OPAL_CALL(opal_set_param,			OPAL_SET_PARAM);
 OPAL_CALL(opal_handle_hmi,			OPAL_HANDLE_HMI);
+OPAL_CALL(opal_handle_hmi2,			OPAL_HANDLE_HMI2);
 OPAL_CALL(opal_config_cpu_idle_state,		OPAL_CONFIG_CPU_IDLE_STATE);
 OPAL_CALL(opal_slw_set_reg,			OPAL_SLW_SET_REG);
 OPAL_CALL(opal_register_dump_region,		OPAL_REGISTER_DUMP_REGION);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 737c51d63480..f2b063b027f0 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -614,6 +614,27 @@ int opal_hmi_exception_early(struct pt_regs *regs)
 	return 0;
 }
 
+int opal_hmi_exception_early2(struct pt_regs *regs)
+{
+	s64 rc;
+	__be64 out_flags;
+
+	/*
+	 * call opal hmi handler.
+	 * Check 64-bit flag mask to find out if an event was generated,
+	 * and whether TB is still valid or not etc.
+	 */
+	rc = opal_handle_hmi2(&out_flags);
+	if (rc != OPAL_SUCCESS)
+		return 0;
+
+	if (be64_to_cpu(out_flags) & OPAL_HMI_FLAGS_NEW_EVENT)
+		local_paca->hmi_event_available = 1;
+	if (be64_to_cpu(out_flags) & OPAL_HMI_FLAGS_TOD_TB_FAIL)
+		tb_invalid = true;
+	return 1;
+}
+
 /* HMI exception handler called in virtual mode during check_irq_replay. */
 int opal_handle_hmi_exception(struct pt_regs *regs)
 {
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 14befee4b3f1..3cf40f689aac 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -401,7 +401,10 @@ static void __init pnv_setup_machdep_opal(void)
 	/* ppc_md.system_reset_exception gets filled in by pnv_smp_init() */
 	ppc_md.machine_check_exception = opal_machine_check;
 	ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
-	ppc_md.hmi_exception_early = opal_hmi_exception_early;
+	if (opal_check_token(OPAL_HANDLE_HMI2))
+		ppc_md.hmi_exception_early = opal_hmi_exception_early2;
+	else
+		ppc_md.hmi_exception_early = opal_hmi_exception_early;
 	ppc_md.handle_hmi_exception = opal_handle_hmi_exception;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From e1619e89c96c596d72e66f15a0794f5001c8576e Mon Sep 17 00:00:00 2001
From: Joel Stanley <joel@jms.id.au>
Date: Wed, 3 Apr 2019 11:19:26 +1030
Subject: powerpc/configs: Add (back) MLX5 ethernet support to
 skiroot_defconfig

It turns out that some defconfig changes and kernel config option
changes meant we accidentally dropped Ethernet support for Mellanox
CLX5 cards.

Fixes: cbc39809a398 ("powerpc/configs: Update skiroot defconfig")
Reported-by: Carol L Soto <clsoto@us.ibm.com>
Suggested-by: Carol L Soto <clsoto@us.ibm.com>
Signed-off-by: Stewart Smith <stewart@linux.ibm.com>
Signed-off-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/configs/skiroot_defconfig | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig
index 5ba131c30f6b..6038b9347d9e 100644
--- a/arch/powerpc/configs/skiroot_defconfig
+++ b/arch/powerpc/configs/skiroot_defconfig
@@ -163,6 +163,8 @@ CONFIG_S2IO=m
 CONFIG_MLX4_EN=m
 # CONFIG_MLX4_CORE_GEN2 is not set
 CONFIG_MLX5_CORE=m
+CONFIG_MLX5_CORE_EN=y
+# CONFIG_MLX5_EN_RXNFC is not set
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_MICROSEMI is not set
 CONFIG_MYRI10GE=m
-- 
cgit v1.2.3-59-g8ed1b


From 1e496391a8452101308a23b7395cdd4983b6e5bd Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 30 Mar 2017 03:19:25 -0700
Subject: powerpc/powernv/ioda2: Add __printf format/argument verification

Fix fallout too.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 31 ++++++++++++++++---------------
 arch/powerpc/platforms/powernv/pci.h      |  2 ++
 2 files changed, 18 insertions(+), 15 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9a9076a5686c..126602b4e399 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -847,11 +847,11 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
 				pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
 	if (rc)
-		pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
+		pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc);
 	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
 			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
 	if (rc)
-		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
+		pe_err(pe, "OPAL error %lld trying to setup PELT table\n", rc);
 
 	pe->pbus = NULL;
 	pe->pdev = NULL;
@@ -1174,11 +1174,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
 	pe->rid = bus->busn_res.start << 8;
 
 	if (all)
-		pe_info(pe, "Secondary bus %d..%d associated with PE#%x\n",
-			bus->busn_res.start, bus->busn_res.end, pe->pe_number);
+		pe_info(pe, "Secondary bus %pad..%pad associated with PE#%x\n",
+			&bus->busn_res.start, &bus->busn_res.end,
+			pe->pe_number);
 	else
-		pe_info(pe, "Secondary bus %d associated with PE#%x\n",
-			bus->busn_res.start, pe->pe_number);
+		pe_info(pe, "Secondary bus %pad associated with PE#%x\n",
+			&bus->busn_res.start, pe->pe_number);
 
 	if (pnv_ioda_configure_pe(phb, pe)) {
 		/* XXX What do we do here ? */
@@ -1448,7 +1449,7 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 	tbl = pe->table_group.tables[0];
 	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
 	if (rc)
-		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
+		pe_warn(pe, "OPAL error %lld release DMA window\n", rc);
 
 	pnv_pci_ioda2_set_bypass(pe, false);
 	if (pe->table_group.group) {
@@ -2286,8 +2287,8 @@ found:
 					      __pa(addr) + tce32_segsz * i,
 					      tce32_segsz, IOMMU_PAGE_SIZE_4K);
 		if (rc) {
-			pe_err(pe, " Failed to configure 32-bit TCE table,"
-			       " err %ld\n", rc);
+			pe_err(pe, " Failed to configure 32-bit TCE table, err %lld\n",
+			       rc);
 			goto fail;
 		}
 	}
@@ -2332,9 +2333,9 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
 	const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
 	const __u64 win_size = tbl->it_size << tbl->it_page_shift;
 
-	pe_info(pe, "Setting up window#%d %llx..%llx pg=%x\n", num,
-			start_addr, start_addr + win_size - 1,
-			IOMMU_PAGE_SIZE(tbl));
+	pe_info(pe, "Setting up window#%d %llx..%llx pg=%lx\n",
+		num, start_addr, start_addr + win_size - 1,
+		IOMMU_PAGE_SIZE(tbl));
 
 	/*
 	 * Map TCE table through TVT. The TVE index is the PE number
@@ -2348,7 +2349,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
 			size << 3,
 			IOMMU_PAGE_SIZE(tbl));
 	if (rc) {
-		pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
+		pe_err(pe, "Failed to configure TCE table, err %lld\n", rc);
 		return rc;
 	}
 
@@ -3450,7 +3451,7 @@ static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
 #ifdef CONFIG_IOMMU_API
 	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
 	if (rc)
-		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
+		pe_warn(pe, "OPAL error %lld release DMA window\n", rc);
 #endif
 
 	pnv_pci_ioda2_set_bypass(pe, false);
@@ -3484,7 +3485,7 @@ static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
 					phb->ioda.reserved_pe_idx, win, 0, idx);
 
 		if (rc != OPAL_SUCCESS)
-			pe_warn(pe, "Error %ld unmapping (%d) segment#%d\n",
+			pe_warn(pe, "Error %lld unmapping (%d) segment#%d\n",
 				rc, win, idx);
 
 		map[idx] = IODA_INVALID_PE;
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 8e36da379252..be26ab3d99e0 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -2,6 +2,7 @@
 #ifndef __POWERNV_PCI_H
 #define __POWERNV_PCI_H
 
+#include <linux/compiler.h>		/* for __printf */
 #include <linux/iommu.h>
 #include <asm/iommu.h>
 #include <asm/msi_bitmap.h>
@@ -204,6 +205,7 @@ extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
 		__u64 window_size, __u32 levels);
 extern int pnv_eeh_post_init(void);
 
+__printf(3, 4)
 extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 			    const char *fmt, ...);
 #define pe_err(pe, fmt, ...)					\
-- 
cgit v1.2.3-59-g8ed1b


From 708597daf23486ea6f889ca29cc88389ca9a409a Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Date: Thu, 4 Apr 2019 17:24:49 +0530
Subject: powerpc/perf: init pmu from core-book3s

Currenty pmu driver file for each ppc64 generation processor
has a __init call in itself. Refactor the code by moving the
__init call to core-books.c. This also clean's up compat mode
pmu driver registration.

Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
[mpe: Use SPDX tag for license]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/core-book3s.c | 28 ++++++++++++++++++++++++++++
 arch/powerpc/perf/internal.h    | 11 +++++++++++
 arch/powerpc/perf/power5+-pmu.c |  4 +---
 arch/powerpc/perf/power5-pmu.c  |  4 +---
 arch/powerpc/perf/power6-pmu.c  |  4 +---
 arch/powerpc/perf/power7-pmu.c  |  4 +---
 arch/powerpc/perf/power8-pmu.c  |  3 +--
 arch/powerpc/perf/power9-pmu.c  |  3 +--
 arch/powerpc/perf/ppc970-pmu.c  |  4 +---
 9 files changed, 46 insertions(+), 19 deletions(-)
 create mode 100644 arch/powerpc/perf/internal.h

(limited to 'arch')

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index b0723002a396..a96f9420139c 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -22,6 +22,10 @@
 #include <asm/ptrace.h>
 #include <asm/code-patching.h>
 
+#ifdef CONFIG_PPC64
+#include "internal.h"
+#endif
+
 #define BHRB_MAX_ENTRIES	32
 #define BHRB_TARGET		0x0000000000000002
 #define BHRB_PREDICTION		0x0000000000000001
@@ -2294,3 +2298,27 @@ int register_power_pmu(struct power_pmu *pmu)
 			  power_pmu_prepare_cpu, NULL);
 	return 0;
 }
+
+#ifdef CONFIG_PPC64
+static int __init init_ppc64_pmu(void)
+{
+	/* run through all the pmu drivers one at a time */
+	if (!init_power5_pmu())
+		return 0;
+	else if (!init_power5p_pmu())
+		return 0;
+	else if (!init_power6_pmu())
+		return 0;
+	else if (!init_power7_pmu())
+		return 0;
+	else if (!init_power8_pmu())
+		return 0;
+	else if (!init_power9_pmu())
+		return 0;
+	else if (!init_ppc970_pmu())
+		return 0;
+	else
+		return -ENODEV;
+}
+early_initcall(init_ppc64_pmu);
+#endif
diff --git a/arch/powerpc/perf/internal.h b/arch/powerpc/perf/internal.h
new file mode 100644
index 000000000000..683f48117132
--- /dev/null
+++ b/arch/powerpc/perf/internal.h
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Copyright 2019 Madhavan Srinivasan, IBM Corporation.
+
+extern int init_ppc970_pmu(void);
+extern int init_power5_pmu(void);
+extern int init_power5p_pmu(void);
+extern int init_power6_pmu(void);
+extern int init_power7_pmu(void);
+extern int init_power8_pmu(void);
+extern int init_power9_pmu(void);
diff --git a/arch/powerpc/perf/power5+-pmu.c b/arch/powerpc/perf/power5+-pmu.c
index 0526dac66007..9aa803504cb2 100644
--- a/arch/powerpc/perf/power5+-pmu.c
+++ b/arch/powerpc/perf/power5+-pmu.c
@@ -677,7 +677,7 @@ static struct power_pmu power5p_pmu = {
 	.cache_events		= &power5p_cache_events,
 };
 
-static int __init init_power5p_pmu(void)
+int init_power5p_pmu(void)
 {
 	if (!cur_cpu_spec->oprofile_cpu_type ||
 	    (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5+")
@@ -686,5 +686,3 @@ static int __init init_power5p_pmu(void)
 
 	return register_power_pmu(&power5p_pmu);
 }
-
-early_initcall(init_power5p_pmu);
diff --git a/arch/powerpc/perf/power5-pmu.c b/arch/powerpc/perf/power5-pmu.c
index 4dc99f9f7962..30cb13d081a9 100644
--- a/arch/powerpc/perf/power5-pmu.c
+++ b/arch/powerpc/perf/power5-pmu.c
@@ -618,7 +618,7 @@ static struct power_pmu power5_pmu = {
 	.flags			= PPMU_HAS_SSLOT,
 };
 
-static int __init init_power5_pmu(void)
+int init_power5_pmu(void)
 {
 	if (!cur_cpu_spec->oprofile_cpu_type ||
 	    strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5"))
@@ -626,5 +626,3 @@ static int __init init_power5_pmu(void)
 
 	return register_power_pmu(&power5_pmu);
 }
-
-early_initcall(init_power5_pmu);
diff --git a/arch/powerpc/perf/power6-pmu.c b/arch/powerpc/perf/power6-pmu.c
index 9c9d646b68a1..80ec48632cfe 100644
--- a/arch/powerpc/perf/power6-pmu.c
+++ b/arch/powerpc/perf/power6-pmu.c
@@ -540,7 +540,7 @@ static struct power_pmu power6_pmu = {
 	.cache_events		= &power6_cache_events,
 };
 
-static int __init init_power6_pmu(void)
+int init_power6_pmu(void)
 {
 	if (!cur_cpu_spec->oprofile_cpu_type ||
 	    strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power6"))
@@ -548,5 +548,3 @@ static int __init init_power6_pmu(void)
 
 	return register_power_pmu(&power6_pmu);
 }
-
-early_initcall(init_power6_pmu);
diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c
index 6dbae9884ec4..bb6efd5d2530 100644
--- a/arch/powerpc/perf/power7-pmu.c
+++ b/arch/powerpc/perf/power7-pmu.c
@@ -445,7 +445,7 @@ static struct power_pmu power7_pmu = {
 	.cache_events		= &power7_cache_events,
 };
 
-static int __init init_power7_pmu(void)
+int init_power7_pmu(void)
 {
 	if (!cur_cpu_spec->oprofile_cpu_type ||
 	    strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power7"))
@@ -456,5 +456,3 @@ static int __init init_power7_pmu(void)
 
 	return register_power_pmu(&power7_pmu);
 }
-
-early_initcall(init_power7_pmu);
diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index d12a2db26353..bcc3409a06de 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -379,7 +379,7 @@ static struct power_pmu power8_pmu = {
 	.bhrb_nr		= 32,
 };
 
-static int __init init_power8_pmu(void)
+int init_power8_pmu(void)
 {
 	int rc;
 
@@ -399,4 +399,3 @@ static int __init init_power8_pmu(void)
 
 	return 0;
 }
-early_initcall(init_power8_pmu);
diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 030544e35959..3a31ac6f4805 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -437,7 +437,7 @@ static struct power_pmu power9_pmu = {
 	.bhrb_nr		= 32,
 };
 
-static int __init init_power9_pmu(void)
+int init_power9_pmu(void)
 {
 	int rc = 0;
 	unsigned int pvr = mfspr(SPRN_PVR);
@@ -467,4 +467,3 @@ static int __init init_power9_pmu(void)
 
 	return 0;
 }
-early_initcall(init_power9_pmu);
diff --git a/arch/powerpc/perf/ppc970-pmu.c b/arch/powerpc/perf/ppc970-pmu.c
index 8b6a8a36fa38..1d3370914022 100644
--- a/arch/powerpc/perf/ppc970-pmu.c
+++ b/arch/powerpc/perf/ppc970-pmu.c
@@ -490,7 +490,7 @@ static struct power_pmu ppc970_pmu = {
 	.flags			= PPMU_NO_SIPR | PPMU_NO_CONT_SAMPLING,
 };
 
-static int __init init_ppc970_pmu(void)
+int init_ppc970_pmu(void)
 {
 	if (!cur_cpu_spec->oprofile_cpu_type ||
 	    (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/970")
@@ -499,5 +499,3 @@ static int __init init_ppc970_pmu(void)
 
 	return register_power_pmu(&ppc970_pmu);
 }
-
-early_initcall(init_ppc970_pmu);
-- 
cgit v1.2.3-59-g8ed1b


From be80e758d0c2ec87eceac7676f08c761b4235869 Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Date: Thu, 4 Apr 2019 17:24:50 +0530
Subject: powerpc/perf: Add generic compat mode pmu driver

Most of the power processor generation performance monitoring
unit (PMU) driver code is bundled in the kernel and one of those
is enabled/registered based on the oprofile_cpu_type check at
the boot.

But things get little tricky incase of "compat" mode boot.
IBM POWER System Server based processors has a compactibility
mode feature, which simpily put is, Nth generation processor
(lets say POWER8) will act and appear in a mode consistent
with an earlier generation (N-1) processor (that is POWER7).
And in this "compat" mode boot, kernel modify the
"oprofile_cpu_type" to be Nth generation (POWER8). If Nth
generation pmu driver is bundled (POWER8), it gets registered.

Key dependency here is to have distro support for latest
processor performance monitoring support. Patch here adds
a generic "compat-mode" performance monitoring driver to
be register in absence of powernv platform specific pmu driver.

Driver supports only "cycles" and "instruction" events.
"0x0001e" used as event code for "cycles" and "0x00002"
used as event code for "instruction" events. New file
called "generic-compat-pmu.c" is created to contain the driver
specific code. And base raw event code format modeled
on PPMU_ARCH_207S.

Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
[mpe: Use SPDX tag for license]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/Makefile             |   3 +-
 arch/powerpc/perf/core-book3s.c        |   2 +-
 arch/powerpc/perf/generic-compat-pmu.c | 234 +++++++++++++++++++++++++++++++++
 arch/powerpc/perf/internal.h           |   1 +
 4 files changed, 238 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/perf/generic-compat-pmu.c

(limited to 'arch')

diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile
index ab26df5bacb9..c155dcbb8691 100644
--- a/arch/powerpc/perf/Makefile
+++ b/arch/powerpc/perf/Makefile
@@ -5,7 +5,8 @@ obj-$(CONFIG_PERF_EVENTS)	+= callchain.o perf_regs.o
 obj-$(CONFIG_PPC_PERF_CTRS)	+= core-book3s.o bhrb.o
 obj64-$(CONFIG_PPC_PERF_CTRS)	+= ppc970-pmu.o power5-pmu.o \
 				   power5+-pmu.o power6-pmu.o power7-pmu.o \
-				   isa207-common.o power8-pmu.o power9-pmu.o
+				   isa207-common.o power8-pmu.o power9-pmu.o \
+				   generic-compat-pmu.o
 obj32-$(CONFIG_PPC_PERF_CTRS)	+= mpc7450-pmu.o
 
 obj-$(CONFIG_PPC_POWERNV)	+= imc-pmu.o
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index a96f9420139c..a66fb9c01c9e 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2318,7 +2318,7 @@ static int __init init_ppc64_pmu(void)
 	else if (!init_ppc970_pmu())
 		return 0;
 	else
-		return -ENODEV;
+		return init_generic_compat_pmu();
 }
 early_initcall(init_ppc64_pmu);
 #endif
diff --git a/arch/powerpc/perf/generic-compat-pmu.c b/arch/powerpc/perf/generic-compat-pmu.c
new file mode 100644
index 000000000000..5e5a54d5588e
--- /dev/null
+++ b/arch/powerpc/perf/generic-compat-pmu.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Copyright 2019 Madhavan Srinivasan, IBM Corporation.
+
+#define pr_fmt(fmt)	"generic-compat-pmu: " fmt
+
+#include "isa207-common.h"
+
+/*
+ * Raw event encoding:
+ *
+ *        60        56        52        48        44        40        36        32
+ * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
+ *
+ *        28        24        20        16        12         8         4         0
+ * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
+ *                                 [ pmc ]   [unit ]   [ ]   m   [    pmcxsel    ]
+ *                                                     |     |
+ *                                                     |     *- mark
+ *                                                     |
+ *                                                     |
+ *                                                     *- combine
+ *
+ * Below uses IBM bit numbering.
+ *
+ * MMCR1[x:y] = unit    (PMCxUNIT)
+ * MMCR1[24]   = pmc1combine[0]
+ * MMCR1[25]   = pmc1combine[1]
+ * MMCR1[26]   = pmc2combine[0]
+ * MMCR1[27]   = pmc2combine[1]
+ * MMCR1[28]   = pmc3combine[0]
+ * MMCR1[29]   = pmc3combine[1]
+ * MMCR1[30]   = pmc4combine[0]
+ * MMCR1[31]   = pmc4combine[1]
+ *
+ */
+
+/*
+ * Some power9 event codes.
+ */
+#define EVENT(_name, _code)	_name = _code,
+
+enum {
+EVENT(PM_CYC,					0x0001e)
+EVENT(PM_INST_CMPL,				0x00002)
+};
+
+#undef EVENT
+
+GENERIC_EVENT_ATTR(cpu-cycles,			PM_CYC);
+GENERIC_EVENT_ATTR(instructions,		PM_INST_CMPL);
+
+static struct attribute *generic_compat_events_attr[] = {
+	GENERIC_EVENT_PTR(PM_CYC),
+	GENERIC_EVENT_PTR(PM_INST_CMPL),
+	NULL
+};
+
+static struct attribute_group generic_compat_pmu_events_group = {
+	.name = "events",
+	.attrs = generic_compat_events_attr,
+};
+
+PMU_FORMAT_ATTR(event,		"config:0-19");
+PMU_FORMAT_ATTR(pmcxsel,	"config:0-7");
+PMU_FORMAT_ATTR(mark,		"config:8");
+PMU_FORMAT_ATTR(combine,	"config:10-11");
+PMU_FORMAT_ATTR(unit,		"config:12-15");
+PMU_FORMAT_ATTR(pmc,		"config:16-19");
+
+static struct attribute *generic_compat_pmu_format_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_pmcxsel.attr,
+	&format_attr_mark.attr,
+	&format_attr_combine.attr,
+	&format_attr_unit.attr,
+	&format_attr_pmc.attr,
+	NULL,
+};
+
+static struct attribute_group generic_compat_pmu_format_group = {
+	.name = "format",
+	.attrs = generic_compat_pmu_format_attr,
+};
+
+static const struct attribute_group *generic_compat_pmu_attr_groups[] = {
+	&generic_compat_pmu_format_group,
+	&generic_compat_pmu_events_group,
+	NULL,
+};
+
+static int compat_generic_events[] = {
+	[PERF_COUNT_HW_CPU_CYCLES] =			PM_CYC,
+	[PERF_COUNT_HW_INSTRUCTIONS] =			PM_INST_CMPL,
+};
+
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int generic_compat_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[ C(L1D) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+	},
+	[ C(L1I) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+	},
+	[ C(LL) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+	},
+	[ C(DTLB) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+	},
+	[ C(ITLB) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+	},
+	[ C(BPU) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+	},
+	[ C(NODE) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+	},
+};
+
+#undef C
+
+static struct power_pmu generic_compat_pmu = {
+	.name			= "GENERIC_COMPAT",
+	.n_counter		= MAX_PMU_COUNTERS,
+	.add_fields		= ISA207_ADD_FIELDS,
+	.test_adder		= ISA207_TEST_ADDER,
+	.compute_mmcr		= isa207_compute_mmcr,
+	.get_constraint		= isa207_get_constraint,
+	.disable_pmc		= isa207_disable_pmc,
+	.flags			= PPMU_HAS_SIER | PPMU_ARCH_207S,
+	.n_generic		= ARRAY_SIZE(compat_generic_events),
+	.generic_events		= compat_generic_events,
+	.cache_events		= &generic_compat_cache_events,
+	.attr_groups		= generic_compat_pmu_attr_groups,
+};
+
+int init_generic_compat_pmu(void)
+{
+	int rc = 0;
+
+	rc = register_power_pmu(&generic_compat_pmu);
+	if (rc)
+		return rc;
+
+	/* Tell userspace that EBB is supported */
+	cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_EBB;
+
+	return 0;
+}
diff --git a/arch/powerpc/perf/internal.h b/arch/powerpc/perf/internal.h
index 683f48117132..f755c64da137 100644
--- a/arch/powerpc/perf/internal.h
+++ b/arch/powerpc/perf/internal.h
@@ -9,3 +9,4 @@ extern int init_power6_pmu(void);
 extern int init_power7_pmu(void);
 extern int init_power8_pmu(void);
 extern int init_power9_pmu(void);
+extern int init_generic_compat_pmu(void);
-- 
cgit v1.2.3-59-g8ed1b


From 659a6e38db0b422c63fd68ca7e78a8daadca061e Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Date: Mon, 1 Apr 2019 11:50:39 +0530
Subject: powerpc/perf: Remove PM_BR_CMPL_ALT from power9 event list

PM_BR_CMPL_ALT event is not supported, remove it from the power9 event
list.

Fixes: 24bedcb7c811 ("powerpc/perf: Fix branch event code for power9")
Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/power9-events-list.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/perf/power9-events-list.h b/arch/powerpc/perf/power9-events-list.h
index 063c9d9f2516..6b1dc9a83ede 100644
--- a/arch/powerpc/perf/power9-events-list.h
+++ b/arch/powerpc/perf/power9-events-list.h
@@ -63,8 +63,6 @@ EVENT(PM_RUN_CYC_ALT,				0x200f4)
 /* Instruction Dispatched */
 EVENT(PM_INST_DISP,				0x200f2)
 EVENT(PM_INST_DISP_ALT,				0x300f2)
-/* Alternate Branch event code */
-EVENT(PM_BR_CMPL_ALT,				0x10012)
 /* Branch event that are not strongly biased */
 EVENT(PM_BR_2PATH,				0x20036)
 /* ALternate branch event that are not strongly biased */
-- 
cgit v1.2.3-59-g8ed1b


From a913e5e8b43be1d3897a141ce61c1ec071cad89c Mon Sep 17 00:00:00 2001
From: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Date: Tue, 27 Nov 2018 13:54:52 +0530
Subject: powerpc/perf: Return accordingly on invalid chip-id in

Nest hardware counter memory resides in a per-chip reserve-memory.
During nest_imc_event_init(), chip-id of the event-cpu is considered to
calculate the base memory addresss for that cpu. Return, proper error
condition if the chip_id calculated is invalid.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Fixes: 885dcd709ba91 ("powerpc/perf: Add nest IMC PMU support")
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/imc-pmu.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index b1c37cc3fa98..6159e9edddfd 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -487,6 +487,11 @@ static int nest_imc_event_init(struct perf_event *event)
 	 * Get the base memory addresss for this cpu.
 	 */
 	chip_id = cpu_to_chip_id(event->cpu);
+
+	/* Return, if chip_id is not valid */
+	if (chip_id < 0)
+		return -ENODEV;
+
 	pcni = pmu->mem_info;
 	do {
 		if (pcni->id == chip_id) {
-- 
cgit v1.2.3-59-g8ed1b


From 860b7d2286236170a36f94946d03ca9888d32571 Mon Sep 17 00:00:00 2001
From: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Date: Tue, 18 Dec 2018 11:50:41 +0530
Subject: powerpc/perf: Fix loop exit condition in nest_imc_event_init

The data structure (i.e struct imc_mem_info) to hold the memory address
information for nest imc units is allocated based on the number of nodes
in the system.

nest_imc_event_init() traverse this struct array to calculate the memory
base address for the event-cpu. If we fail to find a match for the event
cpu's chip-id in imc_mem_info struct array, then the do-while loop will
iterate until we crash.

Fix this by changing the loop exit condition based on the number of
non zero vbase elements in the array, since the allocation is done for
nr_chips + 1.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Fixes: 885dcd709ba91 ("powerpc/perf: Add nest IMC PMU support")
Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/imc-pmu.c               | 2 +-
 arch/powerpc/platforms/powernv/opal-imc.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 6159e9edddfd..2d12f0037e3a 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -499,7 +499,7 @@ static int nest_imc_event_init(struct perf_event *event)
 			break;
 		}
 		pcni++;
-	} while (pcni);
+	} while (pcni->vbase != 0);
 
 	if (!flag)
 		return -ENODEV;
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c
index 58a07948c76e..3d27f02695e4 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -127,7 +127,7 @@ static int imc_get_mem_addr_nest(struct device_node *node,
 								nr_chips))
 		goto error;
 
-	pmu_ptr->mem_info = kcalloc(nr_chips, sizeof(*pmu_ptr->mem_info),
+	pmu_ptr->mem_info = kcalloc(nr_chips + 1, sizeof(*pmu_ptr->mem_info),
 				    GFP_KERNEL);
 	if (!pmu_ptr->mem_info)
 		goto error;
-- 
cgit v1.2.3-59-g8ed1b


From d1720adff3783a2ba7c128e304a385d18962835b Mon Sep 17 00:00:00 2001
From: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Date: Tue, 16 Apr 2019 15:18:27 +0530
Subject: powerpc/include: Add data structures and macros for IMC trace mode

Add the macros needed for IMC (In-Memory Collection Counters) trace-mode
and data structure to hold the trace-imc record data.
Also, add the new type "OPAL_IMC_COUNTERS_TRACE" in 'opal-api.h', since
there is a new switch case added in the opal-calls for IMC.

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/imc-pmu.h  | 39 +++++++++++++++++++++++++++++++++++++
 arch/powerpc/include/asm/opal-api.h |  1 +
 2 files changed, 40 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h
index 69f516ecb2fd..7c2ef0e42661 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -33,6 +33,7 @@
  */
 #define THREAD_IMC_LDBAR_MASK           0x0003ffffffffe000ULL
 #define THREAD_IMC_ENABLE               0x8000000000000000ULL
+#define TRACE_IMC_ENABLE		0x4000000000000000ULL
 
 /*
  * For debugfs interface for imc-mode and imc-command
@@ -59,6 +60,34 @@ struct imc_events {
 	char *scale;
 };
 
+/*
+ * Trace IMC hardware updates a 64bytes record on
+ * Core Performance Monitoring Counter (CPMC)
+ * overflow. Here is the layout for the trace imc record
+ *
+ * DW 0 : Timebase
+ * DW 1 : Program Counter
+ * DW 2 : PIDR information
+ * DW 3 : CPMC1
+ * DW 4 : CPMC2
+ * DW 5 : CPMC3
+ * Dw 6 : CPMC4
+ * DW 7 : Timebase
+ * .....
+ *
+ * The following is the data structure to hold trace imc data.
+ */
+struct trace_imc_data {
+	u64 tb1;
+	u64 ip;
+	u64 val;
+	u64 cpmc1;
+	u64 cpmc2;
+	u64 cpmc3;
+	u64 cpmc4;
+	u64 tb2;
+};
+
 /* Event attribute array index */
 #define IMC_FORMAT_ATTR		0
 #define IMC_EVENT_ATTR		1
@@ -68,6 +97,13 @@ struct imc_events {
 /* PMU Format attribute macros */
 #define IMC_EVENT_OFFSET_MASK	0xffffffffULL
 
+/*
+ * Macro to mask bits 0:21 of first double word(which is the timebase) to
+ * compare with 8th double word (timebase) of trace imc record data.
+ */
+#define IMC_TRACE_RECORD_TB1_MASK      0x3ffffffffffULL
+
+
 /*
  * Device tree parser code detects IMC pmu support and
  * registers new IMC pmus. This structure will hold the
@@ -113,6 +149,7 @@ struct imc_pmu_ref {
 
 enum {
 	IMC_TYPE_THREAD		= 0x1,
+	IMC_TYPE_TRACE		= 0x2,
 	IMC_TYPE_CORE		= 0x4,
 	IMC_TYPE_CHIP           = 0x10,
 };
@@ -123,6 +160,8 @@ enum {
 #define IMC_DOMAIN_NEST		1
 #define IMC_DOMAIN_CORE		2
 #define IMC_DOMAIN_THREAD	3
+/* For trace-imc the domain is still thread but it operates in trace-mode */
+#define IMC_DOMAIN_TRACE	4
 
 extern int init_imc_pmu(struct device_node *parent,
 				struct imc_pmu *pmu_ptr, int pmu_id);
diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 234fde15b37c..e1577cfa7186 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -1129,6 +1129,7 @@ enum {
 enum {
 	OPAL_IMC_COUNTERS_NEST = 1,
 	OPAL_IMC_COUNTERS_CORE = 2,
+	OPAL_IMC_COUNTERS_TRACE = 3,
 };
 
 
-- 
cgit v1.2.3-59-g8ed1b


From dd50cf7cbc7bdd86483b797ac3d27b37d5aeeaa4 Mon Sep 17 00:00:00 2001
From: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Date: Tue, 16 Apr 2019 15:18:28 +0530
Subject: powerpc/perf: Rearrange setting of ldbar for thread-imc

LDBAR holds the memory address allocated for each cpu. For thread-imc
the mode bit (i.e bit 1) of LDBAR is set to accumulation.
Currently, ldbar is loaded with per cpu memory address and mode set to
accumulation at boot time.

To enable trace-imc, the mode bit of ldbar should be set to 'trace'. So to
accommodate trace-mode of IMC, reposition setting of ldbar for thread-imc
to thread_imc_event_add(). Also reset ldbar at thread_imc_event_del().

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/imc-pmu.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 2d12f0037e3a..23092a359ce0 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -793,8 +793,11 @@ static int core_imc_event_init(struct perf_event *event)
 }
 
 /*
- * Allocates a page of memory for each of the online cpus, and write the
- * physical base address of that page to the LDBAR for that cpu.
+ * Allocates a page of memory for each of the online cpus, and load
+ * LDBAR with 0.
+ * The physical base address of the page allocated for a cpu will be
+ * written to the LDBAR for that cpu, when the thread-imc event
+ * is added.
  *
  * LDBAR Register Layout:
  *
@@ -812,7 +815,7 @@ static int core_imc_event_init(struct perf_event *event)
  */
 static int thread_imc_mem_alloc(int cpu_id, int size)
 {
-	u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id);
+	u64 *local_mem = per_cpu(thread_imc_mem, cpu_id);
 	int nid = cpu_to_node(cpu_id);
 
 	if (!local_mem) {
@@ -829,9 +832,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size)
 		per_cpu(thread_imc_mem, cpu_id) = local_mem;
 	}
 
-	ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE;
-
-	mtspr(SPRN_LDBAR, ldbar_value);
+	mtspr(SPRN_LDBAR, 0);
 	return 0;
 }
 
@@ -982,6 +983,7 @@ static int thread_imc_event_add(struct perf_event *event, int flags)
 {
 	int core_id;
 	struct imc_pmu_ref *ref;
+	u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, smp_processor_id());
 
 	if (flags & PERF_EF_START)
 		imc_event_start(event, flags);
@@ -990,6 +992,9 @@ static int thread_imc_event_add(struct perf_event *event, int flags)
 		return -EINVAL;
 
 	core_id = smp_processor_id() / threads_per_core;
+	ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE;
+	mtspr(SPRN_LDBAR, ldbar_value);
+
 	/*
 	 * imc pmus are enabled only when it is used.
 	 * See if this is triggered for the first time.
@@ -1021,11 +1026,7 @@ static void thread_imc_event_del(struct perf_event *event, int flags)
 	int core_id;
 	struct imc_pmu_ref *ref;
 
-	/*
-	 * Take a snapshot and calculate the delta and update
-	 * the event counter values.
-	 */
-	imc_event_update(event);
+	mtspr(SPRN_LDBAR, 0);
 
 	core_id = smp_processor_id() / threads_per_core;
 	ref = &core_imc_refc[core_id];
@@ -1044,6 +1045,11 @@ static void thread_imc_event_del(struct perf_event *event, int flags)
 		ref->refc = 0;
 	}
 	mutex_unlock(&ref->lock);
+	/*
+	 * Take a snapshot and calculate the delta and update
+	 * the event counter values.
+	 */
+	imc_event_update(event);
 }
 
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
-- 
cgit v1.2.3-59-g8ed1b


From 216c3087a346db8d7c8a064d2b8f0f49e4694934 Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Date: Tue, 16 Apr 2019 15:18:29 +0530
Subject: powerpc/perf: Add privileged access check for thread_imc

Add code to restrict user access to thread_imc pmu since
some event report privilege level information.

Fixes: f74c89bd80fb3 ("powerpc/perf: Add thread IMC PMU support")
Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/imc-pmu.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 23092a359ce0..975837d85a80 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -864,6 +864,9 @@ static int thread_imc_event_init(struct perf_event *event)
 	if (event->attr.type != event->pmu->type)
 		return -ENOENT;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	/* Sampling not supported */
 	if (event->hw.sample_period)
 		return -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From 72c69dcddce103338de558c5c6e9ef9e4f607ce1 Mon Sep 17 00:00:00 2001
From: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Date: Tue, 16 Apr 2019 15:18:30 +0530
Subject: powerpc/perf: Trace imc events detection and cpuhotplug

Patch detects trace-imc events, does memory initilizations for each online
cpu, and registers cpuhotplug call-backs.

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/imc-pmu.c               | 104 ++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/opal-imc.c |   3 +
 include/linux/cpuhotplug.h                |   1 +
 3 files changed, 108 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 975837d85a80..3cc5e1934f0c 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -43,6 +43,11 @@ static DEFINE_PER_CPU(u64 *, thread_imc_mem);
 static struct imc_pmu *thread_imc_pmu;
 static int thread_imc_mem_size;
 
+/* Trace IMC data structures */
+static DEFINE_PER_CPU(u64 *, trace_imc_mem);
+static struct imc_pmu_ref *trace_imc_refc;
+static int trace_imc_mem_size;
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
 	return container_of(event->pmu, struct imc_pmu, pmu);
@@ -1055,6 +1060,59 @@ static void thread_imc_event_del(struct perf_event *event, int flags)
 	imc_event_update(event);
 }
 
+/*
+ * Allocate a page of memory for each cpu, and load LDBAR with 0.
+ */
+static int trace_imc_mem_alloc(int cpu_id, int size)
+{
+	u64 *local_mem = per_cpu(trace_imc_mem, cpu_id);
+	int phys_id = cpu_to_node(cpu_id), rc = 0;
+	int core_id = (cpu_id / threads_per_core);
+
+	if (!local_mem) {
+		local_mem = page_address(alloc_pages_node(phys_id,
+					GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
+					__GFP_NOWARN, get_order(size)));
+		if (!local_mem)
+			return -ENOMEM;
+		per_cpu(trace_imc_mem, cpu_id) = local_mem;
+
+		/* Initialise the counters for trace mode */
+		rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void *)local_mem),
+					    get_hard_smp_processor_id(cpu_id));
+		if (rc) {
+			pr_info("IMC:opal init failed for trace imc\n");
+			return rc;
+		}
+	}
+
+	/* Init the mutex, if not already */
+	trace_imc_refc[core_id].id = core_id;
+	mutex_init(&trace_imc_refc[core_id].lock);
+
+	mtspr(SPRN_LDBAR, 0);
+	return 0;
+}
+
+static int ppc_trace_imc_cpu_online(unsigned int cpu)
+{
+	return trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+}
+
+static int ppc_trace_imc_cpu_offline(unsigned int cpu)
+{
+	mtspr(SPRN_LDBAR, 0);
+	return 0;
+}
+
+static int trace_imc_cpu_init(void)
+{
+	return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
+			  "perf/powerpc/imc_trace:online",
+			  ppc_trace_imc_cpu_online,
+			  ppc_trace_imc_cpu_offline);
+}
+
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
 static int update_pmu_ops(struct imc_pmu *pmu)
 {
@@ -1177,6 +1235,18 @@ static void cleanup_all_thread_imc_memory(void)
 	}
 }
 
+static void cleanup_all_trace_imc_memory(void)
+{
+	int i, order = get_order(trace_imc_mem_size);
+
+	for_each_online_cpu(i) {
+		if (per_cpu(trace_imc_mem, i))
+			free_pages((u64)per_cpu(trace_imc_mem, i), order);
+
+	}
+	kfree(trace_imc_refc);
+}
+
 /* Function to free the attr_groups which are dynamically allocated */
 static void imc_common_mem_free(struct imc_pmu *pmu_ptr)
 {
@@ -1218,6 +1288,11 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)
 		cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE);
 		cleanup_all_thread_imc_memory();
 	}
+
+	if (pmu_ptr->domain == IMC_DOMAIN_TRACE) {
+		cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE);
+		cleanup_all_trace_imc_memory();
+	}
 }
 
 /*
@@ -1300,6 +1375,27 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent,
 
 		thread_imc_pmu = pmu_ptr;
 		break;
+	case IMC_DOMAIN_TRACE:
+		/* Update the pmu name */
+		pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
+		if (!pmu_ptr->pmu.name)
+			return -ENOMEM;
+
+		nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
+		trace_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref),
+								GFP_KERNEL);
+		if (!trace_imc_refc)
+			return -ENOMEM;
+
+		trace_imc_mem_size = pmu_ptr->counter_mem_size;
+		for_each_online_cpu(cpu) {
+			res = trace_imc_mem_alloc(cpu, trace_imc_mem_size);
+			if (res) {
+				cleanup_all_trace_imc_memory();
+				goto err;
+			}
+		}
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1372,6 +1468,14 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id
 			goto err_free_mem;
 		}
 
+		break;
+	case IMC_DOMAIN_TRACE:
+		ret = trace_imc_cpu_init();
+		if (ret) {
+			cleanup_all_trace_imc_memory();
+			goto err_free_mem;
+		}
+
 		break;
 	default:
 		return  -EINVAL;	/* Unknown domain */
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c
index 3d27f02695e4..3e497b91d210 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -284,6 +284,9 @@ static int opal_imc_counters_probe(struct platform_device *pdev)
 		case IMC_TYPE_THREAD:
 			domain = IMC_DOMAIN_THREAD;
 			break;
+		case IMC_TYPE_TRACE:
+			domain = IMC_DOMAIN_TRACE;
+			break;
 		default:
 			pr_warn("IMC Unknown Device type \n");
 			domain = -1;
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index e78281d07b70..c3413d9b8348 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -170,6 +170,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
 	CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
 	CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,
+	CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
 	CPUHP_AP_WATCHDOG_ONLINE,
 	CPUHP_AP_WORKQUEUE_ONLINE,
 	CPUHP_AP_RCUTREE_ONLINE,
-- 
cgit v1.2.3-59-g8ed1b


From 012ae244845f19d5f6ca2a90426851bc5044a0dc Mon Sep 17 00:00:00 2001
From: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Date: Tue, 16 Apr 2019 15:18:31 +0530
Subject: powerpc/perf: Trace imc PMU functions

Add PMU functions to support trace-imc.

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/imc-pmu.c | 205 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 204 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 3cc5e1934f0c..31fa753e2eb2 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -53,7 +53,7 @@ static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 	return container_of(event->pmu, struct imc_pmu, pmu);
 }
 
-PMU_FORMAT_ATTR(event, "config:0-40");
+PMU_FORMAT_ATTR(event, "config:0-61");
 PMU_FORMAT_ATTR(offset, "config:0-31");
 PMU_FORMAT_ATTR(rvalue, "config:32");
 PMU_FORMAT_ATTR(mode, "config:33-40");
@@ -70,6 +70,25 @@ static struct attribute_group imc_format_group = {
 	.attrs = imc_format_attrs,
 };
 
+/* Format attribute for imc trace-mode */
+PMU_FORMAT_ATTR(cpmc_reserved, "config:0-19");
+PMU_FORMAT_ATTR(cpmc_event, "config:20-27");
+PMU_FORMAT_ATTR(cpmc_samplesel, "config:28-29");
+PMU_FORMAT_ATTR(cpmc_load, "config:30-61");
+static struct attribute *trace_imc_format_attrs[] = {
+	&format_attr_event.attr,
+	&format_attr_cpmc_reserved.attr,
+	&format_attr_cpmc_event.attr,
+	&format_attr_cpmc_samplesel.attr,
+	&format_attr_cpmc_load.attr,
+	NULL,
+};
+
+static struct attribute_group trace_imc_format_group = {
+.name = "format",
+.attrs = trace_imc_format_attrs,
+};
+
 /* Get the cpumask printed to a buffer "buf" */
 static ssize_t imc_pmu_cpumask_get_attr(struct device *dev,
 					struct device_attribute *attr,
@@ -1113,6 +1132,182 @@ static int trace_imc_cpu_init(void)
 			  ppc_trace_imc_cpu_offline);
 }
 
+static u64 get_trace_imc_event_base_addr(void)
+{
+	return (u64)per_cpu(trace_imc_mem, smp_processor_id());
+}
+
+/*
+ * Function to parse trace-imc data obtained
+ * and to prepare the perf sample.
+ */
+static int trace_imc_prepare_sample(struct trace_imc_data *mem,
+				    struct perf_sample_data *data,
+				    u64 *prev_tb,
+				    struct perf_event_header *header,
+				    struct perf_event *event)
+{
+	/* Sanity checks for a valid record */
+	if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb)
+		*prev_tb = be64_to_cpu(READ_ONCE(mem->tb1));
+	else
+		return -EINVAL;
+
+	if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) !=
+			 be64_to_cpu(READ_ONCE(mem->tb2)))
+		return -EINVAL;
+
+	/* Prepare perf sample */
+	data->ip =  be64_to_cpu(READ_ONCE(mem->ip));
+	data->period = event->hw.last_period;
+
+	header->type = PERF_RECORD_SAMPLE;
+	header->size = sizeof(*header) + event->header_size;
+	header->misc = 0;
+
+	if (is_kernel_addr(data->ip))
+		header->misc |= PERF_RECORD_MISC_KERNEL;
+	else
+		header->misc |= PERF_RECORD_MISC_USER;
+
+	perf_event_header__init_id(header, data, event);
+
+	return 0;
+}
+
+static void dump_trace_imc_data(struct perf_event *event)
+{
+	struct trace_imc_data *mem;
+	int i, ret;
+	u64 prev_tb = 0;
+
+	mem = (struct trace_imc_data *)get_trace_imc_event_base_addr();
+	for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data));
+		i++, mem++) {
+		struct perf_sample_data data;
+		struct perf_event_header header;
+
+		ret = trace_imc_prepare_sample(mem, &data, &prev_tb, &header, event);
+		if (ret) /* Exit, if not a valid record */
+			break;
+		else {
+			/* If this is a valid record, create the sample */
+			struct perf_output_handle handle;
+
+			if (perf_output_begin(&handle, event, header.size))
+				return;
+
+			perf_output_sample(&handle, &header, &data, event);
+			perf_output_end(&handle);
+		}
+	}
+}
+
+static int trace_imc_event_add(struct perf_event *event, int flags)
+{
+	int core_id = smp_processor_id() / threads_per_core;
+	struct imc_pmu_ref *ref = NULL;
+	u64 local_mem, ldbar_value;
+
+	/* Set trace-imc bit in ldbar and load ldbar with per-thread memory address */
+	local_mem = get_trace_imc_event_base_addr();
+	ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | TRACE_IMC_ENABLE;
+
+	if (core_imc_refc)
+		ref = &core_imc_refc[core_id];
+	if (!ref) {
+		/* If core-imc is not enabled, use trace-imc reference count */
+		if (trace_imc_refc)
+			ref = &trace_imc_refc[core_id];
+		if (!ref)
+			return -EINVAL;
+	}
+	mtspr(SPRN_LDBAR, ldbar_value);
+	mutex_lock(&ref->lock);
+	if (ref->refc == 0) {
+		if (opal_imc_counters_start(OPAL_IMC_COUNTERS_TRACE,
+				get_hard_smp_processor_id(smp_processor_id()))) {
+			mutex_unlock(&ref->lock);
+			pr_err("trace-imc: Unable to start the counters for core %d\n", core_id);
+			mtspr(SPRN_LDBAR, 0);
+			return -EINVAL;
+		}
+	}
+	++ref->refc;
+	mutex_unlock(&ref->lock);
+
+	return 0;
+}
+
+static void trace_imc_event_read(struct perf_event *event)
+{
+	return;
+}
+
+static void trace_imc_event_stop(struct perf_event *event, int flags)
+{
+	u64 local_mem = get_trace_imc_event_base_addr();
+	dump_trace_imc_data(event);
+	memset((void *)local_mem, 0, sizeof(u64));
+}
+
+static void trace_imc_event_start(struct perf_event *event, int flags)
+{
+	return;
+}
+
+static void trace_imc_event_del(struct perf_event *event, int flags)
+{
+	int core_id = smp_processor_id() / threads_per_core;
+	struct imc_pmu_ref *ref = NULL;
+
+	if (core_imc_refc)
+		ref = &core_imc_refc[core_id];
+	if (!ref) {
+		/* If core-imc is not enabled, use trace-imc reference count */
+		if (trace_imc_refc)
+			ref = &trace_imc_refc[core_id];
+		if (!ref)
+			return;
+	}
+	mtspr(SPRN_LDBAR, 0);
+	mutex_lock(&ref->lock);
+	ref->refc--;
+	if (ref->refc == 0) {
+		if (opal_imc_counters_stop(OPAL_IMC_COUNTERS_TRACE,
+				get_hard_smp_processor_id(smp_processor_id()))) {
+			mutex_unlock(&ref->lock);
+			pr_err("trace-imc: Unable to stop the counters for core %d\n", core_id);
+			return;
+		}
+	} else if (ref->refc < 0) {
+		ref->refc = 0;
+	}
+	mutex_unlock(&ref->lock);
+	trace_imc_event_stop(event, flags);
+}
+
+static int trace_imc_event_init(struct perf_event *event)
+{
+	struct task_struct *target;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	/* Return if this is a couting event */
+	if (event->attr.sample_period == 0)
+		return -ENOENT;
+
+	event->hw.idx = -1;
+	target = event->hw.target;
+
+	event->pmu->task_ctx_nr = perf_hw_context;
+	return 0;
+}
+
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
 static int update_pmu_ops(struct imc_pmu *pmu)
 {
@@ -1143,6 +1338,14 @@ static int update_pmu_ops(struct imc_pmu *pmu)
 		pmu->pmu.cancel_txn = thread_imc_pmu_cancel_txn;
 		pmu->pmu.commit_txn = thread_imc_pmu_commit_txn;
 		break;
+	case IMC_DOMAIN_TRACE:
+		pmu->pmu.event_init = trace_imc_event_init;
+		pmu->pmu.add = trace_imc_event_add;
+		pmu->pmu.del = trace_imc_event_del;
+		pmu->pmu.start = trace_imc_event_start;
+		pmu->pmu.stop = trace_imc_event_stop;
+		pmu->pmu.read = trace_imc_event_read;
+		pmu->attr_groups[IMC_FORMAT_ATTR] = &trace_imc_format_group;
 	default:
 		break;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 5266e58d6cd90ac85c187d673093ad9cb649e16d Mon Sep 17 00:00:00 2001
From: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Date: Mon, 15 Apr 2019 14:52:11 +0300
Subject: powerpc/booke64: set RI in default MSR

Set RI in the default kernel's MSR so that the architected way of
detecting unrecoverable machine check interrupts has a chance to work.
This is inline with the MSR setup of the rest of booke powerpc
architectures configured here.

Signed-off-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Cc: stable@vger.kernel.org
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/reg_booke.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index eb2a33d5df26..e382bd6ede84 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -41,7 +41,7 @@
 #if defined(CONFIG_PPC_BOOK3E_64)
 #define MSR_64BIT	MSR_CM
 
-#define MSR_		(MSR_ME | MSR_CE)
+#define MSR_		(MSR_ME | MSR_RI | MSR_CE)
 #define MSR_KERNEL	(MSR_ | MSR_64BIT)
 #define MSR_USER32	(MSR_ | MSR_PR | MSR_EE)
 #define MSR_USER64	(MSR_USER32 | MSR_64BIT)
-- 
cgit v1.2.3-59-g8ed1b


From f995adb0ac5bcfa0d396ba7706aab420c7a6fec2 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@mips.com>
Date: Tue, 30 Apr 2019 22:53:30 +0000
Subject: MIPS: Use memblock_phys_alloc() for exception vector
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allocate the exception vector using memblock_phys_alloc() which gives us
a physical address, rather than the previous convoluted setup which
obtained a virtual address using memblock_alloc(), converted it to a
physical address & then back to a virtual address.

Signed-off-by: Paul Burton <paul.burton@mips.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Tested-by: Serge Semin <fancer.lancer@gmail.com>
Cc: linux-mips@vger.kernel.org
---
 arch/mips/kernel/traps.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 98ca55d62201..00f44b16385e 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -2292,9 +2292,8 @@ void __init trap_init(void)
 		unsigned long size = 0x200 + VECTORSPACING*64;
 		phys_addr_t ebase_pa;
 
-		ebase = (unsigned long)
-			memblock_alloc(size, 1 << fls(size));
-		if (!ebase)
+		ebase_pa = memblock_phys_alloc(size, 1 << fls(size));
+		if (!ebase_pa)
 			panic("%s: Failed to allocate %lu bytes align=0x%x\n",
 			      __func__, size, 1 << fls(size));
 
@@ -2309,9 +2308,10 @@ void __init trap_init(void)
 		 * EVA is special though as it allows segments to be rearranged
 		 * and to become uncached during cache error handling.
 		 */
-		ebase_pa = __pa(ebase);
 		if (!IS_ENABLED(CONFIG_EVA) && !WARN_ON(ebase_pa >= 0x20000000))
 			ebase = CKSEG0ADDR(ebase_pa);
+		else
+			ebase = (unsigned long)phys_to_virt(ebase_pa);
 	} else {
 		ebase = CAC_BASE;
 
-- 
cgit v1.2.3-59-g8ed1b


From 172dcd935c34b022729f45a7bbaae5cc05231533 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@mips.com>
Date: Tue, 30 Apr 2019 22:53:30 +0000
Subject: MIPS: Always allocate exception vector for MIPSr2+

Currently we allocate the exception vector on systems which use a
vectored interrupt mode, but otherwise attempt to reuse whatever
exception vector the bootloader uses.

This can be problematic for a number of reasons:

  1) The memory isn't properly marked reserved in the memblock
     allocator. We've relied on the fact that EBase is generally in the
     memory below the kernel image which we don't free, but this is
     about to change.

  2) Recent versions of U-Boot place their exception vector high in
     kseg0, in memory which isn't protected by being lower than the
     kernel anyway & can end up being clobbered.

  3) We are unnecessarily reliant upon there being memory at the address
     EBase points to upon entry to the kernel. This is often the case,
     but if the bootloader doesn't configure EBase & leaves it with its
     default value then we rely upon there being memory at physical
     address 0 for no good reason.

Improve this situation by allocating the exception vector in all cases
when running on MIPSr2 or higher, and reserving the memory for MIPSr1 or
lower. This ensures we don't clobber the exception vector in any
configuration, and for MIPSr2 & higher removes the need for memory at
physical address 0.

Signed-off-by: Paul Burton <paul.burton@mips.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Tested-by: Serge Semin <fancer.lancer@gmail.com>
Cc: linux-mips@vger.kernel.org
---
 arch/mips/kernel/traps.c | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 00f44b16385e..9b565ed51662 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -2284,18 +2284,27 @@ void __init trap_init(void)
 	extern char except_vec3_generic;
 	extern char except_vec4;
 	extern char except_vec3_r4000;
-	unsigned long i;
+	unsigned long i, vec_size;
+	phys_addr_t ebase_pa;
 
 	check_wait();
 
-	if (cpu_has_veic || cpu_has_vint) {
-		unsigned long size = 0x200 + VECTORSPACING*64;
-		phys_addr_t ebase_pa;
+	if (!cpu_has_mips_r2_r6) {
+		ebase = CAC_BASE;
+		ebase_pa = virt_to_phys((void *)ebase);
+		vec_size = 0x400;
+
+		memblock_reserve(ebase_pa, vec_size);
+	} else {
+		if (cpu_has_veic || cpu_has_vint)
+			vec_size = 0x200 + VECTORSPACING*64;
+		else
+			vec_size = PAGE_SIZE;
 
-		ebase_pa = memblock_phys_alloc(size, 1 << fls(size));
+		ebase_pa = memblock_phys_alloc(vec_size, 1 << fls(vec_size));
 		if (!ebase_pa)
 			panic("%s: Failed to allocate %lu bytes align=0x%x\n",
-			      __func__, size, 1 << fls(size));
+			      __func__, vec_size, 1 << fls(vec_size));
 
 		/*
 		 * Try to ensure ebase resides in KSeg0 if possible.
@@ -2312,20 +2321,6 @@ void __init trap_init(void)
 			ebase = CKSEG0ADDR(ebase_pa);
 		else
 			ebase = (unsigned long)phys_to_virt(ebase_pa);
-	} else {
-		ebase = CAC_BASE;
-
-		if (cpu_has_mips_r2_r6) {
-			if (cpu_has_ebase_wg) {
-#ifdef CONFIG_64BIT
-				ebase = (read_c0_ebase_64() & ~0xfff);
-#else
-				ebase = (read_c0_ebase() & ~0xfff);
-#endif
-			} else {
-				ebase += (read_c0_ebase() & 0x3ffff000);
-			}
-		}
 	}
 
 	if (cpu_has_mmips) {
-- 
cgit v1.2.3-59-g8ed1b


From 783454e2bc7ce491b5cd50154433cde993bfd849 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@mips.com>
Date: Tue, 30 Apr 2019 22:53:31 +0000
Subject: MIPS: Sync icache for whole exception vector
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rather than performing cache flushing for a fixed 0x400 bytes, use the
actual size of the vector in order to ensure we cover all emitted code
on systems that make use of vectored interrupts.

Signed-off-by: Paul Burton <paul.burton@mips.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Tested-by: Serge Semin <fancer.lancer@gmail.com>
Cc: linux-mips@vger.kernel.org
---
 arch/mips/kernel/traps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 9b565ed51662..2775190adbe7 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -2454,7 +2454,7 @@ void __init trap_init(void)
 	else
 		set_handler(0x080, &except_vec3_generic, 0x80);
 
-	local_flush_icache_range(ebase, ebase + 0x400);
+	local_flush_icache_range(ebase, ebase + vec_size);
 
 	sort_extable(__start___dbe_table, __stop___dbe_table);
 
-- 
cgit v1.2.3-59-g8ed1b


From de56d4c1da3e68f0ca468a55f6677bef3cee6e10 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@mips.com>
Date: Tue, 30 Apr 2019 22:53:31 +0000
Subject: MIPS: Remove duplicate EBase configuration

Clean up our configuration of the EBase register by making
configure_exception_vector() write to it unconditionally on systems
implementing MIPSr2 or higher, and removing the duplicate code in
per_cpu_trap_init(). The latter would have duplicated work on systems
with vectored interrupts, and didn't set BEV for safety like the
configure_exception_vector() version of the code does.

Signed-off-by: Paul Burton <paul.burton@mips.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Tested-by: Serge Semin <fancer.lancer@gmail.com>
Cc: linux-mips@vger.kernel.org
---
 arch/mips/kernel/traps.c | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 2775190adbe7..c52766a5b85f 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -2151,7 +2151,7 @@ static void configure_hwrena(void)
 
 static void configure_exception_vector(void)
 {
-	if (cpu_has_veic || cpu_has_vint) {
+	if (cpu_has_mips_r2_r6) {
 		unsigned long sr = set_c0_status(ST0_BEV);
 		/* If available, use WG to set top bits of EBASE */
 		if (cpu_has_ebase_wg) {
@@ -2163,6 +2163,8 @@ static void configure_exception_vector(void)
 		}
 		write_c0_ebase(ebase);
 		write_c0_status(sr);
+	}
+	if (cpu_has_veic || cpu_has_vint) {
 		/* Setting vector spacing enables EI/VI mode  */
 		change_c0_intctl(0x3e0, VECTORSPACING);
 	}
@@ -2193,22 +2195,6 @@ void per_cpu_trap_init(bool is_boot_cpu)
 	 *  o read IntCtl.IPFDC to determine the fast debug channel interrupt
 	 */
 	if (cpu_has_mips_r2_r6) {
-		/*
-		 * We shouldn't trust a secondary core has a sane EBASE register
-		 * so use the one calculated by the boot CPU.
-		 */
-		if (!is_boot_cpu) {
-			/* If available, use WG to set top bits of EBASE */
-			if (cpu_has_ebase_wg) {
-#ifdef CONFIG_64BIT
-				write_c0_ebase_64(ebase | MIPS_EBASE_WG);
-#else
-				write_c0_ebase(ebase | MIPS_EBASE_WG);
-#endif
-			}
-			write_c0_ebase(ebase);
-		}
-
 		cp0_compare_irq_shift = CAUSEB_TI - CAUSEB_IP;
 		cp0_compare_irq = (read_c0_intctl() >> INTCTLB_IPTI) & 7;
 		cp0_perfcount_irq = (read_c0_intctl() >> INTCTLB_IPPCI) & 7;
-- 
cgit v1.2.3-59-g8ed1b


From b93ddc4f9156205eb3c8df6ad35a37be3fa4e31e Mon Sep 17 00:00:00 2001
From: Serge Semin <fancer.lancer@gmail.com>
Date: Wed, 24 Apr 2019 01:47:40 +0300
Subject: mips: Reserve memory for the kernel image resources

The reserved_end variable had been used by the bootmem_init() code
to find a lowest limit of memory available for memmap blob. The original
code just tried to find a free memory space higher than kernel was placed.
This limitation seems justified for the memmap ragion search process, but
I can't see any obvious reason to reserve the unused space below kernel
seeing some platforms place it much higher than standard 1MB. Moreover
the RELOCATION config enables it to be loaded at any memory address.
So lets reserve the memory occupied by the kernel only, leaving the region
below being free for allocations. After doing this we can now discard the
code freeing a space between kernel _text and VMLINUX_LOAD_ADDRESS symbols
since it's going to be free anyway (unless marked as reserved by
platforms).

Signed-off-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Matt Redfearn <matt.redfearn@mips.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Cc: Huacai Chen <chenhc@lemote.com>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/mips/kernel/setup.c | 30 +++---------------------------
 1 file changed, 3 insertions(+), 27 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 185e0e42e009..f71a7d32a687 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -371,7 +371,6 @@ static void __init bootmem_init(void)
 
 static void __init bootmem_init(void)
 {
-	unsigned long reserved_end;
 	phys_addr_t ramstart = PHYS_ADDR_MAX;
 	int i;
 
@@ -382,10 +381,10 @@ static void __init bootmem_init(void)
 	 * will reserve the area used for the initrd.
 	 */
 	init_initrd();
-	reserved_end = (unsigned long) PFN_UP(__pa_symbol(&_end));
 
-	memblock_reserve(PHYS_OFFSET,
-			 (reserved_end << PAGE_SHIFT) - PHYS_OFFSET);
+	/* Reserve memory occupied by kernel. */
+	memblock_reserve(__pa_symbol(&_text),
+			__pa_symbol(&_end) - __pa_symbol(&_text));
 
 	/*
 	 * max_low_pfn is not a number of pages. The number of pages
@@ -501,29 +500,6 @@ static void __init bootmem_init(void)
 		memory_present(0, start, end);
 	}
 
-#ifdef CONFIG_RELOCATABLE
-	/*
-	 * The kernel reserves all memory below its _end symbol as bootmem,
-	 * but the kernel may now be at a much higher address. The memory
-	 * between the original and new locations may be returned to the system.
-	 */
-	if (__pa_symbol(_text) > __pa_symbol(VMLINUX_LOAD_ADDRESS)) {
-		unsigned long offset;
-		extern void show_kernel_relocation(const char *level);
-
-		offset = __pa_symbol(_text) - __pa_symbol(VMLINUX_LOAD_ADDRESS);
-		memblock_free(__pa_symbol(VMLINUX_LOAD_ADDRESS), offset);
-
-#if defined(CONFIG_DEBUG_KERNEL) && defined(CONFIG_DEBUG_INFO)
-		/*
-		 * This information is necessary when debugging the kernel
-		 * But is a security vulnerability otherwise!
-		 */
-		show_kernel_relocation(KERN_INFO);
-#endif
-	}
-#endif
-
 	/*
 	 * Reserve initrd memory if needed.
 	 */
-- 
cgit v1.2.3-59-g8ed1b


From eadb6925efeb0c254d17e1da9bb730d2add5613d Mon Sep 17 00:00:00 2001
From: Serge Semin <fancer.lancer@gmail.com>
Date: Wed, 24 Apr 2019 01:47:41 +0300
Subject: mips: Discard post-CMA-init foreach loop

Really the loop is pointless, since it walks over memblock-reserved
memory regions and mark them as reserved in memblock. Before
bootmem was removed from the kernel, this loop had been
used to map the memory reserved by CMA into the legacy bootmem
allocator. But now the early memory allocator is memblock,
which is used by CMA for reservation, so we don't need any mapping
anymore.

Reviewed-by: Matt Redfearn <matt.redfearn@mips.com>
Signed-off-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Cc: Huacai Chen <chenhc@lemote.com>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/mips/kernel/setup.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index f71a7d32a687..2ae6b02b948f 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -708,7 +708,6 @@ static void __init request_crashkernel(struct resource *res)
  */
 static void __init arch_mem_init(char **cmdline_p)
 {
-	struct memblock_region *reg;
 	extern void plat_mem_setup(void);
 
 	/*
@@ -814,10 +813,6 @@ static void __init arch_mem_init(char **cmdline_p)
 	plat_swiotlb_setup();
 
 	dma_contiguous_reserve(PFN_PHYS(max_low_pfn));
-	/* Tell bootmem about cma reserved memblock section */
-	for_each_memblock(reserved, reg)
-		if (reg->size != 0)
-			memblock_reserve(reg->base, reg->size);
 
 	reserve_bootmem_region(__pa_symbol(&__nosave_begin),
 			__pa_symbol(&__nosave_end)); /* Reserve for hibernation */
-- 
cgit v1.2.3-59-g8ed1b


From 4e50a35de4ccc834dbc32c664fb068f4c24cfebf Mon Sep 17 00:00:00 2001
From: Serge Semin <fancer.lancer@gmail.com>
Date: Wed, 24 Apr 2019 01:47:42 +0300
Subject: mips: Use memblock to reserve the __nosave memory range

Originally before legacy bootmem was removed, the memory for the range was
correctly reserved by reserve_bootmem_region(). But since memblock has been
selected for early memory allocation the function can be utilized only
after paging is fully initialized (as it is done by memblock_free_all()
function). So calling it from arch_mem_init() method is prone to errors,
and at this stage we need to reserve the memory in the memblock allocator.

Signed-off-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Matt Redfearn <matt.redfearn@mips.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Cc: Huacai Chen <chenhc@lemote.com>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/mips/kernel/setup.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 2ae6b02b948f..3a5140943f54 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -814,8 +814,9 @@ static void __init arch_mem_init(char **cmdline_p)
 
 	dma_contiguous_reserve(PFN_PHYS(max_low_pfn));
 
-	reserve_bootmem_region(__pa_symbol(&__nosave_begin),
-			__pa_symbol(&__nosave_end)); /* Reserve for hibernation */
+	/* Reserve for hibernation. */
+	memblock_reserve(__pa_symbol(&__nosave_begin),
+		__pa_symbol(&__nosave_end) - __pa_symbol(&__nosave_begin));
 }
 
 static void __init resource_init(void)
-- 
cgit v1.2.3-59-g8ed1b


From 9b9a59db84812d326af41a3802c63f1f95d81016 Mon Sep 17 00:00:00 2001
From: Serge Semin <fancer.lancer@gmail.com>
Date: Wed, 24 Apr 2019 01:47:43 +0300
Subject: mips: Add reserve-nomap memory type support

It might be necessary to prevent the virtual mapping creation for a
requested memory region. For instance there is a "no-map" property
indicating exactly this feature. In this case we need to not only
reserve the specified region by pretending it doesn't exist in the
memory space, but completely remove the range from system just by
removing it from memblock. The same way it's done in default
early_init_dt_reserve_memory_arch() method.

Signed-off-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Matt Redfearn <matt.redfearn@mips.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Cc: Huacai Chen <chenhc@lemote.com>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/mips/include/asm/bootinfo.h | 1 +
 arch/mips/kernel/prom.c          | 4 +++-
 arch/mips/kernel/setup.c         | 8 ++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/mips/include/asm/bootinfo.h b/arch/mips/include/asm/bootinfo.h
index a301a8f4bc66..235bc2f52113 100644
--- a/arch/mips/include/asm/bootinfo.h
+++ b/arch/mips/include/asm/bootinfo.h
@@ -92,6 +92,7 @@ extern unsigned long mips_machtype;
 #define BOOT_MEM_ROM_DATA	2
 #define BOOT_MEM_RESERVED	3
 #define BOOT_MEM_INIT_RAM	4
+#define BOOT_MEM_NOMAP		5
 
 /*
  * A memory map that's built upon what was determined
diff --git a/arch/mips/kernel/prom.c b/arch/mips/kernel/prom.c
index 93b8e0b4332f..437a174e3ef9 100644
--- a/arch/mips/kernel/prom.c
+++ b/arch/mips/kernel/prom.c
@@ -47,7 +47,9 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
 int __init early_init_dt_reserve_memory_arch(phys_addr_t base,
 					phys_addr_t size, bool nomap)
 {
-	add_memory_region(base, size, BOOT_MEM_RESERVED);
+	add_memory_region(base, size,
+			  nomap ? BOOT_MEM_NOMAP : BOOT_MEM_RESERVED);
+
 	return 0;
 }
 
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 3a5140943f54..2a1b2e7a1bc9 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -178,6 +178,7 @@ static bool __init __maybe_unused memory_region_available(phys_addr_t start,
 				in_ram = true;
 			break;
 		case BOOT_MEM_RESERVED:
+		case BOOT_MEM_NOMAP:
 			if ((start >= start_ && start < end_) ||
 			    (start < start_ && start + size >= start_))
 				free = false;
@@ -213,6 +214,9 @@ static void __init print_memory_map(void)
 		case BOOT_MEM_RESERVED:
 			printk(KERN_CONT "(reserved)\n");
 			break;
+		case BOOT_MEM_NOMAP:
+			printk(KERN_CONT "(nomap)\n");
+			break;
 		default:
 			printk(KERN_CONT "type %lu\n", boot_mem_map.map[i].type);
 			break;
@@ -487,6 +491,9 @@ static void __init bootmem_init(void)
 		switch (boot_mem_map.map[i].type) {
 		case BOOT_MEM_RAM:
 			break;
+		case BOOT_MEM_NOMAP: /* Discard the range from the system. */
+			memblock_remove(PFN_PHYS(start), PFN_PHYS(end - start));
+			continue;
 		default: /* Reserve the rest of the memory types at boot time */
 			memblock_reserve(PFN_PHYS(start), PFN_PHYS(end - start));
 			break;
@@ -861,6 +868,7 @@ static void __init resource_init(void)
 			res->flags |= IORESOURCE_SYSRAM;
 			break;
 		case BOOT_MEM_RESERVED:
+		case BOOT_MEM_NOMAP:
 		default:
 			res->name = "reserved";
 		}
-- 
cgit v1.2.3-59-g8ed1b


From d17aa2d262e8574a8c6befb5b6470d1c32875cf8 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Tue, 16 Apr 2019 14:53:50 -0700
Subject: ARM: dts: rockchip: Hook resets up to USB PHYs on rk3288.

Let's hook up the resets to the three USB PHYs on rk3288 as per the
bindings.  This is in preparation for a future patch that will set the
"snps,reset-phy-on-wake" on the host port.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 arch/arm/boot/dts/rk3288.dtsi | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch')

diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi
index a024d1e7e74c..3f361fad4684 100644
--- a/arch/arm/boot/dts/rk3288.dtsi
+++ b/arch/arm/boot/dts/rk3288.dtsi
@@ -904,6 +904,8 @@
 				clocks = <&cru SCLK_OTGPHY0>;
 				clock-names = "phyclk";
 				#clock-cells = <0>;
+				resets = <&cru SRST_USBOTG_PHY>;
+				reset-names = "phy-reset";
 			};
 
 			usbphy1: usb-phy@334 {
@@ -912,6 +914,8 @@
 				clocks = <&cru SCLK_OTGPHY1>;
 				clock-names = "phyclk";
 				#clock-cells = <0>;
+				resets = <&cru SRST_USBHOST0_PHY>;
+				reset-names = "phy-reset";
 			};
 
 			usbphy2: usb-phy@348 {
@@ -920,6 +924,8 @@
 				clocks = <&cru SCLK_OTGPHY2>;
 				clock-names = "phyclk";
 				#clock-cells = <0>;
+				resets = <&cru SRST_USBHOST1_PHY>;
+				reset-names = "phy-reset";
 			};
 		};
 	};
-- 
cgit v1.2.3-59-g8ed1b


From 5bdd614d65e314ac1530f9462c3ab955f3d3302b Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Tue, 16 Apr 2019 14:53:51 -0700
Subject: ARM: dts: rockchip: Add quirk for resetting rk3288's dwc2 host on
 wakeup

The "host" USB port on rk3288 has a hardware errata where we've got to
assert a PHY reset whenever we see a remote wakeup.  Add that quirk
property to the device tree.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 arch/arm/boot/dts/rk3288.dtsi | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi
index 3f361fad4684..8ce3dd2264b1 100644
--- a/arch/arm/boot/dts/rk3288.dtsi
+++ b/arch/arm/boot/dts/rk3288.dtsi
@@ -616,6 +616,7 @@
 		dr_mode = "host";
 		phys = <&usbphy2>;
 		phy-names = "usb2-phy";
+		snps,reset-phy-on-wake;
 		status = "disabled";
 	};
 
-- 
cgit v1.2.3-59-g8ed1b


From 17d9822d4b4c1bfeb14971d9a2639698e7f5b1eb Mon Sep 17 00:00:00 2001
From: Alexandre Ghiti <alex@ghiti.fr>
Date: Thu, 4 Apr 2019 02:26:22 -0400
Subject: parisc: Consider stack randomization for mmap base only when
 necessary

Do not offset mmap base address because of stack randomization if
current task does not want randomization.

Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/sys_parisc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c
index 376ea0d1b275..4407ac4c1d84 100644
--- a/arch/parisc/kernel/sys_parisc.c
+++ b/arch/parisc/kernel/sys_parisc.c
@@ -86,7 +86,8 @@ static unsigned long mmap_upper_limit(struct rlimit *rlim_stack)
 		stack_base = STACK_SIZE_MAX;
 
 	/* Add space for stack randomization. */
-	stack_base += (STACK_RND_MASK << PAGE_SHIFT);
+	if (current->flags & PF_RANDOMIZE)
+		stack_base += (STACK_RND_MASK << PAGE_SHIFT);
 
 	return PAGE_ALIGN(STACK_TOP - stack_base);
 }
-- 
cgit v1.2.3-59-g8ed1b


From ccfbc68d41c2db8f5e88128427fb5bfe3855ff9b Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@stackframe.org>
Date: Thu, 4 Apr 2019 21:14:08 +0200
Subject: parisc: add set_fixmap()/clear_fixmap()

These functions will be used for adding code patching
functions later.

Signed-off-by: Sven Schnelle <svens@stackframe.org>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/include/asm/fixmap.h | 19 ++++++++++++++++++-
 arch/parisc/mm/Makefile          |  2 +-
 arch/parisc/mm/fixmap.c          | 41 ++++++++++++++++++++++++++++++++++++++++
 arch/parisc/mm/init.c            | 14 +++++++++-----
 4 files changed, 69 insertions(+), 7 deletions(-)
 create mode 100644 arch/parisc/mm/fixmap.c

(limited to 'arch')

diff --git a/arch/parisc/include/asm/fixmap.h b/arch/parisc/include/asm/fixmap.h
index f7c3a0905de4..288da73d4cc0 100644
--- a/arch/parisc/include/asm/fixmap.h
+++ b/arch/parisc/include/asm/fixmap.h
@@ -15,17 +15,34 @@
  * from areas congruently mapped with user space.  It is 8MB large
  * and must be 16MB aligned */
 #define TMPALIAS_MAP_START	((__PAGE_OFFSET) - 16*1024*1024)
+
+#define FIXMAP_SIZE		(FIX_BITMAP_COUNT << PAGE_SHIFT)
+#define FIXMAP_START		(TMPALIAS_MAP_START - FIXMAP_SIZE)
 /* This is the kernel area for all maps (vmalloc, dma etc.)  most
  * usually, it extends up to TMPALIAS_MAP_START.  Virtual addresses
  * 0..GATEWAY_PAGE_SIZE are reserved for the gateway page */
 #define KERNEL_MAP_START	(GATEWAY_PAGE_SIZE)
-#define KERNEL_MAP_END		(TMPALIAS_MAP_START)
+#define KERNEL_MAP_END		(FIXMAP_START)
 
 #ifndef __ASSEMBLY__
+
+
+enum fixed_addresses {
+	/* Support writing RO kernel text via kprobes, jump labels, etc. */
+	FIX_TEXT_POKE0,
+	FIX_BITMAP_COUNT
+};
+
 extern void *parisc_vmalloc_start;
 #define PCXL_DMA_MAP_SIZE	(8*1024*1024)
 #define VMALLOC_START		((unsigned long)parisc_vmalloc_start)
 #define VMALLOC_END		(KERNEL_MAP_END)
+
+#define __fix_to_virt(_x) (FIXMAP_START + ((_x) << PAGE_SHIFT))
+
+void set_fixmap(enum fixed_addresses idx, phys_addr_t phys);
+void clear_fixmap(enum fixed_addresses idx);
+
 #endif /*__ASSEMBLY__*/
 
 #endif /*_ASM_FIXMAP_H*/
diff --git a/arch/parisc/mm/Makefile b/arch/parisc/mm/Makefile
index 134393de69d2..20e39b043a60 100644
--- a/arch/parisc/mm/Makefile
+++ b/arch/parisc/mm/Makefile
@@ -2,5 +2,5 @@
 # Makefile for arch/parisc/mm
 #
 
-obj-y	 := init.o fault.o ioremap.o
+obj-y	 := init.o fault.o ioremap.o fixmap.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/parisc/mm/fixmap.c b/arch/parisc/mm/fixmap.c
new file mode 100644
index 000000000000..c8d41b54fb19
--- /dev/null
+++ b/arch/parisc/mm/fixmap.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fixmaps for parisc
+ *
+ * Copyright (c) 2019 Sven Schnelle <svens@stackframe.org>
+ */
+
+#include <linux/kprobes.h>
+#include <linux/mm.h>
+#include <asm/cacheflush.h>
+#include <asm/fixmap.h>
+
+void set_fixmap(enum fixed_addresses idx, phys_addr_t phys)
+{
+	unsigned long vaddr = __fix_to_virt(idx);
+	pgd_t *pgd = pgd_offset_k(vaddr);
+	pmd_t *pmd = pmd_offset(pgd, vaddr);
+	pte_t *pte;
+
+	if (pmd_none(*pmd))
+		pmd = pmd_alloc(NULL, pgd, vaddr);
+
+	pte = pte_offset_kernel(pmd, vaddr);
+	if (pte_none(*pte))
+		pte = pte_alloc_kernel(pmd, vaddr);
+
+	set_pte_at(&init_mm, vaddr, pte, __mk_pte(phys, PAGE_KERNEL_RWX));
+	flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE);
+}
+
+void clear_fixmap(enum fixed_addresses idx)
+{
+	unsigned long vaddr = __fix_to_virt(idx);
+	pgd_t *pgd = pgd_offset_k(vaddr);
+	pmd_t *pmd = pmd_offset(pgd, vaddr);
+	pte_t *pte = pte_offset_kernel(pmd, vaddr);
+
+	pte_clear(&init_mm, vaddr, pte);
+
+	flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE);
+}
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index d0b166256f1a..b2b52de2b82b 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -622,15 +622,19 @@ void __init mem_init(void)
 	 * But keep code for debugging purposes.
 	 */
 	printk("virtual kernel memory layout:\n"
-	       "    vmalloc : 0x%px - 0x%px   (%4ld MB)\n"
-	       "    memory  : 0x%px - 0x%px   (%4ld MB)\n"
-	       "      .init : 0x%px - 0x%px   (%4ld kB)\n"
-	       "      .data : 0x%px - 0x%px   (%4ld kB)\n"
-	       "      .text : 0x%px - 0x%px   (%4ld kB)\n",
+	       "     vmalloc : 0x%px - 0x%px   (%4ld MB)\n"
+	       "     fixmap  : 0x%px - 0x%px   (%4ld kB)\n"
+	       "     memory  : 0x%px - 0x%px   (%4ld MB)\n"
+	       "       .init : 0x%px - 0x%px   (%4ld kB)\n"
+	       "       .data : 0x%px - 0x%px   (%4ld kB)\n"
+	       "       .text : 0x%px - 0x%px   (%4ld kB)\n",
 
 	       (void*)VMALLOC_START, (void*)VMALLOC_END,
 	       (VMALLOC_END - VMALLOC_START) >> 20,
 
+	       (void *)FIXMAP_START, (void *)(FIXMAP_START + FIXMAP_SIZE),
+	       (unsigned long)(FIXMAP_SIZE / 1024),
+
 	       __va(0), high_memory,
 	       ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
 
-- 
cgit v1.2.3-59-g8ed1b


From 620a53d522ba007a79fffc444bd75e8d3775f5b8 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@stackframe.org>
Date: Thu, 4 Apr 2019 21:14:09 +0200
Subject: parisc: add parisc code patching

Instead of re-mapping the whole kernel text with RWX rights
add a patch_text() which can be used to replace instructions
in the kernel .text section. Based on the ARM implementation.

Signed-off-by: Sven Schnelle <svens@stackframe.org>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/include/asm/patch.h | 11 ++++++
 arch/parisc/kernel/Makefile     |  3 +-
 arch/parisc/kernel/patch.c      | 78 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 arch/parisc/include/asm/patch.h
 create mode 100644 arch/parisc/kernel/patch.c

(limited to 'arch')

diff --git a/arch/parisc/include/asm/patch.h b/arch/parisc/include/asm/patch.h
new file mode 100644
index 000000000000..685b58a13968
--- /dev/null
+++ b/arch/parisc/include/asm/patch.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PARISC_KERNEL_PATCH_H
+#define _PARISC_KERNEL_PATCH_H
+
+/* stop machine and patch kernel text */
+void patch_text(void *addr, unsigned int insn);
+
+/* patch kernel text with machine already stopped (e.g. in kgdb) */
+void __patch_text(void *addr, unsigned int insn);
+
+#endif
diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile
index 8e5f1ab65c68..d29bbd1a73b2 100644
--- a/arch/parisc/kernel/Makefile
+++ b/arch/parisc/kernel/Makefile
@@ -9,7 +9,8 @@ obj-y	     	:= cache.o pacache.o setup.o pdt.o traps.o time.o irq.o \
 		   pa7300lc.o syscall.o entry.o sys_parisc.o firmware.o \
 		   ptrace.o hardware.o inventory.o drivers.o alternative.o \
 		   signal.o hpmc.o real2.o parisc_ksyms.o unaligned.o \
-		   process.o processor.o pdc_cons.o pdc_chassis.o unwind.o
+		   process.o processor.o pdc_cons.o pdc_chassis.o unwind.o \
+		   patch.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not profile debug and lowlevel utilities
diff --git a/arch/parisc/kernel/patch.c b/arch/parisc/kernel/patch.c
new file mode 100644
index 000000000000..8c5ee684cb16
--- /dev/null
+++ b/arch/parisc/kernel/patch.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+ /*
+  * functions to patch RO kernel text during runtime
+  *
+  * Copyright (c) 2019 Sven Schnelle <svens@stackframe.org>
+  */
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/kprobes.h>
+#include <linux/mm.h>
+#include <linux/stop_machine.h>
+
+#include <asm/cacheflush.h>
+#include <asm/fixmap.h>
+#include <asm/patch.h>
+
+struct patch {
+	void *addr;
+	unsigned int insn;
+};
+
+static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
+{
+	unsigned long uintaddr = (uintptr_t) addr;
+	bool module = !core_kernel_text(uintaddr);
+	struct page *page;
+
+	if (module && IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+		page = vmalloc_to_page(addr);
+	else if (!module && IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
+		page = virt_to_page(addr);
+	else
+		return addr;
+
+	set_fixmap(fixmap, page_to_phys(page));
+
+	return (void *) (__fix_to_virt(fixmap) + (uintaddr & ~PAGE_MASK));
+}
+
+static void __kprobes patch_unmap(int fixmap, unsigned long *flags)
+{
+	clear_fixmap(fixmap);
+}
+
+void __kprobes __patch_text(void *addr, unsigned int insn)
+{
+	unsigned long flags;
+	void *waddr = addr;
+	int size;
+
+	waddr = patch_map(addr, FIX_TEXT_POKE0, &flags);
+	*(u32 *)waddr = insn;
+	size = sizeof(u32);
+	flush_kernel_vmap_range(waddr, size);
+	patch_unmap(FIX_TEXT_POKE0, &flags);
+	flush_icache_range((uintptr_t)(addr),
+			   (uintptr_t)(addr) + size);
+}
+
+static int __kprobes patch_text_stop_machine(void *data)
+{
+	struct patch *patch = data;
+
+	__patch_text(patch->addr, patch->insn);
+
+	return 0;
+}
+
+void __kprobes patch_text(void *addr, unsigned int insn)
+{
+	struct patch patch = {
+		.addr = addr,
+		.insn = insn,
+	};
+
+	stop_machine_cpuslocked(patch_text_stop_machine, &patch, NULL);
+}
-- 
cgit v1.2.3-59-g8ed1b


From eacbfce19d8b6dbd7958cbe01d65a21324cc2fad Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@stackframe.org>
Date: Thu, 4 Apr 2019 21:14:10 +0200
Subject: parisc: add KGDB support

This patch add KGDB support to PA-RISC. It also implements
single-stepping utilizing the recovery counter.

Signed-off-by: Sven Schnelle <svens@stackframe.org>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/Kconfig            |   1 +
 arch/parisc/include/asm/kgdb.h |  68 ++++++++++++++
 arch/parisc/kernel/Makefile    |   1 +
 arch/parisc/kernel/kgdb.c      | 209 +++++++++++++++++++++++++++++++++++++++++
 arch/parisc/kernel/traps.c     |  17 ++++
 5 files changed, 296 insertions(+)
 create mode 100644 arch/parisc/include/asm/kgdb.h
 create mode 100644 arch/parisc/kernel/kgdb.c

(limited to 'arch')

diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index c8e621296092..a80c19c7fc0e 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -54,6 +54,7 @@ config PARISC
 	select CPU_NO_EFFICIENT_FFS
 	select NEED_DMA_MAP_STATE
 	select NEED_SG_DMA_LENGTH
+	select HAVE_ARCH_KGDB
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/parisc/include/asm/kgdb.h b/arch/parisc/include/asm/kgdb.h
new file mode 100644
index 000000000000..f23e7f8f13a5
--- /dev/null
+++ b/arch/parisc/include/asm/kgdb.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PA-RISC KGDB support
+ *
+ * Copyright (c) 2019 Sven Schnelle <svens@stackframe.org>
+ *
+ */
+
+#ifndef __PARISC_KGDB_H__
+#define __PARISC_KGDB_H__
+
+#define BREAK_INSTR_SIZE		4
+#define PARISC_KGDB_COMPILED_BREAK_INSN	0x3ffc01f
+#define PARISC_KGDB_BREAK_INSN		0x3ffa01f
+
+
+#define NUMREGBYTES			sizeof(struct parisc_gdb_regs)
+#define BUFMAX				4096
+
+#define CACHE_FLUSH_IS_SAFE		1
+
+#ifndef __ASSEMBLY__
+
+static inline void arch_kgdb_breakpoint(void)
+{
+	asm(".word %0" : : "i"(PARISC_KGDB_COMPILED_BREAK_INSN) : "memory");
+}
+
+struct parisc_gdb_regs {
+	unsigned long gpr[32];
+	unsigned long sar;
+	unsigned long iaoq_f;
+	unsigned long iasq_f;
+	unsigned long iaoq_b;
+	unsigned long iasq_b;
+	unsigned long eiem;
+	unsigned long iir;
+	unsigned long isr;
+	unsigned long ior;
+	unsigned long ipsw;
+	unsigned long __unused0;
+	unsigned long sr4;
+	unsigned long sr0;
+	unsigned long sr1;
+	unsigned long sr2;
+	unsigned long sr3;
+	unsigned long sr5;
+	unsigned long sr6;
+	unsigned long sr7;
+	unsigned long cr0;
+	unsigned long pid1;
+	unsigned long pid2;
+	unsigned long scrccr;
+	unsigned long pid3;
+	unsigned long pid4;
+	unsigned long cr24;
+	unsigned long cr25;
+	unsigned long cr26;
+	unsigned long cr27;
+	unsigned long cr28;
+	unsigned long cr29;
+	unsigned long cr30;
+
+	u64 fr[32];
+};
+
+#endif
+#endif
diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile
index d29bbd1a73b2..5012da96c196 100644
--- a/arch/parisc/kernel/Makefile
+++ b/arch/parisc/kernel/Makefile
@@ -33,3 +33,4 @@ obj-$(CONFIG_64BIT)	+= perf.o perf_asm.o $(obj64-y)
 obj-$(CONFIG_PARISC_CPU_TOPOLOGY)	+= topology.o
 obj-$(CONFIG_FUNCTION_TRACER)		+= ftrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
+obj-$(CONFIG_KGDB)			+= kgdb.o
diff --git a/arch/parisc/kernel/kgdb.c b/arch/parisc/kernel/kgdb.c
new file mode 100644
index 000000000000..664278db9b97
--- /dev/null
+++ b/arch/parisc/kernel/kgdb.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PA-RISC KGDB support
+ *
+ * Copyright (c) 2019 Sven Schnelle <svens@stackframe.org>
+ *
+ */
+
+#include <linux/kgdb.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/notifier.h>
+#include <linux/kdebug.h>
+#include <linux/uaccess.h>
+#include <asm/ptrace.h>
+#include <asm/traps.h>
+#include <asm/processor.h>
+#include <asm/patch.h>
+#include <asm/cacheflush.h>
+
+const struct kgdb_arch arch_kgdb_ops = {
+	.gdb_bpt_instr = { 0x03, 0xff, 0xa0, 0x1f }
+};
+
+static int __kgdb_notify(struct die_args *args, unsigned long cmd)
+{
+	struct pt_regs *regs = args->regs;
+
+	if (kgdb_handle_exception(1, args->signr, cmd, regs))
+		return NOTIFY_DONE;
+	return NOTIFY_STOP;
+}
+
+static int kgdb_notify(struct notifier_block *self,
+		       unsigned long cmd, void *ptr)
+{
+	unsigned long flags;
+	int ret;
+
+	local_irq_save(flags);
+	ret = __kgdb_notify(ptr, cmd);
+	local_irq_restore(flags);
+
+	return ret;
+}
+
+static struct notifier_block kgdb_notifier = {
+	.notifier_call	= kgdb_notify,
+	.priority	= -INT_MAX,
+};
+
+int kgdb_arch_init(void)
+{
+	return register_die_notifier(&kgdb_notifier);
+}
+
+void kgdb_arch_exit(void)
+{
+	unregister_die_notifier(&kgdb_notifier);
+}
+
+void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+	struct parisc_gdb_regs *gr = (struct parisc_gdb_regs *)gdb_regs;
+
+	memset(gr, 0, sizeof(struct parisc_gdb_regs));
+
+	memcpy(gr->gpr, regs->gr, sizeof(gr->gpr));
+	memcpy(gr->fr, regs->fr, sizeof(gr->fr));
+
+	gr->sr0 = regs->sr[0];
+	gr->sr1 = regs->sr[1];
+	gr->sr2 = regs->sr[2];
+	gr->sr3 = regs->sr[3];
+	gr->sr4 = regs->sr[4];
+	gr->sr5 = regs->sr[5];
+	gr->sr6 = regs->sr[6];
+	gr->sr7 = regs->sr[7];
+
+	gr->sar = regs->sar;
+	gr->iir = regs->iir;
+	gr->isr = regs->isr;
+	gr->ior = regs->ior;
+	gr->ipsw = regs->ipsw;
+	gr->cr27 = regs->cr27;
+
+	gr->iaoq_f = regs->iaoq[0];
+	gr->iasq_f = regs->iasq[0];
+
+	gr->iaoq_b = regs->iaoq[1];
+	gr->iasq_b = regs->iasq[1];
+}
+
+void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+	struct parisc_gdb_regs *gr = (struct parisc_gdb_regs *)gdb_regs;
+
+
+	memcpy(regs->gr, gr->gpr, sizeof(regs->gr));
+	memcpy(regs->fr, gr->fr, sizeof(regs->fr));
+
+	regs->sr[0] = gr->sr0;
+	regs->sr[1] = gr->sr1;
+	regs->sr[2] = gr->sr2;
+	regs->sr[3] = gr->sr3;
+	regs->sr[4] = gr->sr4;
+	regs->sr[5] = gr->sr5;
+	regs->sr[6] = gr->sr6;
+	regs->sr[7] = gr->sr7;
+
+	regs->sar = gr->sar;
+	regs->iir = gr->iir;
+	regs->isr = gr->isr;
+	regs->ior = gr->ior;
+	regs->ipsw = gr->ipsw;
+	regs->cr27 = gr->cr27;
+
+	regs->iaoq[0] = gr->iaoq_f;
+	regs->iasq[0] = gr->iasq_f;
+
+	regs->iaoq[1] = gr->iaoq_b;
+	regs->iasq[1] = gr->iasq_b;
+}
+
+void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs,
+				struct task_struct *task)
+{
+	struct pt_regs *regs = task_pt_regs(task);
+	unsigned long gr30, iaoq;
+
+	gr30 = regs->gr[30];
+	iaoq = regs->iaoq[0];
+
+	regs->gr[30] = regs->ksp;
+	regs->iaoq[0] = regs->kpc;
+	pt_regs_to_gdb_regs(gdb_regs, regs);
+
+	regs->gr[30] = gr30;
+	regs->iaoq[0] = iaoq;
+
+}
+
+static void step_instruction_queue(struct pt_regs *regs)
+{
+	regs->iaoq[0] = regs->iaoq[1];
+	regs->iaoq[1] += 4;
+}
+
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+{
+	regs->iaoq[0] = ip;
+	regs->iaoq[1] = ip + 4;
+}
+
+int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
+{
+	int ret = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+				BREAK_INSTR_SIZE);
+	if (ret)
+		return ret;
+
+	__patch_text((void *)bpt->bpt_addr,
+			*(unsigned int *)&arch_kgdb_ops.gdb_bpt_instr);
+	return ret;
+}
+
+int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+{
+	__patch_text((void *)bpt->bpt_addr, *(unsigned int *)&bpt->saved_instr);
+	return 0;
+}
+
+int kgdb_arch_handle_exception(int trap, int signo,
+		int err_code, char *inbuf, char *outbuf,
+		struct pt_regs *regs)
+{
+	unsigned long addr;
+	char *p = inbuf + 1;
+
+	switch (inbuf[0]) {
+	case 'D':
+	case 'c':
+	case 'k':
+		kgdb_contthread = NULL;
+		kgdb_single_step = 0;
+
+		if (kgdb_hex2long(&p, &addr))
+			kgdb_arch_set_pc(regs, addr);
+		else if (trap == 9 && regs->iir ==
+				PARISC_KGDB_COMPILED_BREAK_INSN)
+			step_instruction_queue(regs);
+		return 0;
+	case 's':
+		kgdb_single_step = 1;
+		if (kgdb_hex2long(&p, &addr)) {
+			kgdb_arch_set_pc(regs, addr);
+		} else if (trap == 9 && regs->iir ==
+				PARISC_KGDB_COMPILED_BREAK_INSN) {
+			step_instruction_queue(regs);
+			mtctl(-1, 0);
+		} else {
+			mtctl(0, 0);
+		}
+		regs->gr[0] |= PSW_R;
+		return 0;
+
+	}
+	return -1;
+}
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 7e1ccafadf57..4a0516524f2a 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -42,6 +42,7 @@
 #include <asm/unwind.h>
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
+#include <linux/kgdb.h>
 
 #include "../math-emu/math-emu.h"	/* for handle_fpe() */
 
@@ -293,6 +294,14 @@ static void handle_break(struct pt_regs *regs)
 			(tt == BUG_TRAP_TYPE_NONE) ? 9 : 0);
 	}
 
+#ifdef CONFIG_KGDB
+	if (unlikely(iir == PARISC_KGDB_COMPILED_BREAK_INSN ||
+		iir == PARISC_KGDB_BREAK_INSN)) {
+		kgdb_handle_exception(9, SIGTRAP, 0, regs);
+		return;
+	}
+#endif
+
 	if (unlikely(iir != GDB_BREAK_INSN))
 		parisc_printk_ratelimited(0, regs,
 			KERN_DEBUG "break %d,%d: pid=%d command='%s'\n",
@@ -518,6 +527,14 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 	case  3:
 		/* Recovery counter trap */
 		regs->gr[0] &= ~PSW_R;
+
+#ifdef CONFIG_KGDB
+		if (kgdb_single_step) {
+			kgdb_handle_exception(0, SIGTRAP, 0, regs);
+			return;
+		}
+#endif
+
 		if (user_space(regs))
 			handle_gdb_break(regs, TRAP_TRACE);
 		/* else this must be the start of a syscall - just let it run */
-- 
cgit v1.2.3-59-g8ed1b


From 3e1120f4b57bc12437048494ab56648edaa5b57d Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Sat, 6 Apr 2019 16:45:14 +0200
Subject: parisc: Export running_on_qemu symbol for modules

Signed-off-by: Helge Deller <deller@gmx.de>
CC: stable@vger.kernel.org # v4.9+
---
 arch/parisc/kernel/process.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 841db71958cd..97c206734e24 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -193,6 +193,7 @@ int dump_task_fpu (struct task_struct *tsk, elf_fpregset_t *r)
  */
 
 int running_on_qemu __read_mostly;
+EXPORT_SYMBOL(running_on_qemu);
 
 void __cpuidle arch_cpu_idle_dead(void)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 82d96bf68e6c2101dfbba2984cd9565643f5d8cb Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Sun, 7 Apr 2019 01:09:23 +0200
Subject: parisc: PA-Linux requires at least 32 MB RAM

Even a 32-bit kernel requires at least 27 MB to decompress itself, so
halt the system with a message if the system has less memory than 32 MB.

Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/boot/compressed/misc.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/boot/compressed/misc.c b/arch/parisc/boot/compressed/misc.c
index 2556bb181813..2d395998f524 100644
--- a/arch/parisc/boot/compressed/misc.c
+++ b/arch/parisc/boot/compressed/misc.c
@@ -145,14 +145,13 @@ static int putchar(int c)
 
 void __noreturn error(char *x)
 {
-	puts("\n\n");
-	puts(x);
-	puts("\n\n -- System halted");
+	if (x) puts(x);
+	puts("\n -- System halted\n");
 	while (1)	/* wait forever */
 		;
 }
 
-static int print_hex(unsigned long num)
+static int print_num(unsigned long num, int base)
 {
 	const char hex[] = "0123456789abcdef";
 	char str[40];
@@ -160,12 +159,14 @@ static int print_hex(unsigned long num)
 
 	str[i--] = '\0';
 	do {
-		str[i--] = hex[num & 0x0f];
-		num >>= 4;
+		str[i--] = hex[num % base];
+		num = num / base;
 	} while (num);
 
-	str[i--] = 'x';
-	str[i] = '0';
+	if (base == 16) {
+		str[i--] = 'x';
+		str[i] = '0';
+	} else i++;
 	puts(&str[i]);
 
 	return 0;
@@ -187,8 +188,9 @@ put:
 
 		if (fmt[++i] == '%')
 			goto put;
+		print_num(va_arg(args, unsigned long),
+			fmt[i] == 'x' ? 16:10);
 		++i;
-		print_hex(va_arg(args, unsigned long));
 	}
 
 	va_end(args);
@@ -327,8 +329,15 @@ unsigned long decompress_kernel(unsigned int started_wide,
 		free_mem_end_ptr = rd_start;
 #endif
 
-	if (free_mem_ptr >= free_mem_end_ptr)
-		error("Kernel too big for machine.");
+	if (free_mem_ptr >= free_mem_end_ptr) {
+		int free_ram;
+		free_ram = (free_mem_ptr >> 20) + 1;
+		if (free_ram < 32)
+			free_ram = 32;
+		printf("\nKernel requires at least %d MB RAM.\n",
+			free_ram);
+		error(NULL);
+	}
 
 #ifdef DEBUG
 	printf("\n");
-- 
cgit v1.2.3-59-g8ed1b


From ea1afe339a2b1260aa31a4d100155d4403446704 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@stackframe.org>
Date: Sun, 7 Apr 2019 20:10:57 +0200
Subject: parisc: add functions required by KPROBE_EVENTS

implement regs_get_register(), regs_get_kernel_stack_nth() and
regs_within_kernel_stack()

Signed-off-by: Sven Schnelle <svens@stackframe.org>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/include/asm/ptrace.h | 13 +++++++++++++
 arch/parisc/kernel/ptrace.c      | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

(limited to 'arch')

diff --git a/arch/parisc/include/asm/ptrace.h b/arch/parisc/include/asm/ptrace.h
index 9ff033d261ab..143fb2a89dd8 100644
--- a/arch/parisc/include/asm/ptrace.h
+++ b/arch/parisc/include/asm/ptrace.h
@@ -37,4 +37,17 @@ extern int regs_query_register_offset(const char *name);
 extern const char *regs_query_register_name(unsigned int offset);
 #define MAX_REG_OFFSET (offsetof(struct pt_regs, ipsw))
 
+#define kernel_stack_pointer(regs) ((regs)->gr[30])
+
+static inline unsigned long regs_get_register(struct pt_regs *regs,
+					      unsigned int offset)
+{
+	if (unlikely(offset > MAX_REG_OFFSET))
+		return 0;
+	return *(unsigned long *)((unsigned long)regs + offset);
+}
+
+unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n);
+int regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr);
+
 #endif
diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c
index 0964c236e3e5..a3d2fb4e6dd2 100644
--- a/arch/parisc/kernel/ptrace.c
+++ b/arch/parisc/kernel/ptrace.c
@@ -789,3 +789,38 @@ const char *regs_query_register_name(unsigned int offset)
 			return roff->name;
 	return NULL;
 }
+
+/**
+ * regs_within_kernel_stack() - check the address in the stack
+ * @regs:      pt_regs which contains kernel stack pointer.
+ * @addr:      address which is checked.
+ *
+ * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
+ * If @addr is within the kernel stack, it returns true. If not, returns false.
+ */
+int regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr)
+{
+	return ((addr & ~(THREAD_SIZE - 1))  ==
+		(kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
+}
+
+/**
+ * regs_get_kernel_stack_nth() - get Nth entry of the stack
+ * @regs:	pt_regs which contains kernel stack pointer.
+ * @n:		stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
+ * is specified by @regs. If the @n th entry is NOT in the kernel stack,
+ * this returns 0.
+ */
+unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n)
+{
+	unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
+
+	addr -= n;
+
+	if (!regs_within_kernel_stack(regs, (unsigned long)addr))
+		return 0;
+
+	return *addr;
+}
-- 
cgit v1.2.3-59-g8ed1b


From 8858ac8e9e9b1894f7bb218bc0035532371b8d7e Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@stackframe.org>
Date: Sun, 7 Apr 2019 20:10:58 +0200
Subject: parisc: Implement kprobes

Implement kprobes support for PA-RISC.

Signed-off-by: Sven Schnelle <svens@stackframe.org>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/Kconfig               |   1 +
 arch/parisc/include/asm/kprobes.h |  55 ++++++++++++
 arch/parisc/kernel/Makefile       |   1 +
 arch/parisc/kernel/kprobes.c      | 183 ++++++++++++++++++++++++++++++++++++++
 arch/parisc/kernel/traps.c        |  14 +++
 5 files changed, 254 insertions(+)
 create mode 100644 arch/parisc/include/asm/kprobes.h
 create mode 100644 arch/parisc/kernel/kprobes.c

(limited to 'arch')

diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index a80c19c7fc0e..7712688608f4 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -55,6 +55,7 @@ config PARISC
 	select NEED_DMA_MAP_STATE
 	select NEED_SG_DMA_LENGTH
 	select HAVE_ARCH_KGDB
+	select HAVE_KPROBES
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/parisc/include/asm/kprobes.h b/arch/parisc/include/asm/kprobes.h
new file mode 100644
index 000000000000..e09cf2deeafe
--- /dev/null
+++ b/arch/parisc/include/asm/kprobes.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * arch/parisc/include/asm/kprobes.h
+ *
+ * PA-RISC kprobes implementation
+ *
+ * Copyright (c) 2019 Sven Schnelle <svens@stackframe.org>
+ */
+
+#ifndef _PARISC_KPROBES_H
+#define _PARISC_KPROBES_H
+
+#ifdef CONFIG_KPROBES
+
+#include <asm-generic/kprobes.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/notifier.h>
+
+#define PARISC_KPROBES_BREAK_INSN	0x3ff801f
+#define  __ARCH_WANT_KPROBES_INSN_SLOT
+#define MAX_INSN_SIZE 1
+
+typedef u32 kprobe_opcode_t;
+struct kprobe;
+
+void arch_remove_kprobe(struct kprobe *p);
+
+#define flush_insn_slot(p) \
+	flush_icache_range((unsigned long)&(p)->ainsn.insn[0], \
+			   (unsigned long)&(p)->ainsn.insn[0] + \
+			   sizeof(kprobe_opcode_t))
+
+#define kretprobe_blacklist_size    0
+
+struct arch_specific_insn {
+	kprobe_opcode_t *insn;
+};
+
+struct prev_kprobe {
+	struct kprobe *kp;
+	unsigned long status;
+};
+
+struct kprobe_ctlblk {
+	unsigned int kprobe_status;
+	struct prev_kprobe prev_kprobe;
+	unsigned long iaoq[2];
+};
+
+int __kprobes parisc_kprobe_break_handler(struct pt_regs *regs);
+int __kprobes parisc_kprobe_ss_handler(struct pt_regs *regs);
+
+#endif /* CONFIG_KPROBES */
+#endif /* _PARISC_KPROBES_H */
diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile
index 5012da96c196..b818b28c8a99 100644
--- a/arch/parisc/kernel/Makefile
+++ b/arch/parisc/kernel/Makefile
@@ -34,3 +34,4 @@ obj-$(CONFIG_PARISC_CPU_TOPOLOGY)	+= topology.o
 obj-$(CONFIG_FUNCTION_TRACER)		+= ftrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
 obj-$(CONFIG_KGDB)			+= kgdb.o
+obj-$(CONFIG_KPROBES)			+= kprobes.o
diff --git a/arch/parisc/kernel/kprobes.c b/arch/parisc/kernel/kprobes.c
new file mode 100644
index 000000000000..8b1977cd3eb9
--- /dev/null
+++ b/arch/parisc/kernel/kprobes.c
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * arch/parisc/kernel/kprobes.c
+ *
+ * PA-RISC kprobes implementation
+ *
+ * Copyright (c) 2019 Sven Schnelle <svens@stackframe.org>
+ */
+
+#include <linux/types.h>
+#include <linux/kprobes.h>
+#include <linux/slab.h>
+#include <asm/cacheflush.h>
+#include <asm/patch.h>
+
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
+{
+	if ((unsigned long)p->addr & 3UL)
+		return -EINVAL;
+
+	p->ainsn.insn = get_insn_slot();
+	if (!p->ainsn.insn)
+		return -ENOMEM;
+
+	memcpy(p->ainsn.insn, p->addr,
+		MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+	p->opcode = *p->addr;
+	flush_insn_slot(p);
+	return 0;
+}
+
+void __kprobes arch_remove_kprobe(struct kprobe *p)
+{
+	if (!p->ainsn.insn)
+		return;
+
+	free_insn_slot(p->ainsn.insn, 0);
+	p->ainsn.insn = NULL;
+}
+
+void __kprobes arch_arm_kprobe(struct kprobe *p)
+{
+	patch_text(p->addr, PARISC_KPROBES_BREAK_INSN);
+}
+
+void __kprobes arch_disarm_kprobe(struct kprobe *p)
+{
+	patch_text(p->addr, p->opcode);
+}
+
+static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+	kcb->prev_kprobe.kp = kprobe_running();
+	kcb->prev_kprobe.status = kcb->kprobe_status;
+}
+
+static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+	__this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
+	kcb->kprobe_status = kcb->prev_kprobe.status;
+}
+
+static inline void __kprobes set_current_kprobe(struct kprobe *p)
+{
+	__this_cpu_write(current_kprobe, p);
+}
+
+static void __kprobes setup_singlestep(struct kprobe *p,
+		struct kprobe_ctlblk *kcb, struct pt_regs *regs)
+{
+	kcb->iaoq[0] = regs->iaoq[0];
+	kcb->iaoq[1] = regs->iaoq[1];
+	regs->iaoq[0] = (unsigned long)p->ainsn.insn;
+	mtctl(0, 0);
+	regs->gr[0] |= PSW_R;
+}
+
+int __kprobes parisc_kprobe_break_handler(struct pt_regs *regs)
+{
+	struct kprobe *p;
+	struct kprobe_ctlblk *kcb;
+
+	preempt_disable();
+
+	kcb = get_kprobe_ctlblk();
+	p = get_kprobe((unsigned long *)regs->iaoq[0]);
+
+	if (!p) {
+		preempt_enable_no_resched();
+		return 0;
+	}
+
+	if (kprobe_running()) {
+		/*
+		 * We have reentered the kprobe_handler, since another kprobe
+		 * was hit while within the handler, we save the original
+		 * kprobes and single step on the instruction of the new probe
+		 * without calling any user handlers to avoid recursive
+		 * kprobes.
+		 */
+		save_previous_kprobe(kcb);
+		set_current_kprobe(p);
+		kprobes_inc_nmissed_count(p);
+		setup_singlestep(p, kcb, regs);
+		kcb->kprobe_status = KPROBE_REENTER;
+		return 1;
+	}
+
+	set_current_kprobe(p);
+	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+	/* If we have no pre-handler or it returned 0, we continue with
+	 * normal processing. If we have a pre-handler and it returned
+	 * non-zero - which means user handler setup registers to exit
+	 * to another instruction, we must skip the single stepping.
+	 */
+
+	if (!p->pre_handler || !p->pre_handler(p, regs)) {
+		setup_singlestep(p, kcb, regs);
+		kcb->kprobe_status = KPROBE_HIT_SS;
+	} else {
+		reset_current_kprobe();
+		preempt_enable_no_resched();
+	}
+	return 1;
+}
+
+int __kprobes parisc_kprobe_ss_handler(struct pt_regs *regs)
+{
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	struct kprobe *p = kprobe_running();
+
+	if (regs->iaoq[0] != (unsigned long)p->ainsn.insn+4)
+		return 0;
+
+	/* restore back original saved kprobe variables and continue */
+	if (kcb->kprobe_status == KPROBE_REENTER) {
+		restore_previous_kprobe(kcb);
+		return 1;
+	}
+
+	/* for absolute branch instructions we can copy iaoq_b. for relative
+	 * branch instructions we need to calculate the new address based on the
+	 * difference between iaoq_f and iaoq_b. We cannot use iaoq_b without
+	 * modificationt because it's based on our ainsn.insn address.
+	 */
+
+	if (p->post_handler)
+		p->post_handler(p, regs, 0);
+
+	switch (regs->iir >> 26) {
+	case 0x38: /* BE */
+	case 0x39: /* BE,L */
+	case 0x3a: /* BV */
+	case 0x3b: /* BVE */
+		/* for absolute branches, regs->iaoq[1] has already the right
+		 * address
+		 */
+		regs->iaoq[0] = kcb->iaoq[1];
+		break;
+	default:
+		regs->iaoq[1] = kcb->iaoq[0];
+		regs->iaoq[1] += (regs->iaoq[1] - regs->iaoq[0]) + 4;
+		regs->iaoq[0] = kcb->iaoq[1];
+		break;
+	}
+	kcb->kprobe_status = KPROBE_HIT_SSDONE;
+	reset_current_kprobe();
+	return 1;
+}
+
+bool arch_kprobe_on_func_entry(unsigned long offset)
+{
+	return !offset;
+}
+
+int __init arch_init_kprobes(void)
+{
+	return 0;
+}
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 4a0516524f2a..096e319adeb3 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -43,6 +43,7 @@
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 #include <linux/kgdb.h>
+#include <linux/kprobes.h>
 
 #include "../math-emu/math-emu.h"	/* for handle_fpe() */
 
@@ -294,6 +295,14 @@ static void handle_break(struct pt_regs *regs)
 			(tt == BUG_TRAP_TYPE_NONE) ? 9 : 0);
 	}
 
+#ifdef CONFIG_KPROBES
+	if (unlikely(iir == PARISC_KPROBES_BREAK_INSN)) {
+		parisc_kprobe_break_handler(regs);
+		return;
+	}
+
+#endif
+
 #ifdef CONFIG_KGDB
 	if (unlikely(iir == PARISC_KGDB_COMPILED_BREAK_INSN ||
 		iir == PARISC_KGDB_BREAK_INSN)) {
@@ -528,6 +537,11 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 		/* Recovery counter trap */
 		regs->gr[0] &= ~PSW_R;
 
+#ifdef CONFIG_KPROBES
+		if (parisc_kprobe_ss_handler(regs))
+			return;
+#endif
+
 #ifdef CONFIG_KGDB
 		if (kgdb_single_step) {
 			kgdb_handle_exception(0, SIGTRAP, 0, regs);
-- 
cgit v1.2.3-59-g8ed1b


From 1253d18d2d51f79c74421fc4fe12fb4213fdd6ea Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@stackframe.org>
Date: Tue, 9 Apr 2019 19:30:27 +0200
Subject: parisc: remove kprobes.h from generic-y

We're providing our own version now.

Signed-off-by: Sven Schnelle <svens@stackframe.org>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/include/asm/Kbuild | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch')

diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index 9bcd0c903dbb..630a179363c2 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -10,7 +10,6 @@ generic-y += hw_irq.h
 generic-y += irq_regs.h
 generic-y += irq_work.h
 generic-y += kdebug.h
-generic-y += kprobes.h
 generic-y += kvm_para.h
 generic-y += local.h
 generic-y += local64.h
-- 
cgit v1.2.3-59-g8ed1b


From e0b59b7b633ab72c08bb9bac36afaeaade011ddf Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@stackframe.org>
Date: Tue, 9 Apr 2019 19:30:28 +0200
Subject: parisc: Implement kretprobes

Implement kretprobes on parisc, parts stolen from powerpc.

Signed-off-by: Sven Schnelle <svens@stackframe.org>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/Kconfig          |   1 +
 arch/parisc/kernel/kprobes.c | 110 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 110 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 7712688608f4..c8038165b81f 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -56,6 +56,7 @@ config PARISC
 	select NEED_SG_DMA_LENGTH
 	select HAVE_ARCH_KGDB
 	select HAVE_KPROBES
+	select HAVE_KRETPROBES
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/parisc/kernel/kprobes.c b/arch/parisc/kernel/kprobes.c
index 8b1977cd3eb9..d58960b33bda 100644
--- a/arch/parisc/kernel/kprobes.c
+++ b/arch/parisc/kernel/kprobes.c
@@ -172,6 +172,112 @@ int __kprobes parisc_kprobe_ss_handler(struct pt_regs *regs)
 	return 1;
 }
 
+static inline void kretprobe_trampoline(void)
+{
+	asm volatile("nop");
+	asm volatile("nop");
+}
+
+static int __kprobes trampoline_probe_handler(struct kprobe *p,
+					      struct pt_regs *regs);
+
+static struct kprobe trampoline_p = {
+	.pre_handler = trampoline_probe_handler
+};
+
+static int __kprobes trampoline_probe_handler(struct kprobe *p,
+					      struct pt_regs *regs)
+{
+	struct kretprobe_instance *ri = NULL;
+	struct hlist_head *head, empty_rp;
+	struct hlist_node *tmp;
+	unsigned long flags, orig_ret_address = 0;
+	unsigned long trampoline_address = (unsigned long)trampoline_p.addr;
+	kprobe_opcode_t *correct_ret_addr = NULL;
+
+	INIT_HLIST_HEAD(&empty_rp);
+	kretprobe_hash_lock(current, &head, &flags);
+
+	/*
+	 * It is possible to have multiple instances associated with a given
+	 * task either because multiple functions in the call path have
+	 * a return probe installed on them, and/or more than one return
+	 * probe was registered for a target function.
+	 *
+	 * We can handle this because:
+	 *     - instances are always inserted at the head of the list
+	 *     - when multiple return probes are registered for the same
+	 *       function, the first instance's ret_addr will point to the
+	 *       real return address, and all the rest will point to
+	 *       kretprobe_trampoline
+	 */
+	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+		if (ri->task != current)
+			/* another task is sharing our hash bucket */
+			continue;
+
+		orig_ret_address = (unsigned long)ri->ret_addr;
+
+		if (orig_ret_address != trampoline_address)
+			/*
+			 * This is the real return address. Any other
+			 * instances associated with this task are for
+			 * other calls deeper on the call stack
+			 */
+			break;
+	}
+
+	kretprobe_assert(ri, orig_ret_address, trampoline_address);
+
+	correct_ret_addr = ri->ret_addr;
+	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+		if (ri->task != current)
+			/* another task is sharing our hash bucket */
+			continue;
+
+		orig_ret_address = (unsigned long)ri->ret_addr;
+		if (ri->rp && ri->rp->handler) {
+			__this_cpu_write(current_kprobe, &ri->rp->kp);
+			get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
+			ri->ret_addr = correct_ret_addr;
+			ri->rp->handler(ri, regs);
+			__this_cpu_write(current_kprobe, NULL);
+		}
+
+		recycle_rp_inst(ri, &empty_rp);
+
+		if (orig_ret_address != trampoline_address)
+			/*
+			 * This is the real return address. Any other
+			 * instances associated with this task are for
+			 * other calls deeper on the call stack
+			 */
+			break;
+	}
+
+	kretprobe_hash_unlock(current, &flags);
+
+	hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
+		hlist_del(&ri->hlist);
+		kfree(ri);
+	}
+	instruction_pointer_set(regs, orig_ret_address);
+	return 1;
+}
+
+void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
+				      struct pt_regs *regs)
+{
+	ri->ret_addr = (kprobe_opcode_t *)regs->gr[2];
+
+	/* Replace the return addr with trampoline addr. */
+	regs->gr[2] = (unsigned long)trampoline_p.addr;
+}
+
+int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+{
+	return p->addr == trampoline_p.addr;
+}
 bool arch_kprobe_on_func_entry(unsigned long offset)
 {
 	return !offset;
@@ -179,5 +285,7 @@ bool arch_kprobe_on_func_entry(unsigned long offset)
 
 int __init arch_init_kprobes(void)
 {
-	return 0;
+	trampoline_p.addr = (kprobe_opcode_t *)
+		dereference_function_descriptor(kretprobe_trampoline);
+	return register_kprobe(&trampoline_p);
 }
-- 
cgit v1.2.3-59-g8ed1b


From ea5a8c620ffb341b2d1a02755f9e87375523c0e5 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@stackframe.org>
Date: Tue, 9 Apr 2019 19:30:30 +0200
Subject: parisc: remove unused flags parameter in __patch_text()

It's not used by patch_map()/patch_unmap(), so lets remove
it.

Signed-off-by: Sven Schnelle <svens@stackframe.org>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/patch.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/kernel/patch.c b/arch/parisc/kernel/patch.c
index 8c5ee684cb16..cdcd981278b3 100644
--- a/arch/parisc/kernel/patch.c
+++ b/arch/parisc/kernel/patch.c
@@ -20,7 +20,7 @@ struct patch {
 	unsigned int insn;
 };
 
-static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
+static void __kprobes *patch_map(void *addr, int fixmap)
 {
 	unsigned long uintaddr = (uintptr_t) addr;
 	bool module = !core_kernel_text(uintaddr);
@@ -38,22 +38,21 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
 	return (void *) (__fix_to_virt(fixmap) + (uintaddr & ~PAGE_MASK));
 }
 
-static void __kprobes patch_unmap(int fixmap, unsigned long *flags)
+static void __kprobes patch_unmap(int fixmap)
 {
 	clear_fixmap(fixmap);
 }
 
 void __kprobes __patch_text(void *addr, unsigned int insn)
 {
-	unsigned long flags;
 	void *waddr = addr;
 	int size;
 
-	waddr = patch_map(addr, FIX_TEXT_POKE0, &flags);
+	waddr = patch_map(addr, FIX_TEXT_POKE0);
 	*(u32 *)waddr = insn;
 	size = sizeof(u32);
 	flush_kernel_vmap_range(waddr, size);
-	patch_unmap(FIX_TEXT_POKE0, &flags);
+	patch_unmap(FIX_TEXT_POKE0);
 	flush_icache_range((uintptr_t)(addr),
 			   (uintptr_t)(addr) + size);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 0e4db23e12b704c5aeb547b69a62e5869dc1bcb8 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 9 Apr 2019 22:22:29 +0200
Subject: parisc: Show n/a if product number not available

Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/processor.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c
index 7f4d042856b5..e0a81dedc366 100644
--- a/arch/parisc/kernel/processor.c
+++ b/arch/parisc/kernel/processor.c
@@ -305,7 +305,8 @@ void __init collect_boot_cpu_data(void)
 
 	if (pdc_model_platform_info(orig_prod_num, current_prod_num, serial_no) == PDC_OK) {
 		printk(KERN_INFO "product %s, original product %s, S/N: %s\n",
-			current_prod_num, orig_prod_num, serial_no);
+			current_prod_num[0] ? current_prod_num : "n/a",
+			orig_prod_num, serial_no);
 		add_device_randomness(orig_prod_num, strlen(orig_prod_num));
 		add_device_randomness(current_prod_num, strlen(current_prod_num));
 		add_device_randomness(serial_no, strlen(serial_no));
-- 
cgit v1.2.3-59-g8ed1b


From 6b1370ae392b79ac3e2d053205f4107d4a55c102 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@stackframe.org>
Date: Thu, 18 Apr 2019 11:33:13 +0200
Subject: parisc: enable wide mode early

The idle task might have been allocated above 4GB. With the current code
we cannot access that memory because the CPU is still running in narrow
mode.
This was found on a J5000 machine and the patch is required to enable
SPARSEMEM on that machine.

Signed-off-by: Sven Schnelle <svens@stackframe.org>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/head.S | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch')

diff --git a/arch/parisc/kernel/head.S b/arch/parisc/kernel/head.S
index fbb4e43fda05..fefaba2097b5 100644
--- a/arch/parisc/kernel/head.S
+++ b/arch/parisc/kernel/head.S
@@ -329,6 +329,19 @@ smp_slave_stext:
 	mtsp	   %r0,%sr6
 	mtsp	   %r0,%sr7
 
+#ifdef CONFIG_64BIT
+	/*
+	 *  Enable Wide mode early, in case the task_struct for the idle
+	 *  task in smp_init_current_idle_task was allocated above 4GB.
+	 */
+1:	mfia            %rp             /* clear upper part of pcoq */
+	ldo             2f-1b(%rp),%rp
+	depdi           0,31,32,%rp
+	bv              (%rp)
+	ssm             PSW_SM_W,%r0
+2:
+#endif
+
 	/*  Initialize the SP - monarch sets up smp_init_current_idle_task */
 	load32		PA(smp_init_current_idle_task),%sp
 	LDREG		0(%sp),%sp	/* load task address */
-- 
cgit v1.2.3-59-g8ed1b


From dbdf0760990583649bfaca75fd98f76afd5f3905 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 9 Apr 2019 21:52:35 +0200
Subject: parisc: Switch from DISCONTIGMEM to SPARSEMEM

The commit 1c30844d2dfe ("mm: reclaim small amounts of memory when an
external fragmentation event occurs") breaks memory management on a
parisc c8000 workstation with this memory layout:

	0) Start 0x0000000000000000 End 0x000000003fffffff Size   1024 MB
	1) Start 0x0000000100000000 End 0x00000001bfdfffff Size   3070 MB
	2) Start 0x0000004040000000 End 0x00000040ffffffff Size   3072 MB

With the patch 1c30844d2dfe, the kernel will incorrectly reclaim the
first zone when it fills up, ignoring the fact that there are two
completely free zones. Basiscally, it limits cache size to 1GiB.

The parisc kernel is currently using the DISCONTIGMEM implementation,
but isn't NUMA. Avoid this issue or strange work-arounds by switching to
the more commonly used SPARSEMEM implementation.

Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Fixes: 1c30844d2dfe ("mm: reclaim small amounts of memory when an external fragmentation event occurs")
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/Kconfig                 |  12 ++---
 arch/parisc/include/asm/mmzone.h    |  58 +-------------------
 arch/parisc/include/asm/page.h      |   4 +-
 arch/parisc/include/asm/sparsemem.h |  14 +++++
 arch/parisc/kernel/parisc_ksyms.c   |   6 ---
 arch/parisc/mm/init.c               | 102 +++++++++++++++++-------------------
 6 files changed, 68 insertions(+), 128 deletions(-)
 create mode 100644 arch/parisc/include/asm/sparsemem.h

(limited to 'arch')

diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index c8038165b81f..26c215570adf 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -36,6 +36,7 @@ config PARISC
 	select GENERIC_STRNCPY_FROM_USER
 	select SYSCTL_ARCH_UNALIGN_ALLOW
 	select SYSCTL_EXCEPTION_TRACE
+	select ARCH_DISCARD_MEMBLOCK
 	select HAVE_MOD_ARCH_SPECIFIC
 	select VIRT_TO_BUS
 	select MODULES_USE_ELF_RELA
@@ -314,21 +315,16 @@ config ARCH_SELECT_MEMORY_MODEL
 	def_bool y
 	depends on 64BIT
 
-config ARCH_DISCONTIGMEM_ENABLE
+config ARCH_SPARSEMEM_ENABLE
 	def_bool y
 	depends on 64BIT
 
 config ARCH_FLATMEM_ENABLE
 	def_bool y
 
-config ARCH_DISCONTIGMEM_DEFAULT
+config ARCH_SPARSEMEM_DEFAULT
 	def_bool y
-	depends on ARCH_DISCONTIGMEM_ENABLE
-
-config NODES_SHIFT
-	int
-	default "3"
-	depends on NEED_MULTIPLE_NODES
+	depends on ARCH_SPARSEMEM_ENABLE
 
 source "kernel/Kconfig.hz"
 
diff --git a/arch/parisc/include/asm/mmzone.h b/arch/parisc/include/asm/mmzone.h
index fafa3893fd70..8d390406d862 100644
--- a/arch/parisc/include/asm/mmzone.h
+++ b/arch/parisc/include/asm/mmzone.h
@@ -2,62 +2,6 @@
 #ifndef _PARISC_MMZONE_H
 #define _PARISC_MMZONE_H
 
-#define MAX_PHYSMEM_RANGES 8 /* Fix the size for now (current known max is 3) */
+#define MAX_PHYSMEM_RANGES 4 /* Fix the size for now (current known max is 3) */
 
-#ifdef CONFIG_DISCONTIGMEM
-
-extern int npmem_ranges;
-
-struct node_map_data {
-    pg_data_t pg_data;
-};
-
-extern struct node_map_data node_data[];
-
-#define NODE_DATA(nid)          (&node_data[nid].pg_data)
-
-/* We have these possible memory map layouts:
- * Astro: 0-3.75, 67.75-68, 4-64
- * zx1: 0-1, 257-260, 4-256
- * Stretch (N-class): 0-2, 4-32, 34-xxx
- */
-
-/* Since each 1GB can only belong to one region (node), we can create
- * an index table for pfn to nid lookup; each entry in pfnnid_map 
- * represents 1GB, and contains the node that the memory belongs to. */
-
-#define PFNNID_SHIFT (30 - PAGE_SHIFT)
-#define PFNNID_MAP_MAX  512     /* support 512GB */
-extern signed char pfnnid_map[PFNNID_MAP_MAX];
-
-#ifndef CONFIG_64BIT
-#define pfn_is_io(pfn) ((pfn & (0xf0000000UL >> PAGE_SHIFT)) == (0xf0000000UL >> PAGE_SHIFT))
-#else
-/* io can be 0xf0f0f0f0f0xxxxxx or 0xfffffffff0000000 */
-#define pfn_is_io(pfn) ((pfn & (0xf000000000000000UL >> PAGE_SHIFT)) == (0xf000000000000000UL >> PAGE_SHIFT))
-#endif
-
-static inline int pfn_to_nid(unsigned long pfn)
-{
-	unsigned int i;
-
-	if (unlikely(pfn_is_io(pfn)))
-		return 0;
-
-	i = pfn >> PFNNID_SHIFT;
-	BUG_ON(i >= ARRAY_SIZE(pfnnid_map));
-
-	return pfnnid_map[i];
-}
-
-static inline int pfn_valid(int pfn)
-{
-	int nid = pfn_to_nid(pfn);
-
-	if (nid >= 0)
-		return (pfn < node_end_pfn(nid));
-	return 0;
-}
-
-#endif
 #endif /* _PARISC_MMZONE_H */
diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h
index b77f49ce6220..93caf17ac5e2 100644
--- a/arch/parisc/include/asm/page.h
+++ b/arch/parisc/include/asm/page.h
@@ -147,9 +147,9 @@ extern int npmem_ranges;
 #define __pa(x)			((unsigned long)(x)-PAGE_OFFSET)
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
 
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_SPARSEMEM
 #define pfn_valid(pfn)		((pfn) < max_mapnr)
-#endif /* CONFIG_DISCONTIGMEM */
+#endif
 
 #ifdef CONFIG_HUGETLB_PAGE
 #define HPAGE_SHIFT		PMD_SHIFT /* fixed for transparent huge pages */
diff --git a/arch/parisc/include/asm/sparsemem.h b/arch/parisc/include/asm/sparsemem.h
new file mode 100644
index 000000000000..b5c3a79045b4
--- /dev/null
+++ b/arch/parisc/include/asm/sparsemem.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_PARISC_SPARSEMEM_H
+#define ASM_PARISC_SPARSEMEM_H
+
+/* We have these possible memory map layouts:
+ * Astro: 0-3.75, 67.75-68, 4-64
+ * zx1: 0-1, 257-260, 4-256
+ * Stretch (N-class): 0-2, 4-32, 34-xxx
+ */
+
+#define MAX_PHYSMEM_BITS	39	/* 512 GB */
+#define SECTION_SIZE_BITS	27	/* 128 MB */
+
+#endif
diff --git a/arch/parisc/kernel/parisc_ksyms.c b/arch/parisc/kernel/parisc_ksyms.c
index 7baa2265d439..174213b1716e 100644
--- a/arch/parisc/kernel/parisc_ksyms.c
+++ b/arch/parisc/kernel/parisc_ksyms.c
@@ -138,12 +138,6 @@ extern void $$dyncall(void);
 EXPORT_SYMBOL($$dyncall);
 #endif
 
-#ifdef CONFIG_DISCONTIGMEM
-#include <asm/mmzone.h>
-EXPORT_SYMBOL(node_data);
-EXPORT_SYMBOL(pfnnid_map);
-#endif
-
 #ifdef CONFIG_FUNCTION_TRACER
 extern void _mcount(void);
 EXPORT_SYMBOL(_mcount);
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index b2b52de2b82b..513f747b0d9d 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -32,6 +32,7 @@
 #include <asm/mmzone.h>
 #include <asm/sections.h>
 #include <asm/msgbuf.h>
+#include <asm/sparsemem.h>
 
 extern int  data_start;
 extern void parisc_kernel_start(void);	/* Kernel entry point in head.S */
@@ -48,11 +49,6 @@ pmd_t pmd0[PTRS_PER_PMD] __attribute__ ((__section__ (".data..vm0.pmd"), aligned
 pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__ ((__section__ (".data..vm0.pgd"), aligned(PAGE_SIZE)));
 pte_t pg0[PT_INITIAL * PTRS_PER_PTE] __attribute__ ((__section__ (".data..vm0.pte"), aligned(PAGE_SIZE)));
 
-#ifdef CONFIG_DISCONTIGMEM
-struct node_map_data node_data[MAX_NUMNODES] __read_mostly;
-signed char pfnnid_map[PFNNID_MAP_MAX] __read_mostly;
-#endif
-
 static struct resource data_resource = {
 	.name	= "Kernel data",
 	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
@@ -76,11 +72,11 @@ static struct resource sysram_resources[MAX_PHYSMEM_RANGES] __read_mostly;
  * information retrieved in kernel/inventory.c.
  */
 
-physmem_range_t pmem_ranges[MAX_PHYSMEM_RANGES] __read_mostly;
-int npmem_ranges __read_mostly;
+physmem_range_t pmem_ranges[MAX_PHYSMEM_RANGES] __initdata;
+int npmem_ranges __initdata;
 
 #ifdef CONFIG_64BIT
-#define MAX_MEM         (~0UL)
+#define MAX_MEM         (1UL << MAX_PHYSMEM_BITS)
 #else /* !CONFIG_64BIT */
 #define MAX_MEM         (3584U*1024U*1024U)
 #endif /* !CONFIG_64BIT */
@@ -119,7 +115,7 @@ static void __init mem_limit_func(void)
 static void __init setup_bootmem(void)
 {
 	unsigned long mem_max;
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_SPARSEMEM
 	physmem_range_t pmem_holes[MAX_PHYSMEM_RANGES - 1];
 	int npmem_holes;
 #endif
@@ -137,23 +133,20 @@ static void __init setup_bootmem(void)
 		int j;
 
 		for (j = i; j > 0; j--) {
-			unsigned long tmp;
+			physmem_range_t tmp;
 
 			if (pmem_ranges[j-1].start_pfn <
 			    pmem_ranges[j].start_pfn) {
 
 				break;
 			}
-			tmp = pmem_ranges[j-1].start_pfn;
-			pmem_ranges[j-1].start_pfn = pmem_ranges[j].start_pfn;
-			pmem_ranges[j].start_pfn = tmp;
-			tmp = pmem_ranges[j-1].pages;
-			pmem_ranges[j-1].pages = pmem_ranges[j].pages;
-			pmem_ranges[j].pages = tmp;
+			tmp = pmem_ranges[j-1];
+			pmem_ranges[j-1] = pmem_ranges[j];
+			pmem_ranges[j] = tmp;
 		}
 	}
 
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_SPARSEMEM
 	/*
 	 * Throw out ranges that are too far apart (controlled by
 	 * MAX_GAP).
@@ -165,7 +158,7 @@ static void __init setup_bootmem(void)
 			 pmem_ranges[i-1].pages) > MAX_GAP) {
 			npmem_ranges = i;
 			printk("Large gap in memory detected (%ld pages). "
-			       "Consider turning on CONFIG_DISCONTIGMEM\n",
+			       "Consider turning on CONFIG_SPARSEMEM\n",
 			       pmem_ranges[i].start_pfn -
 			       (pmem_ranges[i-1].start_pfn +
 			        pmem_ranges[i-1].pages));
@@ -230,9 +223,8 @@ static void __init setup_bootmem(void)
 
 	printk(KERN_INFO "Total Memory: %ld MB\n",mem_max >> 20);
 
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_SPARSEMEM
 	/* Merge the ranges, keeping track of the holes */
-
 	{
 		unsigned long end_pfn;
 		unsigned long hole_pages;
@@ -255,18 +247,6 @@ static void __init setup_bootmem(void)
 	}
 #endif
 
-#ifdef CONFIG_DISCONTIGMEM
-	for (i = 0; i < MAX_PHYSMEM_RANGES; i++) {
-		memset(NODE_DATA(i), 0, sizeof(pg_data_t));
-	}
-	memset(pfnnid_map, 0xff, sizeof(pfnnid_map));
-
-	for (i = 0; i < npmem_ranges; i++) {
-		node_set_state(i, N_NORMAL_MEMORY);
-		node_set_online(i);
-	}
-#endif
-
 	/*
 	 * Initialize and free the full range of memory in each range.
 	 */
@@ -314,7 +294,7 @@ static void __init setup_bootmem(void)
 	memblock_reserve(__pa(KERNEL_BINARY_TEXT_START),
 			(unsigned long)(_end - KERNEL_BINARY_TEXT_START));
 
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_SPARSEMEM
 
 	/* reserve the holes */
 
@@ -360,6 +340,9 @@ static void __init setup_bootmem(void)
 
 	/* Initialize Page Deallocation Table (PDT) and check for bad memory. */
 	pdc_pdt_init();
+
+	memblock_allow_resize();
+	memblock_dump_all();
 }
 
 static int __init parisc_text_address(unsigned long vaddr)
@@ -713,37 +696,46 @@ static void __init gateway_init(void)
 		  PAGE_SIZE, PAGE_GATEWAY, 1);
 }
 
-void __init paging_init(void)
+static void __init parisc_bootmem_free(void)
 {
+	unsigned long zones_size[MAX_NR_ZONES] = { 0, };
+	unsigned long holes_size[MAX_NR_ZONES] = { 0, };
+	unsigned long mem_start_pfn = ~0UL, mem_end_pfn = 0, mem_size_pfn = 0;
 	int i;
 
+	for (i = 0; i < npmem_ranges; i++) {
+		unsigned long start = pmem_ranges[i].start_pfn;
+		unsigned long size = pmem_ranges[i].pages;
+		unsigned long end = start + size;
+
+		if (mem_start_pfn > start)
+			mem_start_pfn = start;
+		if (mem_end_pfn < end)
+			mem_end_pfn = end;
+		mem_size_pfn += size;
+	}
+
+	zones_size[0] = mem_end_pfn - mem_start_pfn;
+	holes_size[0] = zones_size[0] - mem_size_pfn;
+
+	free_area_init_node(0, zones_size, mem_start_pfn, holes_size);
+}
+
+void __init paging_init(void)
+{
 	setup_bootmem();
 	pagetable_init();
 	gateway_init();
 	flush_cache_all_local(); /* start with known state */
 	flush_tlb_all_local(NULL);
 
-	for (i = 0; i < npmem_ranges; i++) {
-		unsigned long zones_size[MAX_NR_ZONES] = { 0, };
-
-		zones_size[ZONE_NORMAL] = pmem_ranges[i].pages;
-
-#ifdef CONFIG_DISCONTIGMEM
-		/* Need to initialize the pfnnid_map before we can initialize
-		   the zone */
-		{
-		    int j;
-		    for (j = (pmem_ranges[i].start_pfn >> PFNNID_SHIFT);
-			 j <= ((pmem_ranges[i].start_pfn + pmem_ranges[i].pages) >> PFNNID_SHIFT);
-			 j++) {
-			pfnnid_map[j] = i;
-		    }
-		}
-#endif
-
-		free_area_init_node(i, zones_size,
-				pmem_ranges[i].start_pfn, NULL);
-	}
+	/*
+	 * Mark all memblocks as present for sparsemem using
+	 * memory_present() and then initialize sparsemem.
+	 */
+	memblocks_present();
+	sparse_init();
+	parisc_bootmem_free();
 }
 
 #ifdef CONFIG_PA20
-- 
cgit v1.2.3-59-g8ed1b


From 6c63ef80014b60ab13bee7e683d0a95521bb7fdd Mon Sep 17 00:00:00 2001
From: John David Anglin <dave.anglin@bell.net>
Date: Fri, 12 Apr 2019 19:12:04 -0400
Subject: parisc: Remove lock code to serialize TLB operations in pacache.S

TLB operations only need to be serialized on machines with the Merced
(Stretch) bus. The only machines in this category are L and N class, and
they require a 64-bit PA 2.0 kernel. On these machines, we use local TLB
purges in the tmpalias routines.
We don't need to serialize TLB purges on all other machines. Thus, the
lock/unlock code can be removed when CONFIG_PA20 is not defined.
Further, when CONFIG_PA20 is not defined, alternative patching converts
the TLB purges to local purges when PA 2.0 hardware has been detected.

Signed-off-by: John David Anglin <dave.anglin@bell.net>
Tested-By: Sven Schnelle <svens@stackframe.org>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/pacache.S | 43 -------------------------------------------
 1 file changed, 43 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/kernel/pacache.S b/arch/parisc/kernel/pacache.S
index 187f032c9dd8..4e4e8eb25874 100644
--- a/arch/parisc/kernel/pacache.S
+++ b/arch/parisc/kernel/pacache.S
@@ -311,39 +311,6 @@ fdsync:
 	nop
 ENDPROC_CFI(flush_data_cache_local)
 
-/* Macros to serialize TLB purge operations on SMP.  */
-
-	.macro	tlb_lock	la,flags,tmp
-#ifdef CONFIG_SMP
-98:
-#if __PA_LDCW_ALIGNMENT > 4
-	load32		pa_tlb_lock + __PA_LDCW_ALIGNMENT-1, \la
-	depi		0,31,__PA_LDCW_ALIGN_ORDER, \la
-#else
-	load32		pa_tlb_lock, \la
-#endif
-	rsm		PSW_SM_I,\flags
-1:	LDCW		0(\la),\tmp
-	cmpib,<>,n	0,\tmp,3f
-2:	ldw		0(\la),\tmp
-	cmpb,<>		%r0,\tmp,1b
-	nop
-	b,n		2b
-3:
-99:	ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP)
-#endif
-	.endm
-
-	.macro	tlb_unlock	la,flags,tmp
-#ifdef CONFIG_SMP
-98:	ldi		1,\tmp
-	sync
-	stw		\tmp,0(\la)
-	mtsm		\flags
-99:	ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP)
-#endif
-	.endm
-
 /* Clear page using kernel mapping.  */
 
 ENTRY_CFI(clear_page_asm)
@@ -601,10 +568,8 @@ ENTRY_CFI(copy_user_page_asm)
 	pdtlb,l		%r0(%r28)
 	pdtlb,l		%r0(%r29)
 #else
-	tlb_lock	%r20,%r21,%r22
 0:	pdtlb		%r0(%r28)
 1:	pdtlb		%r0(%r29)
-	tlb_unlock	%r20,%r21,%r22
 	ALTERNATIVE(0b, 0b+4, ALT_COND_NO_SMP, INSN_PxTLB)
 	ALTERNATIVE(1b, 1b+4, ALT_COND_NO_SMP, INSN_PxTLB)
 #endif
@@ -743,9 +708,7 @@ ENTRY_CFI(clear_user_page_asm)
 #ifdef CONFIG_PA20
 	pdtlb,l		%r0(%r28)
 #else
-	tlb_lock	%r20,%r21,%r22
 0:	pdtlb		%r0(%r28)
-	tlb_unlock	%r20,%r21,%r22
 	ALTERNATIVE(0b, 0b+4, ALT_COND_NO_SMP, INSN_PxTLB)
 #endif
 
@@ -821,9 +784,7 @@ ENTRY_CFI(flush_dcache_page_asm)
 #ifdef CONFIG_PA20
 	pdtlb,l		%r0(%r28)
 #else
-	tlb_lock	%r20,%r21,%r22
 0:	pdtlb		%r0(%r28)
-	tlb_unlock	%r20,%r21,%r22
 	ALTERNATIVE(0b, 0b+4, ALT_COND_NO_SMP, INSN_PxTLB)
 #endif
 
@@ -882,9 +843,7 @@ ENTRY_CFI(purge_dcache_page_asm)
 #ifdef CONFIG_PA20
 	pdtlb,l		%r0(%r28)
 #else
-	tlb_lock	%r20,%r21,%r22
 0:	pdtlb		%r0(%r28)
-	tlb_unlock	%r20,%r21,%r22
 	ALTERNATIVE(0b, 0b+4, ALT_COND_NO_SMP, INSN_PxTLB)
 #endif
 
@@ -948,10 +907,8 @@ ENTRY_CFI(flush_icache_page_asm)
 1:	pitlb,l         %r0(%sr4,%r28)
 	ALTERNATIVE(1b, 1b+4, ALT_COND_NO_SPLIT_TLB, INSN_NOP)
 #else
-	tlb_lock        %r20,%r21,%r22
 0:	pdtlb		%r0(%r28)
 1:	pitlb           %r0(%sr4,%r28)
-	tlb_unlock      %r20,%r21,%r22
 	ALTERNATIVE(0b, 0b+4, ALT_COND_NO_SMP, INSN_PxTLB)
 	ALTERNATIVE(1b, 1b+4, ALT_COND_NO_SMP, INSN_PxTLB)
 	ALTERNATIVE(1b, 1b+4, ALT_COND_NO_SPLIT_TLB, INSN_NOP)
-- 
cgit v1.2.3-59-g8ed1b


From 9e5c602186a692a7e848c0da17aed40f49d30519 Mon Sep 17 00:00:00 2001
From: John David Anglin <dave.anglin@bell.net>
Date: Sun, 14 Apr 2019 19:20:40 -0400
Subject: parisc: Use ldcw instruction for SMP spinlock release barrier

There are only a couple of instructions that can function as a memory
barrier on parisc.  Currently, we use the sync instruction as a memory
barrier when releasing a spinlock.  However, the ldcw instruction is a
better barrier when we have a handy memory location since it operates in
the cache on coherent machines.

This patch updates the spinlock release code to use ldcw.  I also
changed the "stw,ma" instructions to "stw" instructions as it is not an
adequate barrier.

Signed-off-by: John David Anglin <dave.anglin@bell.net>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/include/asm/spinlock.h |  4 ++++
 arch/parisc/kernel/entry.S         | 43 ++++++++++++++++++++------------------
 arch/parisc/kernel/syscall.S       | 16 ++++++++++----
 3 files changed, 39 insertions(+), 24 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h
index 8a63515f03bf..197d2247e4db 100644
--- a/arch/parisc/include/asm/spinlock.h
+++ b/arch/parisc/include/asm/spinlock.h
@@ -37,7 +37,11 @@ static inline void arch_spin_unlock(arch_spinlock_t *x)
 	volatile unsigned int *a;
 
 	a = __ldcw_align(x);
+#ifdef CONFIG_SMP
+	(void) __ldcw(a);
+#else
 	mb();
+#endif
 	*a = 1;
 }
 
diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
index d5eb19efa65b..5796524a3137 100644
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -471,8 +471,9 @@
 	nop
 	LDREG		0(\ptp),\pte
 	bb,<,n		\pte,_PAGE_PRESENT_BIT,3f
+	LDCW		0(\tmp),\tmp1
 	b		\fault
-	stw,ma		\spc,0(\tmp)
+	stw		\spc,0(\tmp)
 99:	ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP)
 #endif
 2:	LDREG		0(\ptp),\pte
@@ -481,20 +482,22 @@
 	.endm
 
 	/* Release pa_tlb_lock lock without reloading lock address. */
-	.macro		tlb_unlock0	spc,tmp
+	.macro		tlb_unlock0	spc,tmp,tmp1
 #ifdef CONFIG_SMP
 98:	or,COND(=)	%r0,\spc,%r0
-	stw,ma		\spc,0(\tmp)
+	LDCW		0(\tmp),\tmp1
+	or,COND(=)	%r0,\spc,%r0
+	stw		\spc,0(\tmp)
 99:	ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP)
 #endif
 	.endm
 
 	/* Release pa_tlb_lock lock. */
-	.macro		tlb_unlock1	spc,tmp
+	.macro		tlb_unlock1	spc,tmp,tmp1
 #ifdef CONFIG_SMP
 98:	load_pa_tlb_lock \tmp
 99:	ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP)
-	tlb_unlock0	\spc,\tmp
+	tlb_unlock0	\spc,\tmp,\tmp1
 #endif
 	.endm
 
@@ -1177,7 +1180,7 @@ dtlb_miss_20w:
 	
 	idtlbt          pte,prot
 
-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop
 
@@ -1203,7 +1206,7 @@ nadtlb_miss_20w:
 
 	idtlbt          pte,prot
 
-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop
 
@@ -1237,7 +1240,7 @@ dtlb_miss_11:
 
 	mtsp		t1, %sr1	/* Restore sr1 */
 
-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop
 
@@ -1270,7 +1273,7 @@ nadtlb_miss_11:
 
 	mtsp		t1, %sr1	/* Restore sr1 */
 
-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop
 
@@ -1299,7 +1302,7 @@ dtlb_miss_20:
 
 	idtlbt          pte,prot
 
-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop
 
@@ -1327,7 +1330,7 @@ nadtlb_miss_20:
 	
 	idtlbt		pte,prot
 
-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop
 
@@ -1434,7 +1437,7 @@ itlb_miss_20w:
 	
 	iitlbt          pte,prot
 
-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop
 
@@ -1458,7 +1461,7 @@ naitlb_miss_20w:
 
 	iitlbt          pte,prot
 
-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop
 
@@ -1492,7 +1495,7 @@ itlb_miss_11:
 
 	mtsp		t1, %sr1	/* Restore sr1 */
 
-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop
 
@@ -1516,7 +1519,7 @@ naitlb_miss_11:
 
 	mtsp		t1, %sr1	/* Restore sr1 */
 
-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop
 
@@ -1546,7 +1549,7 @@ itlb_miss_20:
 
 	iitlbt          pte,prot
 
-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop
 
@@ -1566,7 +1569,7 @@ naitlb_miss_20:
 
 	iitlbt          pte,prot
 
-	tlb_unlock1	spc,t0
+	tlb_unlock1	spc,t0,t1
 	rfir
 	nop
 
@@ -1596,7 +1599,7 @@ dbit_trap_20w:
 		
 	idtlbt          pte,prot
 
-	tlb_unlock0	spc,t0
+	tlb_unlock0	spc,t0,t1
 	rfir
 	nop
 #else
@@ -1622,7 +1625,7 @@ dbit_trap_11:
 
 	mtsp            t1, %sr1     /* Restore sr1 */
 
-	tlb_unlock0	spc,t0
+	tlb_unlock0	spc,t0,t1
 	rfir
 	nop
 
@@ -1642,7 +1645,7 @@ dbit_trap_20:
 	
 	idtlbt		pte,prot
 
-	tlb_unlock0	spc,t0
+	tlb_unlock0	spc,t0,t1
 	rfir
 	nop
 #endif
diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index 4f77bd9be66b..e2b4c8d81275 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -640,7 +640,9 @@ cas_action:
 	sub,<>	%r28, %r25, %r0
 2:	stw	%r24, 0(%r26)
 	/* Free lock */
-	sync
+#ifdef CONFIG_SMP
+	LDCW	0(%sr2,%r20), %r1			/* Barrier */
+#endif
 	stw	%r20, 0(%sr2,%r20)
 #if ENABLE_LWS_DEBUG
 	/* Clear thread register indicator */
@@ -655,7 +657,9 @@ cas_action:
 3:		
 	/* Error occurred on load or store */
 	/* Free lock */
-	sync
+#ifdef CONFIG_SMP
+	LDCW	0(%sr2,%r20), %r1			/* Barrier */
+#endif
 	stw	%r20, 0(%sr2,%r20)
 #if ENABLE_LWS_DEBUG
 	stw	%r0, 4(%sr2,%r20)
@@ -857,7 +861,9 @@ cas2_action:
 
 cas2_end:
 	/* Free lock */
-	sync
+#ifdef CONFIG_SMP
+	LDCW	0(%sr2,%r20), %r1			/* Barrier */
+#endif
 	stw	%r20, 0(%sr2,%r20)
 	/* Enable interrupts */
 	ssm	PSW_SM_I, %r0
@@ -868,7 +874,9 @@ cas2_end:
 22:
 	/* Error occurred on load or store */
 	/* Free lock */
-	sync
+#ifdef CONFIG_SMP
+	LDCW	0(%sr2,%r20), %r1			/* Barrier */
+#endif
 	stw	%r20, 0(%sr2,%r20)
 	ssm	PSW_SM_I, %r0
 	ldo	1(%r0),%r28
-- 
cgit v1.2.3-59-g8ed1b


From 44224bdb99150ad17cf394973b25736cb92c246a Mon Sep 17 00:00:00 2001
From: John David Anglin <dave.anglin@bell.net>
Date: Sun, 21 Apr 2019 19:47:17 -0400
Subject: parisc: Add memory clobber to TLB purges

The pdtlb and pitlb instructions are strongly ordered. The asms invoking
these instructions should be compiler memory barriers to ensure the
compiler doesn't reorder memory operations around these instructions.

Signed-off-by: John David Anglin <dave.anglin@bell.net>
CC: stable@vger.kernel.org # v4.20+
Fixes: 3847dab77421 ("parisc: Add alternative coding infrastructure")
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/include/asm/cache.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h
index 006fb939cac8..c18351cf5876 100644
--- a/arch/parisc/include/asm/cache.h
+++ b/arch/parisc/include/asm/cache.h
@@ -44,14 +44,14 @@ void parisc_setup_cache_timing(void);
 
 #define pdtlb(addr)	asm volatile("pdtlb 0(%%sr1,%0)" \
 			ALTERNATIVE(ALT_COND_NO_SMP, INSN_PxTLB) \
-			: : "r" (addr))
+			: : "r" (addr) : "memory")
 #define pitlb(addr)	asm volatile("pitlb 0(%%sr1,%0)" \
 			ALTERNATIVE(ALT_COND_NO_SMP, INSN_PxTLB) \
 			ALTERNATIVE(ALT_COND_NO_SPLIT_TLB, INSN_NOP) \
-			: : "r" (addr))
+			: : "r" (addr) : "memory")
 #define pdtlb_kernel(addr)  asm volatile("pdtlb 0(%0)"   \
 			ALTERNATIVE(ALT_COND_NO_SMP, INSN_PxTLB) \
-			: : "r" (addr))
+			: : "r" (addr) : "memory")
 
 #define asm_io_fdc(addr) asm volatile("fdc %%r0(%0)" \
 			ALTERNATIVE(ALT_COND_NO_DCACHE, INSN_NOP) \
-- 
cgit v1.2.3-59-g8ed1b


From 2d94a832e246ac00fd32eec241e6f1aa6fbc5700 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Sat, 27 Apr 2019 23:57:49 +0200
Subject: parisc: Add memory barrier to asm pdc and sync instructions

Add compiler memory barriers to ensure the compiler doesn't reorder memory
operations around these instructions.

Cc: stable@vger.kernel.org # v4.20+
Fixes: 3847dab77421 ("parisc: Add alternative coding infrastructure")
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/include/asm/cache.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h
index c18351cf5876..4016fe1c65a9 100644
--- a/arch/parisc/include/asm/cache.h
+++ b/arch/parisc/include/asm/cache.h
@@ -56,10 +56,10 @@ void parisc_setup_cache_timing(void);
 #define asm_io_fdc(addr) asm volatile("fdc %%r0(%0)" \
 			ALTERNATIVE(ALT_COND_NO_DCACHE, INSN_NOP) \
 			ALTERNATIVE(ALT_COND_NO_IOC_FDC, INSN_NOP) \
-			: : "r" (addr))
+			: : "r" (addr) : "memory")
 #define asm_io_sync()	asm volatile("sync" \
 			ALTERNATIVE(ALT_COND_NO_DCACHE, INSN_NOP) \
-			ALTERNATIVE(ALT_COND_NO_IOC_FDC, INSN_NOP) :: )
+			ALTERNATIVE(ALT_COND_NO_IOC_FDC, INSN_NOP) :::"memory")
 
 #endif /* ! __ASSEMBLY__ */
 
-- 
cgit v1.2.3-59-g8ed1b


From d19a12906e5e558c0f6b6cfece7b7caf1012ef95 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Wed, 1 May 2019 14:59:58 +0200
Subject: parisc: Allow live-patching of __meminit functions

When making the text sections writeable with set_kernel_text_rw(1),
include all text sections including those in the __init section.
Otherwise functions marked with __meminit will stay read-only.

Signed-off-by: Helge Deller <deller@gmx.de>
Cc: <stable@vger.kernel.org>	# 4.20+
---
 arch/parisc/mm/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 513f747b0d9d..3b0f9eab7f2c 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -478,7 +478,7 @@ static void __init map_pages(unsigned long start_vaddr,
 
 void __init set_kernel_text_rw(int enable_read_write)
 {
-	unsigned long start = (unsigned long) _text;
+	unsigned long start = (unsigned long) __init_begin;
 	unsigned long end   = (unsigned long) &data_start;
 
 	map_pages(start, __pa(start), end-start,
-- 
cgit v1.2.3-59-g8ed1b


From b37d1c1898b288c69f3dc9267bc2c41af06f4a4b Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sun, 28 Apr 2019 00:09:53 +0200
Subject: parisc: Use per-pagetable spinlock

PA-RISC uses a global spinlock to protect pagetable updates in the TLB
fault handlers. When multiple cores are taking TLB faults simultaneously,
the cache line containing the spinlock becomes a bottleneck.

This patch embeds the spinlock in the top level page directory, so that
every process has its own lock. It improves performance by 30% when
doing parallel compilations.

At least on the N class systems, only one PxTLB inter processor
broadcast can be active at any one time on the Merced bus. If a Merced
bus is found, this patch serializes the TLB flushes with the
pa_tlb_flush_lock spinlock.

v1: Initial patch by Mikulas
v2: Added Merced detection by Helge
v3: Revised TLB serialization by Dave & Helge

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: John David Anglin <dave.anglin@bell.net>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/include/asm/hardware.h |  2 +-
 arch/parisc/include/asm/pgalloc.h  |  1 +
 arch/parisc/include/asm/pgtable.h  | 69 +++++++++++++++++++++++++++++---------
 arch/parisc/include/asm/tlbflush.h | 24 +------------
 arch/parisc/kernel/cache.c         | 15 ++++++---
 arch/parisc/kernel/drivers.c       | 25 ++++++++++++++
 arch/parisc/kernel/entry.S         |  8 ++---
 arch/parisc/kernel/inventory.c     |  7 ++++
 arch/parisc/kernel/setup.c         |  6 ++++
 9 files changed, 108 insertions(+), 49 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/include/asm/hardware.h b/arch/parisc/include/asm/hardware.h
index d6e1ed145031..9d3d7737c58b 100644
--- a/arch/parisc/include/asm/hardware.h
+++ b/arch/parisc/include/asm/hardware.h
@@ -120,7 +120,7 @@ extern void get_pci_node_path(struct pci_dev *dev, struct hardware_path *path);
 extern void init_parisc_bus(void);
 extern struct device *hwpath_to_device(struct hardware_path *modpath);
 extern void device_to_hwpath(struct device *dev, struct hardware_path *path);
-
+extern int machine_has_merced_bus(void);
 
 /* inventory.c: */
 extern void do_memory_inventory(void);
diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h
index d05c678c77c4..ea75cc966dae 100644
--- a/arch/parisc/include/asm/pgalloc.h
+++ b/arch/parisc/include/asm/pgalloc.h
@@ -41,6 +41,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 		__pgd_val_set(*pgd, PxD_FLAG_ATTACHED);
 #endif
 	}
+	spin_lock_init(pgd_spinlock(actual_pgd));
 	return actual_pgd;
 }
 
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index c7bb74e22436..a39b079e73f2 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -17,7 +17,7 @@
 #include <asm/processor.h>
 #include <asm/cache.h>
 
-extern spinlock_t pa_tlb_lock;
+static inline spinlock_t *pgd_spinlock(pgd_t *);
 
 /*
  * kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel
@@ -34,16 +34,46 @@ extern spinlock_t pa_tlb_lock;
  */
 #define kern_addr_valid(addr)	(1)
 
-/* Purge data and instruction TLB entries.  Must be called holding
- * the pa_tlb_lock.  The TLB purge instructions are slow on SMP
- * machines since the purge must be broadcast to all CPUs.
+/* This is for the serialization of PxTLB broadcasts. At least on the N class
+ * systems, only one PxTLB inter processor broadcast can be active at any one
+ * time on the Merced bus.
+
+ * PTE updates are protected by locks in the PMD.
+ */
+extern spinlock_t pa_tlb_flush_lock;
+extern spinlock_t pa_swapper_pg_lock;
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+extern int pa_serialize_tlb_flushes;
+#else
+#define pa_serialize_tlb_flushes        (0)
+#endif
+
+#define purge_tlb_start(flags)  do { \
+	if (pa_serialize_tlb_flushes)	\
+		spin_lock_irqsave(&pa_tlb_flush_lock, flags); \
+	else \
+		local_irq_save(flags);	\
+	} while (0)
+#define purge_tlb_end(flags)	do { \
+	if (pa_serialize_tlb_flushes)	\
+		spin_unlock_irqrestore(&pa_tlb_flush_lock, flags); \
+	else \
+		local_irq_restore(flags); \
+	} while (0)
+
+/* Purge data and instruction TLB entries. The TLB purge instructions
+ * are slow on SMP machines since the purge must be broadcast to all CPUs.
  */
 
 static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr)
 {
+	unsigned long flags;
+
+	purge_tlb_start(flags);
 	mtsp(mm->context, 1);
 	pdtlb(addr);
 	pitlb(addr);
+	purge_tlb_end(flags);
 }
 
 /* Certain architectures need to do special things when PTEs
@@ -59,11 +89,11 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr)
 	do {							\
 		pte_t old_pte;					\
 		unsigned long flags;				\
-		spin_lock_irqsave(&pa_tlb_lock, flags);		\
+		spin_lock_irqsave(pgd_spinlock((mm)->pgd), flags);\
 		old_pte = *ptep;				\
 		set_pte(ptep, pteval);				\
 		purge_tlb_entries(mm, addr);			\
-		spin_unlock_irqrestore(&pa_tlb_lock, flags);	\
+		spin_unlock_irqrestore(pgd_spinlock((mm)->pgd), flags);\
 	} while (0)
 
 #endif /* !__ASSEMBLY__ */
@@ -88,10 +118,10 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr)
 #if CONFIG_PGTABLE_LEVELS == 3
 #define PGD_ORDER	1 /* Number of pages per pgd */
 #define PMD_ORDER	1 /* Number of pages per pmd */
-#define PGD_ALLOC_ORDER	2 /* first pgd contains pmd */
+#define PGD_ALLOC_ORDER	(2 + 1) /* first pgd contains pmd */
 #else
 #define PGD_ORDER	1 /* Number of pages per pgd */
-#define PGD_ALLOC_ORDER	PGD_ORDER
+#define PGD_ALLOC_ORDER	(PGD_ORDER + 1)
 #endif
 
 /* Definitions for 3rd level (we use PLD here for Page Lower directory
@@ -459,6 +489,15 @@ extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *);
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val })
 
+
+static inline spinlock_t *pgd_spinlock(pgd_t *pgd)
+{
+	if (unlikely(pgd == swapper_pg_dir))
+		return &pa_swapper_pg_lock;
+	return (spinlock_t *)((char *)pgd + (PAGE_SIZE << (PGD_ALLOC_ORDER - 1)));
+}
+
+
 static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
 {
 	pte_t pte;
@@ -467,15 +506,15 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned
 	if (!pte_young(*ptep))
 		return 0;
 
-	spin_lock_irqsave(&pa_tlb_lock, flags);
+	spin_lock_irqsave(pgd_spinlock(vma->vm_mm->pgd), flags);
 	pte = *ptep;
 	if (!pte_young(pte)) {
-		spin_unlock_irqrestore(&pa_tlb_lock, flags);
+		spin_unlock_irqrestore(pgd_spinlock(vma->vm_mm->pgd), flags);
 		return 0;
 	}
 	set_pte(ptep, pte_mkold(pte));
 	purge_tlb_entries(vma->vm_mm, addr);
-	spin_unlock_irqrestore(&pa_tlb_lock, flags);
+	spin_unlock_irqrestore(pgd_spinlock(vma->vm_mm->pgd), flags);
 	return 1;
 }
 
@@ -485,11 +524,11 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 	pte_t old_pte;
 	unsigned long flags;
 
-	spin_lock_irqsave(&pa_tlb_lock, flags);
+	spin_lock_irqsave(pgd_spinlock(mm->pgd), flags);
 	old_pte = *ptep;
 	set_pte(ptep, __pte(0));
 	purge_tlb_entries(mm, addr);
-	spin_unlock_irqrestore(&pa_tlb_lock, flags);
+	spin_unlock_irqrestore(pgd_spinlock(mm->pgd), flags);
 
 	return old_pte;
 }
@@ -497,10 +536,10 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
 	unsigned long flags;
-	spin_lock_irqsave(&pa_tlb_lock, flags);
+	spin_lock_irqsave(pgd_spinlock(mm->pgd), flags);
 	set_pte(ptep, pte_wrprotect(*ptep));
 	purge_tlb_entries(mm, addr);
-	spin_unlock_irqrestore(&pa_tlb_lock, flags);
+	spin_unlock_irqrestore(pgd_spinlock(mm->pgd), flags);
 }
 
 #define pte_same(A,B)	(pte_val(A) == pte_val(B))
diff --git a/arch/parisc/include/asm/tlbflush.h b/arch/parisc/include/asm/tlbflush.h
index 6804374efa66..c5ded01d45be 100644
--- a/arch/parisc/include/asm/tlbflush.h
+++ b/arch/parisc/include/asm/tlbflush.h
@@ -8,21 +8,6 @@
 #include <linux/sched.h>
 #include <asm/mmu_context.h>
 
-
-/* This is for the serialisation of PxTLB broadcasts.  At least on the
- * N class systems, only one PxTLB inter processor broadcast can be
- * active at any one time on the Merced bus.  This tlb purge
- * synchronisation is fairly lightweight and harmless so we activate
- * it on all systems not just the N class.
-
- * It is also used to ensure PTE updates are atomic and consistent
- * with the TLB.
- */
-extern spinlock_t pa_tlb_lock;
-
-#define purge_tlb_start(flags)	spin_lock_irqsave(&pa_tlb_lock, flags)
-#define purge_tlb_end(flags)	spin_unlock_irqrestore(&pa_tlb_lock, flags)
-
 extern void flush_tlb_all(void);
 extern void flush_tlb_all_local(void *);
 
@@ -79,13 +64,6 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
 static inline void flush_tlb_page(struct vm_area_struct *vma,
 	unsigned long addr)
 {
-	unsigned long flags, sid;
-
-	sid = vma->vm_mm->context;
-	purge_tlb_start(flags);
-	mtsp(sid, 1);
-	pdtlb(addr);
-	pitlb(addr);
-	purge_tlb_end(flags);
+	purge_tlb_entries(vma->vm_mm, addr);
 }
 #endif
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index 804880efa11e..0338561968a4 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -40,12 +40,19 @@ void purge_dcache_page_asm(unsigned long phys_addr, unsigned long vaddr);
 void flush_icache_page_asm(unsigned long phys_addr, unsigned long vaddr);
 
 
-/* On some machines (e.g. ones with the Merced bus), there can be
+/* On some machines (i.e., ones with the Merced bus), there can be
  * only a single PxTLB broadcast at a time; this must be guaranteed
- * by software.  We put a spinlock around all TLB flushes  to
- * ensure this.
+ * by software. We need a spinlock around all TLB flushes to ensure
+ * this.
  */
-DEFINE_SPINLOCK(pa_tlb_lock);
+DEFINE_SPINLOCK(pa_tlb_flush_lock);
+
+/* Swapper page setup lock. */
+DEFINE_SPINLOCK(pa_swapper_pg_lock);
+
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+int pa_serialize_tlb_flushes __read_mostly;
+#endif
 
 struct pdc_cache_info cache_info __read_mostly;
 #ifndef CONFIG_PA20
diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c
index 5eb979d04b90..15e7b3be7b6b 100644
--- a/arch/parisc/kernel/drivers.c
+++ b/arch/parisc/kernel/drivers.c
@@ -38,6 +38,7 @@
 #include <asm/io.h>
 #include <asm/pdc.h>
 #include <asm/parisc-device.h>
+#include <asm/ropes.h>
 
 /* See comments in include/asm-parisc/pci.h */
 const struct dma_map_ops *hppa_dma_ops __read_mostly;
@@ -257,6 +258,30 @@ static struct parisc_device *find_device_by_addr(unsigned long hpa)
 	return ret ? d.dev : NULL;
 }
 
+static int __init is_IKE_device(struct device *dev, void *data)
+{
+	struct parisc_device *pdev = to_parisc_device(dev);
+
+	if (!check_dev(dev))
+		return 0;
+	if (pdev->id.hw_type != HPHW_BCPORT)
+		return 0;
+	if (IS_IKE(pdev) ||
+		(pdev->id.hversion == REO_MERCED_PORT) ||
+		(pdev->id.hversion == REOG_MERCED_PORT)) {
+			return 1;
+	}
+	return 0;
+}
+
+int __init machine_has_merced_bus(void)
+{
+	int ret;
+
+	ret = for_each_padev(is_IKE_device, NULL);
+	return ret ? 1 : 0;
+}
+
 /**
  * find_pa_parent_type - Find a parent of a specific type
  * @dev: The device to start searching from
diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
index 5796524a3137..a1fc04570ade 100644
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -50,12 +50,8 @@
 
 	.import		pa_tlb_lock,data
 	.macro  load_pa_tlb_lock reg
-#if __PA_LDCW_ALIGNMENT > 4
-	load32	PA(pa_tlb_lock) + __PA_LDCW_ALIGNMENT-1, \reg
-	depi	0,31,__PA_LDCW_ALIGN_ORDER, \reg
-#else
-	load32	PA(pa_tlb_lock), \reg
-#endif
+	mfctl		%cr25,\reg
+	addil		L%(PAGE_SIZE << (PGD_ALLOC_ORDER - 1)),\reg
 	.endm
 
 	/* space_to_prot macro creates a prot id from a space id */
diff --git a/arch/parisc/kernel/inventory.c b/arch/parisc/kernel/inventory.c
index 35d05fdd7483..6f2d611347a1 100644
--- a/arch/parisc/kernel/inventory.c
+++ b/arch/parisc/kernel/inventory.c
@@ -31,6 +31,7 @@
 #include <asm/processor.h>
 #include <asm/page.h>
 #include <asm/parisc-device.h>
+#include <asm/tlbflush.h>
 
 /*
 ** Debug options
@@ -638,4 +639,10 @@ void __init do_device_inventory(void)
 	}
 	printk(KERN_INFO "Found devices:\n");
 	print_parisc_devices();
+
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+	pa_serialize_tlb_flushes = machine_has_merced_bus();
+	if (pa_serialize_tlb_flushes)
+		pr_info("Merced bus found: Enable PxTLB serialization.\n");
+#endif
 }
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index d908058d05c1..e05cb2a5c16d 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -343,6 +343,12 @@ static int __init parisc_init(void)
 			boot_cpu_data.cpu_hz / 1000000,
 			boot_cpu_data.cpu_hz % 1000000	);
 
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+	/* Don't serialize TLB flushes if we run on one CPU only. */
+	if (num_online_cpus() == 1)
+		pa_serialize_tlb_flushes = 0;
+#endif
+
 	apply_alternatives_all();
 	parisc_setup_cache_timing();
 
-- 
cgit v1.2.3-59-g8ed1b


From 11c03dc85f02901ce9b6a1ced099fdcaeb188572 Mon Sep 17 00:00:00 2001
From: John David Anglin <dave.anglin@bell.net>
Date: Sat, 27 Apr 2019 14:15:38 -0400
Subject: parisc: Update huge TLB page support to use per-pagetable spinlock

This patch updates the parisc huge TLB page support to use per-pagetable spinlocks.

This patch requires Mikulas' per-pagetable spinlock patch and the revised TLB
serialization patch from Helge and myself.  With Mikulas' patch, we need to use
the per-pagetable spinlock for page table updates.  The TLB lock is only used
to serialize TLB flushes on machines with the Merced bus.

Signed-off-by: John David Anglin <dave.anglin@bell.net>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/mm/hugetlbpage.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c
index d77479ae3af2..d578809e55cf 100644
--- a/arch/parisc/mm/hugetlbpage.c
+++ b/arch/parisc/mm/hugetlbpage.c
@@ -139,9 +139,9 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 {
 	unsigned long flags;
 
-	purge_tlb_start(flags);
+	spin_lock_irqsave(pgd_spinlock((mm)->pgd), flags);
 	__set_huge_pte_at(mm, addr, ptep, entry);
-	purge_tlb_end(flags);
+	spin_unlock_irqrestore(pgd_spinlock((mm)->pgd), flags);
 }
 
 
@@ -151,10 +151,10 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 	unsigned long flags;
 	pte_t entry;
 
-	purge_tlb_start(flags);
+	spin_lock_irqsave(pgd_spinlock((mm)->pgd), flags);
 	entry = *ptep;
 	__set_huge_pte_at(mm, addr, ptep, __pte(0));
-	purge_tlb_end(flags);
+	spin_unlock_irqrestore(pgd_spinlock((mm)->pgd), flags);
 
 	return entry;
 }
@@ -166,10 +166,10 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
 	unsigned long flags;
 	pte_t old_pte;
 
-	purge_tlb_start(flags);
+	spin_lock_irqsave(pgd_spinlock((mm)->pgd), flags);
 	old_pte = *ptep;
 	__set_huge_pte_at(mm, addr, ptep, pte_wrprotect(old_pte));
-	purge_tlb_end(flags);
+	spin_unlock_irqrestore(pgd_spinlock((mm)->pgd), flags);
 }
 
 int huge_ptep_set_access_flags(struct vm_area_struct *vma,
@@ -178,13 +178,14 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 {
 	unsigned long flags;
 	int changed;
+	struct mm_struct *mm = vma->vm_mm;
 
-	purge_tlb_start(flags);
+	spin_lock_irqsave(pgd_spinlock((mm)->pgd), flags);
 	changed = !pte_same(*ptep, pte);
 	if (changed) {
-		__set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+		__set_huge_pte_at(mm, addr, ptep, pte);
 	}
-	purge_tlb_end(flags);
+	spin_unlock_irqrestore(pgd_spinlock((mm)->pgd), flags);
 	return changed;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 30c8f4e411fb76f752a193bd945d4b9ef06c2d87 Mon Sep 17 00:00:00 2001
From: Serge Semin <fancer.lancer@gmail.com>
Date: Fri, 3 May 2019 20:50:37 +0300
Subject: mips: Dump memblock regions for debugging

It is useful to have the whole memblock memory space printed to console
when basic memlock initializations are done. It can be performed by
ready-to-use method memblock_dump_all(), which prints the available
and reserved memory spaces if memblock=debug kernel parameter is
specified. Lets call it at the very end of arch_mem_init() function,
when all memblock memory and reserved regions are defined, but before
any serious allocation is performed.

Signed-off-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Cc: Huacai Chen <chenhc@lemote.com>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Serge Semin <Sergey.Semin@t-platforms.ru>
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/mips/kernel/setup.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 2a1b2e7a1bc9..ca493fdf69b0 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -824,6 +824,8 @@ static void __init arch_mem_init(char **cmdline_p)
 	/* Reserve for hibernation. */
 	memblock_reserve(__pa_symbol(&__nosave_begin),
 		__pa_symbol(&__nosave_end) - __pa_symbol(&__nosave_begin));
+
+	memblock_dump_all();
 }
 
 static void __init resource_init(void)
-- 
cgit v1.2.3-59-g8ed1b


From 2f5bd0367e7a9e5f5a150500e016a9cb7042803b Mon Sep 17 00:00:00 2001
From: Serge Semin <fancer.lancer@gmail.com>
Date: Fri, 3 May 2019 20:50:38 +0300
Subject: mips: Perform early low memory test

memblock subsystem provides a method to optionally test the passed
memory region in case if it was requested via special kernel boot
argument. Lets add the function at the bottom of the arch_mem_init()
method. Testing at this point in the boot sequence should be safe since all
critical areas are now reserved and a minimum of allocations have been
done.

Reviewed-by: Matt Redfearn <matt.redfearn@mips.com>
Signed-off-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Cc: Huacai Chen <chenhc@lemote.com>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Serge Semin <Sergey.Semin@t-platforms.ru>
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/mips/kernel/setup.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index ca493fdf69b0..fbd216b4e929 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -826,6 +826,8 @@ static void __init arch_mem_init(char **cmdline_p)
 		__pa_symbol(&__nosave_end) - __pa_symbol(&__nosave_begin));
 
 	memblock_dump_all();
+
+	early_memtest(PFN_PHYS(min_low_pfn), PFN_PHYS(max_low_pfn));
 }
 
 static void __init resource_init(void)
-- 
cgit v1.2.3-59-g8ed1b


From aa2ecb7c8f9519c8734c5b6fa57570ed6197de5e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 3 May 2019 20:21:33 +0200
Subject: um: vector netdev: adjust to xmit_more API change

Replace skb->xmit_more usage by netdev_xmit_more().

Fixes: 4f296edeb9d4 ("drivers: net: aurora: use netdev_xmit_more helper")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/um/drivers/vector_kern.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c
index 596e7056f376..e190e4ca52e1 100644
--- a/arch/um/drivers/vector_kern.c
+++ b/arch/um/drivers/vector_kern.c
@@ -1043,7 +1043,7 @@ static int vector_net_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		vector_send(vp->tx_queue);
 		return NETDEV_TX_OK;
 	}
-	if (skb->xmit_more) {
+	if (netdev_xmit_more()) {
 		mod_timer(&vp->tl, vp->coalesce);
 		return NETDEV_TX_OK;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 1829dda0e87f4462782ca81be474c7890efe31ce Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Sun, 5 May 2019 23:54:34 +0200
Subject: parisc: Rename LEVEL to PA_ASM_LEVEL to avoid name clash with DRBD
 code

LEVEL is a very common word, and now after many years it suddenly
clashed with another LEVEL define in the DRBD code.
Rename it to PA_ASM_LEVEL instead.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Helge Deller <deller@gmx.de>
Cc: <stable@vger.kernel.org>
---
 arch/parisc/include/asm/assembly.h | 6 +++---
 arch/parisc/kernel/head.S          | 4 ++--
 arch/parisc/kernel/syscall.S       | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/include/asm/assembly.h b/arch/parisc/include/asm/assembly.h
index c17ec0ee6e7c..d85738a7bbe6 100644
--- a/arch/parisc/include/asm/assembly.h
+++ b/arch/parisc/include/asm/assembly.h
@@ -61,14 +61,14 @@
 #define LDCW		ldcw,co
 #define BL		b,l
 # ifdef CONFIG_64BIT
-#  define LEVEL		2.0w
+#  define PA_ASM_LEVEL	2.0w
 # else
-#  define LEVEL		2.0
+#  define PA_ASM_LEVEL	2.0
 # endif
 #else
 #define LDCW		ldcw
 #define BL		bl
-#define LEVEL		1.1
+#define PA_ASM_LEVEL	1.1
 #endif
 
 #ifdef __ASSEMBLY__
diff --git a/arch/parisc/kernel/head.S b/arch/parisc/kernel/head.S
index fefaba2097b5..d12de2a13753 100644
--- a/arch/parisc/kernel/head.S
+++ b/arch/parisc/kernel/head.S
@@ -22,7 +22,7 @@
 #include <linux/linkage.h>
 #include <linux/init.h>
 
-	.level	LEVEL
+	.level	PA_ASM_LEVEL
 
 	__INITDATA
 ENTRY(boot_args)
@@ -258,7 +258,7 @@ stext_pdc_ret:
 	ldo		R%PA(fault_vector_11)(%r10),%r10
 
 $is_pa20:
-	.level		LEVEL /* restore 1.1 || 2.0w */
+	.level		PA_ASM_LEVEL /* restore 1.1 || 2.0w */
 #endif /*!CONFIG_64BIT*/
 	load32		PA(fault_vector_20),%r10
 
diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index e2b4c8d81275..e54d5e4d3489 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -48,7 +48,7 @@ registers).
 	 */
 #define KILL_INSN	break	0,0
 
-	.level          LEVEL
+	.level          PA_ASM_LEVEL
 
 	.text
 
-- 
cgit v1.2.3-59-g8ed1b


From bdca5d64ee92abeacd6dada0bc6f6f8e6350dd67 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Sun, 5 May 2019 23:55:02 +0200
Subject: parisc: Use PA_ASM_LEVEL in boot code

The LEVEL define clashed with the DRBD code.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Helge Deller <deller@gmx.de>
Cc: <stable@vger.kernel.org> # v4.14+
---
 arch/parisc/boot/compressed/head.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/boot/compressed/head.S b/arch/parisc/boot/compressed/head.S
index 5aba20fa48aa..e8b798fd0cf0 100644
--- a/arch/parisc/boot/compressed/head.S
+++ b/arch/parisc/boot/compressed/head.S
@@ -22,7 +22,7 @@
 	__HEAD
 
 ENTRY(startup)
-	 .level LEVEL
+	 .level PA_ASM_LEVEL
 
 #define PSW_W_SM	0x200
 #define PSW_W_BIT       36
@@ -63,7 +63,7 @@ $bss_loop:
 	load32	BOOTADDR(decompress_kernel),%r3
 
 #ifdef CONFIG_64BIT
-	.level LEVEL
+	.level PA_ASM_LEVEL
 	ssm	PSW_W_SM, %r0		/* set W-bit */
 	depdi	0, 31, 32, %r3
 #endif
@@ -72,7 +72,7 @@ $bss_loop:
 
 startup_continue:
 #ifdef CONFIG_64BIT
-	.level LEVEL
+	.level PA_ASM_LEVEL
 	rsm	PSW_W_SM, %r0		/* clear W-bit */
 #endif
 
-- 
cgit v1.2.3-59-g8ed1b


From 62217beb394e654bbd2bb87c533dadd2d8bf62c6 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 3 May 2019 23:51:00 +0200
Subject: parisc: Add static branch and JUMP_LABEL feature

Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/Kconfig                  |  2 ++
 arch/parisc/include/asm/jump_label.h | 43 ++++++++++++++++++++++++++++
 arch/parisc/kernel/Makefile          |  1 +
 arch/parisc/kernel/jump_label.c      | 55 ++++++++++++++++++++++++++++++++++++
 arch/parisc/kernel/vmlinux.lds.S     |  3 ++
 5 files changed, 104 insertions(+)
 create mode 100644 arch/parisc/include/asm/jump_label.h
 create mode 100644 arch/parisc/kernel/jump_label.c

(limited to 'arch')

diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 26c215570adf..c971256a74d2 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -45,6 +45,8 @@ config PARISC
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_HASH
+	select HAVE_ARCH_JUMP_LABEL
+	select HAVE_ARCH_JUMP_LABEL_RELATIVE
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/arch/parisc/include/asm/jump_label.h b/arch/parisc/include/asm/jump_label.h
new file mode 100644
index 000000000000..7efb1aa2f7f8
--- /dev/null
+++ b/arch/parisc/include/asm/jump_label.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_PARISC_JUMP_LABEL_H
+#define _ASM_PARISC_JUMP_LABEL_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+#include <asm/assembly.h>
+
+#define JUMP_LABEL_NOP_SIZE 4
+
+static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
+{
+	asm_volatile_goto("1:\n\t"
+		 "nop\n\t"
+		 ".pushsection __jump_table,  \"aw\"\n\t"
+		 ".word 1b - ., %l[l_yes] - .\n\t"
+		 __stringify(ASM_ULONG_INSN) " %c0 - .\n\t"
+		 ".popsection\n\t"
+		 : :  "i" (&((char *)key)[branch]) :  : l_yes);
+
+	return false;
+l_yes:
+	return true;
+}
+
+static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
+{
+	asm_volatile_goto("1:\n\t"
+		 "b,n %l[l_yes]\n\t"
+		 ".pushsection __jump_table,  \"aw\"\n\t"
+		 ".word 1b - ., %l[l_yes] - .\n\t"
+		 __stringify(ASM_ULONG_INSN) " %c0 - .\n\t"
+		 ".popsection\n\t"
+		 : :  "i" (&((char *)key)[branch]) :  : l_yes);
+
+	return false;
+l_yes:
+	return true;
+}
+
+#endif  /* __ASSEMBLY__ */
+#endif
diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile
index b818b28c8a99..fc0df5c44468 100644
--- a/arch/parisc/kernel/Makefile
+++ b/arch/parisc/kernel/Makefile
@@ -33,5 +33,6 @@ obj-$(CONFIG_64BIT)	+= perf.o perf_asm.o $(obj64-y)
 obj-$(CONFIG_PARISC_CPU_TOPOLOGY)	+= topology.o
 obj-$(CONFIG_FUNCTION_TRACER)		+= ftrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
+obj-$(CONFIG_JUMP_LABEL)		+= jump_label.o
 obj-$(CONFIG_KGDB)			+= kgdb.o
 obj-$(CONFIG_KPROBES)			+= kprobes.o
diff --git a/arch/parisc/kernel/jump_label.c b/arch/parisc/kernel/jump_label.c
new file mode 100644
index 000000000000..d2f3cb12e282
--- /dev/null
+++ b/arch/parisc/kernel/jump_label.c
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Helge Deller <deller@gmx.de>
+ *
+ * Based on arch/arm64/kernel/jump_label.c
+ */
+#include <linux/kernel.h>
+#include <linux/jump_label.h>
+#include <linux/bug.h>
+#include <asm/alternative.h>
+#include <asm/patch.h>
+
+static inline int reassemble_17(int as17)
+{
+	return (((as17 & 0x10000) >> 16) |
+		((as17 & 0x0f800) << 5) |
+		((as17 & 0x00400) >> 8) |
+		((as17 & 0x003ff) << 3));
+}
+
+void arch_jump_label_transform(struct jump_entry *entry,
+			       enum jump_label_type type)
+{
+	void *addr = (void *)jump_entry_code(entry);
+	u32 insn;
+
+	if (type == JUMP_LABEL_JMP) {
+		void *target = (void *)jump_entry_target(entry);
+		int distance = target - addr;
+		/*
+		 * Encode the PA1.1 "b,n" instruction with a 17-bit
+		 * displacement.  In case we hit the BUG(), we could use
+		 * another branch instruction with a 22-bit displacement on
+		 * 64-bit CPUs instead. But this seems sufficient for now.
+		 */
+		distance -= 8;
+		BUG_ON(distance > 262143 || distance < -262144);
+		insn = 0xe8000002 | reassemble_17(distance >> 2);
+	} else {
+		insn = INSN_NOP;
+	}
+
+	patch_text(addr, insn);
+}
+
+void arch_jump_label_transform_static(struct jump_entry *entry,
+				      enum jump_label_type type)
+{
+	/*
+	 * We use the architected NOP in arch_static_branch, so there's no
+	 * need to patch an identical NOP over the top of it here. The core
+	 * will call arch_jump_label_transform from a module notifier if the
+	 * NOP needs to be replaced by a branch.
+	 */
+}
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index c3b1b9c24ede..a8be7a47fcc0 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -18,6 +18,9 @@
 				*(.data..vm0.pgd) \
 				*(.data..vm0.pte)
 
+/* No __ro_after_init data in the .rodata section - which will always be ro */
+#define RO_AFTER_INIT_DATA
+
 #include <asm-generic/vmlinux.lds.h>
 
 /* needed for the processor specific cache alignment size */	
-- 
cgit v1.2.3-59-g8ed1b


From 93fa5b280761a4dbb14c5330f260380385ab2b49 Mon Sep 17 00:00:00 2001
From: Serge Semin <fancer.lancer@gmail.com>
Date: Fri, 3 May 2019 20:50:40 +0300
Subject: mips: Make sure dt memory regions are valid

There are situations when memory regions coming from dts may be
too big for the platform physical address space. This especially
concerns XPA-capable systems. Bootloader may determine more than 4GB
memory available and pass it to the kernel over dts memory node, while
kernel is built without XPA/64BIT support. In this case the region
may either simply be truncated by add_memory_region() method
or by u64->phys_addr_t type casting. But in worst case the method
can even drop the memory region if it exceeds PHYS_ADDR_MAX size.
So lets make sure the retrieved from dts memory regions are valid,
and if some of them aren't, just manually truncate them with a warning
printed out.

Signed-off-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Cc: Huacai Chen <chenhc@lemote.com>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Serge Semin <Sergey.Semin@t-platforms.ru>
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/mips/kernel/prom.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/prom.c b/arch/mips/kernel/prom.c
index 437a174e3ef9..28bf01961bb2 100644
--- a/arch/mips/kernel/prom.c
+++ b/arch/mips/kernel/prom.c
@@ -41,7 +41,19 @@ char *mips_get_machine_name(void)
 #ifdef CONFIG_USE_OF
 void __init early_init_dt_add_memory_arch(u64 base, u64 size)
 {
-	return add_memory_region(base, size, BOOT_MEM_RAM);
+	if (base >= PHYS_ADDR_MAX) {
+		pr_warn("Trying to add an invalid memory region, skipped\n");
+		return;
+	}
+
+	/* Truncate the passed memory region instead of type casting */
+	if (base + size - 1 >= PHYS_ADDR_MAX || base + size < base) {
+		pr_warn("Truncate memory region %llx @ %llx to size %llx\n",
+			size, base, PHYS_ADDR_MAX - base);
+		size = PHYS_ADDR_MAX - base;
+	}
+
+	add_memory_region(base, size, BOOT_MEM_RAM);
 }
 
 int __init early_init_dt_reserve_memory_arch(phys_addr_t base,
-- 
cgit v1.2.3-59-g8ed1b


From 3751cbda8f223549d7ea28803cbec8ac87e43ed2 Mon Sep 17 00:00:00 2001
From: Serge Semin <fancer.lancer@gmail.com>
Date: Fri, 3 May 2019 20:50:41 +0300
Subject: mips: Manually call fdt_init_reserved_mem() method

Since memblock-patchset was introduced the reserved-memory nodes are
supported being declared in dt-files. So these nodes are actually parsed
during the arch setup procedure when the early_init_fdt_scan_reserved_mem()
method is called. But due to the arch-specific boot mem_map container
utilization we need to manually call the fdt_init_reserved_mem() method
after all the available and reserved memory has been moved to memblock.
The first function call performed before bootmem_init() by the
early_init_fdt_scan_reserved_mem() routine fails due to the lack of any
memblock memory regions to allocate from at that stage.

Signed-off-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Paul Burton <paul.burton@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Cc: Huacai Chen <chenhc@lemote.com>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Serge Semin <Sergey.Semin@t-platforms.ru>
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/mips/kernel/setup.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index fbd216b4e929..ab349d2381c3 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -27,6 +27,7 @@
 #include <linux/dma-contiguous.h>
 #include <linux/decompress/generic.h>
 #include <linux/of_fdt.h>
+#include <linux/of_reserved_mem.h>
 
 #include <asm/addrspace.h>
 #include <asm/bootinfo.h>
@@ -825,6 +826,8 @@ static void __init arch_mem_init(char **cmdline_p)
 	memblock_reserve(__pa_symbol(&__nosave_begin),
 		__pa_symbol(&__nosave_end) - __pa_symbol(&__nosave_begin));
 
+	fdt_init_reserved_mem();
+
 	memblock_dump_all();
 
 	early_memtest(PFN_PHYS(min_low_pfn), PFN_PHYS(max_low_pfn));
-- 
cgit v1.2.3-59-g8ed1b


From c41593a04e3e9c28da46b13274dc9cb1773b9582 Mon Sep 17 00:00:00 2001
From: Petr Štetiar <ynezz@true.cz>
Date: Fri, 3 May 2019 16:27:14 +0200
Subject: ARM: Kirkwood: support of_get_mac_address new ERR_PTR error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There was NVMEM support added to of_get_mac_address, so it could now return
ERR_PTR encoded error values, so we need to adjust all current users of
of_get_mac_address to this new fact.

Signed-off-by: Petr Štetiar <ynezz@true.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm/mach-mvebu/kirkwood.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm/mach-mvebu/kirkwood.c b/arch/arm/mach-mvebu/kirkwood.c
index 0aa88105d46e..9b5f4d665374 100644
--- a/arch/arm/mach-mvebu/kirkwood.c
+++ b/arch/arm/mach-mvebu/kirkwood.c
@@ -92,7 +92,8 @@ static void __init kirkwood_dt_eth_fixup(void)
 			continue;
 
 		/* skip disabled nodes or nodes with valid MAC address*/
-		if (!of_device_is_available(pnp) || of_get_mac_address(np))
+		if (!of_device_is_available(pnp) ||
+		    !IS_ERR(of_get_mac_address(np)))
 			goto eth_fixup_skip;
 
 		clk = of_clk_get(pnp, 0);
-- 
cgit v1.2.3-59-g8ed1b


From ea168cdf129944a11471dff5af93fc3c81c22c35 Mon Sep 17 00:00:00 2001
From: Petr Štetiar <ynezz@true.cz>
Date: Fri, 3 May 2019 16:27:15 +0200
Subject: powerpc: tsi108: support of_get_mac_address new ERR_PTR error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There was NVMEM support added to of_get_mac_address, so it could now return
ERR_PTR encoded error values, so we need to adjust all current users of
of_get_mac_address to this new fact.

Signed-off-by: Petr Štetiar <ynezz@true.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/powerpc/sysdev/tsi108_dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/sysdev/tsi108_dev.c b/arch/powerpc/sysdev/tsi108_dev.c
index 1f1af12f23e2..c92dcac85231 100644
--- a/arch/powerpc/sysdev/tsi108_dev.c
+++ b/arch/powerpc/sysdev/tsi108_dev.c
@@ -105,7 +105,7 @@ static int __init tsi108_eth_of_init(void)
 		}
 
 		mac_addr = of_get_mac_address(np);
-		if (mac_addr)
+		if (!IS_ERR(mac_addr))
 			memcpy(tsi_eth_data.mac_addr, mac_addr, 6);
 
 		ph = of_get_property(np, "mdio-handle", NULL);
-- 
cgit v1.2.3-59-g8ed1b


From d9c9ce34ed5c892323cbf5b4f9a4c498e036316a Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 2 May 2019 19:11:39 +0200
Subject: x86/fpu: Fault-in user stack if copy_fpstate_to_sigframe() fails

In the compacted form, XSAVES may save only the XMM+SSE state but skip
FP (x87 state).

This is denoted by header->xfeatures = 6. The fastpath
(copy_fpregs_to_sigframe()) does that but _also_ initialises the FP
state (cwd to 0x37f, mxcsr as we do, remaining fields to 0).

The slowpath (copy_xstate_to_user()) leaves most of the FP
state untouched. Only mxcsr and mxcsr_flags are set due to
xfeatures_mxcsr_quirk(). Now that XFEATURE_MASK_FP is set
unconditionally, see

  04944b793e18 ("x86: xsave: set FP, SSE bits in the xsave header in the user sigcontext"),

on return from the signal, random garbage is loaded as the FP state.

Instead of utilizing copy_xstate_to_user(), fault-in the user memory
and retry the fast path. Ideally, the fast path succeeds on the second
attempt but may be retried again if the memory is swapped out due
to memory pressure. If the user memory can not be faulted-in then
get_user_pages() returns an error so we don't loop forever.

Fault in memory via get_user_pages_unlocked() so
copy_fpregs_to_sigframe() succeeds without a fault.

Fixes: 69277c98f5eef ("x86/fpu: Always store the registers in copy_fpstate_to_sigframe()")
Reported-by: Kurt Kanzenbach <kurt.kanzenbach@linutronix.de>
Suggested-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: "linux-mm@kvack.org" <linux-mm@kvack.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190502171139.mqtegctsg35cir2e@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 7026f1c4e5e3..5a8d118bc423 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -157,11 +157,9 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
  */
 int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 {
-	struct fpu *fpu = &current->thread.fpu;
-	struct xregs_state *xsave = &fpu->state.xsave;
 	struct task_struct *tsk = current;
 	int ia32_fxstate = (buf != buf_fx);
-	int ret = -EFAULT;
+	int ret;
 
 	ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
 			 IS_ENABLED(CONFIG_IA32_EMULATION));
@@ -174,11 +172,12 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 			sizeof(struct user_i387_ia32_struct), NULL,
 			(struct _fpstate_32 __user *) buf) ? -1 : 1;
 
+retry:
 	/*
 	 * Load the FPU registers if they are not valid for the current task.
 	 * With a valid FPU state we can attempt to save the state directly to
-	 * userland's stack frame which will likely succeed. If it does not, do
-	 * the slowpath.
+	 * userland's stack frame which will likely succeed. If it does not,
+	 * resolve the fault in the user memory and try again.
 	 */
 	fpregs_lock();
 	if (test_thread_flag(TIF_NEED_FPU_LOAD))
@@ -187,20 +186,20 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	pagefault_disable();
 	ret = copy_fpregs_to_sigframe(buf_fx);
 	pagefault_enable();
-	if (ret && !test_thread_flag(TIF_NEED_FPU_LOAD))
-		copy_fpregs_to_fpstate(fpu);
-	set_thread_flag(TIF_NEED_FPU_LOAD);
 	fpregs_unlock();
 
 	if (ret) {
-		if (using_compacted_format()) {
-			if (copy_xstate_to_user(buf_fx, xsave, 0, size))
-				return -1;
-		} else {
-			fpstate_sanitize_xstate(fpu);
-			if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size))
-				return -1;
-		}
+		int aligned_size;
+		int nr_pages;
+
+		aligned_size = offset_in_page(buf_fx) + fpu_user_xstate_size;
+		nr_pages = DIV_ROUND_UP(aligned_size, PAGE_SIZE);
+
+		ret = get_user_pages_unlocked((unsigned long)buf_fx, nr_pages,
+					      NULL, FOLL_WRITE);
+		if (ret == nr_pages)
+			goto retry;
+		return -EFAULT;
 	}
 
 	/* Save the fsave header for the 32-bit frames. */
-- 
cgit v1.2.3-59-g8ed1b


From 305d60012304684bd59ea1f67703e51662e4906a Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 6 May 2019 06:21:00 +0000
Subject: powerpc/kasan: add missing/lost Makefile

For unknown reason (aka. mpe is a doofus), the new Makefile added via
the KASAN support patch didn't land into arch/powerpc/mm/kasan/

This patch restores it.

Fixes: 2edb16efc899 ("powerpc/32: Add KASAN support")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/kasan/Makefile | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 arch/powerpc/mm/kasan/Makefile

(limited to 'arch')

diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile
new file mode 100644
index 000000000000..6577897673dd
--- /dev/null
+++ b/arch/powerpc/mm/kasan/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+KASAN_SANITIZE := n
+
+obj-$(CONFIG_PPC32)           += kasan_init_32.o
-- 
cgit v1.2.3-59-g8ed1b


From 471e475c69a1689e059b5e57e893a7da75d2831a Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 6 May 2019 06:21:01 +0000
Subject: powerpc/mm: Fix makefile for KASAN

In commit 17312f258cf6 ("powerpc/mm: Move book3s32 specifics in
subdirectory mm/book3s64"), ppc_mmu_32.c was moved and renamed.

This patch fixes Makefiles to disable KASAN instrumentation on
the new name and location.

Fixes: f072015c7b74 ("powerpc: disable KASAN instrumentation on early/critical files.")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/Makefile          | 6 ------
 arch/powerpc/mm/book3s32/Makefile | 6 ++++++
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index d8c0ce9b2557..7a7527116c3a 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -5,12 +5,6 @@
 
 ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
 
-KASAN_SANITIZE_ppc_mmu_32.o := n
-
-ifdef CONFIG_KASAN
-CFLAGS_ppc_mmu_32.o  		+= -DDISABLE_BRANCH_PROFILING
-endif
-
 obj-y				:= fault.o mem.o pgtable.o mmap.o \
 				   init_$(BITS).o pgtable_$(BITS).o \
 				   pgtable-frag.o \
diff --git a/arch/powerpc/mm/book3s32/Makefile b/arch/powerpc/mm/book3s32/Makefile
index a4e217d0f3b7..1732eaa740a9 100644
--- a/arch/powerpc/mm/book3s32/Makefile
+++ b/arch/powerpc/mm/book3s32/Makefile
@@ -1,3 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 
+KASAN_SANITIZE_mmu.o := n
+
+ifdef CONFIG_KASAN
+CFLAGS_mmu.o  		+= -DDISABLE_BRANCH_PROFILING
+endif
+
 obj-y += mmu.o hash_low.o mmu_context.o tlb.o
-- 
cgit v1.2.3-59-g8ed1b


From c4e31847a5490d52ddd44440a524e8355be11ec1 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 6 May 2019 06:47:55 +0000
Subject: powerpc/mm: fix redundant inclusion of pgtable-frag.o in Makefile

The patch identified below added pgtable-frag.o to obj-y
but some merge witchery kept it also for obj-CONFIG_PPC_BOOK3S_64

This patch clears the duplication.

Fixes: 737b434d3d55 ("powerpc/mm: convert Book3E 64 to pte_fragment")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/Makefile | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 7a7527116c3a..0f499db315d6 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -12,7 +12,6 @@ obj-y				:= fault.o mem.o pgtable.o mmap.o \
 obj-$(CONFIG_PPC_MMU_NOHASH)	+= nohash/
 obj-$(CONFIG_PPC_BOOK3S_32)	+= book3s32/
 obj-$(CONFIG_PPC_BOOK3S_64)	+= book3s64/
-obj-$(CONFIG_PPC_BOOK3S_64)	+= pgtable-frag.o
 obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
-- 
cgit v1.2.3-59-g8ed1b


From 67d53f30e23ec66aa7bbdd1592d5e64d46876190 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Mon, 6 May 2019 08:10:43 +0000
Subject: powerpc/mm: fix section mismatch for setup_kup()

commit b28c97505eb1 ("powerpc/64: Setup KUP on secondary CPUs")
moved setup_kup() out of the __init section. As stated in that commit,
"this is only for 64-bit". But this function is also used on PPC32,
where the two functions called by setup_kup() are in the __init
section, so setup_kup() has to either be kept in the __init
section on PPC32 or marked __ref.

This patch marks it __ref, it fixes the below build warnings.

  MODPOST vmlinux.o
WARNING: vmlinux.o(.text+0x169ec): Section mismatch in reference from the function setup_kup() to the function .init.text:setup_kuep()
The function setup_kup() references
the function __init setup_kuep().
This is often because setup_kup lacks a __init
annotation or the annotation of setup_kuep is wrong.

WARNING: vmlinux.o(.text+0x16a04): Section mismatch in reference from the function setup_kup() to the function .init.text:setup_kuap()
The function setup_kup() references
the function __init setup_kuap().
This is often because setup_kup lacks a __init
annotation or the annotation of setup_kuap is wrong.

Fixes: b28c97505eb1 ("powerpc/64: Setup KUP on secondary CPUs")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/init-common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index 6ea5607fc564..3bcae9e5e954 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -45,7 +45,7 @@ static int __init parse_nosmap(char *p)
 }
 early_param("nosmap", parse_nosmap);
 
-void setup_kup(void)
+void __ref setup_kup(void)
 {
 	setup_kuep(disable_kuep);
 	setup_kuap(disable_kuap);
-- 
cgit v1.2.3-59-g8ed1b


From 04a1942933ced67d2b73c156017bf13476b7146b Mon Sep 17 00:00:00 2001
From: Sachin Sant <sachinp@linux.vnet.ibm.com>
Date: Mon, 6 May 2019 17:33:33 +0530
Subject: powerpc/mm: Fix hugetlb page initialization

This patch fixes a regression by using correct kernel config variable
for HUGETLB_PAGE_SIZE_VARIABLE.

Without this huge pages are disabled during kernel boot.
[0.309496] hugetlbfs: disabling because there are no supported hugepage sizes

Fixes: c5710cd20735 ("powerpc/mm: cleanup HPAGE_SHIFT setup")
Reported-by: Sachin Sant <sachinp@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Reviewed-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/hugetlbpage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 98db5ec6a1dd..c5c9ff2d7afc 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -640,7 +640,7 @@ static int __init hugetlbpage_init(void)
 			pgtable_cache_add(PTE_T_ORDER);
 	}
 
-	if (IS_ENABLED(HUGETLB_PAGE_SIZE_VARIABLE))
+	if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE))
 		hugetlbpage_init_default();
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 13bf5ced93775ffccb53527a9d862e023a9daa03 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 25 Mar 2019 15:44:06 +0100
Subject: dma-mapping: add a Kconfig symbol to indicate arch_dma_prep_coherent
 presence

Add a Kconfig symbol that indicates an architecture provides a
arch_dma_prep_coherent implementation, and provide a stub otherwise.

This will allow the generic dma-iommu code to use it while still
allowing to be built for cache coherent architectures.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
---
 arch/arm64/Kconfig              | 1 +
 arch/csky/Kconfig               | 1 +
 include/linux/dma-noncoherent.h | 6 ++++++
 kernel/dma/Kconfig              | 3 +++
 4 files changed, 11 insertions(+)

(limited to 'arch')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7e34b9eba5de..adda078d6df7 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -13,6 +13,7 @@ config ARM64
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_HAS_DMA_COHERENT_TO_PFN
 	select ARCH_HAS_DMA_MMAP_PGPROT
+	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FAST_MULTIPLIER
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 725a115759c9..2c3178848b7d 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -1,6 +1,7 @@
 config CSKY
 	def_bool y
 	select ARCH_32BIT_OFF_T
+	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_USE_BUILTIN_BSWAP
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index 69b36ed31a99..9741767e400f 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -72,6 +72,12 @@ static inline void arch_sync_dma_for_cpu_all(struct device *dev)
 }
 #endif /* CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL */
 
+#ifdef CONFIG_ARCH_HAS_DMA_PREP_COHERENT
 void arch_dma_prep_coherent(struct page *page, size_t size);
+#else
+static inline void arch_dma_prep_coherent(struct page *page, size_t size)
+{
+}
+#endif /* CONFIG_ARCH_HAS_DMA_PREP_COHERENT */
 
 #endif /* _LINUX_DMA_NONCOHERENT_H */
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 52b704e2b97a..83d711f8d665 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -38,6 +38,9 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU
 config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
 	bool
 
+config ARCH_HAS_DMA_PREP_COHERENT
+	bool
+
 config ARCH_HAS_DMA_COHERENT_TO_PFN
 	bool
 
-- 
cgit v1.2.3-59-g8ed1b


From c5bf68fe0c86a5835bd2e6aead1c49976360753f Mon Sep 17 00:00:00 2001
From: Kirill Smelkov <kirr@nexedi.com>
Date: Tue, 26 Mar 2019 23:51:19 +0300
Subject: *: convert stream-like files from nonseekable_open -> stream_open

Using scripts/coccinelle/api/stream_open.cocci added in 10dce8af3422
("fs: stream_open - opener for stream-like files so that read and write
can run simultaneously without deadlock"), search and convert to
stream_open all in-kernel nonseekable_open users for which read and
write actually do not depend on ppos and where there is no other methods
in file_operations which assume @offset access.

I've verified each generated change manually - that it is correct to convert -
and each other nonseekable_open instance left - that it is either not correct
to convert there, or that it is not converted due to current stream_open.cocci
limitations. The script also does not convert files that should be valid to
convert, but that currently have .llseek = noop_llseek or generic_file_llseek
for unknown reason despite file being opened with nonseekable_open (e.g.
drivers/input/mousedev.c)

Among cases converted 14 were potentially vulnerable to read vs write deadlock
(see details in 10dce8af3422):

	drivers/char/pcmcia/cm4000_cs.c:1685:7-23: ERROR: cm4000_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/gnss/core.c:45:1-17: ERROR: gnss_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/hid/uhid.c:635:1-17: ERROR: uhid_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/infiniband/core/user_mad.c:988:1-17: ERROR: umad_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/input/evdev.c:527:1-17: ERROR: evdev_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/input/misc/uinput.c:401:1-17: ERROR: uinput_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/isdn/capi/capi.c:963:8-24: ERROR: capi_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/leds/uleds.c:77:1-17: ERROR: uleds_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/media/rc/lirc_dev.c:198:1-17: ERROR: lirc_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/s390/char/fs3270.c:488:1-17: ERROR: fs3270_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/usb/misc/ldusb.c:310:1-17: ERROR: ld_usb_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/xen/evtchn.c:667:8-24: ERROR: evtchn_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	net/batman-adv/icmp_socket.c:80:1-17: ERROR: batadv_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	net/rfkill/core.c:1146:8-24: ERROR: rfkill_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.

and the rest were just safe to convert to stream_open because their read and
write do not use ppos at all and corresponding file_operations do not
have methods that assume @offset file access(*):

	arch/powerpc/platforms/52xx/mpc52xx_gpt.c:631:8-24: WARNING: mpc52xx_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	arch/powerpc/platforms/cell/spufs/file.c:591:8-24: WARNING: spufs_ibox_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	arch/powerpc/platforms/cell/spufs/file.c:591:8-24: WARNING: spufs_ibox_stat_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	arch/powerpc/platforms/cell/spufs/file.c:591:8-24: WARNING: spufs_mbox_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	arch/powerpc/platforms/cell/spufs/file.c:591:8-24: WARNING: spufs_mbox_stat_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	arch/powerpc/platforms/cell/spufs/file.c:591:8-24: WARNING: spufs_wbox_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	arch/powerpc/platforms/cell/spufs/file.c:591:8-24: WARNING: spufs_wbox_stat_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	arch/um/drivers/harddog_kern.c:88:8-24: WARNING: harddog_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	arch/x86/kernel/cpu/microcode/core.c:430:33-49: WARNING: microcode_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/char/ds1620.c:215:8-24: WARNING: ds1620_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/char/dtlk.c:301:1-17: WARNING: dtlk_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/char/ipmi/ipmi_watchdog.c:840:9-25: WARNING: ipmi_wdog_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/char/pcmcia/scr24x_cs.c:95:8-24: WARNING: scr24x_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/char/tb0219.c:246:9-25: WARNING: tb0219_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/firewire/nosy.c:306:8-24: WARNING: nosy_ops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/hwmon/fschmd.c:840:8-24: WARNING: watchdog_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/hwmon/w83793.c:1344:8-24: WARNING: watchdog_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/infiniband/core/ucma.c:1747:8-24: WARNING: ucma_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/infiniband/core/ucm.c:1178:8-24: WARNING: ucm_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/infiniband/core/uverbs_main.c:1086:8-24: WARNING: uverbs_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/input/joydev.c:282:1-17: WARNING: joydev_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/pci/switch/switchtec.c:393:1-17: WARNING: switchtec_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/platform/chrome/cros_ec_debugfs.c:135:8-24: WARNING: cros_ec_console_log_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/rtc/rtc-ds1374.c:470:9-25: WARNING: ds1374_wdt_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/rtc/rtc-m41t80.c:805:9-25: WARNING: wdt_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/s390/char/tape_char.c:293:2-18: WARNING: tape_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/s390/char/zcore.c:194:8-24: WARNING: zcore_reipl_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/s390/crypto/zcrypt_api.c:528:8-24: WARNING: zcrypt_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/spi/spidev.c:594:1-17: WARNING: spidev_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/staging/pi433/pi433_if.c:974:1-17: WARNING: pi433_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/acquirewdt.c:203:8-24: WARNING: acq_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/advantechwdt.c:202:8-24: WARNING: advwdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/alim1535_wdt.c:252:8-24: WARNING: ali_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/alim7101_wdt.c:217:8-24: WARNING: wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/ar7_wdt.c:166:8-24: WARNING: ar7_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/at91rm9200_wdt.c:113:8-24: WARNING: at91wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/ath79_wdt.c:135:8-24: WARNING: ath79_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/bcm63xx_wdt.c:119:8-24: WARNING: bcm63xx_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/cpu5wdt.c:143:8-24: WARNING: cpu5wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/cpwd.c:397:8-24: WARNING: cpwd_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/eurotechwdt.c:319:8-24: WARNING: eurwdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/f71808e_wdt.c:528:8-24: WARNING: watchdog_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/gef_wdt.c:232:8-24: WARNING: gef_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/geodewdt.c:95:8-24: WARNING: geodewdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/ib700wdt.c:241:8-24: WARNING: ibwdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/ibmasr.c:326:8-24: WARNING: asr_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/indydog.c:80:8-24: WARNING: indydog_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/intel_scu_watchdog.c:307:8-24: WARNING: intel_scu_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/iop_wdt.c:104:8-24: WARNING: iop_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/it8712f_wdt.c:330:8-24: WARNING: it8712f_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/ixp4xx_wdt.c:68:8-24: WARNING: ixp4xx_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/ks8695_wdt.c:145:8-24: WARNING: ks8695wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/m54xx_wdt.c:88:8-24: WARNING: m54xx_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/machzwd.c:336:8-24: WARNING: zf_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/mixcomwd.c:153:8-24: WARNING: mixcomwd_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/mtx-1_wdt.c:121:8-24: WARNING: mtx1_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/mv64x60_wdt.c:136:8-24: WARNING: mv64x60_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/nuc900_wdt.c:134:8-24: WARNING: nuc900wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/nv_tco.c:164:8-24: WARNING: nv_tco_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/pc87413_wdt.c:289:8-24: WARNING: pc87413_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/pcwd.c:698:8-24: WARNING: pcwd_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/pcwd.c:737:8-24: WARNING: pcwd_temp_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/pcwd_pci.c:581:8-24: WARNING: pcipcwd_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/pcwd_pci.c:623:8-24: WARNING: pcipcwd_temp_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/pcwd_usb.c:488:8-24: WARNING: usb_pcwd_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/pcwd_usb.c:527:8-24: WARNING: usb_pcwd_temperature_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/pika_wdt.c:121:8-24: WARNING: pikawdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/pnx833x_wdt.c:119:8-24: WARNING: pnx833x_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/rc32434_wdt.c:153:8-24: WARNING: rc32434_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/rdc321x_wdt.c:145:8-24: WARNING: rdc321x_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/riowd.c:79:1-17: WARNING: riowd_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sa1100_wdt.c:62:8-24: WARNING: sa1100dog_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sbc60xxwdt.c:211:8-24: WARNING: wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sbc7240_wdt.c:139:8-24: WARNING: wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sbc8360.c:274:8-24: WARNING: sbc8360_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sbc_epx_c3.c:81:8-24: WARNING: epx_c3_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sbc_fitpc2_wdt.c:78:8-24: WARNING: fitpc2_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sb_wdog.c:108:1-17: WARNING: sbwdog_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sc1200wdt.c:181:8-24: WARNING: sc1200wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sc520_wdt.c:261:8-24: WARNING: wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sch311x_wdt.c:319:8-24: WARNING: sch311x_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/scx200_wdt.c:105:8-24: WARNING: scx200_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/smsc37b787_wdt.c:369:8-24: WARNING: wb_smsc_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/w83877f_wdt.c:227:8-24: WARNING: wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/w83977f_wdt.c:301:8-24: WARNING: wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/wafer5823wdt.c:200:8-24: WARNING: wafwdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/watchdog_dev.c:828:8-24: WARNING: watchdog_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/wdrtas.c:379:8-24: WARNING: wdrtas_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/wdrtas.c:445:8-24: WARNING: wdrtas_temp_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/wdt285.c:104:1-17: WARNING: watchdog_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/wdt977.c:276:8-24: WARNING: wdt977_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/wdt.c:424:8-24: WARNING: wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/wdt.c:484:8-24: WARNING: wdt_temp_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/wdt_pci.c:464:8-24: WARNING: wdtpci_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/wdt_pci.c:527:8-24: WARNING: wdtpci_temp_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	net/batman-adv/log.c:105:1-17: WARNING: batadv_log_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	sound/core/control.c:57:7-23: WARNING: snd_ctl_f_ops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	sound/core/rawmidi.c:385:7-23: WARNING: snd_rawmidi_f_ops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	sound/core/seq/seq_clientmgr.c:310:7-23: WARNING: snd_seq_f_ops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	sound/core/timer.c:1428:7-23: WARNING: snd_timer_f_ops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.

One can also recheck/review the patch via generating it with explanation comments included via

	$ make coccicheck MODE=patch COCCI=scripts/coccinelle/api/stream_open.cocci SPFLAGS="-D explain"

(*) This second group also contains cases with read/write deadlocks that
stream_open.cocci don't yet detect, but which are still valid to convert to
stream_open since ppos is not used. For example drivers/pci/switch/switchtec.c
calls wait_for_completion_interruptible() in its .read, but stream_open.cocci
currently detects only "wait_event*" as blocking.

Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Yongzhi Pan <panyongzhi@gmail.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Tejun Heo <tj@kernel.org>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Julia Lawall <Julia.Lawall@lip6.fr>
Cc: Nikolaus Rath <Nikolaus@rath.org>
Cc: Han-Wen Nienhuys <hanwen@google.com>
Cc: Anatolij Gustschin <agust@denx.de>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James R. Van Zandt" <jrv@vanzandt.mv.com>
Cc: Corey Minyard <minyard@acm.org>
Cc: Harald Welte <laforge@gnumonks.org>
Acked-by: Lubomir Rintel <lkundrak@v3.sk> [scr24x_cs]
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Johan Hovold <johan@kernel.org>
Cc: David Herrmann <dh.herrmann@googlemail.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Cc: Jean Delvare <jdelvare@suse.com>
Acked-by: Guenter Roeck <linux@roeck-us.net>	[watchdog/* hwmon/*]
Cc: Rudolf Marek <r.marek@assembler.cz>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Karsten Keil <isdn@linux-pingi.de>
Cc: Jacek Anaszewski <jacek.anaszewski@gmail.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Kurt Schwemmer <kurt.schwemmer@microsemi.com>
Acked-by: Logan Gunthorpe <logang@deltatee.com> [drivers/pci/switch/switchtec]
Acked-by: Bjorn Helgaas <bhelgaas@google.com> [drivers/pci/switch/switchtec]
Cc: Benson Leung <bleung@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com> [platform/chrome]
Cc: Alessandro Zummo <a.zummo@towertech.it>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com> [rtc/*]
Cc: Mark Brown <broonie@kernel.org>
Cc: Wim Van Sebroeck <wim@linux-watchdog.org>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: bcm-kernel-feedback-list@broadcom.com
Cc: Wan ZongShun <mcuos.com@gmail.com>
Cc: Zwane Mwaikambo <zwanem@gmail.com>
Cc: Marek Lindner <mareklindner@neomailbox.ch>
Cc: Simon Wunderlich <sw@simonwunderlich.de>
Cc: Antonio Quartulli <a@unstable.cc>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.com>
Signed-off-by: Kirill Smelkov <kirr@nexedi.com>
---
 arch/powerpc/platforms/52xx/mpc52xx_gpt.c | 2 +-
 arch/powerpc/platforms/cell/spufs/file.c  | 2 +-
 arch/um/drivers/harddog_kern.c            | 2 +-
 arch/x86/kernel/cpu/microcode/core.c      | 2 +-
 drivers/char/ds1620.c                     | 2 +-
 drivers/char/dtlk.c                       | 2 +-
 drivers/char/ipmi/ipmi_watchdog.c         | 2 +-
 drivers/char/pcmcia/cm4000_cs.c           | 2 +-
 drivers/char/pcmcia/scr24x_cs.c           | 2 +-
 drivers/char/tb0219.c                     | 2 +-
 drivers/firewire/nosy.c                   | 2 +-
 drivers/gnss/core.c                       | 2 +-
 drivers/hid/uhid.c                        | 2 +-
 drivers/hwmon/fschmd.c                    | 2 +-
 drivers/hwmon/w83793.c                    | 2 +-
 drivers/infiniband/core/ucm.c             | 2 +-
 drivers/infiniband/core/ucma.c            | 2 +-
 drivers/infiniband/core/user_mad.c        | 2 +-
 drivers/infiniband/core/uverbs_main.c     | 2 +-
 drivers/input/evdev.c                     | 2 +-
 drivers/input/joydev.c                    | 2 +-
 drivers/input/misc/uinput.c               | 2 +-
 drivers/isdn/capi/capi.c                  | 2 +-
 drivers/leds/uleds.c                      | 2 +-
 drivers/media/rc/lirc_dev.c               | 2 +-
 drivers/pci/switch/switchtec.c            | 2 +-
 drivers/platform/chrome/cros_ec_debugfs.c | 2 +-
 drivers/rtc/rtc-ds1374.c                  | 2 +-
 drivers/rtc/rtc-m41t80.c                  | 2 +-
 drivers/s390/char/fs3270.c                | 2 +-
 drivers/s390/char/tape_char.c             | 2 +-
 drivers/s390/char/zcore.c                 | 2 +-
 drivers/s390/crypto/zcrypt_api.c          | 2 +-
 drivers/spi/spidev.c                      | 2 +-
 drivers/staging/pi433/pi433_if.c          | 2 +-
 drivers/usb/misc/ldusb.c                  | 2 +-
 drivers/watchdog/acquirewdt.c             | 2 +-
 drivers/watchdog/advantechwdt.c           | 2 +-
 drivers/watchdog/alim1535_wdt.c           | 2 +-
 drivers/watchdog/alim7101_wdt.c           | 2 +-
 drivers/watchdog/ar7_wdt.c                | 2 +-
 drivers/watchdog/at91rm9200_wdt.c         | 2 +-
 drivers/watchdog/ath79_wdt.c              | 2 +-
 drivers/watchdog/bcm63xx_wdt.c            | 2 +-
 drivers/watchdog/cpu5wdt.c                | 2 +-
 drivers/watchdog/cpwd.c                   | 2 +-
 drivers/watchdog/eurotechwdt.c            | 2 +-
 drivers/watchdog/f71808e_wdt.c            | 2 +-
 drivers/watchdog/gef_wdt.c                | 2 +-
 drivers/watchdog/geodewdt.c               | 2 +-
 drivers/watchdog/ib700wdt.c               | 2 +-
 drivers/watchdog/ibmasr.c                 | 2 +-
 drivers/watchdog/indydog.c                | 2 +-
 drivers/watchdog/intel_scu_watchdog.c     | 2 +-
 drivers/watchdog/iop_wdt.c                | 2 +-
 drivers/watchdog/it8712f_wdt.c            | 2 +-
 drivers/watchdog/ixp4xx_wdt.c             | 2 +-
 drivers/watchdog/ks8695_wdt.c             | 2 +-
 drivers/watchdog/m54xx_wdt.c              | 2 +-
 drivers/watchdog/machzwd.c                | 2 +-
 drivers/watchdog/mixcomwd.c               | 2 +-
 drivers/watchdog/mtx-1_wdt.c              | 2 +-
 drivers/watchdog/mv64x60_wdt.c            | 2 +-
 drivers/watchdog/nuc900_wdt.c             | 2 +-
 drivers/watchdog/nv_tco.c                 | 2 +-
 drivers/watchdog/pc87413_wdt.c            | 2 +-
 drivers/watchdog/pcwd.c                   | 4 ++--
 drivers/watchdog/pcwd_pci.c               | 4 ++--
 drivers/watchdog/pcwd_usb.c               | 4 ++--
 drivers/watchdog/pika_wdt.c               | 2 +-
 drivers/watchdog/pnx833x_wdt.c            | 2 +-
 drivers/watchdog/rc32434_wdt.c            | 2 +-
 drivers/watchdog/rdc321x_wdt.c            | 2 +-
 drivers/watchdog/riowd.c                  | 2 +-
 drivers/watchdog/sa1100_wdt.c             | 2 +-
 drivers/watchdog/sb_wdog.c                | 2 +-
 drivers/watchdog/sbc60xxwdt.c             | 2 +-
 drivers/watchdog/sbc7240_wdt.c            | 2 +-
 drivers/watchdog/sbc8360.c                | 2 +-
 drivers/watchdog/sbc_epx_c3.c             | 2 +-
 drivers/watchdog/sbc_fitpc2_wdt.c         | 2 +-
 drivers/watchdog/sc1200wdt.c              | 2 +-
 drivers/watchdog/sc520_wdt.c              | 2 +-
 drivers/watchdog/sch311x_wdt.c            | 2 +-
 drivers/watchdog/scx200_wdt.c             | 2 +-
 drivers/watchdog/smsc37b787_wdt.c         | 2 +-
 drivers/watchdog/w83877f_wdt.c            | 2 +-
 drivers/watchdog/w83977f_wdt.c            | 2 +-
 drivers/watchdog/wafer5823wdt.c           | 2 +-
 drivers/watchdog/watchdog_dev.c           | 2 +-
 drivers/watchdog/wdrtas.c                 | 4 ++--
 drivers/watchdog/wdt.c                    | 4 ++--
 drivers/watchdog/wdt285.c                 | 2 +-
 drivers/watchdog/wdt977.c                 | 2 +-
 drivers/watchdog/wdt_pci.c                | 4 ++--
 drivers/xen/evtchn.c                      | 2 +-
 net/batman-adv/icmp_socket.c              | 2 +-
 net/batman-adv/log.c                      | 2 +-
 net/rfkill/core.c                         | 2 +-
 sound/core/control.c                      | 2 +-
 sound/core/rawmidi.c                      | 2 +-
 sound/core/seq/seq_clientmgr.c            | 2 +-
 sound/core/timer.c                        | 2 +-
 103 files changed, 109 insertions(+), 109 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
index 17cf249b18ee..3cb2f07ce8eb 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
@@ -628,7 +628,7 @@ static int mpc52xx_wdt_open(struct inode *inode, struct file *file)
 	}
 
 	file->private_data = mpc52xx_gpt_wdt;
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int mpc52xx_wdt_release(struct inode *inode, struct file *file)
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 48c2477e7e2a..bfb9ca99ac05 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -588,7 +588,7 @@ static int spufs_pipe_open(struct inode *inode, struct file *file)
 	struct spufs_inode_info *i = SPUFS_I(inode);
 	file->private_data = i->i_ctx;
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /*
diff --git a/arch/um/drivers/harddog_kern.c b/arch/um/drivers/harddog_kern.c
index 6d381279b362..000cb69ba0bc 100644
--- a/arch/um/drivers/harddog_kern.c
+++ b/arch/um/drivers/harddog_kern.c
@@ -85,7 +85,7 @@ static int harddog_open(struct inode *inode, struct file *file)
 	timer_alive = 1;
 	spin_unlock(&lock);
 	mutex_unlock(&harddog_mutex);
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 err:
 	spin_unlock(&lock);
 	mutex_unlock(&harddog_mutex);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 5260185cbf7b..639817729ed4 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -427,7 +427,7 @@ static int do_microcode_update(const void __user *buf, size_t size)
 
 static int microcode_open(struct inode *inode, struct file *file)
 {
-	return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
+	return capable(CAP_SYS_RAWIO) ? stream_open(inode, file) : -EPERM;
 }
 
 static ssize_t microcode_write(struct file *file, const char __user *buf,
diff --git a/drivers/char/ds1620.c b/drivers/char/ds1620.c
index a5ecf6dae02e..373f549525fe 100644
--- a/drivers/char/ds1620.c
+++ b/drivers/char/ds1620.c
@@ -212,7 +212,7 @@ static void ds1620_read_state(struct therm *therm)
 
 static int ds1620_open(struct inode *inode, struct file *file)
 {
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static ssize_t
diff --git a/drivers/char/dtlk.c b/drivers/char/dtlk.c
index 669c3311adc4..4fed8fafa0f0 100644
--- a/drivers/char/dtlk.c
+++ b/drivers/char/dtlk.c
@@ -302,7 +302,7 @@ static int dtlk_open(struct inode *inode, struct file *file)
 	case DTLK_MINOR:
 		if (dtlk_busy)
 			return -EBUSY;
-		return nonseekable_open(inode, file);
+		return stream_open(inode, file);
 
 	default:
 		return -ENXIO;
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 2924a4bc4a32..74c6d1f34132 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -837,7 +837,7 @@ static int ipmi_open(struct inode *ino, struct file *filep)
 		 * first heartbeat.
 		 */
 		ipmi_start_timer_on_heartbeat = 1;
-		return nonseekable_open(ino, filep);
+		return stream_open(ino, filep);
 
 	default:
 		return (-ENODEV);
diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c
index 7a4eb86aedac..15bf585af5d3 100644
--- a/drivers/char/pcmcia/cm4000_cs.c
+++ b/drivers/char/pcmcia/cm4000_cs.c
@@ -1682,7 +1682,7 @@ static int cmm_open(struct inode *inode, struct file *filp)
 	link->open = 1;		/* only one open per device */
 
 	DEBUGP(2, dev, "<- cmm_open\n");
-	ret = nonseekable_open(inode, filp);
+	ret = stream_open(inode, filp);
 out:
 	mutex_unlock(&cmm_mutex);
 	return ret;
diff --git a/drivers/char/pcmcia/scr24x_cs.c b/drivers/char/pcmcia/scr24x_cs.c
index f6b43d9350f0..04b39c3596cc 100644
--- a/drivers/char/pcmcia/scr24x_cs.c
+++ b/drivers/char/pcmcia/scr24x_cs.c
@@ -92,7 +92,7 @@ static int scr24x_open(struct inode *inode, struct file *filp)
 	kref_get(&dev->refcnt);
 	filp->private_data = dev;
 
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 }
 
 static int scr24x_release(struct inode *inode, struct file *filp)
diff --git a/drivers/char/tb0219.c b/drivers/char/tb0219.c
index 7c19d9b22785..e8614ea843e2 100644
--- a/drivers/char/tb0219.c
+++ b/drivers/char/tb0219.c
@@ -243,7 +243,7 @@ static int tanbac_tb0219_open(struct inode *inode, struct file *file)
 	case 16 ... 23:
 	case 32 ... 39:
 	case 48 ... 55:
-		return nonseekable_open(inode, file);
+		return stream_open(inode, file);
 	default:
 		break;
 	}
diff --git a/drivers/firewire/nosy.c b/drivers/firewire/nosy.c
index a128dd1126ae..515e96db4391 100644
--- a/drivers/firewire/nosy.c
+++ b/drivers/firewire/nosy.c
@@ -303,7 +303,7 @@ nosy_open(struct inode *inode, struct file *file)
 
 	file->private_data = client;
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 fail:
 	kfree(client);
 	lynx_put(lynx);
diff --git a/drivers/gnss/core.c b/drivers/gnss/core.c
index 320cfca80d5f..e6f94501cb28 100644
--- a/drivers/gnss/core.c
+++ b/drivers/gnss/core.c
@@ -42,7 +42,7 @@ static int gnss_open(struct inode *inode, struct file *file)
 
 	get_device(&gdev->dev);
 
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 	file->private_data = gdev;
 
 	down_write(&gdev->rwsem);
diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c
index 840634e0f1e3..dbaead0a5371 100644
--- a/drivers/hid/uhid.c
+++ b/drivers/hid/uhid.c
@@ -632,7 +632,7 @@ static int uhid_char_open(struct inode *inode, struct file *file)
 	INIT_WORK(&uhid->worker, uhid_device_add_worker);
 
 	file->private_data = uhid;
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 
 	return 0;
 }
diff --git a/drivers/hwmon/fschmd.c b/drivers/hwmon/fschmd.c
index 042a166e1858..8fb54079fac8 100644
--- a/drivers/hwmon/fschmd.c
+++ b/drivers/hwmon/fschmd.c
@@ -837,7 +837,7 @@ static int watchdog_open(struct inode *inode, struct file *filp)
 	watchdog_trigger(data);
 	filp->private_data = data;
 
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 }
 
 static int watchdog_release(struct inode *inode, struct file *filp)
diff --git a/drivers/hwmon/w83793.c b/drivers/hwmon/w83793.c
index 0af0f6283b35..e94ae1bb3cf0 100644
--- a/drivers/hwmon/w83793.c
+++ b/drivers/hwmon/w83793.c
@@ -1341,7 +1341,7 @@ static int watchdog_open(struct inode *inode, struct file *filp)
 	/* Store pointer to data into filp's private data */
 	filp->private_data = data;
 
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 }
 
 static int watchdog_close(struct inode *inode, struct file *filp)
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index 7541fbaf58a3..65c3230f5663 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -1175,7 +1175,7 @@ static int ib_ucm_open(struct inode *inode, struct file *filp)
 	file->filp = filp;
 	file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev);
 
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 }
 
 static int ib_ucm_close(struct inode *inode, struct file *filp)
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 7468b26b8a01..140a338a135f 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1744,7 +1744,7 @@ static int ucma_open(struct inode *inode, struct file *filp)
 	filp->private_data = file;
 	file->filp = filp;
 
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 }
 
 static int ucma_close(struct inode *inode, struct file *filp)
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 02b7947ab215..b58b07c03cfb 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -985,7 +985,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
 
 	list_add_tail(&file->port_list, &port->file_list);
 
-	nonseekable_open(inode, filp);
+	stream_open(inode, filp);
 out:
 	mutex_unlock(&port->file_mutex);
 	return ret;
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index c489f545baae..8b43dd96d3b2 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -1132,7 +1132,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 
 	setup_ufile_idr_uobject(file);
 
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 
 err_module:
 	module_put(ib_dev->owner);
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index f48369d6f3a0..f040d8881ff2 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -524,7 +524,7 @@ static int evdev_open(struct inode *inode, struct file *file)
 		goto err_free_client;
 
 	file->private_data = client;
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 
 	return 0;
 
diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c
index 4c1e427dfabb..d806f6be4788 100644
--- a/drivers/input/joydev.c
+++ b/drivers/input/joydev.c
@@ -279,7 +279,7 @@ static int joydev_open(struct inode *inode, struct file *file)
 		goto err_free_client;
 
 	file->private_data = client;
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 
 	return 0;
 
diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c
index 26ec603fe220..1a6762fc38f9 100644
--- a/drivers/input/misc/uinput.c
+++ b/drivers/input/misc/uinput.c
@@ -398,7 +398,7 @@ static int uinput_open(struct inode *inode, struct file *file)
 	newdev->state = UIST_NEW_DEVICE;
 
 	file->private_data = newdev;
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 
 	return 0;
 }
diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index e1da70a9530c..3c3ad42f22bf 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -960,7 +960,7 @@ static int capi_open(struct inode *inode, struct file *file)
 	list_add_tail(&cdev->list, &capidev_list);
 	mutex_unlock(&capidev_list_lock);
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int capi_release(struct inode *inode, struct file *file)
diff --git a/drivers/leds/uleds.c b/drivers/leds/uleds.c
index 0c43bfac9598..08b6a769ff8f 100644
--- a/drivers/leds/uleds.c
+++ b/drivers/leds/uleds.c
@@ -74,7 +74,7 @@ static int uleds_open(struct inode *inode, struct file *file)
 	udev->state = ULEDS_STATE_UNKNOWN;
 
 	file->private_data = udev;
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 
 	return 0;
 }
diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c
index f862f1b7f996..92db1e83c192 100644
--- a/drivers/media/rc/lirc_dev.c
+++ b/drivers/media/rc/lirc_dev.c
@@ -195,7 +195,7 @@ static int ir_lirc_open(struct inode *inode, struct file *file)
 	list_add(&fh->list, &dev->lirc_fh);
 	spin_unlock_irqrestore(&dev->lirc_fh_lock, flags);
 
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 
 	return 0;
 out_kfifo:
diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index e22766c79fe9..0f7b80144863 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -390,7 +390,7 @@ static int switchtec_dev_open(struct inode *inode, struct file *filp)
 		return PTR_ERR(stuser);
 
 	filp->private_data = stuser;
-	nonseekable_open(inode, filp);
+	stream_open(inode, filp);
 
 	dev_dbg(&stdev->dev, "%s: %p\n", __func__, stuser);
 
diff --git a/drivers/platform/chrome/cros_ec_debugfs.c b/drivers/platform/chrome/cros_ec_debugfs.c
index 71308766e891..2b8e8a01a739 100644
--- a/drivers/platform/chrome/cros_ec_debugfs.c
+++ b/drivers/platform/chrome/cros_ec_debugfs.c
@@ -132,7 +132,7 @@ static int cros_ec_console_log_open(struct inode *inode, struct file *file)
 {
 	file->private_data = inode->i_private;
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static ssize_t cros_ec_console_log_read(struct file *file, char __user *buf,
diff --git a/drivers/rtc/rtc-ds1374.c b/drivers/rtc/rtc-ds1374.c
index 38a2e9e684df..225a8df1d4e9 100644
--- a/drivers/rtc/rtc-ds1374.c
+++ b/drivers/rtc/rtc-ds1374.c
@@ -467,7 +467,7 @@ static int ds1374_wdt_open(struct inode *inode, struct file *file)
 		 */
 		wdt_is_open = 1;
 		mutex_unlock(&ds1374->mutex);
-		return nonseekable_open(inode, file);
+		return stream_open(inode, file);
 	}
 	return -ENODEV;
 }
diff --git a/drivers/rtc/rtc-m41t80.c b/drivers/rtc/rtc-m41t80.c
index ebf50b1540f2..dd5a8991f75b 100644
--- a/drivers/rtc/rtc-m41t80.c
+++ b/drivers/rtc/rtc-m41t80.c
@@ -802,7 +802,7 @@ static int wdt_open(struct inode *inode, struct file *file)
 		 */
 		wdt_is_open = 1;
 		mutex_unlock(&m41t80_rtc_mutex);
-		return nonseekable_open(inode, file);
+		return stream_open(inode, file);
 	}
 	return -ENODEV;
 }
diff --git a/drivers/s390/char/fs3270.c b/drivers/s390/char/fs3270.c
index 8b48ba9c598e..4c4683d8784a 100644
--- a/drivers/s390/char/fs3270.c
+++ b/drivers/s390/char/fs3270.c
@@ -486,7 +486,7 @@ fs3270_open(struct inode *inode, struct file *filp)
 		raw3270_del_view(&fp->view);
 		goto out;
 	}
-	nonseekable_open(inode, filp);
+	stream_open(inode, filp);
 	filp->private_data = fp;
 out:
 	mutex_unlock(&fs3270_mutex);
diff --git a/drivers/s390/char/tape_char.c b/drivers/s390/char/tape_char.c
index fc206c9d1c56..ea4253939555 100644
--- a/drivers/s390/char/tape_char.c
+++ b/drivers/s390/char/tape_char.c
@@ -290,7 +290,7 @@ tapechar_open (struct inode *inode, struct file *filp)
 	rc = tape_open(device);
 	if (rc == 0) {
 		filp->private_data = device;
-		nonseekable_open(inode, filp);
+		stream_open(inode, filp);
 	} else
 		tape_put_device(device);
 
diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
index 76d3c50bf078..a57e1c55094f 100644
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -191,7 +191,7 @@ static ssize_t zcore_reipl_write(struct file *filp, const char __user *buf,
 
 static int zcore_reipl_open(struct inode *inode, struct file *filp)
 {
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 }
 
 static int zcore_reipl_release(struct inode *inode, struct file *filp)
diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index 689c2af7026a..6bfdc69a13e7 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -525,7 +525,7 @@ static int zcrypt_open(struct inode *inode, struct file *filp)
 	filp->private_data = (void *) perms;
 
 	atomic_inc(&zcrypt_open_count);
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 }
 
 /**
diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c
index b0c76e2626ce..7fd0d9943160 100644
--- a/drivers/spi/spidev.c
+++ b/drivers/spi/spidev.c
@@ -591,7 +591,7 @@ static int spidev_open(struct inode *inode, struct file *filp)
 
 	spidev->users++;
 	filp->private_data = spidev;
-	nonseekable_open(inode, filp);
+	stream_open(inode, filp);
 
 	mutex_unlock(&device_list_lock);
 	return 0;
diff --git a/drivers/staging/pi433/pi433_if.c b/drivers/staging/pi433/pi433_if.c
index b2314636dc89..2299a11b1878 100644
--- a/drivers/staging/pi433/pi433_if.c
+++ b/drivers/staging/pi433/pi433_if.c
@@ -971,7 +971,7 @@ static int pi433_open(struct inode *inode, struct file *filp)
 
 	/* instance data as context */
 	filp->private_data = instance;
-	nonseekable_open(inode, filp);
+	stream_open(inode, filp);
 
 	return 0;
 }
diff --git a/drivers/usb/misc/ldusb.c b/drivers/usb/misc/ldusb.c
index 006762b72ff5..6581774bdfa4 100644
--- a/drivers/usb/misc/ldusb.c
+++ b/drivers/usb/misc/ldusb.c
@@ -307,7 +307,7 @@ static int ld_usb_open(struct inode *inode, struct file *file)
 	int retval;
 	struct usb_interface *interface;
 
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 	subminor = iminor(inode);
 
 	interface = usb_find_interface(&ld_usb_driver, subminor);
diff --git a/drivers/watchdog/acquirewdt.c b/drivers/watchdog/acquirewdt.c
index d6210d946082..957d1255d4ca 100644
--- a/drivers/watchdog/acquirewdt.c
+++ b/drivers/watchdog/acquirewdt.c
@@ -200,7 +200,7 @@ static int acq_open(struct inode *inode, struct file *file)
 
 	/* Activate */
 	acq_keepalive();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int acq_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/advantechwdt.c b/drivers/watchdog/advantechwdt.c
index f61944369c1a..2766af292a71 100644
--- a/drivers/watchdog/advantechwdt.c
+++ b/drivers/watchdog/advantechwdt.c
@@ -199,7 +199,7 @@ static int advwdt_open(struct inode *inode, struct file *file)
 	 */
 
 	advwdt_ping();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int advwdt_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/alim1535_wdt.c b/drivers/watchdog/alim1535_wdt.c
index 60f0c2eb8531..39a07bb5f6d5 100644
--- a/drivers/watchdog/alim1535_wdt.c
+++ b/drivers/watchdog/alim1535_wdt.c
@@ -249,7 +249,7 @@ static int ali_open(struct inode *inode, struct file *file)
 
 	/* Activate */
 	ali_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /*
diff --git a/drivers/watchdog/alim7101_wdt.c b/drivers/watchdog/alim7101_wdt.c
index 12f7ea62dddd..7e9884960eb9 100644
--- a/drivers/watchdog/alim7101_wdt.c
+++ b/drivers/watchdog/alim7101_wdt.c
@@ -214,7 +214,7 @@ static int fop_open(struct inode *inode, struct file *file)
 		return -EBUSY;
 	/* Good, fire up the show */
 	wdt_startup();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int fop_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/ar7_wdt.c b/drivers/watchdog/ar7_wdt.c
index ee1ab12ab04f..b9b2d06b3879 100644
--- a/drivers/watchdog/ar7_wdt.c
+++ b/drivers/watchdog/ar7_wdt.c
@@ -163,7 +163,7 @@ static int ar7_wdt_open(struct inode *inode, struct file *file)
 	ar7_wdt_enable_wdt();
 	expect_close = 0;
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int ar7_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/at91rm9200_wdt.c b/drivers/watchdog/at91rm9200_wdt.c
index b45fc0aee667..907a4545dee6 100644
--- a/drivers/watchdog/at91rm9200_wdt.c
+++ b/drivers/watchdog/at91rm9200_wdt.c
@@ -110,7 +110,7 @@ static int at91_wdt_open(struct inode *inode, struct file *file)
 		return -EBUSY;
 
 	at91_wdt_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /*
diff --git a/drivers/watchdog/ath79_wdt.c b/drivers/watchdog/ath79_wdt.c
index e2209bf5fa8a..4f56b63f9691 100644
--- a/drivers/watchdog/ath79_wdt.c
+++ b/drivers/watchdog/ath79_wdt.c
@@ -132,7 +132,7 @@ static int ath79_wdt_open(struct inode *inode, struct file *file)
 	clear_bit(WDT_FLAGS_EXPECT_CLOSE, &wdt_flags);
 	ath79_wdt_enable();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int ath79_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/bcm63xx_wdt.c b/drivers/watchdog/bcm63xx_wdt.c
index d3c1113e774c..e2af37c9a266 100644
--- a/drivers/watchdog/bcm63xx_wdt.c
+++ b/drivers/watchdog/bcm63xx_wdt.c
@@ -116,7 +116,7 @@ static int bcm63xx_wdt_open(struct inode *inode, struct file *file)
 		return -EBUSY;
 
 	bcm63xx_wdt_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int bcm63xx_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/cpu5wdt.c b/drivers/watchdog/cpu5wdt.c
index 6cfb102c397c..475360de6e9e 100644
--- a/drivers/watchdog/cpu5wdt.c
+++ b/drivers/watchdog/cpu5wdt.c
@@ -140,7 +140,7 @@ static int cpu5wdt_open(struct inode *inode, struct file *file)
 {
 	if (test_and_set_bit(0, &cpu5wdt_device.inuse))
 		return -EBUSY;
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int cpu5wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/cpwd.c b/drivers/watchdog/cpwd.c
index 32156e199c51..b5b078bdebe6 100644
--- a/drivers/watchdog/cpwd.c
+++ b/drivers/watchdog/cpwd.c
@@ -394,7 +394,7 @@ static int cpwd_open(struct inode *inode, struct file *f)
 
 	mutex_unlock(&cpwd_mutex);
 
-	return nonseekable_open(inode, f);
+	return stream_open(inode, f);
 }
 
 static int cpwd_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/eurotechwdt.c b/drivers/watchdog/eurotechwdt.c
index 47f77a6fdfd6..89129e6fa9b6 100644
--- a/drivers/watchdog/eurotechwdt.c
+++ b/drivers/watchdog/eurotechwdt.c
@@ -316,7 +316,7 @@ static int eurwdt_open(struct inode *inode, struct file *file)
 	eurwdt_timeout = WDT_TIMEOUT;	/* initial timeout */
 	/* Activate the WDT */
 	eurwdt_activate_timer();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /**
diff --git a/drivers/watchdog/f71808e_wdt.c b/drivers/watchdog/f71808e_wdt.c
index 9a1c761258ce..021c6ace9462 100644
--- a/drivers/watchdog/f71808e_wdt.c
+++ b/drivers/watchdog/f71808e_wdt.c
@@ -525,7 +525,7 @@ static int watchdog_open(struct inode *inode, struct file *file)
 		__module_get(THIS_MODULE);
 
 	watchdog.expect_close = 0;
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int watchdog_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/gef_wdt.c b/drivers/watchdog/gef_wdt.c
index 006e2348022c..26350b319505 100644
--- a/drivers/watchdog/gef_wdt.c
+++ b/drivers/watchdog/gef_wdt.c
@@ -229,7 +229,7 @@ static int gef_wdt_open(struct inode *inode, struct file *file)
 
 	gef_wdt_handler_enable();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int gef_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/geodewdt.c b/drivers/watchdog/geodewdt.c
index 88e01238f01b..c5a727da6657 100644
--- a/drivers/watchdog/geodewdt.c
+++ b/drivers/watchdog/geodewdt.c
@@ -92,7 +92,7 @@ static int geodewdt_open(struct inode *inode, struct file *file)
 		__module_get(THIS_MODULE);
 
 	geodewdt_ping();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int geodewdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/ib700wdt.c b/drivers/watchdog/ib700wdt.c
index cc262284a6aa..30d6cec582af 100644
--- a/drivers/watchdog/ib700wdt.c
+++ b/drivers/watchdog/ib700wdt.c
@@ -238,7 +238,7 @@ static int ibwdt_open(struct inode *inode, struct file *file)
 
 	/* Activate */
 	ibwdt_ping();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int ibwdt_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/ibmasr.c b/drivers/watchdog/ibmasr.c
index 366b0474f278..897f7eda9e6a 100644
--- a/drivers/watchdog/ibmasr.c
+++ b/drivers/watchdog/ibmasr.c
@@ -323,7 +323,7 @@ static int asr_open(struct inode *inode, struct file *file)
 	asr_toggle();
 	asr_enable();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int asr_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/indydog.c b/drivers/watchdog/indydog.c
index 5d20cdd30efe..5592b975fe3a 100644
--- a/drivers/watchdog/indydog.c
+++ b/drivers/watchdog/indydog.c
@@ -77,7 +77,7 @@ static int indydog_open(struct inode *inode, struct file *file)
 
 	pr_info("Started watchdog timer\n");
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int indydog_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/intel_scu_watchdog.c b/drivers/watchdog/intel_scu_watchdog.c
index 0caab6241eb7..3181a72c7ddf 100644
--- a/drivers/watchdog/intel_scu_watchdog.c
+++ b/drivers/watchdog/intel_scu_watchdog.c
@@ -304,7 +304,7 @@ static int intel_scu_open(struct inode *inode, struct file *file)
 	if (watchdog_device.driver_closed)
 		return -EPERM;
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int intel_scu_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/iop_wdt.c b/drivers/watchdog/iop_wdt.c
index b16013ffacc2..d910a7dec21b 100644
--- a/drivers/watchdog/iop_wdt.c
+++ b/drivers/watchdog/iop_wdt.c
@@ -101,7 +101,7 @@ static int iop_wdt_open(struct inode *inode, struct file *file)
 	clear_bit(WDT_OK_TO_CLOSE, &wdt_status);
 	wdt_enable();
 	set_bit(WDT_ENABLED, &wdt_status);
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static ssize_t iop_wdt_write(struct file *file, const char *data, size_t len,
diff --git a/drivers/watchdog/it8712f_wdt.c b/drivers/watchdog/it8712f_wdt.c
index 41b3979a9d87..b1567240a0e6 100644
--- a/drivers/watchdog/it8712f_wdt.c
+++ b/drivers/watchdog/it8712f_wdt.c
@@ -327,7 +327,7 @@ static int it8712f_wdt_open(struct inode *inode, struct file *file)
 	ret = it8712f_wdt_enable();
 	if (ret)
 		return ret;
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int it8712f_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/ixp4xx_wdt.c b/drivers/watchdog/ixp4xx_wdt.c
index f20cc53ff719..dd139cda936c 100644
--- a/drivers/watchdog/ixp4xx_wdt.c
+++ b/drivers/watchdog/ixp4xx_wdt.c
@@ -65,7 +65,7 @@ static int ixp4xx_wdt_open(struct inode *inode, struct file *file)
 
 	clear_bit(WDT_OK_TO_CLOSE, &wdt_status);
 	wdt_enable();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static ssize_t
diff --git a/drivers/watchdog/ks8695_wdt.c b/drivers/watchdog/ks8695_wdt.c
index 1e41818a44bc..0565cf30017b 100644
--- a/drivers/watchdog/ks8695_wdt.c
+++ b/drivers/watchdog/ks8695_wdt.c
@@ -142,7 +142,7 @@ static int ks8695_wdt_open(struct inode *inode, struct file *file)
 		return -EBUSY;
 
 	ks8695_wdt_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /*
diff --git a/drivers/watchdog/m54xx_wdt.c b/drivers/watchdog/m54xx_wdt.c
index da6fa2b68074..752d03620f0a 100644
--- a/drivers/watchdog/m54xx_wdt.c
+++ b/drivers/watchdog/m54xx_wdt.c
@@ -85,7 +85,7 @@ static int m54xx_wdt_open(struct inode *inode, struct file *file)
 
 	clear_bit(WDT_OK_TO_CLOSE, &wdt_status);
 	wdt_enable();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static ssize_t m54xx_wdt_write(struct file *file, const char *data,
diff --git a/drivers/watchdog/machzwd.c b/drivers/watchdog/machzwd.c
index 88d823d87a4b..53759415cf06 100644
--- a/drivers/watchdog/machzwd.c
+++ b/drivers/watchdog/machzwd.c
@@ -333,7 +333,7 @@ static int zf_open(struct inode *inode, struct file *file)
 	if (nowayout)
 		__module_get(THIS_MODULE);
 	zf_timer_on();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int zf_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/mixcomwd.c b/drivers/watchdog/mixcomwd.c
index 3cc07447c655..ece56db0a379 100644
--- a/drivers/watchdog/mixcomwd.c
+++ b/drivers/watchdog/mixcomwd.c
@@ -150,7 +150,7 @@ static int mixcomwd_open(struct inode *inode, struct file *file)
 			mixcomwd_timer_alive = 0;
 		}
 	}
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int mixcomwd_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/mtx-1_wdt.c b/drivers/watchdog/mtx-1_wdt.c
index e028e0a2eca0..25a92857b217 100644
--- a/drivers/watchdog/mtx-1_wdt.c
+++ b/drivers/watchdog/mtx-1_wdt.c
@@ -118,7 +118,7 @@ static int mtx1_wdt_open(struct inode *inode, struct file *file)
 {
 	if (test_and_set_bit(0, &mtx1_wdt_device.inuse))
 		return -EBUSY;
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 
diff --git a/drivers/watchdog/mv64x60_wdt.c b/drivers/watchdog/mv64x60_wdt.c
index 315275d7bab6..c785f4f0a196 100644
--- a/drivers/watchdog/mv64x60_wdt.c
+++ b/drivers/watchdog/mv64x60_wdt.c
@@ -133,7 +133,7 @@ static int mv64x60_wdt_open(struct inode *inode, struct file *file)
 
 	mv64x60_wdt_handler_enable();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int mv64x60_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/nuc900_wdt.c b/drivers/watchdog/nuc900_wdt.c
index 830bd04ff911..8a36350bab7b 100644
--- a/drivers/watchdog/nuc900_wdt.c
+++ b/drivers/watchdog/nuc900_wdt.c
@@ -131,7 +131,7 @@ static int nuc900_wdt_open(struct inode *inode, struct file *file)
 
 	nuc900_wdt_start();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int nuc900_wdt_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/nv_tco.c b/drivers/watchdog/nv_tco.c
index a0fabf6f92b0..98d4f5371cf4 100644
--- a/drivers/watchdog/nv_tco.c
+++ b/drivers/watchdog/nv_tco.c
@@ -161,7 +161,7 @@ static int nv_tco_open(struct inode *inode, struct file *file)
 	/* Reload and activate timer */
 	tco_timer_keepalive();
 	tco_timer_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int nv_tco_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/pc87413_wdt.c b/drivers/watchdog/pc87413_wdt.c
index 2ffa39b46970..ca21d6c240a3 100644
--- a/drivers/watchdog/pc87413_wdt.c
+++ b/drivers/watchdog/pc87413_wdt.c
@@ -286,7 +286,7 @@ static int pc87413_open(struct inode *inode, struct file *file)
 
 	pr_info("Watchdog enabled. Timeout set to %d minute(s).\n", timeout);
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /**
diff --git a/drivers/watchdog/pcwd.c b/drivers/watchdog/pcwd.c
index b72ce68eacd3..a3415cf07c98 100644
--- a/drivers/watchdog/pcwd.c
+++ b/drivers/watchdog/pcwd.c
@@ -695,7 +695,7 @@ static int pcwd_open(struct inode *inode, struct file *file)
 	/* Activate */
 	pcwd_start();
 	pcwd_keepalive();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int pcwd_close(struct inode *inode, struct file *file)
@@ -734,7 +734,7 @@ static int pcwd_temp_open(struct inode *inode, struct file *file)
 	if (!pcwd_private.supports_temp)
 		return -ENODEV;
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int pcwd_temp_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/pcwd_pci.c b/drivers/watchdog/pcwd_pci.c
index 1f78f0908621..5773d2591d3f 100644
--- a/drivers/watchdog/pcwd_pci.c
+++ b/drivers/watchdog/pcwd_pci.c
@@ -578,7 +578,7 @@ static int pcipcwd_open(struct inode *inode, struct file *file)
 	/* Activate */
 	pcipcwd_start();
 	pcipcwd_keepalive();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int pcipcwd_release(struct inode *inode, struct file *file)
@@ -620,7 +620,7 @@ static int pcipcwd_temp_open(struct inode *inode, struct file *file)
 	if (!pcipcwd_private.supports_temp)
 		return -ENODEV;
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int pcipcwd_temp_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/pcwd_usb.c b/drivers/watchdog/pcwd_usb.c
index 4d02f26156f9..5de6182dae33 100644
--- a/drivers/watchdog/pcwd_usb.c
+++ b/drivers/watchdog/pcwd_usb.c
@@ -485,7 +485,7 @@ static int usb_pcwd_open(struct inode *inode, struct file *file)
 	/* Activate */
 	usb_pcwd_start(usb_pcwd_device);
 	usb_pcwd_keepalive(usb_pcwd_device);
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int usb_pcwd_release(struct inode *inode, struct file *file)
@@ -524,7 +524,7 @@ static ssize_t usb_pcwd_temperature_read(struct file *file, char __user *data,
 
 static int usb_pcwd_temperature_open(struct inode *inode, struct file *file)
 {
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int usb_pcwd_temperature_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/pika_wdt.c b/drivers/watchdog/pika_wdt.c
index bb97f5b2f7eb..8938b3fb2b2d 100644
--- a/drivers/watchdog/pika_wdt.c
+++ b/drivers/watchdog/pika_wdt.c
@@ -118,7 +118,7 @@ static int pikawdt_open(struct inode *inode, struct file *file)
 
 	pikawdt_start();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /*
diff --git a/drivers/watchdog/pnx833x_wdt.c b/drivers/watchdog/pnx833x_wdt.c
index 882fdcb46ad1..312899f39fd2 100644
--- a/drivers/watchdog/pnx833x_wdt.c
+++ b/drivers/watchdog/pnx833x_wdt.c
@@ -116,7 +116,7 @@ static int pnx833x_wdt_open(struct inode *inode, struct file *file)
 
 	pr_info("Started watchdog timer\n");
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int pnx833x_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/rc32434_wdt.c b/drivers/watchdog/rc32434_wdt.c
index 3a75f3b53452..e74d5cf272ab 100644
--- a/drivers/watchdog/rc32434_wdt.c
+++ b/drivers/watchdog/rc32434_wdt.c
@@ -150,7 +150,7 @@ static int rc32434_wdt_open(struct inode *inode, struct file *file)
 	rc32434_wdt_start();
 	rc32434_wdt_ping();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int rc32434_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/rdc321x_wdt.c b/drivers/watchdog/rdc321x_wdt.c
index a281aa84bfb1..4382e9556860 100644
--- a/drivers/watchdog/rdc321x_wdt.c
+++ b/drivers/watchdog/rdc321x_wdt.c
@@ -142,7 +142,7 @@ static int rdc321x_wdt_open(struct inode *inode, struct file *file)
 	if (test_and_set_bit(0, &rdc321x_wdt_device.inuse))
 		return -EBUSY;
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int rdc321x_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/riowd.c b/drivers/watchdog/riowd.c
index aba53424605e..f7f7a7a62022 100644
--- a/drivers/watchdog/riowd.c
+++ b/drivers/watchdog/riowd.c
@@ -76,7 +76,7 @@ static void riowd_writereg(struct riowd *p, u8 val, int index)
 
 static int riowd_open(struct inode *inode, struct file *filp)
 {
-	nonseekable_open(inode, filp);
+	stream_open(inode, filp);
 	return 0;
 }
 
diff --git a/drivers/watchdog/sa1100_wdt.c b/drivers/watchdog/sa1100_wdt.c
index d3be4f831db5..bfa035e1a75e 100644
--- a/drivers/watchdog/sa1100_wdt.c
+++ b/drivers/watchdog/sa1100_wdt.c
@@ -59,7 +59,7 @@ static int sa1100dog_open(struct inode *inode, struct file *file)
 	writel_relaxed(OSSR_M3, OSSR);
 	writel_relaxed(OWER_WME, OWER);
 	writel_relaxed(readl_relaxed(OIER) | OIER_E3, OIER);
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /*
diff --git a/drivers/watchdog/sb_wdog.c b/drivers/watchdog/sb_wdog.c
index 3abae50773b8..0692d42e5c67 100644
--- a/drivers/watchdog/sb_wdog.c
+++ b/drivers/watchdog/sb_wdog.c
@@ -105,7 +105,7 @@ static const struct watchdog_info ident = {
  */
 static int sbwdog_open(struct inode *inode, struct file *file)
 {
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 	if (test_and_set_bit(0, &sbwdog_gate))
 		return -EBUSY;
 	__module_get(THIS_MODULE);
diff --git a/drivers/watchdog/sbc60xxwdt.c b/drivers/watchdog/sbc60xxwdt.c
index 72d15fd1f183..4d127a91cbdc 100644
--- a/drivers/watchdog/sbc60xxwdt.c
+++ b/drivers/watchdog/sbc60xxwdt.c
@@ -208,7 +208,7 @@ static int fop_open(struct inode *inode, struct file *file)
 
 	/* Good, fire up the show */
 	wdt_startup();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int fop_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/sbc7240_wdt.c b/drivers/watchdog/sbc7240_wdt.c
index 5f268add17ce..efc81b318939 100644
--- a/drivers/watchdog/sbc7240_wdt.c
+++ b/drivers/watchdog/sbc7240_wdt.c
@@ -136,7 +136,7 @@ static int fop_open(struct inode *inode, struct file *file)
 
 	wdt_enable();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int fop_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/sbc8360.c b/drivers/watchdog/sbc8360.c
index da60560ca446..3396024e7b76 100644
--- a/drivers/watchdog/sbc8360.c
+++ b/drivers/watchdog/sbc8360.c
@@ -271,7 +271,7 @@ static int sbc8360_open(struct inode *inode, struct file *file)
 	/* Activate and ping once to start the countdown */
 	sbc8360_activate();
 	sbc8360_ping();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int sbc8360_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/sbc_epx_c3.c b/drivers/watchdog/sbc_epx_c3.c
index a1c502e0d8ec..783037ffd7d8 100644
--- a/drivers/watchdog/sbc_epx_c3.c
+++ b/drivers/watchdog/sbc_epx_c3.c
@@ -78,7 +78,7 @@ static int epx_c3_open(struct inode *inode, struct file *file)
 	epx_c3_alive = 1;
 	pr_info("Started watchdog timer\n");
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int epx_c3_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/sbc_fitpc2_wdt.c b/drivers/watchdog/sbc_fitpc2_wdt.c
index a517d8bae757..3822a60a8d2b 100644
--- a/drivers/watchdog/sbc_fitpc2_wdt.c
+++ b/drivers/watchdog/sbc_fitpc2_wdt.c
@@ -75,7 +75,7 @@ static int fitpc2_wdt_open(struct inode *inode, struct file *file)
 
 	wdt_enable();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static ssize_t fitpc2_wdt_write(struct file *file, const char *data,
diff --git a/drivers/watchdog/sc1200wdt.c b/drivers/watchdog/sc1200wdt.c
index e035a4d4b299..3c2e9355410a 100644
--- a/drivers/watchdog/sc1200wdt.c
+++ b/drivers/watchdog/sc1200wdt.c
@@ -178,7 +178,7 @@ static int sc1200wdt_open(struct inode *inode, struct file *file)
 	sc1200wdt_start();
 	pr_info("Watchdog enabled, timeout = %d min(s)", timeout);
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 
diff --git a/drivers/watchdog/sc520_wdt.c b/drivers/watchdog/sc520_wdt.c
index 403542f9ed8d..44797414c886 100644
--- a/drivers/watchdog/sc520_wdt.c
+++ b/drivers/watchdog/sc520_wdt.c
@@ -258,7 +258,7 @@ static int fop_open(struct inode *inode, struct file *file)
 
 	/* Good, fire up the show */
 	wdt_startup();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int fop_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/sch311x_wdt.c b/drivers/watchdog/sch311x_wdt.c
index 814cdf539b0f..ed6e9fac5d74 100644
--- a/drivers/watchdog/sch311x_wdt.c
+++ b/drivers/watchdog/sch311x_wdt.c
@@ -316,7 +316,7 @@ static int sch311x_wdt_open(struct inode *inode, struct file *file)
 	 *	Activate
 	 */
 	sch311x_wdt_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int sch311x_wdt_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/scx200_wdt.c b/drivers/watchdog/scx200_wdt.c
index ec4063ebb41a..85f2d8e06cd0 100644
--- a/drivers/watchdog/scx200_wdt.c
+++ b/drivers/watchdog/scx200_wdt.c
@@ -102,7 +102,7 @@ static int scx200_wdt_open(struct inode *inode, struct file *file)
 		return -EBUSY;
 	scx200_wdt_enable();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int scx200_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/smsc37b787_wdt.c b/drivers/watchdog/smsc37b787_wdt.c
index c768dcd53034..a22170775273 100644
--- a/drivers/watchdog/smsc37b787_wdt.c
+++ b/drivers/watchdog/smsc37b787_wdt.c
@@ -366,7 +366,7 @@ static int wb_smsc_wdt_open(struct inode *inode, struct file *file)
 	pr_info("Watchdog enabled. Timeout set to %d %s\n",
 		timeout, (unit == UNIT_SECOND) ? "second(s)" : "minute(s)");
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /* close => shut off the timer */
diff --git a/drivers/watchdog/w83877f_wdt.c b/drivers/watchdog/w83877f_wdt.c
index db9b6488e388..8dd953f90680 100644
--- a/drivers/watchdog/w83877f_wdt.c
+++ b/drivers/watchdog/w83877f_wdt.c
@@ -224,7 +224,7 @@ static int fop_open(struct inode *inode, struct file *file)
 
 	/* Good, fire up the show */
 	wdt_startup();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int fop_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/w83977f_wdt.c b/drivers/watchdog/w83977f_wdt.c
index 672b61a7f9a3..184324c1edae 100644
--- a/drivers/watchdog/w83977f_wdt.c
+++ b/drivers/watchdog/w83977f_wdt.c
@@ -298,7 +298,7 @@ static int wdt_open(struct inode *inode, struct file *file)
 		__module_get(THIS_MODULE);
 
 	wdt_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/wafer5823wdt.c b/drivers/watchdog/wafer5823wdt.c
index 93c5b610e264..0a8073b419f8 100644
--- a/drivers/watchdog/wafer5823wdt.c
+++ b/drivers/watchdog/wafer5823wdt.c
@@ -197,7 +197,7 @@ static int wafwdt_open(struct inode *inode, struct file *file)
 	 *      Activate
 	 */
 	wafwdt_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int wafwdt_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
index f6c24b22b37c..252a7c7b6592 100644
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -825,7 +825,7 @@ static int watchdog_open(struct inode *inode, struct file *file)
 		kref_get(&wd_data->kref);
 
 	/* dev/watchdog is a virtual (and thus non-seekable) filesystem */
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 
 out_mod:
 	module_put(wd_data->wdd->ops->owner);
diff --git a/drivers/watchdog/wdrtas.c b/drivers/watchdog/wdrtas.c
index 0240c60d14e3..3c3ed512ce1e 100644
--- a/drivers/watchdog/wdrtas.c
+++ b/drivers/watchdog/wdrtas.c
@@ -376,7 +376,7 @@ static int wdrtas_open(struct inode *inode, struct file *file)
 	wdrtas_timer_start();
 	wdrtas_timer_keepalive();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /**
@@ -442,7 +442,7 @@ static ssize_t wdrtas_temp_read(struct file *file, char __user *buf,
  */
 static int wdrtas_temp_open(struct inode *inode, struct file *file)
 {
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /**
diff --git a/drivers/watchdog/wdt.c b/drivers/watchdog/wdt.c
index e481fbbc4ae7..3d2f5ed60e88 100644
--- a/drivers/watchdog/wdt.c
+++ b/drivers/watchdog/wdt.c
@@ -421,7 +421,7 @@ static int wdt_open(struct inode *inode, struct file *file)
 	 *	Activate
 	 */
 	wdt_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /**
@@ -481,7 +481,7 @@ static ssize_t wdt_temp_read(struct file *file, char __user *buf,
 
 static int wdt_temp_open(struct inode *inode, struct file *file)
 {
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /**
diff --git a/drivers/watchdog/wdt285.c b/drivers/watchdog/wdt285.c
index ebbb183be618..68843e7f224d 100644
--- a/drivers/watchdog/wdt285.c
+++ b/drivers/watchdog/wdt285.c
@@ -101,7 +101,7 @@ static int watchdog_open(struct inode *inode, struct file *file)
 
 	ret = 0;
 #endif
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 	return ret;
 }
 
diff --git a/drivers/watchdog/wdt977.c b/drivers/watchdog/wdt977.c
index a8e6f87f60c9..59ed644dd4a9 100644
--- a/drivers/watchdog/wdt977.c
+++ b/drivers/watchdog/wdt977.c
@@ -273,7 +273,7 @@ static int wdt977_open(struct inode *inode, struct file *file)
 		__module_get(THIS_MODULE);
 
 	wdt977_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int wdt977_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/wdt_pci.c b/drivers/watchdog/wdt_pci.c
index 10e2cda0ee5a..ff3a41f47127 100644
--- a/drivers/watchdog/wdt_pci.c
+++ b/drivers/watchdog/wdt_pci.c
@@ -461,7 +461,7 @@ static int wdtpci_open(struct inode *inode, struct file *file)
 	 *	Activate
 	 */
 	wdtpci_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /**
@@ -524,7 +524,7 @@ static ssize_t wdtpci_temp_read(struct file *file, char __user *buf,
 
 static int wdtpci_temp_open(struct inode *inode, struct file *file)
 {
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /**
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index 6d1a5e58968f..f341b016672f 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -664,7 +664,7 @@ static int evtchn_open(struct inode *inode, struct file *filp)
 
 	filp->private_data = u;
 
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 }
 
 static int evtchn_release(struct inode *inode, struct file *filp)
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index 9859ababb82e..3ff32125f4b5 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -77,7 +77,7 @@ static int batadv_socket_open(struct inode *inode, struct file *file)
 
 	batadv_debugfs_deprecated(file, "");
 
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 
 	socket_client = kmalloc(sizeof(*socket_client), GFP_KERNEL);
 	if (!socket_client) {
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index 3e610df8debf..e8ff13598c08 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -102,7 +102,7 @@ static int batadv_log_open(struct inode *inode, struct file *file)
 	batadv_debugfs_deprecated(file,
 				  "Use tracepoint batadv:batadv_dbg instead\n");
 
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 	file->private_data = inode->i_private;
 	return 0;
 }
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index abca57040f37..742e186bfadb 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -1143,7 +1143,7 @@ static int rfkill_fop_open(struct inode *inode, struct file *file)
 
 	file->private_data = data;
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 
  free:
 	mutex_unlock(&data->mtx);
diff --git a/sound/core/control.c b/sound/core/control.c
index fad7db402443..a5cc9a874062 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c
@@ -54,7 +54,7 @@ static int snd_ctl_open(struct inode *inode, struct file *file)
 	struct snd_ctl_file *ctl;
 	int i, err;
 
-	err = nonseekable_open(inode, file);
+	err = stream_open(inode, file);
 	if (err < 0)
 		return err;
 
diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c
index c0690d1ecd55..4666bb366c0c 100644
--- a/sound/core/rawmidi.c
+++ b/sound/core/rawmidi.c
@@ -382,7 +382,7 @@ static int snd_rawmidi_open(struct inode *inode, struct file *file)
 	if ((file->f_flags & O_APPEND) && !(file->f_flags & O_NONBLOCK))
 		return -EINVAL;		/* invalid combination */
 
-	err = nonseekable_open(inode, file);
+	err = stream_open(inode, file);
 	if (err < 0)
 		return err;
 
diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c
index 38e7deab6384..a11bdc0350fc 100644
--- a/sound/core/seq/seq_clientmgr.c
+++ b/sound/core/seq/seq_clientmgr.c
@@ -307,7 +307,7 @@ static int snd_seq_open(struct inode *inode, struct file *file)
 	struct snd_seq_user_client *user;
 	int err;
 
-	err = nonseekable_open(inode, file);
+	err = stream_open(inode, file);
 	if (err < 0)
 		return err;
 
diff --git a/sound/core/timer.c b/sound/core/timer.c
index 61a0cec6e1f6..b842b61f66c2 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -1425,7 +1425,7 @@ static int snd_timer_user_open(struct inode *inode, struct file *file)
 	struct snd_timer_user *tu;
 	int err;
 
-	err = nonseekable_open(inode, file);
+	err = stream_open(inode, file);
 	if (err < 0)
 		return err;
 
-- 
cgit v1.2.3-59-g8ed1b


From fb1b79d88b88370a8177aefcd446b1b3d323e4c8 Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmail.com>
Date: Sun, 5 May 2019 22:26:06 +0800
Subject: xtensa: fix incorrect fd close in error case of simdisk_setup()

dev->fd is opened in attach operation, so should not
close dev->fd in error case of simdisk_setup().

Signed-off-by: Chengguang Xu <cgxu519@gmail.com>
Message-Id: <1557066367-4783-1-git-send-email-cgxu519@gmail.com>
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
---
 arch/xtensa/platforms/iss/simdisk.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch')

diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index 026211e7ab09..5fac8b781453 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -297,7 +297,6 @@ out_alloc_disk:
 	blk_cleanup_queue(dev->queue);
 	dev->queue = NULL;
 out_alloc_queue:
-	simc_close(dev->fd);
 	return -EIO;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From fd58015ca9fed04fd52009e2dbcb7aef6940f734 Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmail.com>
Date: Sun, 5 May 2019 22:26:07 +0800
Subject: xtensa: set proper error code for simdisk_setup()

Change error code to -ENOMEM from -EIO because
the error is related to memory allocating.

Signed-off-by: Chengguang Xu <cgxu519@gmail.com>
Message-Id: <1557066367-4783-2-git-send-email-cgxu519@gmail.com>
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
---
 arch/xtensa/platforms/iss/simdisk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index 5fac8b781453..f9cd45860bee 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -297,7 +297,7 @@ out_alloc_disk:
 	blk_cleanup_queue(dev->queue);
 	dev->queue = NULL;
 out_alloc_queue:
-	return -EIO;
+	return -ENOMEM;
 }
 
 static int __init simdisk_init(void)
-- 
cgit v1.2.3-59-g8ed1b


From 8e65986dcae8c49501c1920064dc192e704248bb Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Mon, 6 May 2019 13:07:26 -0700
Subject: xtensa: drop ifdef __KERNEL__ from kernel-only headers

These headers are not exported to userspace, so they're never used
without __KERNEL__ defined. Drop these ifdef statements.

Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
---
 arch/xtensa/include/asm/atomic.h     | 4 ----
 arch/xtensa/include/asm/bitops.h     | 4 ----
 arch/xtensa/include/asm/futex.h      | 3 ---
 arch/xtensa/include/asm/io.h         | 3 ---
 arch/xtensa/include/asm/pci-bridge.h | 3 ---
 arch/xtensa/include/asm/pci.h        | 4 ----
 arch/xtensa/include/asm/pgalloc.h    | 3 ---
 7 files changed, 24 deletions(-)

(limited to 'arch')

diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h
index 7de0149e1cf7..4e7311f58ae8 100644
--- a/arch/xtensa/include/asm/atomic.h
+++ b/arch/xtensa/include/asm/atomic.h
@@ -15,8 +15,6 @@
 
 #include <linux/stringify.h>
 #include <linux/types.h>
-
-#ifdef __KERNEL__
 #include <asm/processor.h>
 #include <asm/cmpxchg.h>
 #include <asm/barrier.h>
@@ -200,6 +198,4 @@ ATOMIC_OPS(xor)
 #define atomic_cmpxchg(v, o, n) ((int)cmpxchg(&((v)->counter), (o), (n)))
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
-#endif /* __KERNEL__ */
-
 #endif /* _XTENSA_ATOMIC_H */
diff --git a/arch/xtensa/include/asm/bitops.h b/arch/xtensa/include/asm/bitops.h
index d3490189792b..345d28aead65 100644
--- a/arch/xtensa/include/asm/bitops.h
+++ b/arch/xtensa/include/asm/bitops.h
@@ -13,8 +13,6 @@
 #ifndef _XTENSA_BITOPS_H
 #define _XTENSA_BITOPS_H
 
-#ifdef __KERNEL__
-
 #ifndef _LINUX_BITOPS_H
 #error only <linux/bitops.h> can be included directly
 #endif
@@ -232,6 +230,4 @@ test_and_change_bit(unsigned int bit, volatile unsigned long *p)
 #include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/sched.h>
 
-#endif	/* __KERNEL__ */
-
 #endif	/* _XTENSA_BITOPS_H */
diff --git a/arch/xtensa/include/asm/futex.h b/arch/xtensa/include/asm/futex.h
index 505d09eff184..0fd58b7521aa 100644
--- a/arch/xtensa/include/asm/futex.h
+++ b/arch/xtensa/include/asm/futex.h
@@ -15,8 +15,6 @@
 #ifndef _ASM_XTENSA_FUTEX_H
 #define _ASM_XTENSA_FUTEX_H
 
-#ifdef __KERNEL__
-
 #include <linux/futex.h>
 #include <linux/uaccess.h>
 #include <linux/errno.h>
@@ -123,5 +121,4 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	return ret;
 }
 
-#endif /* __KERNEL__ */
 #endif /* _ASM_XTENSA_FUTEX_H */
diff --git a/arch/xtensa/include/asm/io.h b/arch/xtensa/include/asm/io.h
index acc5bb2cf1c7..da3e783f896b 100644
--- a/arch/xtensa/include/asm/io.h
+++ b/arch/xtensa/include/asm/io.h
@@ -11,7 +11,6 @@
 #ifndef _XTENSA_IO_H
 #define _XTENSA_IO_H
 
-#ifdef __KERNEL__
 #include <asm/byteorder.h>
 #include <asm/page.h>
 #include <asm/vectors.h>
@@ -78,8 +77,6 @@ static inline void iounmap(volatile void __iomem *addr)
 
 #endif /* CONFIG_MMU */
 
-#endif	/* __KERNEL__ */
-
 #include <asm-generic/io.h>
 
 #endif	/* _XTENSA_IO_H */
diff --git a/arch/xtensa/include/asm/pci-bridge.h b/arch/xtensa/include/asm/pci-bridge.h
index 0b68c76ec1e6..405526912d9a 100644
--- a/arch/xtensa/include/asm/pci-bridge.h
+++ b/arch/xtensa/include/asm/pci-bridge.h
@@ -11,8 +11,6 @@
 #ifndef _XTENSA_PCI_BRIDGE_H
 #define _XTENSA_PCI_BRIDGE_H
 
-#ifdef __KERNEL__
-
 struct device_node;
 struct pci_controller;
 
@@ -84,5 +82,4 @@ int early_write_config_byte(struct pci_controller*, int, int, int, u8);
 int early_write_config_word(struct pci_controller*, int, int, int, u16);
 int early_write_config_dword(struct pci_controller*, int, int, int, u32);
 
-#endif	/* __KERNEL__ */
 #endif	/* _XTENSA_PCI_BRIDGE_H */
diff --git a/arch/xtensa/include/asm/pci.h b/arch/xtensa/include/asm/pci.h
index 883024054b05..8e2b48a268db 100644
--- a/arch/xtensa/include/asm/pci.h
+++ b/arch/xtensa/include/asm/pci.h
@@ -11,8 +11,6 @@
 #ifndef _XTENSA_PCI_H
 #define _XTENSA_PCI_H
 
-#ifdef __KERNEL__
-
 /* Can be used to override the logic in pci_scan_bus for skipping
  * already-configured bus numbers - to be used for buggy BIOSes
  * or architectures with incomplete PCI setup by the loader
@@ -45,8 +43,6 @@
 #define ARCH_GENERIC_PCI_MMAP_RESOURCE	1
 #define arch_can_pci_mmap_io()		1
 
-#endif /* __KERNEL__ */
-
 /* Generic PCI */
 #include <asm-generic/pci.h>
 
diff --git a/arch/xtensa/include/asm/pgalloc.h b/arch/xtensa/include/asm/pgalloc.h
index b3b388ff2f01..368284c972e7 100644
--- a/arch/xtensa/include/asm/pgalloc.h
+++ b/arch/xtensa/include/asm/pgalloc.h
@@ -11,8 +11,6 @@
 #ifndef _XTENSA_PGALLOC_H
 #define _XTENSA_PGALLOC_H
 
-#ifdef __KERNEL__
-
 #include <linux/highmem.h>
 #include <linux/slab.h>
 
@@ -79,5 +77,4 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
 }
 #define pmd_pgtable(pmd) pmd_page(pmd)
 
-#endif /* __KERNEL__ */
 #endif /* _XTENSA_PGALLOC_H */
-- 
cgit v1.2.3-59-g8ed1b


From 8f8d5745bb520c76b81abef4a2cb3023d0313bfd Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Tue, 1 Jan 2019 19:41:55 -0800
Subject: xtensa: replace variant/core.h with asm/core.h

Introduce the header arch/xtensa/include/asm/core.h that provides
definitions for XCHAL macros missing in older xtensa releases. Use this
header instead of variant/core.h

Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
---
 arch/xtensa/boot/boot-redboot/bootstrap.S                |  2 +-
 arch/xtensa/include/asm/asmmacro.h                       |  2 +-
 arch/xtensa/include/asm/cache.h                          |  2 +-
 arch/xtensa/include/asm/checksum.h                       |  2 +-
 arch/xtensa/include/asm/coprocessor.h                    |  2 +-
 arch/xtensa/include/asm/core.h                           | 13 +++++++++++++
 arch/xtensa/include/asm/initialize_mmu.h                 |  4 ----
 arch/xtensa/include/asm/irq.h                            |  2 +-
 arch/xtensa/include/asm/processor.h                      |  2 +-
 arch/xtensa/include/asm/ptrace.h                         |  2 +-
 arch/xtensa/include/asm/vectors.h                        |  2 +-
 arch/xtensa/kernel/hw_breakpoint.c                       |  2 +-
 arch/xtensa/kernel/vmlinux.lds.S                         |  2 +-
 arch/xtensa/lib/checksum.S                               |  2 +-
 arch/xtensa/lib/memcopy.S                                |  2 +-
 arch/xtensa/lib/memset.S                                 |  2 +-
 arch/xtensa/lib/strncpy_user.S                           |  2 +-
 arch/xtensa/lib/strnlen_user.S                           |  2 +-
 arch/xtensa/lib/usercopy.S                               |  2 +-
 arch/xtensa/platforms/xt2000/include/platform/hardware.h |  2 +-
 arch/xtensa/platforms/xt2000/include/platform/serial.h   |  2 +-
 21 files changed, 32 insertions(+), 23 deletions(-)
 create mode 100644 arch/xtensa/include/asm/core.h

(limited to 'arch')

diff --git a/arch/xtensa/boot/boot-redboot/bootstrap.S b/arch/xtensa/boot/boot-redboot/bootstrap.S
index bbf3b4b080cd..48ba5a232d94 100644
--- a/arch/xtensa/boot/boot-redboot/bootstrap.S
+++ b/arch/xtensa/boot/boot-redboot/bootstrap.S
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#include <variant/core.h>
+#include <asm/core.h>
 #include <asm/regs.h>
 #include <asm/asmmacro.h>
 #include <asm/cacheasm.h>
diff --git a/arch/xtensa/include/asm/asmmacro.h b/arch/xtensa/include/asm/asmmacro.h
index 7f2ae5872151..8308a9c3abb2 100644
--- a/arch/xtensa/include/asm/asmmacro.h
+++ b/arch/xtensa/include/asm/asmmacro.h
@@ -11,7 +11,7 @@
 #ifndef _XTENSA_ASMMACRO_H
 #define _XTENSA_ASMMACRO_H
 
-#include <variant/core.h>
+#include <asm/core.h>
 
 /*
  * Some little helpers for loops. Use zero-overhead-loops
diff --git a/arch/xtensa/include/asm/cache.h b/arch/xtensa/include/asm/cache.h
index d2fd932fdb4d..b21fd133ff62 100644
--- a/arch/xtensa/include/asm/cache.h
+++ b/arch/xtensa/include/asm/cache.h
@@ -11,7 +11,7 @@
 #ifndef _XTENSA_CACHE_H
 #define _XTENSA_CACHE_H
 
-#include <variant/core.h>
+#include <asm/core.h>
 
 #define L1_CACHE_SHIFT	XCHAL_DCACHE_LINEWIDTH
 #define L1_CACHE_BYTES	XCHAL_DCACHE_LINESIZE
diff --git a/arch/xtensa/include/asm/checksum.h b/arch/xtensa/include/asm/checksum.h
index f302ef57973a..8b687176ad72 100644
--- a/arch/xtensa/include/asm/checksum.h
+++ b/arch/xtensa/include/asm/checksum.h
@@ -13,7 +13,7 @@
 
 #include <linux/in6.h>
 #include <linux/uaccess.h>
-#include <variant/core.h>
+#include <asm/core.h>
 
 /*
  * computes the checksum of a memory block at buff, length len,
diff --git a/arch/xtensa/include/asm/coprocessor.h b/arch/xtensa/include/asm/coprocessor.h
index 6712929a27c9..0fbe2a740b8d 100644
--- a/arch/xtensa/include/asm/coprocessor.h
+++ b/arch/xtensa/include/asm/coprocessor.h
@@ -12,8 +12,8 @@
 #ifndef _XTENSA_COPROCESSOR_H
 #define _XTENSA_COPROCESSOR_H
 
-#include <variant/core.h>
 #include <variant/tie.h>
+#include <asm/core.h>
 #include <asm/types.h>
 
 #ifdef __ASSEMBLY__
diff --git a/arch/xtensa/include/asm/core.h b/arch/xtensa/include/asm/core.h
new file mode 100644
index 000000000000..b1f57519df49
--- /dev/null
+++ b/arch/xtensa/include/asm/core.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2019 Cadence Design Systems Inc. */
+
+#ifndef _ASM_XTENSA_CORE_H
+#define _ASM_XTENSA_CORE_H
+
+#include <variant/core.h>
+
+#ifndef XCHAL_SPANNING_WAY
+#define XCHAL_SPANNING_WAY 0
+#endif
+
+#endif
diff --git a/arch/xtensa/include/asm/initialize_mmu.h b/arch/xtensa/include/asm/initialize_mmu.h
index 10e9852b2fb4..d719785e54f7 100644
--- a/arch/xtensa/include/asm/initialize_mmu.h
+++ b/arch/xtensa/include/asm/initialize_mmu.h
@@ -33,10 +33,6 @@
 #define CA_WRITEBACK	(0x4)
 #endif
 
-#ifndef XCHAL_SPANNING_WAY
-#define XCHAL_SPANNING_WAY 0
-#endif
-
 #ifdef __ASSEMBLY__
 
 #define XTENSA_HWVERSION_RC_2009_0 230000
diff --git a/arch/xtensa/include/asm/irq.h b/arch/xtensa/include/asm/irq.h
index 6c6ed23e0c79..0f71a51dab25 100644
--- a/arch/xtensa/include/asm/irq.h
+++ b/arch/xtensa/include/asm/irq.h
@@ -12,7 +12,7 @@
 #define _XTENSA_IRQ_H
 
 #include <linux/init.h>
-#include <variant/core.h>
+#include <asm/core.h>
 
 #ifdef CONFIG_PLATFORM_NR_IRQS
 # define PLATFORM_NR_IRQS CONFIG_PLATFORM_NR_IRQS
diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h
index 0c14018d1c26..19f6b54e358b 100644
--- a/arch/xtensa/include/asm/processor.h
+++ b/arch/xtensa/include/asm/processor.h
@@ -10,7 +10,7 @@
 #ifndef _XTENSA_PROCESSOR_H
 #define _XTENSA_PROCESSOR_H
 
-#include <variant/core.h>
+#include <asm/core.h>
 
 #include <linux/compiler.h>
 #include <linux/stringify.h>
diff --git a/arch/xtensa/include/asm/ptrace.h b/arch/xtensa/include/asm/ptrace.h
index 62a58d2567e9..b109416dc07e 100644
--- a/arch/xtensa/include/asm/ptrace.h
+++ b/arch/xtensa/include/asm/ptrace.h
@@ -80,7 +80,7 @@ struct pt_regs {
 	unsigned long areg[16];
 };
 
-#include <variant/core.h>
+#include <asm/core.h>
 
 # define arch_has_single_step()	(1)
 # define task_pt_regs(tsk) ((struct pt_regs*) \
diff --git a/arch/xtensa/include/asm/vectors.h b/arch/xtensa/include/asm/vectors.h
index 7111280c8842..79fe3007919e 100644
--- a/arch/xtensa/include/asm/vectors.h
+++ b/arch/xtensa/include/asm/vectors.h
@@ -18,7 +18,7 @@
 #ifndef _XTENSA_VECTORS_H
 #define _XTENSA_VECTORS_H
 
-#include <variant/core.h>
+#include <asm/core.h>
 #include <asm/kmem_layout.h>
 
 #if XCHAL_HAVE_PTP_MMU
diff --git a/arch/xtensa/kernel/hw_breakpoint.c b/arch/xtensa/kernel/hw_breakpoint.c
index 4f20416061fb..285fb2942b06 100644
--- a/arch/xtensa/kernel/hw_breakpoint.c
+++ b/arch/xtensa/kernel/hw_breakpoint.c
@@ -12,7 +12,7 @@
 #include <linux/log2.h>
 #include <linux/percpu.h>
 #include <linux/perf_event.h>
-#include <variant/core.h>
+#include <asm/core.h>
 
 /* Breakpoint currently in use for each IBREAKA. */
 static DEFINE_PER_CPU(struct perf_event *, bp_on_reg[XCHAL_NUM_IBREAK]);
diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S
index b80a430453b1..943f10639a93 100644
--- a/arch/xtensa/kernel/vmlinux.lds.S
+++ b/arch/xtensa/kernel/vmlinux.lds.S
@@ -18,8 +18,8 @@
 #include <asm/page.h>
 #include <asm/thread_info.h>
 
+#include <asm/core.h>
 #include <asm/vectors.h>
-#include <variant/core.h>
 
 OUTPUT_ARCH(xtensa)
 ENTRY(_start)
diff --git a/arch/xtensa/lib/checksum.S b/arch/xtensa/lib/checksum.S
index 528fe0dd9339..d82c20c1fb7a 100644
--- a/arch/xtensa/lib/checksum.S
+++ b/arch/xtensa/lib/checksum.S
@@ -16,8 +16,8 @@
 
 #include <linux/errno.h>
 #include <linux/linkage.h>
-#include <variant/core.h>
 #include <asm/asmmacro.h>
+#include <asm/core.h>
 
 /*
  * computes a partial checksum, e.g. for TCP/UDP fragments
diff --git a/arch/xtensa/lib/memcopy.S b/arch/xtensa/lib/memcopy.S
index c0f6981719d6..efecfd7ed8cc 100644
--- a/arch/xtensa/lib/memcopy.S
+++ b/arch/xtensa/lib/memcopy.S
@@ -10,8 +10,8 @@
  */
 
 #include <linux/linkage.h>
-#include <variant/core.h>
 #include <asm/asmmacro.h>
+#include <asm/core.h>
 
 /*
  * void *memcpy(void *dst, const void *src, size_t len);
diff --git a/arch/xtensa/lib/memset.S b/arch/xtensa/lib/memset.S
index 276747dec300..8632eacbdc80 100644
--- a/arch/xtensa/lib/memset.S
+++ b/arch/xtensa/lib/memset.S
@@ -12,8 +12,8 @@
  */
 
 #include <linux/linkage.h>
-#include <variant/core.h>
 #include <asm/asmmacro.h>
+#include <asm/core.h>
 
 /*
  * void *memset(void *dst, int c, size_t length)
diff --git a/arch/xtensa/lib/strncpy_user.S b/arch/xtensa/lib/strncpy_user.S
index 5fce16b67dca..c4c6c8578d59 100644
--- a/arch/xtensa/lib/strncpy_user.S
+++ b/arch/xtensa/lib/strncpy_user.S
@@ -13,8 +13,8 @@
 
 #include <linux/errno.h>
 #include <linux/linkage.h>
-#include <variant/core.h>
 #include <asm/asmmacro.h>
+#include <asm/core.h>
 
 /*
  * char *__strncpy_user(char *dst, const char *src, size_t len)
diff --git a/arch/xtensa/lib/strnlen_user.S b/arch/xtensa/lib/strnlen_user.S
index 0b956ce7f386..1f2ca2bb2ab3 100644
--- a/arch/xtensa/lib/strnlen_user.S
+++ b/arch/xtensa/lib/strnlen_user.S
@@ -12,8 +12,8 @@
  */
 
 #include <linux/linkage.h>
-#include <variant/core.h>
 #include <asm/asmmacro.h>
+#include <asm/core.h>
 
 /*
  * size_t __strnlen_user(const char *s, size_t len)
diff --git a/arch/xtensa/lib/usercopy.S b/arch/xtensa/lib/usercopy.S
index 64ab1971324f..228607e30bc2 100644
--- a/arch/xtensa/lib/usercopy.S
+++ b/arch/xtensa/lib/usercopy.S
@@ -54,8 +54,8 @@
  */
 
 #include <linux/linkage.h>
-#include <variant/core.h>
 #include <asm/asmmacro.h>
+#include <asm/core.h>
 
 	.text
 ENTRY(__xtensa_copy_user)
diff --git a/arch/xtensa/platforms/xt2000/include/platform/hardware.h b/arch/xtensa/platforms/xt2000/include/platform/hardware.h
index 8e5e0d6a81ec..9f213f573330 100644
--- a/arch/xtensa/platforms/xt2000/include/platform/hardware.h
+++ b/arch/xtensa/platforms/xt2000/include/platform/hardware.h
@@ -15,7 +15,7 @@
 #ifndef _XTENSA_XT2000_HARDWARE_H
 #define _XTENSA_XT2000_HARDWARE_H
 
-#include <variant/core.h>
+#include <asm/core.h>
 
 /*
  * On-board components.
diff --git a/arch/xtensa/platforms/xt2000/include/platform/serial.h b/arch/xtensa/platforms/xt2000/include/platform/serial.h
index 7226cf732b47..cde804827626 100644
--- a/arch/xtensa/platforms/xt2000/include/platform/serial.h
+++ b/arch/xtensa/platforms/xt2000/include/platform/serial.h
@@ -11,7 +11,7 @@
 #ifndef _XTENSA_XT2000_SERIAL_H
 #define _XTENSA_XT2000_SERIAL_H
 
-#include <variant/core.h>
+#include <asm/core.h>
 #include <asm/io.h>
 
 /*  National-Semi PC16552D DUART:  */
-- 
cgit v1.2.3-59-g8ed1b


From f39356261c265a0689d7ee568132d516e8b6cecc Mon Sep 17 00:00:00 2001
From: Rick Lindsley <ricklind@linux.vnet.ibm.com>
Date: Sun, 5 May 2019 17:20:43 -0700
Subject: powerpc/book3s/64: check for NULL pointer in pgd_alloc()

When the memset code was added to pgd_alloc(), it failed to consider
that kmem_cache_alloc() can return NULL. It's uncommon, but not
impossible under heavy memory contention. Example oops:

  Unable to handle kernel paging request for data at address 0x00000000
  Faulting instruction address: 0xc0000000000a4000
  Oops: Kernel access of bad area, sig: 11 [#1]
  LE SMP NR_CPUS=2048 NUMA pSeries
  CPU: 70 PID: 48471 Comm: entrypoint.sh Kdump: loaded Not tainted 4.14.0-115.6.1.el7a.ppc64le #1
  task: c000000334a00000 task.stack: c000000331c00000
  NIP:  c0000000000a4000 LR: c00000000012f43c CTR: 0000000000000020
  REGS: c000000331c039c0 TRAP: 0300   Not tainted  (4.14.0-115.6.1.el7a.ppc64le)
  MSR:  800000010280b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE,TM[E]>  CR: 44022840  XER: 20040000
  CFAR: c000000000008874 DAR: 0000000000000000 DSISR: 42000000 SOFTE: 1
  ...
  NIP [c0000000000a4000] memset+0x68/0x104
  LR [c00000000012f43c] mm_init+0x27c/0x2f0
  Call Trace:
    mm_init+0x260/0x2f0 (unreliable)
    copy_mm+0x11c/0x638
    copy_process.isra.28.part.29+0x6fc/0x1080
    _do_fork+0xdc/0x4c0
    ppc_clone+0x8/0xc
  Instruction dump:
  409e000c b0860000 38c60002 409d000c 90860000 38c60004 78a0d183 78a506a0
  7c0903a6 41820034 60000000 60420000 <f8860000> f8860008 f8860010 f8860018

Fixes: fc5c2f4a55a2 ("powerpc/mm/hash64: Zero PGD pages on allocation")
Cc: stable@vger.kernel.org # v4.16+
Signed-off-by: Rick Lindsley <ricklind@vnet.linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/pgalloc.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index 053a7940504e..d45e4449619f 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -59,6 +59,9 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 
 	pgd = kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE),
 			       pgtable_gfp_flags(mm, GFP_KERNEL));
+	if (unlikely(!pgd))
+		return pgd;
+
 	/*
 	 * Don't scan the PGD for pointers, it contains references to PUDs but
 	 * those references are not full pointers and so can't be recognised by
-- 
cgit v1.2.3-59-g8ed1b


From 4c11edfcf70bea4cb0a3f4992ac6a4852e8bdc31 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 7 May 2019 02:28:17 +0900
Subject: Remove MODULE_ALIAS() calls that take undefined macro

These files do not define (USBHS_)DRIVER_NAME. Yet, they can be
successfully compiled because they are never built as a module by
anyone, i.e, the MODULE_ALIAS() calls are always no-op.

A problem showed up when a patch "moduleparam: Save information about
built-in modules in separate file" was applied. With this new feature,
MODULE_*() will be populated even if the callers are built-in.

To avoid the build errors, the lines referencing to the undefined
macro must be removed.

The complete fix is to remove all MODULE_* and #include <linux/module.h>
like many "make ... explicitly non-modular" commits did.

For now, I am touching only the offending lines.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/plat-omap/dma.c          | 1 -
 drivers/clocksource/timer-ti-dm.c | 1 -
 drivers/mfd/omap-usb-tll.c        | 1 -
 3 files changed, 3 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/plat-omap/dma.c b/arch/arm/plat-omap/dma.c
index d4012d6c0dcb..5ca4c5fd627a 100644
--- a/arch/arm/plat-omap/dma.c
+++ b/arch/arm/plat-omap/dma.c
@@ -1449,7 +1449,6 @@ static void __exit omap_system_dma_exit(void)
 
 MODULE_DESCRIPTION("OMAP SYSTEM DMA DRIVER");
 MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:" DRIVER_NAME);
 MODULE_AUTHOR("Texas Instruments Inc");
 
 /*
diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c
index 3352da6ed61f..2b7cee81bcf3 100644
--- a/drivers/clocksource/timer-ti-dm.c
+++ b/drivers/clocksource/timer-ti-dm.c
@@ -998,5 +998,4 @@ module_platform_driver(omap_dm_timer_driver);
 
 MODULE_DESCRIPTION("OMAP Dual-Mode Timer Driver");
 MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:" DRIVER_NAME);
 MODULE_AUTHOR("Texas Instruments Inc");
diff --git a/drivers/mfd/omap-usb-tll.c b/drivers/mfd/omap-usb-tll.c
index 446713dbee27..93177d85e7a9 100644
--- a/drivers/mfd/omap-usb-tll.c
+++ b/drivers/mfd/omap-usb-tll.c
@@ -459,7 +459,6 @@ EXPORT_SYMBOL_GPL(omap_tll_disable);
 
 MODULE_AUTHOR("Keshava Munegowda <keshava_mgowda@ti.com>");
 MODULE_AUTHOR("Roger Quadros <rogerq@ti.com>");
-MODULE_ALIAS("platform:" USBHS_DRIVER_NAME);
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("usb tll driver for TI OMAP EHCI and OHCI controllers");
 
-- 
cgit v1.2.3-59-g8ed1b


From d065fcf12c21348cbc125460f75332f467518fb1 Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Mon, 6 May 2019 12:02:26 -0700
Subject: xtensa: clean up inline assembly in futex.h

Replace numeric inline assembly parameters with named parameters.
Drop unused parameters from the futex_atomic_cmpxchg_inatomic.
Use new temporary variable to hold target address in the fixup code in
futex_atomic_cmpxchg_inatomic. Conditionalize function bodies so that
only 'return -ENOSYS' is left in configurations without futex support.

Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
---
 arch/xtensa/include/asm/futex.h | 80 +++++++++++++++++++++--------------------
 1 file changed, 42 insertions(+), 38 deletions(-)

(limited to 'arch')

diff --git a/arch/xtensa/include/asm/futex.h b/arch/xtensa/include/asm/futex.h
index 0fd58b7521aa..a67f801aa303 100644
--- a/arch/xtensa/include/asm/futex.h
+++ b/arch/xtensa/include/asm/futex.h
@@ -19,59 +19,58 @@
 #include <linux/uaccess.h>
 #include <linux/errno.h>
 
-#define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \
+#define __futex_atomic_op(insn, ret, old, uaddr, arg)	\
 	__asm__ __volatile(				\
-	"1:	l32i	%0, %2, 0\n"			\
+	"1:	l32i	%[oldval], %[addr], 0\n"	\
 		insn "\n"				\
-	"	wsr	%0, scompare1\n"		\
-	"2:	s32c1i	%1, %2, 0\n"			\
-	"	bne	%1, %0, 1b\n"			\
-	"	movi	%1, 0\n"			\
+	"	wsr	%[oldval], scompare1\n"		\
+	"2:	s32c1i	%[newval], %[addr], 0\n"	\
+	"	bne	%[newval], %[oldval], 1b\n"	\
+	"	movi	%[newval], 0\n"			\
 	"3:\n"						\
 	"	.section .fixup,\"ax\"\n"		\
 	"	.align 4\n"				\
 	"	.literal_position\n"			\
-	"5:	movi	%0, 3b\n"			\
-	"	movi	%1, %3\n"			\
-	"	jx	%0\n"				\
+	"5:	movi	%[oldval], 3b\n"		\
+	"	movi	%[newval], %[fault]\n"		\
+	"	jx	%[oldval]\n"			\
 	"	.previous\n"				\
 	"	.section __ex_table,\"a\"\n"		\
-	"	.long 1b,5b,2b,5b\n"			\
+	"	.long 1b, 5b, 2b, 5b\n"			\
 	"	.previous\n"				\
-	: "=&r" (oldval), "=&r" (ret)			\
-	: "r" (uaddr), "I" (-EFAULT), "r" (oparg)	\
+	: [oldval] "=&r" (old), [newval] "=&r" (ret)	\
+	: [addr] "r" (uaddr), [oparg] "r" (arg),	\
+	  [fault] "I" (-EFAULT)				\
 	: "memory")
 
 static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
 		u32 __user *uaddr)
 {
+#if XCHAL_HAVE_S32C1I
 	int oldval = 0, ret;
 
-#if !XCHAL_HAVE_S32C1I
-	return -ENOSYS;
-#endif
-
 	pagefault_disable();
 
 	switch (op) {
 	case FUTEX_OP_SET:
-		__futex_atomic_op("mov %1, %4", ret, oldval, uaddr, oparg);
+		__futex_atomic_op("mov %[newval], %[oparg]",
+				  ret, oldval, uaddr, oparg);
 		break;
 	case FUTEX_OP_ADD:
-		__futex_atomic_op("add %1, %0, %4", ret, oldval, uaddr,
-				oparg);
+		__futex_atomic_op("add %[newval], %[oldval], %[oparg]",
+				  ret, oldval, uaddr, oparg);
 		break;
 	case FUTEX_OP_OR:
-		__futex_atomic_op("or %1, %0, %4", ret, oldval, uaddr,
-				oparg);
+		__futex_atomic_op("or %[newval], %[oldval], %[oparg]",
+				  ret, oldval, uaddr, oparg);
 		break;
 	case FUTEX_OP_ANDN:
-		__futex_atomic_op("and %1, %0, %4", ret, oldval, uaddr,
-				~oparg);
+		__futex_atomic_op("and %[newval], %[oldval], %[oparg]",
+				  ret, oldval, uaddr, ~oparg);
 		break;
 	case FUTEX_OP_XOR:
-		__futex_atomic_op("xor %1, %0, %4", ret, oldval, uaddr,
-				oparg);
+		__futex_atomic_op("xor %[newval], %[oldval], %[oparg]",
+				  ret, oldval, uaddr, oparg);
 		break;
 	default:
 		ret = -ENOSYS;
@@ -83,42 +82,47 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
 		*oval = oldval;
 
 	return ret;
+#else
+	return -ENOSYS;
+#endif
 }
 
 static inline int
 futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 			      u32 oldval, u32 newval)
 {
+#if XCHAL_HAVE_S32C1I
+	unsigned long tmp;
 	int ret = 0;
 
 	if (!access_ok(uaddr, sizeof(u32)))
 		return -EFAULT;
 
-#if !XCHAL_HAVE_S32C1I
-	return -ENOSYS;
-#endif
-
 	__asm__ __volatile__ (
 	"	# futex_atomic_cmpxchg_inatomic\n"
-	"	wsr	%5, scompare1\n"
-	"1:	s32c1i	%1, %4, 0\n"
-	"	s32i	%1, %6, 0\n"
+	"	wsr	%[oldval], scompare1\n"
+	"1:	s32c1i	%[newval], %[addr], 0\n"
+	"	s32i	%[newval], %[uval], 0\n"
 	"2:\n"
 	"	.section .fixup,\"ax\"\n"
 	"	.align 4\n"
 	"	.literal_position\n"
-	"4:	movi	%1, 2b\n"
-	"	movi	%0, %7\n"
-	"	jx	%1\n"
+	"4:	movi	%[tmp], 2b\n"
+	"	movi	%[ret], %[fault]\n"
+	"	jx	%[tmp]\n"
 	"	.previous\n"
 	"	.section __ex_table,\"a\"\n"
-	"	.long 1b,4b\n"
+	"	.long 1b, 4b\n"
 	"	.previous\n"
-	: "+r" (ret), "+r" (newval), "+m" (*uaddr), "+m" (*uval)
-	: "r" (uaddr), "r" (oldval), "r" (uval), "I" (-EFAULT)
+	: [ret] "+r" (ret), [newval] "+r" (newval), [tmp] "=&r" (tmp)
+	: [addr] "r" (uaddr), [oldval] "r" (oldval), [uval] "r" (uval),
+	  [fault] "I" (-EFAULT)
 	: "memory");
 
 	return ret;
+#else
+	return -ENOSYS;
+#endif
 }
 
 #endif /* _ASM_XTENSA_FUTEX_H */
-- 
cgit v1.2.3-59-g8ed1b


From f7c34874f04a80d6c39a32f08da2529e59602d3c Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Thu, 20 Dec 2018 17:18:12 -0800
Subject: xtensa: add exclusive atomics support

Implement atomic primitives using exclusive access opcodes available in
the recent xtensa cores.
Since l32ex/s32ex don't have any memory ordering guarantees don't define
__smp_mb__before_atomic/__smp_mb__after_atomic to make them use memw.

Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
---
 arch/xtensa/include/asm/atomic.h  |  62 ++++++++++++++++++-
 arch/xtensa/include/asm/barrier.h |   4 ++
 arch/xtensa/include/asm/bitops.h  | 121 +++++++++++++++++++++++++++++++++++++-
 arch/xtensa/include/asm/cmpxchg.h |  36 +++++++++++-
 arch/xtensa/include/asm/core.h    |   4 ++
 arch/xtensa/include/asm/futex.h   |  43 +++++++++++++-
 arch/xtensa/kernel/setup.c        |   3 +
 7 files changed, 267 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h
index 4e7311f58ae8..7b00d26f472e 100644
--- a/arch/xtensa/include/asm/atomic.h
+++ b/arch/xtensa/include/asm/atomic.h
@@ -56,7 +56,67 @@
  */
 #define atomic_set(v,i)		WRITE_ONCE((v)->counter, (i))
 
-#if XCHAL_HAVE_S32C1I
+#if XCHAL_HAVE_EXCLUSIVE
+#define ATOMIC_OP(op)							\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	unsigned long tmp;						\
+	int result;							\
+									\
+	__asm__ __volatile__(						\
+			"1:     l32ex   %1, %3\n"			\
+			"       " #op " %0, %1, %2\n"			\
+			"       s32ex   %0, %3\n"			\
+			"       getex   %0\n"				\
+			"       beqz    %0, 1b\n"			\
+			: "=&a" (result), "=&a" (tmp)			\
+			: "a" (i), "a" (v)				\
+			: "memory"					\
+			);						\
+}									\
+
+#define ATOMIC_OP_RETURN(op)						\
+static inline int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	unsigned long tmp;						\
+	int result;							\
+									\
+	__asm__ __volatile__(						\
+			"1:     l32ex   %1, %3\n"			\
+			"       " #op " %0, %1, %2\n"			\
+			"       s32ex   %0, %3\n"			\
+			"       getex   %0\n"				\
+			"       beqz    %0, 1b\n"			\
+			"       " #op " %0, %1, %2\n"			\
+			: "=&a" (result), "=&a" (tmp)			\
+			: "a" (i), "a" (v)				\
+			: "memory"					\
+			);						\
+									\
+	return result;							\
+}
+
+#define ATOMIC_FETCH_OP(op)						\
+static inline int atomic_fetch_##op(int i, atomic_t *v)			\
+{									\
+	unsigned long tmp;						\
+	int result;							\
+									\
+	__asm__ __volatile__(						\
+			"1:     l32ex   %1, %3\n"			\
+			"       " #op " %0, %1, %2\n"			\
+			"       s32ex   %0, %3\n"			\
+			"       getex   %0\n"				\
+			"       beqz    %0, 1b\n"			\
+			: "=&a" (result), "=&a" (tmp)			\
+			: "a" (i), "a" (v)				\
+			: "memory"					\
+			);						\
+									\
+	return tmp;							\
+}
+
+#elif XCHAL_HAVE_S32C1I
 #define ATOMIC_OP(op)							\
 static inline void atomic_##op(int i, atomic_t * v)			\
 {									\
diff --git a/arch/xtensa/include/asm/barrier.h b/arch/xtensa/include/asm/barrier.h
index 956596e4d437..d6f8d4ddc2bc 100644
--- a/arch/xtensa/include/asm/barrier.h
+++ b/arch/xtensa/include/asm/barrier.h
@@ -9,12 +9,16 @@
 #ifndef _XTENSA_SYSTEM_H
 #define _XTENSA_SYSTEM_H
 
+#include <asm/core.h>
+
 #define mb()  ({ __asm__ __volatile__("memw" : : : "memory"); })
 #define rmb() barrier()
 #define wmb() mb()
 
+#if XCHAL_HAVE_S32C1I
 #define __smp_mb__before_atomic()		barrier()
 #define __smp_mb__after_atomic()		barrier()
+#endif
 
 #include <asm-generic/barrier.h>
 
diff --git a/arch/xtensa/include/asm/bitops.h b/arch/xtensa/include/asm/bitops.h
index 345d28aead65..aeb15f4c755b 100644
--- a/arch/xtensa/include/asm/bitops.h
+++ b/arch/xtensa/include/asm/bitops.h
@@ -96,7 +96,126 @@ static inline unsigned long __fls(unsigned long word)
 
 #include <asm-generic/bitops/fls64.h>
 
-#if XCHAL_HAVE_S32C1I
+#if XCHAL_HAVE_EXCLUSIVE
+
+static inline void set_bit(unsigned int bit, volatile unsigned long *p)
+{
+	unsigned long tmp;
+	unsigned long mask = 1UL << (bit & 31);
+
+	p += bit >> 5;
+
+	__asm__ __volatile__(
+			"1:     l32ex   %0, %2\n"
+			"       or      %0, %0, %1\n"
+			"       s32ex   %0, %2\n"
+			"       getex   %0\n"
+			"       beqz    %0, 1b\n"
+			: "=&a" (tmp)
+			: "a" (mask), "a" (p)
+			: "memory");
+}
+
+static inline void clear_bit(unsigned int bit, volatile unsigned long *p)
+{
+	unsigned long tmp;
+	unsigned long mask = 1UL << (bit & 31);
+
+	p += bit >> 5;
+
+	__asm__ __volatile__(
+			"1:     l32ex   %0, %2\n"
+			"       and     %0, %0, %1\n"
+			"       s32ex   %0, %2\n"
+			"       getex   %0\n"
+			"       beqz    %0, 1b\n"
+			: "=&a" (tmp)
+			: "a" (~mask), "a" (p)
+			: "memory");
+}
+
+static inline void change_bit(unsigned int bit, volatile unsigned long *p)
+{
+	unsigned long tmp;
+	unsigned long mask = 1UL << (bit & 31);
+
+	p += bit >> 5;
+
+	__asm__ __volatile__(
+			"1:     l32ex   %0, %2\n"
+			"       xor     %0, %0, %1\n"
+			"       s32ex   %0, %2\n"
+			"       getex   %0\n"
+			"       beqz    %0, 1b\n"
+			: "=&a" (tmp)
+			: "a" (~mask), "a" (p)
+			: "memory");
+}
+
+static inline int
+test_and_set_bit(unsigned int bit, volatile unsigned long *p)
+{
+	unsigned long tmp, value;
+	unsigned long mask = 1UL << (bit & 31);
+
+	p += bit >> 5;
+
+	__asm__ __volatile__(
+			"1:     l32ex   %1, %3\n"
+			"       or      %0, %1, %2\n"
+			"       s32ex   %0, %3\n"
+			"       getex   %0\n"
+			"       beqz    %0, 1b\n"
+			: "=&a" (tmp), "=&a" (value)
+			: "a" (mask), "a" (p)
+			: "memory");
+
+	return value & mask;
+}
+
+static inline int
+test_and_clear_bit(unsigned int bit, volatile unsigned long *p)
+{
+	unsigned long tmp, value;
+	unsigned long mask = 1UL << (bit & 31);
+
+	p += bit >> 5;
+
+	__asm__ __volatile__(
+			"1:     l32ex   %1, %3\n"
+			"       and     %0, %1, %2\n"
+			"       s32ex   %0, %3\n"
+			"       getex   %0\n"
+			"       beqz    %0, 1b\n"
+			: "=&a" (tmp), "=&a" (value)
+			: "a" (~mask), "a" (p)
+			: "memory");
+
+	return value & mask;
+}
+
+static inline int
+test_and_change_bit(unsigned int bit, volatile unsigned long *p)
+{
+	unsigned long tmp, value;
+	unsigned long mask = 1UL << (bit & 31);
+
+	p += bit >> 5;
+
+	__asm__ __volatile__(
+			"1:     l32ex   %1, %3\n"
+			"       xor     %0, %1, %2\n"
+			"       s32ex   %0, %3\n"
+			"       getex   %0\n"
+			"       beqz    %0, 1b\n"
+			: "=&a" (tmp), "=&a" (value)
+			: "a" (mask), "a" (p)
+			: "memory");
+
+	return value & mask;
+}
+
+#elif XCHAL_HAVE_S32C1I
 
 static inline void set_bit(unsigned int bit, volatile unsigned long *p)
 {
diff --git a/arch/xtensa/include/asm/cmpxchg.h b/arch/xtensa/include/asm/cmpxchg.h
index 22a10c715c1f..7ccc5cbf441b 100644
--- a/arch/xtensa/include/asm/cmpxchg.h
+++ b/arch/xtensa/include/asm/cmpxchg.h
@@ -23,7 +23,24 @@
 static inline unsigned long
 __cmpxchg_u32(volatile int *p, int old, int new)
 {
-#if XCHAL_HAVE_S32C1I
+#if XCHAL_HAVE_EXCLUSIVE
+	unsigned long tmp, result;
+
+	__asm__ __volatile__(
+			"1:     l32ex   %0, %3\n"
+			"       bne     %0, %4, 2f\n"
+			"       mov     %1, %2\n"
+			"       s32ex   %1, %3\n"
+			"       getex   %1\n"
+			"       beqz    %1, 1b\n"
+			"2:\n"
+			: "=&a" (result), "=&a" (tmp)
+			: "a" (new), "a" (p), "a" (old)
+			: "memory"
+			);
+
+	return result;
+#elif XCHAL_HAVE_S32C1I
 	__asm__ __volatile__(
 			"       wsr     %2, scompare1\n"
 			"       s32c1i  %0, %1, 0\n"
@@ -108,7 +125,22 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
 
 static inline unsigned long xchg_u32(volatile int * m, unsigned long val)
 {
-#if XCHAL_HAVE_S32C1I
+#if XCHAL_HAVE_EXCLUSIVE
+	unsigned long tmp, result;
+
+	__asm__ __volatile__(
+			"1:     l32ex   %0, %3\n"
+			"       mov     %1, %2\n"
+			"       s32ex   %1, %3\n"
+			"       getex   %1\n"
+			"       beqz    %1, 1b\n"
+			: "=&a" (result), "=&a" (tmp)
+			: "a" (val), "a" (m)
+			: "memory"
+			);
+
+	return result;
+#elif XCHAL_HAVE_S32C1I
 	unsigned long tmp, result;
 	__asm__ __volatile__(
 			"1:     l32i    %1, %2, 0\n"
diff --git a/arch/xtensa/include/asm/core.h b/arch/xtensa/include/asm/core.h
index b1f57519df49..ee02e164b673 100644
--- a/arch/xtensa/include/asm/core.h
+++ b/arch/xtensa/include/asm/core.h
@@ -6,6 +6,10 @@
 
 #include <variant/core.h>
 
+#ifndef XCHAL_HAVE_EXCLUSIVE
+#define XCHAL_HAVE_EXCLUSIVE 0
+#endif
+
 #ifndef XCHAL_SPANNING_WAY
 #define XCHAL_SPANNING_WAY 0
 #endif
diff --git a/arch/xtensa/include/asm/futex.h b/arch/xtensa/include/asm/futex.h
index a67f801aa303..9538b0f7953c 100644
--- a/arch/xtensa/include/asm/futex.h
+++ b/arch/xtensa/include/asm/futex.h
@@ -19,6 +19,31 @@
 #include <linux/uaccess.h>
 #include <linux/errno.h>
 
+#if XCHAL_HAVE_EXCLUSIVE
+#define __futex_atomic_op(insn, ret, old, uaddr, arg)	\
+	__asm__ __volatile(				\
+	"1:	l32ex	%[oldval], %[addr]\n"		\
+		insn "\n"				\
+	"2:	s32ex	%[newval], %[addr]\n"		\
+	"	getex	%[newval]\n"			\
+	"	beqz	%[newval], 1b\n"		\
+	"	movi	%[newval], 0\n"			\
+	"3:\n"						\
+	"	.section .fixup,\"ax\"\n"		\
+	"	.align 4\n"				\
+	"	.literal_position\n"			\
+	"5:	movi	%[oldval], 3b\n"		\
+	"	movi	%[newval], %[fault]\n"		\
+	"	jx	%[oldval]\n"			\
+	"	.previous\n"				\
+	"	.section __ex_table,\"a\"\n"		\
+	"	.long 1b, 5b, 2b, 5b\n"			\
+	"	.previous\n"				\
+	: [oldval] "=&r" (old), [newval] "=&r" (ret)	\
+	: [addr] "r" (uaddr), [oparg] "r" (arg),	\
+	  [fault] "I" (-EFAULT)				\
+	: "memory")
+#elif XCHAL_HAVE_S32C1I
 #define __futex_atomic_op(insn, ret, old, uaddr, arg)	\
 	__asm__ __volatile(				\
 	"1:	l32i	%[oldval], %[addr], 0\n"	\
@@ -42,11 +67,12 @@
 	: [addr] "r" (uaddr), [oparg] "r" (arg),	\
 	  [fault] "I" (-EFAULT)				\
 	: "memory")
+#endif
 
 static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
 		u32 __user *uaddr)
 {
-#if XCHAL_HAVE_S32C1I
+#if XCHAL_HAVE_S32C1I || XCHAL_HAVE_EXCLUSIVE
 	int oldval = 0, ret;
 
 	pagefault_disable();
@@ -91,7 +117,7 @@ static inline int
 futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 			      u32 oldval, u32 newval)
 {
-#if XCHAL_HAVE_S32C1I
+#if XCHAL_HAVE_S32C1I || XCHAL_HAVE_EXCLUSIVE
 	unsigned long tmp;
 	int ret = 0;
 
@@ -100,9 +126,19 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 
 	__asm__ __volatile__ (
 	"	# futex_atomic_cmpxchg_inatomic\n"
+#if XCHAL_HAVE_EXCLUSIVE
+	"1:	l32ex	%[tmp], %[addr]\n"
+	"	s32i	%[tmp], %[uval], 0\n"
+	"	bne	%[tmp], %[oldval], 2f\n"
+	"	mov	%[tmp], %[newval]\n"
+	"3:	s32ex	%[tmp], %[addr]\n"
+	"	getex	%[tmp]\n"
+	"	beqz	%[tmp], 1b\n"
+#elif XCHAL_HAVE_S32C1I
 	"	wsr	%[oldval], scompare1\n"
 	"1:	s32c1i	%[newval], %[addr], 0\n"
 	"	s32i	%[newval], %[uval], 0\n"
+#endif
 	"2:\n"
 	"	.section .fixup,\"ax\"\n"
 	"	.align 4\n"
@@ -113,6 +149,9 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	"	.previous\n"
 	"	.section __ex_table,\"a\"\n"
 	"	.long 1b, 4b\n"
+#if XCHAL_HAVE_EXCLUSIVE
+	"	.long 3b, 4b\n"
+#endif
 	"	.previous\n"
 	: [ret] "+r" (ret), [newval] "+r" (newval), [tmp] "=&r" (tmp)
 	: [addr] "r" (uaddr), [oldval] "r" (oldval), [uval] "r" (uval),
diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c
index 4ec6fbb696bf..c0ec24349421 100644
--- a/arch/xtensa/kernel/setup.c
+++ b/arch/xtensa/kernel/setup.c
@@ -650,6 +650,9 @@ c_show(struct seq_file *f, void *slot)
 #endif
 #if XCHAL_HAVE_S32C1I
 		     "s32c1i "
+#endif
+#if XCHAL_HAVE_EXCLUSIVE
+		     "exclusive "
 #endif
 		     "\n");
 
-- 
cgit v1.2.3-59-g8ed1b


From a5944195d00a359e28d6e093593609bcee37ed5e Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Mon, 6 May 2019 16:47:41 -0700
Subject: xtensa: implement initialize_cacheattr for MPU cores

Use CONFIG_MEMMAP_CACHEATTR to initialize MPU as described in the Xtensa
LSP RM document. Coalesce adjacent regions with the same cacheattr.
Update Kconfig help text.

Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
---
 arch/xtensa/Kconfig                      | 26 ++++++++++++++++++------
 arch/xtensa/include/asm/core.h           |  4 ++++
 arch/xtensa/include/asm/initialize_mmu.h | 34 +++++++++++++++++++++++++++++++-
 3 files changed, 57 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 4b9aafe766c5..f081751db229 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -256,12 +256,26 @@ config MEMMAP_CACHEATTR
 	  region: bits 0..3 -- for addresses 0x00000000..0x1fffffff,
 	  bits 4..7 -- for addresses 0x20000000..0x3fffffff, and so on.
 
-	  Cache attribute values are specific for the MMU type, so e.g.
-	  for region protection MMUs: 2 is cache bypass, 4 is WB cached,
-	  1 is WT cached, f is illegal. For ful MMU: bit 0 makes it executable,
-	  bit 1 makes it writable, bits 2..3 meaning is 0: cache bypass,
-	  1: WB cache, 2: WT cache, 3: special (c and e are illegal, f is
-	  reserved).
+	  Cache attribute values are specific for the MMU type.
+	  For region protection MMUs:
+	    1: WT cached,
+	    2: cache bypass,
+	    4: WB cached,
+	    f: illegal.
+	  For ful MMU:
+	    bit 0: executable,
+	    bit 1: writable,
+	    bits 2..3:
+	      0: cache bypass,
+	      1: WB cache,
+	      2: WT cache,
+	      3: special (c and e are illegal, f is reserved).
+	  For MPU:
+	    0: illegal,
+	    1: WB cache,
+	    2: WB, no-write-allocate cache,
+	    3: WT cache,
+	    4: cache bypass.
 
 config KSEG_PADDR
 	hex "Physical address of the KSEG mapping"
diff --git a/arch/xtensa/include/asm/core.h b/arch/xtensa/include/asm/core.h
index ee02e164b673..5b4acb7d1c07 100644
--- a/arch/xtensa/include/asm/core.h
+++ b/arch/xtensa/include/asm/core.h
@@ -10,6 +10,10 @@
 #define XCHAL_HAVE_EXCLUSIVE 0
 #endif
 
+#ifndef XCHAL_HAVE_MPU
+#define XCHAL_HAVE_MPU 0
+#endif
+
 #ifndef XCHAL_SPANNING_WAY
 #define XCHAL_SPANNING_WAY 0
 #endif
diff --git a/arch/xtensa/include/asm/initialize_mmu.h b/arch/xtensa/include/asm/initialize_mmu.h
index d719785e54f7..323d05789159 100644
--- a/arch/xtensa/include/asm/initialize_mmu.h
+++ b/arch/xtensa/include/asm/initialize_mmu.h
@@ -177,11 +177,42 @@
 
 	.macro	initialize_cacheattr
 
-#if !defined(CONFIG_MMU) && XCHAL_HAVE_TLBS
+#if !defined(CONFIG_MMU) && (XCHAL_HAVE_TLBS || XCHAL_HAVE_MPU)
 #if CONFIG_MEMMAP_CACHEATTR == 0x22222222 && XCHAL_HAVE_PTP_MMU
 #error Default MEMMAP_CACHEATTR of 0x22222222 does not work with full MMU.
 #endif
 
+#if XCHAL_HAVE_MPU
+	.data
+	.align	4
+.Lattribute_table:
+	.long 0x000000, 0x1fff00, 0x1ddf00, 0x1eef00
+	.long 0x006600, 0x000000, 0x000000, 0x000000
+	.long 0x000000, 0x000000, 0x000000, 0x000000
+	.long 0x000000, 0x000000, 0x000000, 0x000000
+	.previous
+
+	movi	a3, .Lattribute_table
+	movi	a4, CONFIG_MEMMAP_CACHEATTR
+	movi	a5, 1
+	movi	a6, XCHAL_MPU_ENTRIES
+	movi	a10, 0x20000000
+	movi	a11, -1
+1:
+	sub	a5, a5, a10
+	extui	a8, a4, 28, 4
+	beq	a8, a11, 2f
+	addi	a6, a6, -1
+	mov	a11, a8
+2:
+	addx4	a9, a8, a3
+	l32i	a9, a9, 0
+	or	a9, a9, a6
+	wptlb	a9, a5
+	slli	a4, a4, 4
+	bgeu	a5, a10, 1b
+
+#else
 	movi	a5, XCHAL_SPANNING_WAY
 	movi	a6, ~_PAGE_ATTRIB_MASK
 	movi	a4, CONFIG_MEMMAP_CACHEATTR
@@ -203,6 +234,7 @@
 	bgeu	a5, a8, 1b
 
 	isync
+#endif
 #endif
 
 	.endm
-- 
cgit v1.2.3-59-g8ed1b


From 5c2ffce1e9496477720966e70d79f2da3e4b84e6 Mon Sep 17 00:00:00 2001
From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Date: Fri, 4 Jan 2019 15:38:21 +0000
Subject: um: Revert to using stack for pt_regs in signal handling

Reverts commit b6024b21fec8367ef961a771cc9dde31f1831965 and
adjusts default stack sizing to cope with larger size of
floating point save registers on the newer Intel CPUs.

b6024b21fec8367ef961a771cc9dde31f1831965 replaced storing the
register state on the stack with kmalloc-ed storage. That has
a number of issues and a panic if that fails.
    1. kmalloc/ATOMIC can fail. There was a latent hard crash
in all interrupt and fault handling as a result.
    2. kmalloc in the interrupt path introduces a considerable
performance penalty for networking ~ 14% on iperf.

This commit restores uml to a stable state until a better
solution is found.

Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/um/Kconfig           | 10 +++++++---
 arch/um/os-Linux/signal.c | 28 ++++++++--------------------
 2 files changed, 15 insertions(+), 23 deletions(-)

(limited to 'arch')

diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index ec9711d068b7..41913504a56e 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -142,13 +142,17 @@ config MAGIC_SYSRQ
 
 config KERNEL_STACK_ORDER
 	int "Kernel stack size order"
-	default 1 if 64BIT
-	range 1 10 if 64BIT
-	default 0 if !64BIT
+	default 2 if 64BIT
+	range 2 10 if 64BIT
+	default 1 if !64BIT
 	help
 	  This option determines the size of UML kernel stacks.  They will
 	  be 1 << order pages.  The default is OK unless you're running Valgrind
 	  on UML, in which case, set this to 3.
+	  It is possible to reduce the stack to 1 for 64BIT and 0 for 32BIT on
+	  older (pre-2017) CPUs. It is not recommended on newer CPUs due to the
+	  increase in the size of the state which needs to be saved when handling
+          signals.
 
 config MMAPPER
 	tristate "iomem emulation driver"
diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
index bf0acb8aad8b..75b10235d369 100644
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -31,29 +31,23 @@ void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = {
 
 static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
 {
-	struct uml_pt_regs *r;
+	struct uml_pt_regs r;
 	int save_errno = errno;
 
-	r = uml_kmalloc(sizeof(struct uml_pt_regs), UM_GFP_ATOMIC);
-	if (!r)
-		panic("out of memory");
-
-	r->is_user = 0;
+	r.is_user = 0;
 	if (sig == SIGSEGV) {
 		/* For segfaults, we want the data from the sigcontext. */
-		get_regs_from_mc(r, mc);
-		GET_FAULTINFO_FROM_MC(r->faultinfo, mc);
+		get_regs_from_mc(&r, mc);
+		GET_FAULTINFO_FROM_MC(r.faultinfo, mc);
 	}
 
 	/* enable signals if sig isn't IRQ signal */
 	if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGALRM))
 		unblock_signals();
 
-	(*sig_info[sig])(sig, si, r);
+	(*sig_info[sig])(sig, si, &r);
 
 	errno = save_errno;
-
-	free(r);
 }
 
 /*
@@ -91,17 +85,11 @@ void sig_handler(int sig, struct siginfo *si, mcontext_t *mc)
 
 static void timer_real_alarm_handler(mcontext_t *mc)
 {
-	struct uml_pt_regs *regs;
-
-	regs = uml_kmalloc(sizeof(struct uml_pt_regs), UM_GFP_ATOMIC);
-	if (!regs)
-		panic("out of memory");
+	struct uml_pt_regs regs;
 
 	if (mc != NULL)
-		get_regs_from_mc(regs, mc);
-	timer_handler(SIGALRM, NULL, regs);
-
-	free(regs);
+		get_regs_from_mc(&regs, mc);
+	timer_handler(SIGALRM, NULL, &regs);
 }
 
 void timer_alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc)
-- 
cgit v1.2.3-59-g8ed1b


From 37606596d19d53bbc94e8e7f174c4df762682ccb Mon Sep 17 00:00:00 2001
From: "Enrico Weigelt, metux IT consult" <info@metux.net>
Date: Wed, 6 Mar 2019 22:14:55 +0100
Subject: arch: um: Kconfig: pedantic indention cleanups

Formatting of Kconfig files doesn't look so pretty, so just
take damp cloth and clean it up.

Signed-off-by: Enrico Weigelt, metux IT consult <info@metux.net>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/um/Kconfig | 52 ++++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

(limited to 'arch')

diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 41913504a56e..6b6eb938fcc1 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -80,46 +80,46 @@ config LD_SCRIPT_DYN
 	bool
 	default y
 	depends on !LD_SCRIPT_STATIC
-        select MODULE_REL_CRCS if MODVERSIONS
+	select MODULE_REL_CRCS if MODVERSIONS
 
 config HOSTFS
 	tristate "Host filesystem"
 	help
-          While the User-Mode Linux port uses its own root file system for
-          booting and normal file access, this module lets the UML user
-          access files stored on the host.  It does not require any
-          network connection between the Host and UML.  An example use of
-          this might be:
+	  While the User-Mode Linux port uses its own root file system for
+	  booting and normal file access, this module lets the UML user
+	  access files stored on the host.  It does not require any
+	  network connection between the Host and UML.  An example use of
+	  this might be:
 
-          mount none /tmp/fromhost -t hostfs -o /tmp/umlshare
+	  mount none /tmp/fromhost -t hostfs -o /tmp/umlshare
 
-          where /tmp/fromhost is an empty directory inside UML and
-          /tmp/umlshare is a directory on the host with files the UML user
-          wishes to access.
+	  where /tmp/fromhost is an empty directory inside UML and
+	  /tmp/umlshare is a directory on the host with files the UML user
+	  wishes to access.
 
-          For more information, see
-          <http://user-mode-linux.sourceforge.net/hostfs.html>.
+	  For more information, see
+	  <http://user-mode-linux.sourceforge.net/hostfs.html>.
 
-          If you'd like to be able to work with files stored on the host,
-          say Y or M here; otherwise say N.
+	  If you'd like to be able to work with files stored on the host,
+	  say Y or M here; otherwise say N.
 
 config MCONSOLE
 	bool "Management console"
 	depends on PROC_FS
 	default y
 	help
-          The user mode linux management console is a low-level interface to
-          the kernel, somewhat like the i386 SysRq interface.  Since there is
-          a full-blown operating system running under every user mode linux
-          instance, there is much greater flexibility possible than with the
-          SysRq mechanism.
-
-          If you answer 'Y' to this option, to use this feature, you need the
-          mconsole client (called uml_mconsole) which is present in CVS in
-          2.4.5-9um and later (path /tools/mconsole), and is also in the
-          distribution RPM package in 2.4.6 and later.
-
-          It is safe to say 'Y' here.
+	  The user mode linux management console is a low-level interface to
+	  the kernel, somewhat like the i386 SysRq interface.  Since there is
+	  a full-blown operating system running under every user mode linux
+	  instance, there is much greater flexibility possible than with the
+	  SysRq mechanism.
+
+	  If you answer 'Y' to this option, to use this feature, you need the
+	  mconsole client (called uml_mconsole) which is present in CVS in
+	  2.4.5-9um and later (path /tools/mconsole), and is also in the
+	  distribution RPM package in 2.4.6 and later.
+
+	  It is safe to say 'Y' here.
 
 config MAGIC_SYSRQ
 	bool "Magic SysRq key"
-- 
cgit v1.2.3-59-g8ed1b


From 75f24f78721048a271e2e50a563f51bcfd6f5c1c Mon Sep 17 00:00:00 2001
From: "Enrico Weigelt, metux IT consult" <info@metux.net>
Date: Thu, 7 Mar 2019 23:22:37 +0100
Subject: arch: um: drivers: Kconfig: pedantic formatting

Formatting of Kconfig files doesn't look so pretty, so just
take damp cloth and clean it up. Just indention changes.

Signed-off-by: Enrico Weigelt, metux IT consult <info@metux.net>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/um/drivers/Kconfig | 352 ++++++++++++++++++++++++------------------------
 1 file changed, 176 insertions(+), 176 deletions(-)

(limited to 'arch')

diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
index 2b1aaf7755aa..2638e46f50cc 100644
--- a/arch/um/drivers/Kconfig
+++ b/arch/um/drivers/Kconfig
@@ -11,58 +11,58 @@ config STDERR_CONSOLE
 config SSL
 	bool "Virtual serial line"
 	help
-          The User-Mode Linux environment allows you to create virtual serial
-          lines on the UML that are usually made to show up on the host as
-          ttys or ptys.
+	  The User-Mode Linux environment allows you to create virtual serial
+	  lines on the UML that are usually made to show up on the host as
+	  ttys or ptys.
 
-          See <http://user-mode-linux.sourceforge.net/old/input.html> for more
-          information and command line examples of how to use this facility.
+	  See <http://user-mode-linux.sourceforge.net/old/input.html> for more
+	  information and command line examples of how to use this facility.
 
-          Unless you have a specific reason for disabling this, say Y.
+	  Unless you have a specific reason for disabling this, say Y.
 
 config NULL_CHAN
 	bool "null channel support"
 	help
-          This option enables support for attaching UML consoles and serial
-          lines to a device similar to /dev/null.  Data written to it disappears
-          and there is never any data to be read.
+	  This option enables support for attaching UML consoles and serial
+	  lines to a device similar to /dev/null.  Data written to it disappears
+	  and there is never any data to be read.
 
 config PORT_CHAN
 	bool "port channel support"
 	help
-          This option enables support for attaching UML consoles and serial
-          lines to host portals.  They may be accessed with 'telnet <host>
-          <port number>'.  Any number of consoles and serial lines may be
-          attached to a single portal, although what UML device you get when
-          you telnet to that portal will be unpredictable.
-          It is safe to say 'Y' here.
+	  This option enables support for attaching UML consoles and serial
+	  lines to host portals.  They may be accessed with 'telnet <host>
+	  <port number>'.  Any number of consoles and serial lines may be
+	  attached to a single portal, although what UML device you get when
+	  you telnet to that portal will be unpredictable.
+	  It is safe to say 'Y' here.
 
 config PTY_CHAN
 	bool "pty channel support"
 	help
-          This option enables support for attaching UML consoles and serial
-          lines to host pseudo-terminals.  Access to both traditional
-          pseudo-terminals (/dev/pty*) and pts pseudo-terminals are controlled
-          with this option.  The assignment of UML devices to host devices
-          will be announced in the kernel message log.
-          It is safe to say 'Y' here.
+	  This option enables support for attaching UML consoles and serial
+	  lines to host pseudo-terminals.  Access to both traditional
+	  pseudo-terminals (/dev/pty*) and pts pseudo-terminals are controlled
+	  with this option.  The assignment of UML devices to host devices
+	  will be announced in the kernel message log.
+	  It is safe to say 'Y' here.
 
 config TTY_CHAN
 	bool "tty channel support"
 	help
-          This option enables support for attaching UML consoles and serial
-          lines to host terminals.  Access to both virtual consoles
-          (/dev/tty*) and the slave side of pseudo-terminals (/dev/ttyp* and
-          /dev/pts/*) are controlled by this option.
-          It is safe to say 'Y' here.
+	  This option enables support for attaching UML consoles and serial
+	  lines to host terminals.  Access to both virtual consoles
+	  (/dev/tty*) and the slave side of pseudo-terminals (/dev/ttyp* and
+	  /dev/pts/*) are controlled by this option.
+	  It is safe to say 'Y' here.
 
 config XTERM_CHAN
 	bool "xterm channel support"
 	help
-          This option enables support for attaching UML consoles and serial
-          lines to xterms.  Each UML device so assigned will be brought up in
-          its own xterm.
-          It is safe to say 'Y' here.
+	  This option enables support for attaching UML consoles and serial
+	  lines to xterms.  Each UML device so assigned will be brought up in
+	  its own xterm.
+	  It is safe to say 'Y' here.
 
 config NOCONFIG_CHAN
 	bool
@@ -72,43 +72,43 @@ config CON_ZERO_CHAN
 	string "Default main console channel initialization"
 	default "fd:0,fd:1"
 	help
-          This is the string describing the channel to which the main console
-          will be attached by default.  This value can be overridden from the
-          command line.  The default value is "fd:0,fd:1", which attaches the
-          main console to stdin and stdout.
-          It is safe to leave this unchanged.
+	  This is the string describing the channel to which the main console
+	  will be attached by default.  This value can be overridden from the
+	  command line.  The default value is "fd:0,fd:1", which attaches the
+	  main console to stdin and stdout.
+	  It is safe to leave this unchanged.
 
 config CON_CHAN
 	string "Default console channel initialization"
 	default "xterm"
 	help
-          This is the string describing the channel to which all consoles
-          except the main console will be attached by default.  This value can
-          be overridden from the command line.  The default value is "xterm",
-          which brings them up in xterms.
-          It is safe to leave this unchanged, although you may wish to change
-          this if you expect the UML that you build to be run in environments
-          which don't have X or xterm available.
+	  This is the string describing the channel to which all consoles
+	  except the main console will be attached by default.  This value can
+	  be overridden from the command line.  The default value is "xterm",
+	  which brings them up in xterms.
+	  It is safe to leave this unchanged, although you may wish to change
+	  this if you expect the UML that you build to be run in environments
+	  which don't have X or xterm available.
 
 config SSL_CHAN
 	string "Default serial line channel initialization"
 	default "pty"
 	help
-          This is the string describing the channel to which the serial lines
-          will be attached by default.  This value can be overridden from the
-          command line.  The default value is "pty", which attaches them to
-          traditional pseudo-terminals.
-          It is safe to leave this unchanged, although you may wish to change
-          this if you expect the UML that you build to be run in environments
-          which don't have a set of /dev/pty* devices.
+	  This is the string describing the channel to which the serial lines
+	  will be attached by default.  This value can be overridden from the
+	  command line.  The default value is "pty", which attaches them to
+	  traditional pseudo-terminals.
+	  It is safe to leave this unchanged, although you may wish to change
+	  this if you expect the UML that you build to be run in environments
+	  which don't have a set of /dev/pty* devices.
 
 config UML_SOUND
 	tristate "Sound support"
 	help
-          This option enables UML sound support.  If enabled, it will pull in
-          soundcore and the UML hostaudio relay, which acts as a intermediary
-          between the host's dsp and mixer devices and the UML sound system.
-          It is safe to say 'Y' here.
+	  This option enables UML sound support.  If enabled, it will pull in
+	  soundcore and the UML hostaudio relay, which acts as a intermediary
+	  between the host's dsp and mixer devices and the UML sound system.
+	  It is safe to say 'Y' here.
 
 config SOUND
 	tristate
@@ -131,107 +131,107 @@ menu "UML Network Devices"
 config UML_NET
 	bool "Virtual network device"
 	help
-        While the User-Mode port cannot directly talk to any physical
-        hardware devices, this choice and the following transport options
-        provide one or more virtual network devices through which the UML
-        kernels can talk to each other, the host, and with the host's help,
-        machines on the outside world.
+	  While the User-Mode port cannot directly talk to any physical
+	  hardware devices, this choice and the following transport options
+	  provide one or more virtual network devices through which the UML
+	  kernels can talk to each other, the host, and with the host's help,
+	  machines on the outside world.
 
-        For more information, including explanations of the networking and
-        sample configurations, see
-        <http://user-mode-linux.sourceforge.net/old/networking.html>.
+	  For more information, including explanations of the networking and
+	  sample configurations, see
+	  <http://user-mode-linux.sourceforge.net/old/networking.html>.
 
-        If you'd like to be able to enable networking in the User-Mode
-        linux environment, say Y; otherwise say N.  Note that you must
-        enable at least one of the following transport options to actually
-        make use of UML networking.
+	  If you'd like to be able to enable networking in the User-Mode
+	  linux environment, say Y; otherwise say N.  Note that you must
+	  enable at least one of the following transport options to actually
+	  make use of UML networking.
 
 config UML_NET_ETHERTAP
 	bool "Ethertap transport"
 	depends on UML_NET
 	help
-        The Ethertap User-Mode Linux network transport allows a single
-        running UML to exchange packets with its host over one of the
-        host's Ethertap devices, such as /dev/tap0.  Additional running
-        UMLs can use additional Ethertap devices, one per running UML.
-        While the UML believes it's on a (multi-device, broadcast) virtual
-        Ethernet network, it's in fact communicating over a point-to-point
-        link with the host.
-
-        To use this, your host kernel must have support for Ethertap
-        devices.  Also, if your host kernel is 2.4.x, it must have
-        CONFIG_NETLINK_DEV configured as Y or M.
-
-        For more information, see
-        <http://user-mode-linux.sourceforge.net/old/networking.html>  That site
-        has examples of the UML command line to use to enable Ethertap
-        networking.
-
-        If you'd like to set up an IP network with the host and/or the
-        outside world, say Y to this, the Daemon Transport and/or the
-        Slip Transport.  You'll need at least one of them, but may choose
-        more than one without conflict.  If you don't need UML networking,
-        say N.
+	  The Ethertap User-Mode Linux network transport allows a single
+	  running UML to exchange packets with its host over one of the
+	  host's Ethertap devices, such as /dev/tap0.  Additional running
+	  UMLs can use additional Ethertap devices, one per running UML.
+	  While the UML believes it's on a (multi-device, broadcast) virtual
+	  Ethernet network, it's in fact communicating over a point-to-point
+	  link with the host.
+
+	  To use this, your host kernel must have support for Ethertap
+	  devices.  Also, if your host kernel is 2.4.x, it must have
+	  CONFIG_NETLINK_DEV configured as Y or M.
+
+	  For more information, see
+	  <http://user-mode-linux.sourceforge.net/old/networking.html>  That site
+	  has examples of the UML command line to use to enable Ethertap
+	  networking.
+
+	  If you'd like to set up an IP network with the host and/or the
+	  outside world, say Y to this, the Daemon Transport and/or the
+	  Slip Transport.  You'll need at least one of them, but may choose
+	  more than one without conflict.  If you don't need UML networking,
+	  say N.
 
 config UML_NET_TUNTAP
 	bool "TUN/TAP transport"
 	depends on UML_NET
 	help
-        The UML TUN/TAP network transport allows a UML instance to exchange
-        packets with the host over a TUN/TAP device.  This option will only
-        work with a 2.4 host, unless you've applied the TUN/TAP patch to
-        your 2.2 host kernel.
+	  The UML TUN/TAP network transport allows a UML instance to exchange
+	  packets with the host over a TUN/TAP device.  This option will only
+	  work with a 2.4 host, unless you've applied the TUN/TAP patch to
+	  your 2.2 host kernel.
 
-        To use this transport, your host kernel must have support for TUN/TAP
-        devices, either built-in or as a module.
+	  To use this transport, your host kernel must have support for TUN/TAP
+	  devices, either built-in or as a module.
 
 config UML_NET_SLIP
 	bool "SLIP transport"
 	depends on UML_NET
 	help
-        The slip User-Mode Linux network transport allows a running UML to
-        network with its host over a point-to-point link.  Unlike Ethertap,
-        which can carry any Ethernet frame (and hence even non-IP packets),
-        the slip transport can only carry IP packets.
-
-        To use this, your host must support slip devices.
-
-        For more information, see
-        <http://user-mode-linux.sourceforge.net/old/networking.html>.
-        has examples of the UML command line to use to enable slip
-        networking, and details of a few quirks with it.
-
-        The Ethertap Transport is preferred over slip because of its
-        limitations.  If you prefer slip, however, say Y here.  Otherwise
-        choose the Multicast transport (to network multiple UMLs on
-        multiple hosts), Ethertap (to network with the host and the
-        outside world), and/or the Daemon transport (to network multiple
-        UMLs on a single host).  You may choose more than one without
-        conflict.  If you don't need UML networking, say N.
+	  The slip User-Mode Linux network transport allows a running UML to
+	  network with its host over a point-to-point link.  Unlike Ethertap,
+	  which can carry any Ethernet frame (and hence even non-IP packets),
+	  the slip transport can only carry IP packets.
+
+	  To use this, your host must support slip devices.
+
+	  For more information, see
+	  <http://user-mode-linux.sourceforge.net/old/networking.html>.
+	  has examples of the UML command line to use to enable slip
+	  networking, and details of a few quirks with it.
+
+	  The Ethertap Transport is preferred over slip because of its
+	  limitations.  If you prefer slip, however, say Y here.  Otherwise
+	  choose the Multicast transport (to network multiple UMLs on
+	  multiple hosts), Ethertap (to network with the host and the
+	  outside world), and/or the Daemon transport (to network multiple
+	  UMLs on a single host).  You may choose more than one without
+	  conflict.  If you don't need UML networking, say N.
 
 config UML_NET_DAEMON
 	bool "Daemon transport"
 	depends on UML_NET
 	help
-        This User-Mode Linux network transport allows one or more running
-        UMLs on a single host to communicate with each other, but not to
-        the host.
-
-        To use this form of networking, you'll need to run the UML
-        networking daemon on the host.
-
-        For more information, see
-        <http://user-mode-linux.sourceforge.net/old/networking.html>  That site
-        has examples of the UML command line to use to enable Daemon
-        networking.
-
-        If you'd like to set up a network with other UMLs on a single host,
-        say Y.  If you need a network between UMLs on multiple physical
-        hosts, choose the Multicast Transport.  To set up a network with
-        the host and/or other IP machines, say Y to the Ethertap or Slip
-        transports.  You'll need at least one of them, but may choose
-        more than one without conflict.  If you don't need UML networking,
-        say N.
+	  This User-Mode Linux network transport allows one or more running
+	  UMLs on a single host to communicate with each other, but not to
+	  the host.
+
+	  To use this form of networking, you'll need to run the UML
+	  networking daemon on the host.
+
+	  For more information, see
+	  <http://user-mode-linux.sourceforge.net/old/networking.html>  That site
+	  has examples of the UML command line to use to enable Daemon
+	  networking.
+
+	  If you'd like to set up a network with other UMLs on a single host,
+	  say Y.  If you need a network between UMLs on multiple physical
+	  hosts, choose the Multicast Transport.  To set up a network with
+	  the host and/or other IP machines, say Y to the Ethertap or Slip
+	  transports.  You'll need at least one of them, but may choose
+	  more than one without conflict.  If you don't need UML networking,
+	  say N.
 
 config UML_NET_VECTOR
 	bool "Vector I/O high performance network devices"
@@ -270,26 +270,26 @@ config UML_NET_MCAST
 	bool "Multicast transport"
 	depends on UML_NET
 	help
-        This Multicast User-Mode Linux network transport allows multiple
-        UMLs (even ones running on different host machines!) to talk to
-        each other over a virtual ethernet network.  However, it requires
-        at least one UML with one of the other transports to act as a
-        bridge if any of them need to be able to talk to their hosts or any
-        other IP machines.
-
-        To use this, your host kernel(s) must support IP Multicasting.
-
-        For more information, see
-        <http://user-mode-linux.sourceforge.net/old/networking.html>  That site
-        has examples of the UML command line to use to enable Multicast
-        networking, and notes about the security of this approach.
-
-        If you need UMLs on multiple physical hosts to communicate as if
-        they shared an Ethernet network, say Y.  If you need to communicate
-        with other IP machines, make sure you select one of the other
-        transports (possibly in addition to Multicast; they're not
-        exclusive).  If you don't need to network UMLs say N to each of
-        the transports.
+	  This Multicast User-Mode Linux network transport allows multiple
+	  UMLs (even ones running on different host machines!) to talk to
+	  each other over a virtual ethernet network.  However, it requires
+	  at least one UML with one of the other transports to act as a
+	  bridge if any of them need to be able to talk to their hosts or any
+	  other IP machines.
+
+	  To use this, your host kernel(s) must support IP Multicasting.
+
+	  For more information, see
+	  <http://user-mode-linux.sourceforge.net/old/networking.html>  That site
+	  has examples of the UML command line to use to enable Multicast
+	  networking, and notes about the security of this approach.
+
+	  If you need UMLs on multiple physical hosts to communicate as if
+	  they shared an Ethernet network, say Y.  If you need to communicate
+	  with other IP machines, make sure you select one of the other
+	  transports (possibly in addition to Multicast; they're not
+	  exclusive).  If you don't need to network UMLs say N to each of
+	  the transports.
 
 config UML_NET_PCAP
 	bool "pcap transport"
@@ -300,9 +300,9 @@ config UML_NET_PCAP
 	UML act as a network monitor for the host.  You must have libcap
 	installed in order to build the pcap transport into UML.
 
-        For more information, see
-        <http://user-mode-linux.sourceforge.net/old/networking.html>  That site
-        has examples of the UML command line to use to enable this option.
+	  For more information, see
+	  <http://user-mode-linux.sourceforge.net/old/networking.html>  That site
+	  has examples of the UML command line to use to enable this option.
 
 	If you intend to use UML as a network monitor for the host, say
 	Y here.  Otherwise, say N.
@@ -311,27 +311,27 @@ config UML_NET_SLIRP
 	bool "SLiRP transport"
 	depends on UML_NET
 	help
-        The SLiRP User-Mode Linux network transport allows a running UML
-        to network by invoking a program that can handle SLIP encapsulated
-        packets.  This is commonly (but not limited to) the application
-        known as SLiRP, a program that can re-socket IP packets back onto
-        the host on which it is run.  Only IP packets are supported,
-        unlike other network transports that can handle all Ethernet
-        frames.  In general, slirp allows the UML the same IP connectivity
-        to the outside world that the host user is permitted, and unlike
-        other transports, SLiRP works without the need of root level
-        privleges, setuid binaries, or SLIP devices on the host.  This
-        also means not every type of connection is possible, but most
-        situations can be accommodated with carefully crafted slirp
-        commands that can be passed along as part of the network device's
-        setup string.  The effect of this transport on the UML is similar
-        that of a host behind a firewall that masquerades all network
-        connections passing through it (but is less secure).
-
-        To use this you should first have slirp compiled somewhere
-        accessible on the host, and have read its documentation.  If you
-        don't need UML networking, say N.
-
-        Startup example: "eth0=slirp,FE:FD:01:02:03:04,/usr/local/bin/slirp"
+	  The SLiRP User-Mode Linux network transport allows a running UML
+	  to network by invoking a program that can handle SLIP encapsulated
+	  packets.  This is commonly (but not limited to) the application
+	  known as SLiRP, a program that can re-socket IP packets back onto
+	  he host on which it is run.  Only IP packets are supported,
+	  unlike other network transports that can handle all Ethernet
+	  frames.  In general, slirp allows the UML the same IP connectivity
+	  to the outside world that the host user is permitted, and unlike
+	  other transports, SLiRP works without the need of root level
+	  privleges, setuid binaries, or SLIP devices on the host.  This
+	  also means not every type of connection is possible, but most
+	  situations can be accommodated with carefully crafted slirp
+	  commands that can be passed along as part of the network device's
+	  setup string.  The effect of this transport on the UML is similar
+	  that of a host behind a firewall that masquerades all network
+	  connections passing through it (but is less secure).
+
+	  To use this you should first have slirp compiled somewhere
+	  accessible on the host, and have read its documentation.  If you
+	  don't need UML networking, say N.
+
+	  Startup example: "eth0=slirp,FE:FD:01:02:03:04,/usr/local/bin/slirp"
 
 endmenu
-- 
cgit v1.2.3-59-g8ed1b


From 9ca55299f2ee0a2ccf41d99bfc02d5ad3118e03b Mon Sep 17 00:00:00 2001
From: Daniel Walter <dwalter@google.com>
Date: Tue, 2 Apr 2019 10:43:32 +0200
Subject: um: Do not unlock mutex that is not hold.

Return error instead of trying to unlock a mutex that is not hold.

Signed-off-by: Daniel Walter <dwalter@google.com>
Reviewed-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Acked-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/um/drivers/ubd_kern.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index aca09be2373e..33c1cd6a12ac 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -276,14 +276,14 @@ static int ubd_setup_common(char *str, int *index_out, char **error_out)
 		str++;
 		if(!strcmp(str, "sync")){
 			global_openflags = of_sync(global_openflags);
-			goto out1;
+			return err;
 		}
 
 		err = -EINVAL;
 		major = simple_strtoul(str, &end, 0);
 		if((*end != '\0') || (end == str)){
 			*error_out = "Didn't parse major number";
-			goto out1;
+			return err;
 		}
 
 		mutex_lock(&ubd_lock);
-- 
cgit v1.2.3-59-g8ed1b


From 689a58605b63173acb0a8cf954af6a8f60440c93 Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <maze@google.com>
Date: Wed, 10 Apr 2019 11:11:23 -0700
Subject: uml: fix a boot splat wrt use of cpu_all_mask
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Memory: 509108K/542612K available (3835K kernel code, 919K rwdata, 1028K rodata, 129K init, 211K bss, 33504K reserved, 0K cma-reserved)
NR_IRQS: 15
clocksource: timer: mask: 0xffffffffffffffff max_cycles: 0x1cd42e205, max_idle_ns: 881590404426 ns
------------[ cut here ]------------
WARNING: CPU: 0 PID: 0 at kernel/time/clockevents.c:458 clockevents_register_device+0x72/0x140
posix-timer cpumask == cpu_all_mask, using cpu_possible_mask instead
Modules linked in:
CPU: 0 PID: 0 Comm: swapper Not tainted 5.1.0-rc4-00048-ged79cc87302b #4
Stack:
 604ebda0 603c5370 604ebe20 6046fd17
 00000000 6006fcbb 604ebdb0 603c53b5
 604ebe10 6003bfc4 604ebdd0 9000001ca
Call Trace:
 [<6006fcbb>] ? printk+0x0/0x94
 [<60083160>] ? clockevents_register_device+0x72/0x140
 [<6001f16e>] show_stack+0x13b/0x155
 [<603c5370>] ? dump_stack_print_info+0xe2/0xeb
 [<6006fcbb>] ? printk+0x0/0x94
 [<603c53b5>] dump_stack+0x2a/0x2c
 [<6003bfc4>] __warn+0x10e/0x13e
 [<60070320>] ? vprintk_func+0xc8/0xcf
 [<60030fd6>] ? block_signals+0x0/0x16
 [<6006fcbb>] ? printk+0x0/0x94
 [<6003c08b>] warn_slowpath_fmt+0x97/0x99
 [<600311a1>] ? set_signals+0x0/0x3f
 [<6003bff4>] ? warn_slowpath_fmt+0x0/0x99
 [<600842cb>] ? tick_oneshot_mode_active+0x44/0x4f
 [<60030fd6>] ? block_signals+0x0/0x16
 [<6006fcbb>] ? printk+0x0/0x94
 [<6007d2d5>] ? __clocksource_select+0x20/0x1b1
 [<60030fd6>] ? block_signals+0x0/0x16
 [<6006fcbb>] ? printk+0x0/0x94
 [<60083160>] clockevents_register_device+0x72/0x140
 [<60031192>] ? get_signals+0x0/0xf
 [<60030fd6>] ? block_signals+0x0/0x16
 [<6006fcbb>] ? printk+0x0/0x94
 [<60002eec>] um_timer_setup+0xc8/0xca
 [<60001b59>] start_kernel+0x47f/0x57e
 [<600035bc>] start_kernel_proc+0x49/0x4d
 [<6006c483>] ? kmsg_dump_register+0x82/0x8a
 [<6001de62>] new_thread_handler+0x81/0xb2
 [<60003571>] ? kmsg_dumper_stdout_init+0x1a/0x1c
 [<60020c75>] uml_finishsetup+0x54/0x59

random: get_random_bytes called from init_oops_id+0x27/0x34 with crng_init=0
---[ end trace 00173d0117a88acb ]---
Calibrating delay loop... 6941.90 BogoMIPS (lpj=34709504)

Signed-off-by: Maciej Żenczykowski <maze@google.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: linux-um@lists.infradead.org
Cc: linux-kernel@vger.kernel.org

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/um/kernel/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index 052de4c8acb2..0c572a48158e 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -56,7 +56,7 @@ static int itimer_one_shot(struct clock_event_device *evt)
 static struct clock_event_device timer_clockevent = {
 	.name			= "posix-timer",
 	.rating			= 250,
-	.cpumask		= cpu_all_mask,
+	.cpumask		= cpu_possible_mask,
 	.features		= CLOCK_EVT_FEAT_PERIODIC |
 				  CLOCK_EVT_FEAT_ONESHOT,
 	.set_state_shutdown	= itimer_shutdown,
-- 
cgit v1.2.3-59-g8ed1b


From 4b6b4c902947cae374593e601400a7bdd3d7f7a4 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Thu, 11 Apr 2019 11:49:41 +0200
Subject: um: remove unused variable

The buf variable is unused. Remove it.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Reviewed-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Acked-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/um/kernel/skas/uaccess.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch')

diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c
index 7f06fdbc7ee1..bd3cb694322c 100644
--- a/arch/um/kernel/skas/uaccess.c
+++ b/arch/um/kernel/skas/uaccess.c
@@ -59,7 +59,6 @@ static pte_t *maybe_map(unsigned long virt, int is_write)
 static int do_op_one_page(unsigned long addr, int len, int is_write,
 		 int (*op)(unsigned long addr, int len, void *arg), void *arg)
 {
-	jmp_buf buf;
 	struct page *page;
 	pte_t *pte;
 	int n;
-- 
cgit v1.2.3-59-g8ed1b


From 0d4e5ac7e78035950d564e65c38ce148cb9af681 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Thu, 11 Apr 2019 11:49:42 +0200
Subject: um: remove uses of variable length arrays

While the affected code is run in user-mode, the build still warns
about it. Convert all uses of VLA to dynamic allocations.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/um/os-Linux/umid.c | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c
index 998fbb445458..e261656fe9d7 100644
--- a/arch/um/os-Linux/umid.c
+++ b/arch/um/os-Linux/umid.c
@@ -135,12 +135,18 @@ out:
  */
 static inline int is_umdir_used(char *dir)
 {
-	char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")];
-	char pid[sizeof("nnnnn\0")], *end;
+	char pid[sizeof("nnnnn\0")], *end, *file;
 	int dead, fd, p, n, err;
+	size_t filelen;
 
-	n = snprintf(file, sizeof(file), "%s/pid", dir);
-	if (n >= sizeof(file)) {
+	err = asprintf(&file, "%s/pid", dir);
+	if (err < 0)
+		return 0;
+
+	filelen = strlen(file);
+
+	n = snprintf(file, filelen, "%s/pid", dir);
+	if (n >= filelen) {
 		printk(UM_KERN_ERR "is_umdir_used - pid filename too long\n");
 		err = -E2BIG;
 		goto out;
@@ -185,6 +191,7 @@ static inline int is_umdir_used(char *dir)
 out_close:
 	close(fd);
 out:
+	free(file);
 	return 0;
 }
 
@@ -210,18 +217,21 @@ static int umdir_take_if_dead(char *dir)
 
 static void __init create_pid_file(void)
 {
-	char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")];
-	char pid[sizeof("nnnnn\0")];
+	char pid[sizeof("nnnnn\0")], *file;
 	int fd, n;
 
-	if (umid_file_name("pid", file, sizeof(file)))
+	file = malloc(strlen(uml_dir) + UMID_LEN + sizeof("/pid\0"));
+	if (!file)
 		return;
 
+	if (umid_file_name("pid", file, sizeof(file)))
+		goto out;
+
 	fd = open(file, O_RDWR | O_CREAT | O_EXCL, 0644);
 	if (fd < 0) {
 		printk(UM_KERN_ERR "Open of machine pid file \"%s\" failed: "
 		       "%s\n", file, strerror(errno));
-		return;
+		goto out;
 	}
 
 	snprintf(pid, sizeof(pid), "%d\n", getpid());
@@ -231,6 +241,8 @@ static void __init create_pid_file(void)
 		       errno);
 
 	close(fd);
+out:
+	free(file);
 }
 
 int __init set_umid(char *name)
@@ -385,13 +397,19 @@ __uml_setup("uml_dir=", set_uml_dir,
 
 static void remove_umid_dir(void)
 {
-	char dir[strlen(uml_dir) + UMID_LEN + 1], err;
+	char *dir, err;
+
+	dir = malloc(strlen(uml_dir) + UMID_LEN + 1);
+	if (!dir)
+		return;
 
 	sprintf(dir, "%s%s", uml_dir, umid);
 	err = remove_files_and_dir(dir);
 	if (err)
 		os_warn("%s - remove_files_and_dir failed with err = %d\n",
 			__func__, err);
+
+	free(dir);
 }
 
 __uml_exitcall(remove_umid_dir);
-- 
cgit v1.2.3-59-g8ed1b


From ea70d791c1687529edf9bb9a69530a66e1ca4584 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Thu, 11 Apr 2019 11:49:43 +0200
Subject: um: define set_pte_at() as a static inline function, not a macro

When defined as macro, the mm argument is unused and subsequently the
variable passed as mm is considered unused by the compiler. This fixes
a build warning.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Acked-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/um/include/asm/pgtable.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h
index 9c04562310b3..b377df76cc28 100644
--- a/arch/um/include/asm/pgtable.h
+++ b/arch/um/include/asm/pgtable.h
@@ -263,7 +263,12 @@ static inline void set_pte(pte_t *pteptr, pte_t pteval)
 	*pteptr = pte_mknewpage(*pteptr);
 	if(pte_present(*pteptr)) *pteptr = pte_mknewprot(*pteptr);
 }
-#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval)
+
+static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
+			      pte_t *pteptr, pte_t pteval)
+{
+	set_pte(pteptr, pteval);
+}
 
 #define __HAVE_ARCH_PTE_SAME
 static inline int pte_same(pte_t pte_a, pte_t pte_b)
-- 
cgit v1.2.3-59-g8ed1b


From 1987b1b8f9f17a06255877e7917d0bb5b5377774 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Thu, 11 Apr 2019 11:49:44 +0200
Subject: um: irq: don't set the chip for all irqs

Setting a chip for an interrupt marks it as allocated. Since UM doesn't
support dynamic interrupt numbers (yet), it means we cannot simply
increase NR_IRQS and then use the free irqs between LAST_IRQ and NR_IRQS
with gpio-mockup or iio testing drivers as irq_alloc_descs() will fail
after not being able to neither find an unallocated range of interrupts
nor expand the range.

Only call irq_set_chip_and_handler() for irqs until LAST_IRQ.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Reviewed-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Acked-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/um/kernel/irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index f4874b7ec503..598d7b3d9355 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -479,7 +479,7 @@ void __init init_IRQ(void)
 	irq_set_chip_and_handler(TIMER_IRQ, &SIGVTALRM_irq_type, handle_edge_irq);
 
 
-	for (i = 1; i < NR_IRQS; i++)
+	for (i = 1; i < LAST_IRQ; i++)
 		irq_set_chip_and_handler(i, &normal_irq_type, handle_edge_irq);
 	/* Initialize EPOLL Loop */
 	os_setup_epoll();
-- 
cgit v1.2.3-59-g8ed1b


From a95a9e5f0fdf9ef7c1b1cbf2788cb0df28a97bfb Mon Sep 17 00:00:00 2001
From: Jagadeesh Pagadala <jagdsh.linux@gmail.com>
Date: Thu, 28 Mar 2019 02:58:45 +0530
Subject: arch:sparc:kernel/uprobes.c : Remove duplicate header

Remove duplicate header which is included twice.

Signed-off-by: Jagadeesh Pagadala <jagdsh.linux@gmail.com>
Reviewed-by: Mukesh Ojha <mojha@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/uprobes.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch')

diff --git a/arch/sparc/kernel/uprobes.c b/arch/sparc/kernel/uprobes.c
index d852ae56ddc1..c44bf5b85de8 100644
--- a/arch/sparc/kernel/uprobes.c
+++ b/arch/sparc/kernel/uprobes.c
@@ -29,7 +29,6 @@
 #include <linux/kdebug.h>
 
 #include <asm/cacheflush.h>
-#include <linux/uaccess.h>
 
 /* Compute the address of the breakpoint instruction and return it.
  *
-- 
cgit v1.2.3-59-g8ed1b


From 269fe56551c68cde57e477a6810ed57921dfe54f Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Wed, 3 Apr 2019 17:32:24 +0900
Subject: sparc: vdso: add FORCE to the build rule of %.so

$(call if_changed,...) must have FORCE as a prerequisite.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/vdso/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/sparc/vdso/Makefile b/arch/sparc/vdso/Makefile
index 74e97f77e23b..83c4b463cb3d 100644
--- a/arch/sparc/vdso/Makefile
+++ b/arch/sparc/vdso/Makefile
@@ -68,7 +68,7 @@ CFLAGS_REMOVE_vdso-note.o = -pg
 CFLAGS_REMOVE_vclock_gettime.o = -pg
 
 $(obj)/%.so: OBJCOPYFLAGS := -S
-$(obj)/%.so: $(obj)/%.so.dbg
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
 	$(call if_changed,objcopy)
 
 CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
-- 
cgit v1.2.3-59-g8ed1b


From 031abf0b70cb6804eefb11340463a2277e52f853 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Apr 2019 20:23:39 +0200
Subject: sparc/iommu: use !PageHighMem to check if a page has a kernel mapping

This deobsfucates the check a bit, and prepares for future changes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/mm/iommu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c
index e8d5d73ca40d..dcdadac03fdf 100644
--- a/arch/sparc/mm/iommu.c
+++ b/arch/sparc/mm/iommu.c
@@ -273,7 +273,8 @@ static int sbus_iommu_map_sg_pflush(struct device *dev, struct scatterlist *sgl,
 		 * XXX Is this a good assumption?
 		 * XXX What if someone else unmaps it here and races us?
 		 */
-		if ((page = (unsigned long) page_address(sg_page(sg))) != 0) {
+		if (!PageHighMem(sg_page(sg))) {
+			page = (unsigned long)page_address(sg_page(sg));
 			for (i = 0; i < n; i++) {
 				if (page != oldpage) {	/* Already flushed? */
 					flush_page_for_dma(page);
-- 
cgit v1.2.3-59-g8ed1b


From a7fce1f7ca2f092fe44a17cb158deda97060aab4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Apr 2019 20:23:40 +0200
Subject: sparc/iommu: use sbus_iommu_unmap_page in sbus_iommu_unmap_sg

Use the page-level helper instead of duplicating the logic, while also
fixing the incorrect handling of larger than page sized offsets in
the sg variant.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/mm/iommu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c
index dcdadac03fdf..f47a6ce0acaa 100644
--- a/arch/sparc/mm/iommu.c
+++ b/arch/sparc/mm/iommu.c
@@ -321,11 +321,11 @@ static void sbus_iommu_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		int nents, enum dma_data_direction dir, unsigned long attrs)
 {
 	struct scatterlist *sg;
-	int i, n;
+	int i;
 
 	for_each_sg(sgl, sg, nents, i) {
-		n = (sg->length + sg->offset + PAGE_SIZE-1) >> PAGE_SHIFT;
-		iommu_release_one(dev, sg->dma_address & PAGE_MASK, n);
+		sbus_iommu_unmap_page(dev, sg->dma_address, sg->length, dir,
+				attrs);
 		sg->dma_address = 0x21212121;
 	}
 }
-- 
cgit v1.2.3-59-g8ed1b


From f25b23bc156fef3211fe4adf9692eca5ce2fd082 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Apr 2019 20:23:41 +0200
Subject: sparc/iommu: merge iommu_release_one and sbus_iommu_unmap_page

There is only one caller of iommu_release_one left, so merge it into
that one to clean things up a bit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/mm/iommu.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c
index f47a6ce0acaa..7cb9ddda7531 100644
--- a/arch/sparc/mm/iommu.c
+++ b/arch/sparc/mm/iommu.c
@@ -291,14 +291,17 @@ static int sbus_iommu_map_sg_pflush(struct device *dev, struct scatterlist *sgl,
 	return nents;
 }
 
-static void iommu_release_one(struct device *dev, u32 busa, int npages)
+static void sbus_iommu_unmap_page(struct device *dev, dma_addr_t dma_addr,
+		size_t len, enum dma_data_direction dir, unsigned long attrs)
 {
 	struct iommu_struct *iommu = dev->archdata.iommu;
-	int ioptex;
-	int i;
+	unsigned int busa = dma_addr & PAGE_MASK;
+	unsigned long off = dma_addr & ~PAGE_MASK;
+	unsigned int npages = (off + len + PAGE_SIZE-1) >> PAGE_SHIFT;
+	unsigned int ioptex = (busa - iommu->start) >> PAGE_SHIFT;
+	unsigned int i;
 
 	BUG_ON(busa < iommu->start);
-	ioptex = (busa - iommu->start) >> PAGE_SHIFT;
 	for (i = 0; i < npages; i++) {
 		iopte_val(iommu->page_table[ioptex + i]) = 0;
 		iommu_invalidate_page(iommu->regs, busa);
@@ -307,16 +310,6 @@ static void iommu_release_one(struct device *dev, u32 busa, int npages)
 	bit_map_clear(&iommu->usemap, ioptex, npages);
 }
 
-static void sbus_iommu_unmap_page(struct device *dev, dma_addr_t dma_addr,
-		size_t len, enum dma_data_direction dir, unsigned long attrs)
-{
-	unsigned long off = dma_addr & ~PAGE_MASK;
-	int npages;
-
-	npages = (off + len + PAGE_SIZE-1) >> PAGE_SHIFT;
-	iommu_release_one(dev, dma_addr & PAGE_MASK, npages);
-}
-
 static void sbus_iommu_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		int nents, enum dma_data_direction dir, unsigned long attrs)
 {
-- 
cgit v1.2.3-59-g8ed1b


From ff5cbec0c3ea8b96c4cb7bcd9f484d8665d394e6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Apr 2019 20:23:42 +0200
Subject: sparc/iommu: create a common helper for map_sg

Share the code for the global and per-page flush map_sg loops using a
simple bool parameter to disable the per-page flush for the former
variant.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/mm/iommu.c | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

(limited to 'arch')

diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c
index 7cb9ddda7531..f90d943a3a27 100644
--- a/arch/sparc/mm/iommu.c
+++ b/arch/sparc/mm/iommu.c
@@ -241,25 +241,9 @@ static dma_addr_t sbus_iommu_map_page_pflush(struct device *dev,
 	return __sbus_iommu_map_page(dev, page, offset, len);
 }
 
-static int sbus_iommu_map_sg_gflush(struct device *dev, struct scatterlist *sgl,
-		int nents, enum dma_data_direction dir, unsigned long attrs)
-{
-	struct scatterlist *sg;
-	int i, n;
-
-	flush_page_for_dma(0);
-
-	for_each_sg(sgl, sg, nents, i) {
-		n = (sg->length + sg->offset + PAGE_SIZE-1) >> PAGE_SHIFT;
-		sg->dma_address = iommu_get_one(dev, sg_page(sg), n) + sg->offset;
-		sg->dma_length = sg->length;
-	}
-
-	return nents;
-}
-
-static int sbus_iommu_map_sg_pflush(struct device *dev, struct scatterlist *sgl,
-		int nents, enum dma_data_direction dir, unsigned long attrs)
+static int __sbus_iommu_map_sg(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir, unsigned long attrs,
+		bool per_page_flush)
 {
 	unsigned long page, oldpage = 0;
 	struct scatterlist *sg;
@@ -273,7 +257,7 @@ static int sbus_iommu_map_sg_pflush(struct device *dev, struct scatterlist *sgl,
 		 * XXX Is this a good assumption?
 		 * XXX What if someone else unmaps it here and races us?
 		 */
-		if (!PageHighMem(sg_page(sg))) {
+		if (per_page_flush && !PageHighMem(sg_page(sg))) {
 			page = (unsigned long)page_address(sg_page(sg));
 			for (i = 0; i < n; i++) {
 				if (page != oldpage) {	/* Already flushed? */
@@ -291,6 +275,19 @@ static int sbus_iommu_map_sg_pflush(struct device *dev, struct scatterlist *sgl,
 	return nents;
 }
 
+static int sbus_iommu_map_sg_gflush(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	flush_page_for_dma(0);
+	return __sbus_iommu_map_sg(dev, sgl, nents, dir, attrs, false);
+}
+
+static int sbus_iommu_map_sg_pflush(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	return __sbus_iommu_map_sg(dev, sgl, nents, dir, attrs, true);
+}
+
 static void sbus_iommu_unmap_page(struct device *dev, dma_addr_t dma_addr,
 		size_t len, enum dma_data_direction dir, unsigned long attrs)
 {
-- 
cgit v1.2.3-59-g8ed1b


From b82059428c0577c2ec082974d7956291d5eae2cf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Apr 2019 20:23:43 +0200
Subject: sparc/iommu: pass a physical address to iommu_get_one

No need for the page structure, just the paddr / pfn.  This is
going to simplify fixes to the callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/mm/iommu.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c
index f90d943a3a27..19d9266e4049 100644
--- a/arch/sparc/mm/iommu.c
+++ b/arch/sparc/mm/iommu.c
@@ -175,16 +175,17 @@ static void iommu_flush_iotlb(iopte_t *iopte, unsigned int niopte)
 	}
 }
 
-static u32 iommu_get_one(struct device *dev, struct page *page, int npages)
+static u32 iommu_get_one(struct device *dev, phys_addr_t paddr, int npages)
 {
 	struct iommu_struct *iommu = dev->archdata.iommu;
 	int ioptex;
 	iopte_t *iopte, *iopte0;
 	unsigned int busa, busa0;
+	unsigned long pfn = __phys_to_pfn(paddr);
 	int i;
 
 	/* page color = pfn of page */
-	ioptex = bit_map_string_get(&iommu->usemap, npages, page_to_pfn(page));
+	ioptex = bit_map_string_get(&iommu->usemap, npages, pfn);
 	if (ioptex < 0)
 		panic("iommu out");
 	busa0 = iommu->start + (ioptex << PAGE_SHIFT);
@@ -193,11 +194,11 @@ static u32 iommu_get_one(struct device *dev, struct page *page, int npages)
 	busa = busa0;
 	iopte = iopte0;
 	for (i = 0; i < npages; i++) {
-		iopte_val(*iopte) = MKIOPTE(page_to_pfn(page), IOPERM);
+		iopte_val(*iopte) = MKIOPTE(pfn, IOPERM);
 		iommu_invalidate_page(iommu->regs, busa);
 		busa += PAGE_SIZE;
 		iopte++;
-		page++;
+		pfn++;
 	}
 
 	iommu_flush_iotlb(iopte0, npages);
@@ -215,7 +216,7 @@ static dma_addr_t __sbus_iommu_map_page(struct device *dev, struct page *page,
 	/* XXX So what is maxphys for us and how do drivers know it? */
 	if (!len || len > 256 * 1024)
 		return DMA_MAPPING_ERROR;
-	return iommu_get_one(dev, virt_to_page(vaddr), npages) + off;
+	return iommu_get_one(dev, virt_to_phys(vaddr), npages) + off;
 }
 
 static dma_addr_t sbus_iommu_map_page_gflush(struct device *dev,
@@ -268,7 +269,7 @@ static int __sbus_iommu_map_sg(struct device *dev, struct scatterlist *sgl,
 			}
 		}
 
-		sg->dma_address = iommu_get_one(dev, sg_page(sg), n) + sg->offset;
+		sg->dma_address = iommu_get_one(dev, sg_phys(sg), n) + sg->offset;
 		sg->dma_length = sg->length;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 8668b38c1c7720baf76da15a7a7eef43ae0c65a4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Apr 2019 20:23:44 +0200
Subject: sparc/iommu: move per-page flushing into __sbus_iommu_map_page

This prepares for reusing __sbus_iommu_map_page in the map_sg path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/mm/iommu.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c
index 19d9266e4049..7e191c8ae46a 100644
--- a/arch/sparc/mm/iommu.c
+++ b/arch/sparc/mm/iommu.c
@@ -207,15 +207,25 @@ static u32 iommu_get_one(struct device *dev, phys_addr_t paddr, int npages)
 }
 
 static dma_addr_t __sbus_iommu_map_page(struct device *dev, struct page *page,
-		unsigned long offset, size_t len)
+		unsigned long offset, size_t len, bool per_page_flush)
 {
 	void *vaddr = page_address(page) + offset;
 	unsigned long off = (unsigned long)vaddr & ~PAGE_MASK;
 	unsigned long npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	
+
 	/* XXX So what is maxphys for us and how do drivers know it? */
 	if (!len || len > 256 * 1024)
 		return DMA_MAPPING_ERROR;
+
+	if (per_page_flush) {
+		unsigned long p = (unsigned long)vaddr & PAGE_MASK;
+
+		while (p < (unsigned long)vaddr + len) {
+			flush_page_for_dma(p);
+			p += PAGE_SIZE;
+		}
+	}
+
 	return iommu_get_one(dev, virt_to_phys(vaddr), npages) + off;
 }
 
@@ -224,22 +234,14 @@ static dma_addr_t sbus_iommu_map_page_gflush(struct device *dev,
 		enum dma_data_direction dir, unsigned long attrs)
 {
 	flush_page_for_dma(0);
-	return __sbus_iommu_map_page(dev, page, offset, len);
+	return __sbus_iommu_map_page(dev, page, offset, len, false);
 }
 
 static dma_addr_t sbus_iommu_map_page_pflush(struct device *dev,
 		struct page *page, unsigned long offset, size_t len,
 		enum dma_data_direction dir, unsigned long attrs)
 {
-	void *vaddr = page_address(page) + offset;
-	unsigned long p = ((unsigned long)vaddr) & PAGE_MASK;
-
-	while (p < (unsigned long)vaddr + len) {
-		flush_page_for_dma(p);
-		p += PAGE_SIZE;
-	}
-
-	return __sbus_iommu_map_page(dev, page, offset, len);
+	return __sbus_iommu_map_page(dev, page, offset, len, true);
 }
 
 static int __sbus_iommu_map_sg(struct device *dev, struct scatterlist *sgl,
-- 
cgit v1.2.3-59-g8ed1b


From 7e996890b88078011bfb55ce072712d464207dad Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Apr 2019 20:23:45 +0200
Subject: sparc/iommu: fix __sbus_iommu_map_page for highmem pages

__sbus_iommu_map_page currently assumes all pages are mapped into the
kernel direct mapping.  Switch to using physical address instead of
virtual ones for all the normal mapping operations, and only use
the virtual addresses for cache flushing when not operating on
a highmem page.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/mm/iommu.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c
index 7e191c8ae46a..37b5ce7657f6 100644
--- a/arch/sparc/mm/iommu.c
+++ b/arch/sparc/mm/iommu.c
@@ -209,24 +209,23 @@ static u32 iommu_get_one(struct device *dev, phys_addr_t paddr, int npages)
 static dma_addr_t __sbus_iommu_map_page(struct device *dev, struct page *page,
 		unsigned long offset, size_t len, bool per_page_flush)
 {
-	void *vaddr = page_address(page) + offset;
-	unsigned long off = (unsigned long)vaddr & ~PAGE_MASK;
+	phys_addr_t paddr = page_to_phys(page) + offset;
+	unsigned long off = paddr & ~PAGE_MASK;
 	unsigned long npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
 	/* XXX So what is maxphys for us and how do drivers know it? */
 	if (!len || len > 256 * 1024)
 		return DMA_MAPPING_ERROR;
 
-	if (per_page_flush) {
-		unsigned long p = (unsigned long)vaddr & PAGE_MASK;
+	if (per_page_flush && !PageHighMem(page)) {
+		unsigned long vaddr, p;
 
-		while (p < (unsigned long)vaddr + len) {
+		vaddr = (unsigned long)page_address(page) + offset;
+		for (p = vaddr & PAGE_MASK; p < vaddr + len; p += PAGE_SIZE)
 			flush_page_for_dma(p);
-			p += PAGE_SIZE;
-		}
 	}
 
-	return iommu_get_one(dev, virt_to_phys(vaddr), npages) + off;
+	return iommu_get_one(dev, paddr, npages) + off;
 }
 
 static dma_addr_t sbus_iommu_map_page_gflush(struct device *dev,
-- 
cgit v1.2.3-59-g8ed1b


From edb1f07203ba8856b24bcddf8326386ba6a03291 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Apr 2019 20:23:46 +0200
Subject: sparc/iommu: use __sbus_iommu_map_page to implement the map_sg path

This means we handle > PAGE_SIZE offsets fine, and grow the size check
so far only performed in the map_page path.  We lose the optimization
to not double flush a page if it apears in multiple consecutive SG list
entries.  But at least for block I/O those don't happen anymore since
we properly merge in higher layers anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/mm/iommu.c | 31 ++++++++++---------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

(limited to 'arch')

diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c
index 37b5ce7657f6..8fbc08d14836 100644
--- a/arch/sparc/mm/iommu.c
+++ b/arch/sparc/mm/iommu.c
@@ -217,6 +217,11 @@ static dma_addr_t __sbus_iommu_map_page(struct device *dev, struct page *page,
 	if (!len || len > 256 * 1024)
 		return DMA_MAPPING_ERROR;
 
+	/*
+	 * We expect unmapped highmem pages to be not in the cache.
+	 * XXX Is this a good assumption?
+	 * XXX What if someone else unmaps it here and races us?
+	 */
 	if (per_page_flush && !PageHighMem(page)) {
 		unsigned long vaddr, p;
 
@@ -247,30 +252,14 @@ static int __sbus_iommu_map_sg(struct device *dev, struct scatterlist *sgl,
 		int nents, enum dma_data_direction dir, unsigned long attrs,
 		bool per_page_flush)
 {
-	unsigned long page, oldpage = 0;
 	struct scatterlist *sg;
-	int i, j, n;
+	int j;
 
 	for_each_sg(sgl, sg, nents, j) {
-		n = (sg->length + sg->offset + PAGE_SIZE-1) >> PAGE_SHIFT;
-
-		/*
-		 * We expect unmapped highmem pages to be not in the cache.
-		 * XXX Is this a good assumption?
-		 * XXX What if someone else unmaps it here and races us?
-		 */
-		if (per_page_flush && !PageHighMem(sg_page(sg))) {
-			page = (unsigned long)page_address(sg_page(sg));
-			for (i = 0; i < n; i++) {
-				if (page != oldpage) {	/* Already flushed? */
-					flush_page_for_dma(page);
-					oldpage = page;
-				}
-				page += PAGE_SIZE;
-			}
-		}
-
-		sg->dma_address = iommu_get_one(dev, sg_phys(sg), n) + sg->offset;
+		sg->dma_address =__sbus_iommu_map_page(dev, sg_page(sg),
+				sg->offset, sg->length, per_page_flush);
+		if (sg->dma_address == DMA_MAPPING_ERROR)
+			return 0;
 		sg->dma_length = sg->length;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 376b1371a9f29112ae000cc0cade174a9a670053 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Apr 2019 20:23:47 +0200
Subject: sparc/iommu: merge iommu_get_one and __sbus_iommu_map_page

There is only one caller of iommu_get_one left, so merge it into
that one to clean things up a bit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/mm/iommu.c | 56 ++++++++++++++++++++++-----------------------------
 1 file changed, 24 insertions(+), 32 deletions(-)

(limited to 'arch')

diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c
index 8fbc08d14836..71ac353032b6 100644
--- a/arch/sparc/mm/iommu.c
+++ b/arch/sparc/mm/iommu.c
@@ -175,43 +175,17 @@ static void iommu_flush_iotlb(iopte_t *iopte, unsigned int niopte)
 	}
 }
 
-static u32 iommu_get_one(struct device *dev, phys_addr_t paddr, int npages)
-{
-	struct iommu_struct *iommu = dev->archdata.iommu;
-	int ioptex;
-	iopte_t *iopte, *iopte0;
-	unsigned int busa, busa0;
-	unsigned long pfn = __phys_to_pfn(paddr);
-	int i;
-
-	/* page color = pfn of page */
-	ioptex = bit_map_string_get(&iommu->usemap, npages, pfn);
-	if (ioptex < 0)
-		panic("iommu out");
-	busa0 = iommu->start + (ioptex << PAGE_SHIFT);
-	iopte0 = &iommu->page_table[ioptex];
-
-	busa = busa0;
-	iopte = iopte0;
-	for (i = 0; i < npages; i++) {
-		iopte_val(*iopte) = MKIOPTE(pfn, IOPERM);
-		iommu_invalidate_page(iommu->regs, busa);
-		busa += PAGE_SIZE;
-		iopte++;
-		pfn++;
-	}
-
-	iommu_flush_iotlb(iopte0, npages);
-
-	return busa0;
-}
-
 static dma_addr_t __sbus_iommu_map_page(struct device *dev, struct page *page,
 		unsigned long offset, size_t len, bool per_page_flush)
 {
+	struct iommu_struct *iommu = dev->archdata.iommu;
 	phys_addr_t paddr = page_to_phys(page) + offset;
 	unsigned long off = paddr & ~PAGE_MASK;
 	unsigned long npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	unsigned long pfn = __phys_to_pfn(paddr);
+	unsigned int busa, busa0;
+	iopte_t *iopte, *iopte0;
+	int ioptex, i;
 
 	/* XXX So what is maxphys for us and how do drivers know it? */
 	if (!len || len > 256 * 1024)
@@ -230,7 +204,25 @@ static dma_addr_t __sbus_iommu_map_page(struct device *dev, struct page *page,
 			flush_page_for_dma(p);
 	}
 
-	return iommu_get_one(dev, paddr, npages) + off;
+	/* page color = pfn of page */
+	ioptex = bit_map_string_get(&iommu->usemap, npages, pfn);
+	if (ioptex < 0)
+		panic("iommu out");
+	busa0 = iommu->start + (ioptex << PAGE_SHIFT);
+	iopte0 = &iommu->page_table[ioptex];
+
+	busa = busa0;
+	iopte = iopte0;
+	for (i = 0; i < npages; i++) {
+		iopte_val(*iopte) = MKIOPTE(pfn, IOPERM);
+		iommu_invalidate_page(iommu->regs, busa);
+		busa += PAGE_SIZE;
+		iopte++;
+		pfn++;
+	}
+
+	iommu_flush_iotlb(iopte0, npages);
+	return busa0 + off;
 }
 
 static dma_addr_t sbus_iommu_map_page_gflush(struct device *dev,
-- 
cgit v1.2.3-59-g8ed1b


From 8150a153c013aa2dd1ffae43370b89ac1347a7fb Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 8 May 2019 13:06:42 +1000
Subject: powerpc/64s: Use early_mmu_has_feature() in set_kuap()

When implementing the KUAP support on Radix we fixed one case where
mmu_has_feature() was being called too early in boot via
__put_user_size().

However since then some new code in linux-next has created a new path
via which we can end up calling mmu_has_feature() too early.

On P9 this leads to crashes early in boot if we have both PPC_KUAP and
CONFIG_JUMP_LABEL_FEATURE_CHECK_DEBUG enabled. Our early boot code
calls printk() which calls probe_kernel_read(), that does a
__copy_from_user_inatomic() which calls into set_kuap() and that uses
mmu_has_feature().

At that point in boot we haven't patched MMU features yet so the debug
code in mmu_has_feature() complains, and calls printk(). At that point
we recurse, eg:

  ...
  dump_stack+0xdc
  probe_kernel_read+0x1a4
  check_pointer+0x58
  ...
  printk+0x40
  dump_stack_print_info+0xbc
  dump_stack+0x8
  probe_kernel_read+0x1a4
  probe_kernel_read+0x19c
  check_pointer+0x58
  ...
  printk+0x40
  cpufeatures_process_feature+0xc8
  scan_cpufeatures_subnodes+0x380
  of_scan_flat_dt_subnodes+0xb4
  dt_cpu_ftrs_scan_callback+0x158
  of_scan_flat_dt+0xf0
  dt_cpu_ftrs_scan+0x3c
  early_init_devtree+0x360
  early_setup+0x9c

And so on for infinity, symptom is a dead system.

Even more fun is what happens when using the hash MMU (ie. p8 or p9
with Radix disabled), and when we don't have
CONFIG_JUMP_LABEL_FEATURE_CHECK_DEBUG enabled. With the debug disabled
we don't check if static keys have been initialised, we just rely on
the jump label. But the jump label defaults to true so we just whack
the AMR even though Radix is not enabled.

Clearing the AMR is fine, but after we've done the user copy we write
(0b11 << 62) into AMR. When using hash that makes all pages with key
zero no longer readable or writable. All kernel pages implicitly have
key zero, and so all of a sudden the kernel can't read or write any of
its memory. Again dead system.

In the medium term we have several options for fixing this.
probe_kernel_read() doesn't need to touch AMR at all, it's not doing a
user access after all, but it uses __copy_from_user_inatomic() just
because it's easy, we could fix that.

It would also be safe to default to not writing to the AMR during
early boot, until we've detected features. But it's not clear that
flipping all the MMU features to static_key_false won't introduce
other bugs.

But for now just switch to early_mmu_has_feature() in set_kuap(), that
avoids all the problems with jump labels. It adds the overhead of a
global lookup and test, but that's probably trivial compared to the
writes to the AMR anyway.

Fixes: 890274c2dc4c ("powerpc/64s: Implement KUAP for Radix MMU")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Russell Currey <ruscur@russell.cc>
---
 arch/powerpc/include/asm/book3s/64/kup-radix.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h
index 7679bd0c5af0..f254de956d6a 100644
--- a/arch/powerpc/include/asm/book3s/64/kup-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h
@@ -65,7 +65,7 @@
 
 static inline void set_kuap(unsigned long value)
 {
-	if (!mmu_has_feature(MMU_FTR_RADIX_KUAP))
+	if (!early_mmu_has_feature(MMU_FTR_RADIX_KUAP))
 		return;
 
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From bc0025b6107c011e8f9411a275d8442a56bd573a Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Tue, 8 Jan 2019 10:13:56 -0600
Subject: sparc: use struct_size() in kzalloc()

One of the more common cases of allocation size calculations is finding the
size of a structure that has a zero-sized array at the end, along with memory
for some number of elements for that array. For example:

struct foo {
    int stuff;
    void *entry[];
};

instance = kzalloc(sizeof(struct foo) + sizeof(void *) * count, GFP_KERNEL);

Instead of leaving these open-coded and prone to type mistakes, we can now
use the new struct_size() helper:

instance = kzalloc(struct_size(instance, entry, count), GFP_KERNEL);

This code was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/cpumap.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/sparc/kernel/cpumap.c b/arch/sparc/kernel/cpumap.c
index d1d52822603d..1cb62bfeaa1f 100644
--- a/arch/sparc/kernel/cpumap.c
+++ b/arch/sparc/kernel/cpumap.c
@@ -194,8 +194,7 @@ static struct cpuinfo_tree *build_cpuinfo_tree(void)
 
 	n = enumerate_cpuinfo_nodes(tmp_level);
 
-	new_tree = kzalloc(sizeof(struct cpuinfo_tree) +
-	                   (sizeof(struct cpuinfo_node) * n), GFP_ATOMIC);
+	new_tree = kzalloc(struct_size(new_tree, nodes, n), GFP_ATOMIC);
 	if (!new_tree)
 		return NULL;
 
-- 
cgit v1.2.3-59-g8ed1b


From f4d9a23d3dad0252f375901bf4ff6523a2c97241 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Tue, 12 Feb 2019 11:32:36 +0200
Subject: sparc64: simplify reduce_memory() function

The reduce_memory() function clampls the available memory to a limit
defined by the "mem=" command line parameter. It takes into account the
amount of already reserved memory and excludes it from the limit
calculations.

Rather than traverse memblocks and remove them by hand, use
memblock_reserved_size() to account the reserved memory and
memblock_enforce_memory_limit() to clamp the available memory.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/mm/init_64.c | 42 ++----------------------------------------
 1 file changed, 2 insertions(+), 40 deletions(-)

(limited to 'arch')

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index f2d70ff7a284..bc2aaa47bc8a 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2269,19 +2269,6 @@ static unsigned long last_valid_pfn;
 static void sun4u_pgprot_init(void);
 static void sun4v_pgprot_init(void);
 
-static phys_addr_t __init available_memory(void)
-{
-	phys_addr_t available = 0ULL;
-	phys_addr_t pa_start, pa_end;
-	u64 i;
-
-	for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start,
-				&pa_end, NULL)
-		available = available + (pa_end  - pa_start);
-
-	return available;
-}
-
 #define _PAGE_CACHE_4U	(_PAGE_CP_4U | _PAGE_CV_4U)
 #define _PAGE_CACHE_4V	(_PAGE_CP_4V | _PAGE_CV_4V)
 #define __DIRTY_BITS_4U	 (_PAGE_MODIFIED_4U | _PAGE_WRITE_4U | _PAGE_W_4U)
@@ -2295,33 +2282,8 @@ static phys_addr_t __init available_memory(void)
  */
 static void __init reduce_memory(phys_addr_t limit_ram)
 {
-	phys_addr_t avail_ram = available_memory();
-	phys_addr_t pa_start, pa_end;
-	u64 i;
-
-	if (limit_ram >= avail_ram)
-		return;
-
-	for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start,
-				&pa_end, NULL) {
-		phys_addr_t region_size = pa_end - pa_start;
-		phys_addr_t clip_start = pa_start;
-
-		avail_ram = avail_ram - region_size;
-		/* Are we consuming too much? */
-		if (avail_ram < limit_ram) {
-			phys_addr_t give_back = limit_ram - avail_ram;
-
-			region_size = region_size - give_back;
-			clip_start = clip_start + give_back;
-		}
-
-		memblock_remove(clip_start, region_size);
-
-		if (avail_ram <= limit_ram)
-			break;
-		i = 0UL;
-	}
+	limit_ram += memblock_reserved_size();
+	memblock_enforce_memory_limit(limit_ram);
 }
 
 void __init paging_init(void)
-- 
cgit v1.2.3-59-g8ed1b


From e6eb5fe9123f05dcbf339ae5c0b6d32fcc0685d5 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 7 May 2019 20:19:05 +0200
Subject: parisc: Drop LDCW barrier in CAS code when running UP

When running an SMP kernel on a single-CPU machine, we can speed up the
CAS code by replacing the LDCW sync barrier with NOP.

Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/syscall.S | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index e54d5e4d3489..97ac707c6bff 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -641,7 +641,8 @@ cas_action:
 2:	stw	%r24, 0(%r26)
 	/* Free lock */
 #ifdef CONFIG_SMP
-	LDCW	0(%sr2,%r20), %r1			/* Barrier */
+98:	LDCW	0(%sr2,%r20), %r1			/* Barrier */
+99:	ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP)
 #endif
 	stw	%r20, 0(%sr2,%r20)
 #if ENABLE_LWS_DEBUG
@@ -658,7 +659,8 @@ cas_action:
 	/* Error occurred on load or store */
 	/* Free lock */
 #ifdef CONFIG_SMP
-	LDCW	0(%sr2,%r20), %r1			/* Barrier */
+98:	LDCW	0(%sr2,%r20), %r1			/* Barrier */
+99:	ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP)
 #endif
 	stw	%r20, 0(%sr2,%r20)
 #if ENABLE_LWS_DEBUG
@@ -862,7 +864,8 @@ cas2_action:
 cas2_end:
 	/* Free lock */
 #ifdef CONFIG_SMP
-	LDCW	0(%sr2,%r20), %r1			/* Barrier */
+98:	LDCW	0(%sr2,%r20), %r1			/* Barrier */
+99:	ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP)
 #endif
 	stw	%r20, 0(%sr2,%r20)
 	/* Enable interrupts */
@@ -875,7 +878,8 @@ cas2_end:
 	/* Error occurred on load or store */
 	/* Free lock */
 #ifdef CONFIG_SMP
-	LDCW	0(%sr2,%r20), %r1			/* Barrier */
+98:	LDCW	0(%sr2,%r20), %r1			/* Barrier */
+99:	ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP)
 #endif
 	stw	%r20, 0(%sr2,%r20)
 	ssm	PSW_SM_I, %r0
-- 
cgit v1.2.3-59-g8ed1b


From 8d0e051cc75e2b1a7e2fd51fc56af332c9619618 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 10 May 2019 17:00:01 +0200
Subject: parisc: Enable the ro_after_init feature

This patch modifies the initial page mapping functions in the following way:

During bootup the init, text and data pages will be mapped RWX and if
supported, with huge pages.

At final stage of the bootup, the kernel calls free_initmem() and then all
pages will be remapped either R-X (for text and read-only data) or RW- (for
data). The __init pages will be dropped.

This reflects the behaviour of the x86 platform.

Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/include/asm/cache.h  |  3 --
 arch/parisc/kernel/vmlinux.lds.S |  3 --
 arch/parisc/mm/init.c            | 67 +++++++++++++++++++++-------------------
 3 files changed, 35 insertions(+), 38 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h
index 4016fe1c65a9..73ca89a47f49 100644
--- a/arch/parisc/include/asm/cache.h
+++ b/arch/parisc/include/asm/cache.h
@@ -24,9 +24,6 @@
 
 #define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
-/* Read-only memory is marked before mark_rodata_ro() is called. */
-#define __ro_after_init	__read_mostly
-
 void parisc_cache_init(void);	/* initializes cache-flushing */
 void disable_sr_hashing_asm(int); /* low level support for above */
 void disable_sr_hashing(void);   /* turns off space register hashing */
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index a8be7a47fcc0..c3b1b9c24ede 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -18,9 +18,6 @@
 				*(.data..vm0.pgd) \
 				*(.data..vm0.pte)
 
-/* No __ro_after_init data in the .rodata section - which will always be ro */
-#define RO_AFTER_INIT_DATA
-
 #include <asm-generic/vmlinux.lds.h>
 
 /* needed for the processor specific cache alignment size */	
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 3b0f9eab7f2c..b99bcbf1ecdb 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -345,16 +345,7 @@ static void __init setup_bootmem(void)
 	memblock_dump_all();
 }
 
-static int __init parisc_text_address(unsigned long vaddr)
-{
-	static unsigned long head_ptr __initdata;
-
-	if (!head_ptr)
-		head_ptr = PAGE_MASK & (unsigned long)
-			dereference_function_descriptor(&parisc_kernel_start);
-
-	return core_kernel_text(vaddr) || vaddr == head_ptr;
-}
+static bool kernel_set_to_readonly;
 
 static void __init map_pages(unsigned long start_vaddr,
 			     unsigned long start_paddr, unsigned long size,
@@ -372,10 +363,11 @@ static void __init map_pages(unsigned long start_vaddr,
 	unsigned long vaddr;
 	unsigned long ro_start;
 	unsigned long ro_end;
-	unsigned long kernel_end;
+	unsigned long kernel_start, kernel_end;
 
 	ro_start = __pa((unsigned long)_text);
 	ro_end   = __pa((unsigned long)&data_start);
+	kernel_start = __pa((unsigned long)&__init_begin);
 	kernel_end  = __pa((unsigned long)&_end);
 
 	end_paddr = start_paddr + size;
@@ -438,26 +430,30 @@ static void __init map_pages(unsigned long start_vaddr,
 			pg_table = (pte_t *) __va(pg_table) + start_pte;
 			for (tmp2 = start_pte; tmp2 < PTRS_PER_PTE; tmp2++, pg_table++) {
 				pte_t pte;
-
-				if (force)
-					pte =  __mk_pte(address, pgprot);
-				else if (parisc_text_address(vaddr)) {
-					pte = __mk_pte(address, PAGE_KERNEL_EXEC);
-					if (address >= ro_start && address < kernel_end)
-						pte = pte_mkhuge(pte);
+				pgprot_t prot;
+				bool huge = false;
+
+				if (force) {
+					prot = pgprot;
+				} else if (address < kernel_start || address >= kernel_end) {
+					/* outside kernel memory */
+					prot = PAGE_KERNEL;
+				} else if (!kernel_set_to_readonly) {
+					/* still initializing, allow writing to RO memory */
+					prot = PAGE_KERNEL_RWX;
+					huge = true;
+				} else if (address >= ro_start) {
+					/* Code (ro) and Data areas */
+					prot = (address < ro_end) ?
+						PAGE_KERNEL_EXEC : PAGE_KERNEL;
+					huge = true;
+				} else {
+					prot = PAGE_KERNEL;
 				}
-				else
-#if defined(CONFIG_PARISC_PAGE_SIZE_4KB)
-				if (address >= ro_start && address < ro_end) {
-					pte = __mk_pte(address, PAGE_KERNEL_EXEC);
+
+				pte = __mk_pte(address, prot);
+				if (huge)
 					pte = pte_mkhuge(pte);
-				} else
-#endif
-				{
-					pte = __mk_pte(address, pgprot);
-					if (address >= ro_start && address < kernel_end)
-						pte = pte_mkhuge(pte);
-				}
 
 				if (address >= end_paddr)
 					break;
@@ -493,6 +489,12 @@ void __ref free_initmem(void)
 {
 	unsigned long init_begin = (unsigned long)__init_begin;
 	unsigned long init_end = (unsigned long)__init_end;
+	unsigned long kernel_end  = (unsigned long)&_end;
+
+	/* Remap kernel text and data, but do not touch init section yet. */
+	kernel_set_to_readonly = true;
+	map_pages(init_end, __pa(init_end), kernel_end - init_end,
+		  PAGE_KERNEL, 0);
 
 	/* The init text pages are marked R-X.  We have to
 	 * flush the icache and mark them RW-
@@ -509,7 +511,7 @@ void __ref free_initmem(void)
 		  PAGE_KERNEL, 1);
 
 	/* force the kernel to see the new TLB entries */
-	__flush_tlb_range(0, init_begin, init_end);
+	__flush_tlb_range(0, init_begin, kernel_end);
 
 	/* finally dump all the instructions which were cached, since the
 	 * pages are no-longer executable */
@@ -527,8 +529,9 @@ void mark_rodata_ro(void)
 {
 	/* rodata memory was already mapped with KERNEL_RO access rights by
            pagetable_init() and map_pages(). No need to do additional stuff here */
-	printk (KERN_INFO "Write protecting the kernel read-only data: %luk\n",
-		(unsigned long)(__end_rodata - __start_rodata) >> 10);
+	unsigned long roai_size = __end_ro_after_init - __start_ro_after_init;
+
+	pr_info("Write protected read-only-after-init data: %luk\n", roai_size >> 10);
 }
 #endif
 
-- 
cgit v1.2.3-59-g8ed1b


From 271c29a17fadd3eabbfe63f13bb49dfb6c9d872d Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 10 May 2019 20:47:11 +0200
Subject: parisc: Use __ro_after_init in cache.c

Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/cache.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index 0338561968a4..a82b3eaa5398 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -29,9 +29,9 @@
 #include <asm/sections.h>
 #include <asm/shmparam.h>
 
-int split_tlb __read_mostly;
-int dcache_stride __read_mostly;
-int icache_stride __read_mostly;
+int split_tlb __ro_after_init;
+int dcache_stride __ro_after_init;
+int icache_stride __ro_after_init;
 EXPORT_SYMBOL(dcache_stride);
 
 void flush_dcache_page_asm(unsigned long phys_addr, unsigned long vaddr);
@@ -51,12 +51,12 @@ DEFINE_SPINLOCK(pa_tlb_flush_lock);
 DEFINE_SPINLOCK(pa_swapper_pg_lock);
 
 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
-int pa_serialize_tlb_flushes __read_mostly;
+int pa_serialize_tlb_flushes __ro_after_init;
 #endif
 
-struct pdc_cache_info cache_info __read_mostly;
+struct pdc_cache_info cache_info __ro_after_init;
 #ifndef CONFIG_PA20
-static struct pdc_btlb_info btlb_info __read_mostly;
+static struct pdc_btlb_info btlb_info __ro_after_init;
 #endif
 
 #ifdef CONFIG_SMP
@@ -381,10 +381,10 @@ EXPORT_SYMBOL(flush_data_cache_local);
 EXPORT_SYMBOL(flush_kernel_icache_range_asm);
 
 #define FLUSH_THRESHOLD 0x80000 /* 0.5MB */
-static unsigned long parisc_cache_flush_threshold __read_mostly = FLUSH_THRESHOLD;
+static unsigned long parisc_cache_flush_threshold __ro_after_init = FLUSH_THRESHOLD;
 
 #define FLUSH_TLB_THRESHOLD (16*1024) /* 16 KiB minimum TLB threshold */
-static unsigned long parisc_tlb_flush_threshold __read_mostly = FLUSH_TLB_THRESHOLD;
+static unsigned long parisc_tlb_flush_threshold __ro_after_init = FLUSH_TLB_THRESHOLD;
 
 void __init parisc_setup_cache_timing(void)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 9aa8848a751fec5f655f4409875baaee1ca3e3e8 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 10 May 2019 20:49:16 +0200
Subject: parisc: Use __ro_after_init in drivers.c

Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/drivers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c
index 15e7b3be7b6b..00a181f1ecc6 100644
--- a/arch/parisc/kernel/drivers.c
+++ b/arch/parisc/kernel/drivers.c
@@ -41,7 +41,7 @@
 #include <asm/ropes.h>
 
 /* See comments in include/asm-parisc/pci.h */
-const struct dma_map_ops *hppa_dma_ops __read_mostly;
+const struct dma_map_ops *hppa_dma_ops __ro_after_init;
 EXPORT_SYMBOL(hppa_dma_ops);
 
 static struct device root = {
-- 
cgit v1.2.3-59-g8ed1b


From 1b69085d4fcfd1df67d86a553c522fa1cb7603b4 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 10 May 2019 20:50:16 +0200
Subject: parisc: Use __ro_after_init in firmware.c

Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/firmware.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c
index 7a17551ea31e..f01e102bbfa2 100644
--- a/arch/parisc/kernel/firmware.c
+++ b/arch/parisc/kernel/firmware.c
@@ -87,7 +87,7 @@ extern unsigned long pdc_result2[NUM_PDC_RESULT];
 
 /* Firmware needs to be initially set to narrow to determine the 
  * actual firmware width. */
-int parisc_narrow_firmware __read_mostly = 1;
+int parisc_narrow_firmware __ro_after_init = 1;
 #endif
 
 /* On most currently-supported platforms, IODC I/O calls are 32-bit calls
-- 
cgit v1.2.3-59-g8ed1b


From dc1b3c0d50eae30c4588f2146fe41b49584e50fb Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 10 May 2019 20:50:46 +0200
Subject: parisc: Use __ro_after_init in head.S

Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/head.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/parisc/kernel/head.S b/arch/parisc/kernel/head.S
index d12de2a13753..951a339369dd 100644
--- a/arch/parisc/kernel/head.S
+++ b/arch/parisc/kernel/head.S
@@ -376,7 +376,7 @@ smp_slave_stext:
 ENDPROC(parisc_kernel_start)
 
 #ifndef CONFIG_64BIT
-	.section .data..read_mostly
+	.section .data..ro_after_init
 
 	.align	4
 	.export	$global$,data
-- 
cgit v1.2.3-59-g8ed1b


From 7c1952b4be68a40d2283b16d8236cb8d2af1272f Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 10 May 2019 20:51:44 +0200
Subject: parisc: Use __ro_after_init in inventory.c

Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/inventory.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/kernel/inventory.c b/arch/parisc/kernel/inventory.c
index 6f2d611347a1..3f4a91c0b805 100644
--- a/arch/parisc/kernel/inventory.c
+++ b/arch/parisc/kernel/inventory.c
@@ -39,12 +39,12 @@
 */
 #undef DEBUG_PAT
 
-int pdc_type __read_mostly = PDC_TYPE_ILLEGAL;
+int pdc_type __ro_after_init = PDC_TYPE_ILLEGAL;
 
 /* cell number and location (PAT firmware only) */
-unsigned long parisc_cell_num __read_mostly;
-unsigned long parisc_cell_loc __read_mostly;
-unsigned long parisc_pat_pdc_cap __read_mostly;
+unsigned long parisc_cell_num __ro_after_init;
+unsigned long parisc_cell_loc __ro_after_init;
+unsigned long parisc_pat_pdc_cap __ro_after_init;
 
 
 void __init setup_pdc(void)
-- 
cgit v1.2.3-59-g8ed1b


From 874b051923dd757adfeacc71ea2f7f0cfbf267c4 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 10 May 2019 20:52:36 +0200
Subject: parisc: Use __ro_after_init in pci.c

Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/pci.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/kernel/pci.c b/arch/parisc/kernel/pci.c
index ae684ac6efb6..bc41ca243cfe 100644
--- a/arch/parisc/kernel/pci.c
+++ b/arch/parisc/kernel/pci.c
@@ -45,14 +45,14 @@
  * #define pci_post_reset_delay 50
  */
 
-struct pci_port_ops *pci_port __read_mostly;
-struct pci_bios_ops *pci_bios __read_mostly;
+struct pci_port_ops *pci_port __ro_after_init;
+struct pci_bios_ops *pci_bios __ro_after_init;
 
-static int pci_hba_count __read_mostly;
+static int pci_hba_count __ro_after_init;
 
 /* parisc_pci_hba used by pci_port->in/out() ops to lookup bus data.  */
 #define PCI_HBA_MAX 32
-static struct pci_hba_data *parisc_pci_hba[PCI_HBA_MAX] __read_mostly;
+static struct pci_hba_data *parisc_pci_hba[PCI_HBA_MAX] __ro_after_init;
 
 
 /********************************************************************
-- 
cgit v1.2.3-59-g8ed1b


From 67266fd48f0f6cb42838aeac2ef55417640ad1b5 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 10 May 2019 20:53:46 +0200
Subject: parisc: Use __ro_after_init in perf_images.h

Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/perf_images.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/kernel/perf_images.h b/arch/parisc/kernel/perf_images.h
index 7fef9644df47..c108fee989d9 100644
--- a/arch/parisc/kernel/perf_images.h
+++ b/arch/parisc/kernel/perf_images.h
@@ -25,7 +25,7 @@
 
 #define PCXU_IMAGE_SIZE 584
 
-static uint32_t onyx_images[][PCXU_IMAGE_SIZE/sizeof(uint32_t)] __read_mostly = {
+static uint32_t onyx_images[][PCXU_IMAGE_SIZE/sizeof(uint32_t)] __ro_after_init = {
 /*
  * CPI:
  *
@@ -2093,7 +2093,7 @@ static uint32_t onyx_images[][PCXU_IMAGE_SIZE/sizeof(uint32_t)] __read_mostly =
 };
 #define PCXW_IMAGE_SIZE 576
 
-static uint32_t cuda_images[][PCXW_IMAGE_SIZE/sizeof(uint32_t)] __read_mostly = {
+static uint32_t cuda_images[][PCXW_IMAGE_SIZE/sizeof(uint32_t)] __ro_after_init = {
 /*
  * CPI:     FROM CPI.IDF (Image 0)
  *
-- 
cgit v1.2.3-59-g8ed1b


From 7e4c65bf0637c089d8981ac7e99081a57ffbcdbc Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 10 May 2019 20:54:31 +0200
Subject: parisc: Use __ro_after_init in process.c

Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 97c206734e24..89e4f4497ffb 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -192,7 +192,7 @@ int dump_task_fpu (struct task_struct *tsk, elf_fpregset_t *r)
  * QEMU idle the host too.
  */
 
-int running_on_qemu __read_mostly;
+int running_on_qemu __ro_after_init;
 EXPORT_SYMBOL(running_on_qemu);
 
 void __cpuidle arch_cpu_idle_dead(void)
-- 
cgit v1.2.3-59-g8ed1b


From d98883690b7bc5faab98795dff694549e4c60556 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 10 May 2019 20:55:08 +0200
Subject: parisc: Use __ro_after_init in processor.c

Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/processor.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c
index e0a81dedc366..e715871cd4ac 100644
--- a/arch/parisc/kernel/processor.c
+++ b/arch/parisc/kernel/processor.c
@@ -43,10 +43,10 @@
 #include <asm/irq.h>		/* for struct irq_region */
 #include <asm/parisc-device.h>
 
-struct system_cpuinfo_parisc boot_cpu_data __read_mostly;
+struct system_cpuinfo_parisc boot_cpu_data __ro_after_init;
 EXPORT_SYMBOL(boot_cpu_data);
 #ifdef CONFIG_PA8X00
-int _parisc_requires_coherency __read_mostly;
+int _parisc_requires_coherency __ro_after_init;
 EXPORT_SYMBOL(_parisc_requires_coherency);
 #endif
 
-- 
cgit v1.2.3-59-g8ed1b


From 34589df6338afc75208cd3f9b612c1ae7738bbd0 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 10 May 2019 20:55:31 +0200
Subject: parisc: Use __ro_after_init in time.c

Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index a1e772f909cb..04508158815c 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -40,7 +40,7 @@
 
 #include <linux/timex.h>
 
-static unsigned long clocktick __read_mostly;	/* timer cycles per tick */
+static unsigned long clocktick __ro_after_init;	/* timer cycles per tick */
 
 /*
  * We keep time on PA-RISC Linux by using the Interval Timer which is
-- 
cgit v1.2.3-59-g8ed1b


From 47293774c49c174c9863a60a757b825299ddf649 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 10 May 2019 20:55:51 +0200
Subject: parisc: Use __ro_after_init in unwind.c

Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/kernel/unwind.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c
index 2d14f17838d2..87ae476d1c4f 100644
--- a/arch/parisc/kernel/unwind.c
+++ b/arch/parisc/kernel/unwind.c
@@ -40,7 +40,7 @@ static DEFINE_SPINLOCK(unwind_lock);
  * we can call unwind_init as early in the bootup process as 
  * possible (before the slab allocator is initialized)
  */
-static struct unwind_table kernel_unwind_table __read_mostly;
+static struct unwind_table kernel_unwind_table __ro_after_init;
 static LIST_HEAD(unwind_tables);
 
 static inline const struct unwind_table_entry *
-- 
cgit v1.2.3-59-g8ed1b


From 4e617c86ba9be5211643e53f5f69d1195c956d68 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Fri, 10 May 2019 20:56:16 +0200
Subject: parisc: Use __ro_after_init in init.c

Signed-off-by: Helge Deller <deller@gmx.de>
---
 arch/parisc/mm/init.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index b99bcbf1ecdb..5a68185295a9 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -66,7 +66,7 @@ static struct resource pdcdata_resource = {
 	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM,
 };
 
-static struct resource sysram_resources[MAX_PHYSMEM_RANGES] __read_mostly;
+static struct resource sysram_resources[MAX_PHYSMEM_RANGES] __ro_after_init;
 
 /* The following array is initialized from the firmware specific
  * information retrieved in kernel/inventory.c.
@@ -557,11 +557,11 @@ void mark_rodata_ro(void)
 #define SET_MAP_OFFSET(x) ((void *)(((unsigned long)(x) + VM_MAP_OFFSET) \
 				     & ~(VM_MAP_OFFSET-1)))
 
-void *parisc_vmalloc_start __read_mostly;
+void *parisc_vmalloc_start __ro_after_init;
 EXPORT_SYMBOL(parisc_vmalloc_start);
 
 #ifdef CONFIG_PA11
-unsigned long pcxl_dma_start __read_mostly;
+unsigned long pcxl_dma_start __ro_after_init;
 #endif
 
 void __init mem_init(void)
@@ -635,7 +635,7 @@ void __init mem_init(void)
 #endif
 }
 
-unsigned long *empty_zero_page __read_mostly;
+unsigned long *empty_zero_page __ro_after_init;
 EXPORT_SYMBOL(empty_zero_page);
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 1be91314532c4a2779fa17665b6d0368972b570b Mon Sep 17 00:00:00 2001
From: Petr Štetiar <ynezz@true.cz>
Date: Fri, 10 May 2019 11:35:16 +0200
Subject: powerpc: tsi108: fix similar warning reported by kbuild test robot
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes following (similar) warning reported by kbuild test robot:

 In function ‘memcpy’,
  inlined from ‘smsc75xx_init_mac_address’ at drivers/net/usb/smsc75xx.c:778:3,
  inlined from ‘smsc75xx_bind’ at drivers/net/usb/smsc75xx.c:1501:2:
  ./include/linux/string.h:355:9: warning: argument 2 null where non-null expected [-Wnonnull]
  return __builtin_memcpy(p, q, size);
         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
  drivers/net/usb/smsc75xx.c: In function ‘smsc75xx_bind’:
  ./include/linux/string.h:355:9: note: in a call to built-in function ‘__builtin_memcpy’

I've replaced the offending memcpy with ether_addr_copy, because I'm
100% sure, that of_get_mac_address can't return NULL as it returns valid
pointer or ERR_PTR encoded value, nothing else.

I'm hesitant to just change IS_ERR into IS_ERR_OR_NULL check, as this
would make the warning disappear also, but it would be confusing to
check for impossible return value just to make a compiler happy.

I'm now changing all occurencies of memcpy to ether_addr_copy after the
of_get_mac_address call, as it's very likely, that we're going to get
similar reports from kbuild test robot in the future.

Fixes: ea168cdf1299 ("powerpc: tsi108: support of_get_mac_address new ERR_PTR error")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Petr Štetiar <ynezz@true.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/powerpc/sysdev/tsi108_dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/sysdev/tsi108_dev.c b/arch/powerpc/sysdev/tsi108_dev.c
index c92dcac85231..026619c9a8cb 100644
--- a/arch/powerpc/sysdev/tsi108_dev.c
+++ b/arch/powerpc/sysdev/tsi108_dev.c
@@ -18,6 +18,7 @@
 #include <linux/irq.h>
 #include <linux/export.h>
 #include <linux/device.h>
+#include <linux/etherdevice.h>
 #include <linux/platform_device.h>
 #include <linux/of_net.h>
 #include <asm/tsi108.h>
@@ -106,7 +107,7 @@ static int __init tsi108_eth_of_init(void)
 
 		mac_addr = of_get_mac_address(np);
 		if (!IS_ERR(mac_addr))
-			memcpy(tsi_eth_data.mac_addr, mac_addr, 6);
+			ether_addr_copy(tsi_eth_data.mac_addr, mac_addr);
 
 		ph = of_get_property(np, "mdio-handle", NULL);
 		mdio = of_find_node_by_phandle(*ph);
-- 
cgit v1.2.3-59-g8ed1b


From e602b26ce4758e0eb97252363d32bd294afea530 Mon Sep 17 00:00:00 2001
From: Sabyasachi Gupta <sabyasachi.linux@gmail.com>
Date: Mon, 13 May 2019 17:15:46 -0700
Subject: arch/sh/boards/mach-dreamcast/irq.c: Remove duplicate header

Remove linux/irq.h which is included more than once.

Link: http://lkml.kernel.org/r/5c8682ef.1c69fb81.5a1ea.2e7f@mx.google.com
Signed-off-by: Sabyasachi Gupta <sabyasachi.linux@gmail.com>
Acked-by: Souptick Joarder <jrdr.linux@gmail.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: Mukesh Ojha <mojha@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sh/boards/mach-dreamcast/irq.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch')

diff --git a/arch/sh/boards/mach-dreamcast/irq.c b/arch/sh/boards/mach-dreamcast/irq.c
index a929f764ae04..cc06e4cdb4cd 100644
--- a/arch/sh/boards/mach-dreamcast/irq.c
+++ b/arch/sh/boards/mach-dreamcast/irq.c
@@ -10,7 +10,6 @@
  */
 #include <linux/irq.h>
 #include <linux/io.h>
-#include <linux/irq.h>
 #include <linux/export.h>
 #include <linux/err.h>
 #include <mach/sysasic.h>
-- 
cgit v1.2.3-59-g8ed1b


From 932f4a630a695212bdc7379b05f9bd0dafc5d968 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Mon, 13 May 2019 17:17:03 -0700
Subject: mm/gup: replace get_user_pages_longterm() with FOLL_LONGTERM

Pach series "Add FOLL_LONGTERM to GUP fast and use it".

HFI1, qib, and mthca, use get_user_pages_fast() due to its performance
advantages.  These pages can be held for a significant time.  But
get_user_pages_fast() does not protect against mapping FS DAX pages.

Introduce FOLL_LONGTERM and use this flag in get_user_pages_fast() which
retains the performance while also adding the FS DAX checks.  XDP has also
shown interest in using this functionality.[1]

In addition we change get_user_pages() to use the new FOLL_LONGTERM flag
and remove the specialized get_user_pages_longterm call.

[1] https://lkml.org/lkml/2019/3/19/939

"longterm" is a relative thing and at this point is probably a misnomer.
This is really flagging a pin which is going to be given to hardware and
can't move.  I've thought of a couple of alternative names but I think we
have to settle on if we are going to use FL_LAYOUT or something else to
solve the "longterm" problem.  Then I think we can change the flag to a
better name.

Secondly, it depends on how often you are registering memory.  I have
spoken with some RDMA users who consider MR in the performance path...
For the overall application performance.  I don't have the numbers as the
tests for HFI1 were done a long time ago.  But there was a significant
advantage.  Some of which is probably due to the fact that you don't have
to hold mmap_sem.

Finally, architecturally I think it would be good for everyone to use
*_fast.  There are patches submitted to the RDMA list which would allow
the use of *_fast (they reworking the use of mmap_sem) and as soon as they
are accepted I'll submit a patch to convert the RDMA core as well.  Also
to this point others are looking to use *_fast.

As an aside, Jasons pointed out in my previous submission that *_fast and
*_unlocked look very much the same.  I agree and I think further cleanup
will be coming.  But I'm focused on getting the final solution for DAX at
the moment.

This patch (of 7):

This patch starts a series which aims to support FOLL_LONGTERM in
get_user_pages_fast().  Some callers who would like to do a longterm (user
controlled pin) of pages with the fast variant of GUP for performance
purposes.

Rather than have a separate get_user_pages_longterm() call, introduce
FOLL_LONGTERM and change the longterm callers to use it.

This patch does not change any functionality.  In the short term
"longterm" or user controlled pins are unsafe for Filesystems and FS DAX
in particular has been blocked.  However, callers of get_user_pages_fast()
were not "protected".

FOLL_LONGTERM can _only_ be supported with get_user_pages[_fast]() as it
requires vmas to determine if DAX is in use.

NOTE: In merging with the CMA changes we opt to change the
get_user_pages() call in check_and_migrate_cma_pages() to a call of
__get_user_pages_locked() on the newly migrated pages.  This makes the
code read better in that we are calling __get_user_pages_locked() on the
pages before and after a potential migration.

As a side affect some of the interfaces are cleaned up but this is not the
primary purpose of the series.

In review[1] it was asked:

<quote>
> This I don't get - if you do lock down long term mappings performance
> of the actual get_user_pages call shouldn't matter to start with.
>
> What do I miss?

A couple of points.

First "longterm" is a relative thing and at this point is probably a
misnomer.  This is really flagging a pin which is going to be given to
hardware and can't move.  I've thought of a couple of alternative names
but I think we have to settle on if we are going to use FL_LAYOUT or
something else to solve the "longterm" problem.  Then I think we can
change the flag to a better name.

Second, It depends on how often you are registering memory.  I have spoken
with some RDMA users who consider MR in the performance path...  For the
overall application performance.  I don't have the numbers as the tests
for HFI1 were done a long time ago.  But there was a significant
advantage.  Some of which is probably due to the fact that you don't have
to hold mmap_sem.

Finally, architecturally I think it would be good for everyone to use
*_fast.  There are patches submitted to the RDMA list which would allow
the use of *_fast (they reworking the use of mmap_sem) and as soon as they
are accepted I'll submit a patch to convert the RDMA core as well.  Also
to this point others are looking to use *_fast.

As an asside, Jasons pointed out in my previous submission that *_fast and
*_unlocked look very much the same.  I agree and I think further cleanup
will be coming.  But I'm focused on getting the final solution for DAX at
the moment.

</quote>

[1] https://lore.kernel.org/lkml/20190220180255.GA12020@iweiny-DESK2.sc.intel.com/T/#md6abad2569f3bf6c1f03686c8097ab6563e94965

[ira.weiny@intel.com: v3]
  Link: http://lkml.kernel.org/r/20190328084422.29911-2-ira.weiny@intel.com
Link: http://lkml.kernel.org/r/20190328084422.29911-2-ira.weiny@intel.com
Link: http://lkml.kernel.org/r/20190317183438.2057-2-ira.weiny@intel.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Mike Marshall <hubcap@omnibond.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/mm/book3s64/iommu_api.c       |   5 +-
 drivers/infiniband/core/umem.c             |   5 +-
 drivers/infiniband/hw/qib/qib_user_pages.c |   8 +-
 drivers/infiniband/hw/usnic/usnic_uiom.c   |   9 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c  |   6 +-
 drivers/vfio/vfio_iommu_type1.c            |   3 +-
 fs/io_uring.c                              |   5 +-
 include/linux/mm.h                         |  41 +++++--
 mm/gup.c                                   | 190 ++++++++++++++++++-----------
 mm/gup_benchmark.c                         |   5 +-
 net/xdp/xdp_umem.c                         |   4 +-
 11 files changed, 173 insertions(+), 108 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index 8330f135294f..5c521f3924a5 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -141,8 +141,9 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
 	for (entry = 0; entry < entries; entry += chunk) {
 		unsigned long n = min(entries - entry, chunk);
 
-		ret = get_user_pages_longterm(ua + (entry << PAGE_SHIFT), n,
-				FOLL_WRITE, mem->hpages + entry, NULL);
+		ret = get_user_pages(ua + (entry << PAGE_SHIFT), n,
+				FOLL_WRITE | FOLL_LONGTERM,
+				mem->hpages + entry, NULL);
 		if (ret == n) {
 			pinned += n;
 			continue;
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 0a23048db523..e7ea819fcb11 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -295,10 +295,11 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
 
 	while (npages) {
 		down_read(&mm->mmap_sem);
-		ret = get_user_pages_longterm(cur_base,
+		ret = get_user_pages(cur_base,
 				     min_t(unsigned long, npages,
 					   PAGE_SIZE / sizeof (struct page *)),
-				     gup_flags, page_list, NULL);
+				     gup_flags | FOLL_LONGTERM,
+				     page_list, NULL);
 		if (ret < 0) {
 			up_read(&mm->mmap_sem);
 			goto umem_release;
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index 123ca8f64f75..f712fb7fa82f 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -114,10 +114,10 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages,
 
 	down_read(&current->mm->mmap_sem);
 	for (got = 0; got < num_pages; got += ret) {
-		ret = get_user_pages_longterm(start_page + got * PAGE_SIZE,
-					      num_pages - got,
-					      FOLL_WRITE | FOLL_FORCE,
-					      p + got, NULL);
+		ret = get_user_pages(start_page + got * PAGE_SIZE,
+				     num_pages - got,
+				     FOLL_LONGTERM | FOLL_WRITE | FOLL_FORCE,
+				     p + got, NULL);
 		if (ret < 0) {
 			up_read(&current->mm->mmap_sem);
 			goto bail_release;
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index da35d6fdfc5e..e312f522a66d 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -143,10 +143,11 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
 	ret = 0;
 
 	while (npages) {
-		ret = get_user_pages_longterm(cur_base,
-					min_t(unsigned long, npages,
-					PAGE_SIZE / sizeof(struct page *)),
-					gup_flags, page_list, NULL);
+		ret = get_user_pages(cur_base,
+				     min_t(unsigned long, npages,
+				     PAGE_SIZE / sizeof(struct page *)),
+				     gup_flags | FOLL_LONGTERM,
+				     page_list, NULL);
 
 		if (ret < 0)
 			goto out;
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index 08929c087e27..870a2a526e0b 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -186,12 +186,12 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma,
 	dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n",
 		data, size, dma->nr_pages);
 
-	err = get_user_pages_longterm(data & PAGE_MASK, dma->nr_pages,
-			     flags, dma->pages, NULL);
+	err = get_user_pages(data & PAGE_MASK, dma->nr_pages,
+			     flags | FOLL_LONGTERM, dma->pages, NULL);
 
 	if (err != dma->nr_pages) {
 		dma->nr_pages = (err >= 0) ? err : 0;
-		dprintk(1, "get_user_pages_longterm: err=%d [%d]\n", err,
+		dprintk(1, "get_user_pages: err=%d [%d]\n", err,
 			dma->nr_pages);
 		return err < 0 ? err : -EINVAL;
 	}
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 3be1db3501cc..3ddc375e7063 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -358,7 +358,8 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 
 	down_read(&mm->mmap_sem);
 	if (mm == current->mm) {
-		ret = get_user_pages_longterm(vaddr, 1, flags, page, vmas);
+		ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page,
+				     vmas);
 	} else {
 		ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
 					    vmas, NULL);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 48ea3977012a..fdc18321d70c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2697,8 +2697,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 
 		ret = 0;
 		down_read(&current->mm->mmap_sem);
-		pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
-						pages, vmas);
+		pret = get_user_pages(ubuf, nr_pages,
+				      FOLL_WRITE | FOLL_LONGTERM,
+				      pages, vmas);
 		if (pret == nr_pages) {
 			/* don't support file backed memory */
 			for (j = 0; j < nr_pages; j++) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 083d7b4863ed..8bc677ce8f01 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1505,19 +1505,6 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
 		    struct page **pages, unsigned int gup_flags);
 
-#if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA)
-long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
-			    unsigned int gup_flags, struct page **pages,
-			    struct vm_area_struct **vmas);
-#else
-static inline long get_user_pages_longterm(unsigned long start,
-		unsigned long nr_pages, unsigned int gup_flags,
-		struct page **pages, struct vm_area_struct **vmas)
-{
-	return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
-}
-#endif /* CONFIG_FS_DAX */
-
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages);
 
@@ -2583,6 +2570,34 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 #define FOLL_REMOTE	0x2000	/* we are working on non-current tsk/mm */
 #define FOLL_COW	0x4000	/* internal GUP flag */
 #define FOLL_ANON	0x8000	/* don't do file mappings */
+#define FOLL_LONGTERM	0x10000	/* mapping lifetime is indefinite: see below */
+
+/*
+ * NOTE on FOLL_LONGTERM:
+ *
+ * FOLL_LONGTERM indicates that the page will be held for an indefinite time
+ * period _often_ under userspace control.  This is contrasted with
+ * iov_iter_get_pages() where usages which are transient.
+ *
+ * FIXME: For pages which are part of a filesystem, mappings are subject to the
+ * lifetime enforced by the filesystem and we need guarantees that longterm
+ * users like RDMA and V4L2 only establish mappings which coordinate usage with
+ * the filesystem.  Ideas for this coordination include revoking the longterm
+ * pin, delaying writeback, bounce buffer page writeback, etc.  As FS DAX was
+ * added after the problem with filesystems was found FS DAX VMAs are
+ * specifically failed.  Filesystem pages are still subject to bugs and use of
+ * FOLL_LONGTERM should be avoided on those pages.
+ *
+ * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call.
+ * Currently only get_user_pages() and get_user_pages_fast() support this flag
+ * and calls to get_user_pages_[un]locked are specifically not allowed.  This
+ * is due to an incompatibility with the FS DAX check and
+ * FAULT_FLAG_ALLOW_RETRY
+ *
+ * In the CMA case: longterm pins in a CMA region would unnecessarily fragment
+ * that region.  And so CMA attempts to migrate the page before pinning when
+ * FOLL_LONGTERM is specified.
+ */
 
 static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
 {
diff --git a/mm/gup.c b/mm/gup.c
index 91819b8ad9cc..25381102e21e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1018,6 +1018,15 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
 			   unsigned int gup_flags, struct page **pages,
 			   int *locked)
 {
+	/*
+	 * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+	 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+	 * vmas.  As there are no users of this flag in this call we simply
+	 * disallow this option for now.
+	 */
+	if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+		return -EINVAL;
+
 	return __get_user_pages_locked(current, current->mm, start, nr_pages,
 				       pages, NULL, locked,
 				       gup_flags | FOLL_TOUCH);
@@ -1046,6 +1055,15 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
 	int locked = 1;
 	long ret;
 
+	/*
+	 * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+	 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+	 * vmas.  As there are no users of this flag in this call we simply
+	 * disallow this option for now.
+	 */
+	if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+		return -EINVAL;
+
 	down_read(&mm->mmap_sem);
 	ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
 				      &locked, gup_flags | FOLL_TOUCH);
@@ -1116,32 +1134,22 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned int gup_flags, struct page **pages,
 		struct vm_area_struct **vmas, int *locked)
 {
+	/*
+	 * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+	 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+	 * vmas.  As there are no users of this flag in this call we simply
+	 * disallow this option for now.
+	 */
+	if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+		return -EINVAL;
+
 	return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
 				       locked,
 				       gup_flags | FOLL_TOUCH | FOLL_REMOTE);
 }
 EXPORT_SYMBOL(get_user_pages_remote);
 
-/*
- * This is the same as get_user_pages_remote(), just with a
- * less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's and don't allow
- * passing of a locked parameter.  We also obviously don't pass
- * FOLL_REMOTE in here.
- */
-long get_user_pages(unsigned long start, unsigned long nr_pages,
-		unsigned int gup_flags, struct page **pages,
-		struct vm_area_struct **vmas)
-{
-	return __get_user_pages_locked(current, current->mm, start, nr_pages,
-				       pages, vmas, NULL,
-				       gup_flags | FOLL_TOUCH);
-}
-EXPORT_SYMBOL(get_user_pages);
-
 #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
-
-#ifdef CONFIG_FS_DAX
 static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
 {
 	long i;
@@ -1160,12 +1168,6 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
 	}
 	return false;
 }
-#else
-static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
-{
-	return false;
-}
-#endif
 
 #ifdef CONFIG_CMA
 static struct page *new_non_cma_page(struct page *page, unsigned long private)
@@ -1219,10 +1221,13 @@ static struct page *new_non_cma_page(struct page *page, unsigned long private)
 	return __alloc_pages_node(nid, gfp_mask, 0);
 }
 
-static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
-					unsigned int gup_flags,
+static long check_and_migrate_cma_pages(struct task_struct *tsk,
+					struct mm_struct *mm,
+					unsigned long start,
+					unsigned long nr_pages,
 					struct page **pages,
-					struct vm_area_struct **vmas)
+					struct vm_area_struct **vmas,
+					unsigned int gup_flags)
 {
 	long i;
 	bool drain_allow = true;
@@ -1278,10 +1283,14 @@ check_again:
 				putback_movable_pages(&cma_page_list);
 		}
 		/*
-		 * We did migrate all the pages, Try to get the page references again
-		 * migrating any new CMA pages which we failed to isolate earlier.
+		 * We did migrate all the pages, Try to get the page references
+		 * again migrating any new CMA pages which we failed to isolate
+		 * earlier.
 		 */
-		nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+		nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
+						   pages, vmas, NULL,
+						   gup_flags);
+
 		if ((nr_pages > 0) && migrate_allow) {
 			drain_allow = true;
 			goto check_again;
@@ -1291,66 +1300,101 @@ check_again:
 	return nr_pages;
 }
 #else
-static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
-					       unsigned int gup_flags,
-					       struct page **pages,
-					       struct vm_area_struct **vmas)
+static long check_and_migrate_cma_pages(struct task_struct *tsk,
+					struct mm_struct *mm,
+					unsigned long start,
+					unsigned long nr_pages,
+					struct page **pages,
+					struct vm_area_struct **vmas,
+					unsigned int gup_flags)
 {
 	return nr_pages;
 }
 #endif
 
 /*
- * This is the same as get_user_pages() in that it assumes we are
- * operating on the current task's mm, but it goes further to validate
- * that the vmas associated with the address range are suitable for
- * longterm elevated page reference counts. For example, filesystem-dax
- * mappings are subject to the lifetime enforced by the filesystem and
- * we need guarantees that longterm users like RDMA and V4L2 only
- * establish mappings that have a kernel enforced revocation mechanism.
- *
- * "longterm" == userspace controlled elevated page count lifetime.
- * Contrast this to iov_iter_get_pages() usages which are transient.
+ * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
+ * allows us to process the FOLL_LONGTERM flag.
  */
-long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
-			     unsigned int gup_flags, struct page **pages,
-			     struct vm_area_struct **vmas_arg)
+static long __gup_longterm_locked(struct task_struct *tsk,
+				  struct mm_struct *mm,
+				  unsigned long start,
+				  unsigned long nr_pages,
+				  struct page **pages,
+				  struct vm_area_struct **vmas,
+				  unsigned int gup_flags)
 {
-	struct vm_area_struct **vmas = vmas_arg;
-	unsigned long flags;
+	struct vm_area_struct **vmas_tmp = vmas;
+	unsigned long flags = 0;
 	long rc, i;
 
-	if (!pages)
-		return -EINVAL;
-
-	if (!vmas) {
-		vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
-			       GFP_KERNEL);
-		if (!vmas)
-			return -ENOMEM;
+	if (gup_flags & FOLL_LONGTERM) {
+		if (!pages)
+			return -EINVAL;
+
+		if (!vmas_tmp) {
+			vmas_tmp = kcalloc(nr_pages,
+					   sizeof(struct vm_area_struct *),
+					   GFP_KERNEL);
+			if (!vmas_tmp)
+				return -ENOMEM;
+		}
+		flags = memalloc_nocma_save();
 	}
 
-	flags = memalloc_nocma_save();
-	rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
-	memalloc_nocma_restore(flags);
-	if (rc < 0)
-		goto out;
+	rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages,
+				     vmas_tmp, NULL, gup_flags);
 
-	if (check_dax_vmas(vmas, rc)) {
-		for (i = 0; i < rc; i++)
-			put_page(pages[i]);
-		rc = -EOPNOTSUPP;
-		goto out;
+	if (gup_flags & FOLL_LONGTERM) {
+		memalloc_nocma_restore(flags);
+		if (rc < 0)
+			goto out;
+
+		if (check_dax_vmas(vmas_tmp, rc)) {
+			for (i = 0; i < rc; i++)
+				put_page(pages[i]);
+			rc = -EOPNOTSUPP;
+			goto out;
+		}
+
+		rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages,
+						 vmas_tmp, gup_flags);
 	}
 
-	rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas);
 out:
-	if (vmas != vmas_arg)
-		kfree(vmas);
+	if (vmas_tmp != vmas)
+		kfree(vmas_tmp);
 	return rc;
 }
-EXPORT_SYMBOL(get_user_pages_longterm);
-#endif /* CONFIG_FS_DAX */
+#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
+static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
+						  struct mm_struct *mm,
+						  unsigned long start,
+						  unsigned long nr_pages,
+						  struct page **pages,
+						  struct vm_area_struct **vmas,
+						  unsigned int flags)
+{
+	return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+				       NULL, flags);
+}
+#endif /* CONFIG_FS_DAX || CONFIG_CMA */
+
+/*
+ * This is the same as get_user_pages_remote(), just with a
+ * less-flexible calling convention where we assume that the task
+ * and mm being operated on are the current task's and don't allow
+ * passing of a locked parameter.  We also obviously don't pass
+ * FOLL_REMOTE in here.
+ */
+long get_user_pages(unsigned long start, unsigned long nr_pages,
+		unsigned int gup_flags, struct page **pages,
+		struct vm_area_struct **vmas)
+{
+	return __gup_longterm_locked(current, current->mm, start, nr_pages,
+				     pages, vmas, gup_flags | FOLL_TOUCH);
+}
+EXPORT_SYMBOL(get_user_pages);
 
 /**
  * populate_vma_page_range() -  populate a range of pages in the vma.
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 6c0279e70cc4..7dd602d7f8db 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -54,8 +54,9 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
 						 pages + i);
 			break;
 		case GUP_LONGTERM_BENCHMARK:
-			nr = get_user_pages_longterm(addr, nr, gup->flags & 1,
-						     pages + i, NULL);
+			nr = get_user_pages(addr, nr,
+					    (gup->flags & 1) | FOLL_LONGTERM,
+					    pages + i, NULL);
 			break;
 		case GUP_BENCHMARK:
 			nr = get_user_pages(addr, nr, gup->flags & 1, pages + i,
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 989e52386c35..2b18223e7eb8 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -253,8 +253,8 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem)
 		return -ENOMEM;
 
 	down_read(&current->mm->mmap_sem);
-	npgs = get_user_pages_longterm(umem->address, umem->npgs,
-				       gup_flags, &umem->pgs[0], NULL);
+	npgs = get_user_pages(umem->address, umem->npgs,
+			      gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
 	up_read(&current->mm->mmap_sem);
 
 	if (npgs != umem->npgs) {
-- 
cgit v1.2.3-59-g8ed1b


From 73b0140bf0fe9df90fb267c00673c4b9bf285430 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Mon, 13 May 2019 17:17:11 -0700
Subject: mm/gup: change GUP fast to use flags rather than a write 'bool'

To facilitate additional options to get_user_pages_fast() change the
singular write parameter to be gup_flags.

This patch does not change any functionality.  New functionality will
follow in subsequent patches.

Some of the get_user_pages_fast() call sites were unchanged because they
already passed FOLL_WRITE or 0 for the write parameter.

NOTE: It was suggested to change the ordering of the get_user_pages_fast()
arguments to ensure that callers were converted.  This breaks the current
GUP call site convention of having the returned pages be the final
parameter.  So the suggestion was rejected.

Link: http://lkml.kernel.org/r/20190328084422.29911-4-ira.weiny@intel.com
Link: http://lkml.kernel.org/r/20190317183438.2057-4-ira.weiny@intel.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Mike Marshall <hubcap@omnibond.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/mm/gup.c                         | 11 ++++++-----
 arch/powerpc/kvm/book3s_64_mmu_hv.c        |  4 ++--
 arch/powerpc/kvm/e500_mmu.c                |  2 +-
 arch/s390/kvm/interrupt.c                  |  2 +-
 arch/sh/mm/gup.c                           | 11 ++++++-----
 arch/sparc/mm/gup.c                        |  9 +++++----
 arch/x86/kvm/paging_tmpl.h                 |  2 +-
 arch/x86/kvm/svm.c                         |  2 +-
 drivers/fpga/dfl-afu-dma-region.c          |  2 +-
 drivers/gpu/drm/via/via_dmablit.c          |  3 ++-
 drivers/infiniband/hw/hfi1/user_pages.c    |  3 ++-
 drivers/misc/genwqe/card_utils.c           |  2 +-
 drivers/misc/vmw_vmci/vmci_host.c          |  2 +-
 drivers/misc/vmw_vmci/vmci_queue_pair.c    |  6 ++++--
 drivers/platform/goldfish/goldfish_pipe.c  |  3 ++-
 drivers/rapidio/devices/rio_mport_cdev.c   |  4 +++-
 drivers/sbus/char/oradax.c                 |  2 +-
 drivers/scsi/st.c                          |  3 ++-
 drivers/staging/gasket/gasket_page_table.c |  4 ++--
 drivers/tee/tee_shm.c                      |  2 +-
 drivers/vfio/vfio_iommu_spapr_tce.c        |  3 ++-
 drivers/vhost/vhost.c                      |  2 +-
 drivers/video/fbdev/pvr2fb.c               |  2 +-
 drivers/virt/fsl_hypervisor.c              |  2 +-
 drivers/xen/gntdev.c                       |  2 +-
 fs/orangefs/orangefs-bufmap.c              |  2 +-
 include/linux/mm.h                         |  4 ++--
 kernel/futex.c                             |  2 +-
 lib/iov_iter.c                             |  7 +++++--
 mm/gup.c                                   | 10 +++++-----
 mm/util.c                                  |  8 ++++----
 net/ceph/pagevec.c                         |  2 +-
 net/rds/info.c                             |  2 +-
 net/rds/rdma.c                             |  3 ++-
 34 files changed, 73 insertions(+), 57 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 0d14e0d8eacf..4c2b4483683c 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -235,7 +235,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
  * get_user_pages_fast() - pin user pages in memory
  * @start:	starting user address
  * @nr_pages:	number of pages from start to pin
- * @write:	whether pages will be written to
+ * @gup_flags:	flags modifying pin behaviour
  * @pages:	array that receives pointers to the pages pinned.
  *		Should be at least nr_pages long.
  *
@@ -247,8 +247,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
  * requested. If nr_pages is 0 or negative, returns 0. If no pages
  * were pinned, returns -errno.
  */
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
-			struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+			unsigned int gup_flags, struct page **pages)
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long addr, len, end;
@@ -273,7 +273,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none(pgd))
 			goto slow;
-		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+		if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
+				   pages, &nr))
 			goto slow;
 	} while (pgdp++, addr = next, addr != end);
 	local_irq_enable();
@@ -289,7 +290,7 @@ slow_irqon:
 	pages += nr;
 
 	ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT,
-				      pages, write ? FOLL_WRITE : 0);
+				      pages, gup_flags);
 
 	/* Have to be a bit careful with return values */
 	if (nr > 0) {
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index be7bc070eae5..ab3d484c5e2e 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -600,7 +600,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	/* If writing != 0, then the HPTE must allow writing, if we get here */
 	write_ok = writing;
 	hva = gfn_to_hva_memslot(memslot, gfn);
-	npages = get_user_pages_fast(hva, 1, writing, pages);
+	npages = get_user_pages_fast(hva, 1, writing ? FOLL_WRITE : 0, pages);
 	if (npages < 1) {
 		/* Check if it's an I/O mapping */
 		down_read(&current->mm->mmap_sem);
@@ -1193,7 +1193,7 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
 	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
 		goto err;
 	hva = gfn_to_hva_memslot(memslot, gfn);
-	npages = get_user_pages_fast(hva, 1, 1, pages);
+	npages = get_user_pages_fast(hva, 1, FOLL_WRITE, pages);
 	if (npages < 1)
 		goto err;
 	page = pages[0];
diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
index 24296f4cadc6..e0af53fd78c5 100644
--- a/arch/powerpc/kvm/e500_mmu.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -783,7 +783,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 	if (!pages)
 		return -ENOMEM;
 
-	ret = get_user_pages_fast(cfg->array, num_pages, 1, pages);
+	ret = get_user_pages_fast(cfg->array, num_pages, FOLL_WRITE, pages);
 	if (ret < 0)
 		goto free_pages;
 
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 37503ae62486..1fd706f6206c 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2376,7 +2376,7 @@ static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
 		ret = -EFAULT;
 		goto out;
 	}
-	ret = get_user_pages_fast(map->addr, 1, 1, &map->page);
+	ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page);
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret != 1);
diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c
index 3e27f6d1f1ec..277c882f7489 100644
--- a/arch/sh/mm/gup.c
+++ b/arch/sh/mm/gup.c
@@ -204,7 +204,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
  * get_user_pages_fast() - pin user pages in memory
  * @start:	starting user address
  * @nr_pages:	number of pages from start to pin
- * @write:	whether pages will be written to
+ * @gup_flags:	flags modifying pin behaviour
  * @pages:	array that receives pointers to the pages pinned.
  *		Should be at least nr_pages long.
  *
@@ -216,8 +216,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
  * requested. If nr_pages is 0 or negative, returns 0. If no pages
  * were pinned, returns -errno.
  */
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
-			struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+			unsigned int gup_flags, struct page **pages)
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long addr, len, end;
@@ -241,7 +241,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none(pgd))
 			goto slow;
-		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+		if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
+				   pages, &nr))
 			goto slow;
 	} while (pgdp++, addr = next, addr != end);
 	local_irq_enable();
@@ -261,7 +262,7 @@ slow_irqon:
 
 		ret = get_user_pages_unlocked(start,
 			(end - start) >> PAGE_SHIFT, pages,
-			write ? FOLL_WRITE : 0);
+			gup_flags);
 
 		/* Have to be a bit careful with return values */
 		if (nr > 0) {
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
index aee6dba83d0e..1e770a517d4a 100644
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -245,8 +245,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	return nr;
 }
 
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
-			struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+			unsigned int gup_flags, struct page **pages)
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long addr, len, end;
@@ -303,7 +303,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none(pgd))
 			goto slow;
-		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+		if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
+				   pages, &nr))
 			goto slow;
 	} while (pgdp++, addr = next, addr != end);
 
@@ -324,7 +325,7 @@ slow:
 
 		ret = get_user_pages_unlocked(start,
 			(end - start) >> PAGE_SHIFT, pages,
-			write ? FOLL_WRITE : 0);
+			gup_flags);
 
 		/* Have to be a bit careful with return values */
 		if (nr > 0) {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bdca39829bc..08715034e315 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -140,7 +140,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 	pt_element_t *table;
 	struct page *page;
 
-	npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
+	npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page);
 	/* Check if the user is doing something meaningless. */
 	if (unlikely(npages != 1))
 		return -EFAULT;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 406b558abfef..6b92eaf4a3b1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1805,7 +1805,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
 		return NULL;
 
 	/* Pin the user virtual address. */
-	npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
+	npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages);
 	if (npinned != npages) {
 		pr_err("SEV: Failure locking %lu pages.\n", npages);
 		goto err;
diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c
index e18a786fc943..c438722bf4e1 100644
--- a/drivers/fpga/dfl-afu-dma-region.c
+++ b/drivers/fpga/dfl-afu-dma-region.c
@@ -102,7 +102,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata,
 		goto unlock_vm;
 	}
 
-	pinned = get_user_pages_fast(region->user_addr, npages, 1,
+	pinned = get_user_pages_fast(region->user_addr, npages, FOLL_WRITE,
 				     region->pages);
 	if (pinned < 0) {
 		ret = pinned;
diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c
index 8bf3a7c23ed3..062067438f1d 100644
--- a/drivers/gpu/drm/via/via_dmablit.c
+++ b/drivers/gpu/drm/via/via_dmablit.c
@@ -243,7 +243,8 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg,  drm_via_dmablit_t *xfer)
 	if (NULL == vsg->pages)
 		return -ENOMEM;
 	ret = get_user_pages_fast((unsigned long)xfer->mem_addr,
-			vsg->num_pages, vsg->direction == DMA_FROM_DEVICE,
+			vsg->num_pages,
+			vsg->direction == DMA_FROM_DEVICE ? FOLL_WRITE : 0,
 			vsg->pages);
 	if (ret != vsg->num_pages) {
 		if (ret < 0)
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
index 24b592c6522e..78ccacaf97d0 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -105,7 +105,8 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np
 {
 	int ret;
 
-	ret = get_user_pages_fast(vaddr, npages, writable, pages);
+	ret = get_user_pages_fast(vaddr, npages, writable ? FOLL_WRITE : 0,
+				  pages);
 	if (ret < 0)
 		return ret;
 
diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c
index 25265fd0fd6e..89cff9d1012b 100644
--- a/drivers/misc/genwqe/card_utils.c
+++ b/drivers/misc/genwqe/card_utils.c
@@ -603,7 +603,7 @@ int genwqe_user_vmap(struct genwqe_dev *cd, struct dma_mapping *m, void *uaddr,
 	/* pin user pages in memory */
 	rc = get_user_pages_fast(data & PAGE_MASK, /* page aligned addr */
 				 m->nr_pages,
-				 m->write,		/* readable/writable */
+				 m->write ? FOLL_WRITE : 0,	/* readable/writable */
 				 m->page_list);	/* ptrs to pages */
 	if (rc < 0)
 		goto fail_get_user_pages;
diff --git a/drivers/misc/vmw_vmci/vmci_host.c b/drivers/misc/vmw_vmci/vmci_host.c
index 997f92543dd4..422d08da3244 100644
--- a/drivers/misc/vmw_vmci/vmci_host.c
+++ b/drivers/misc/vmw_vmci/vmci_host.c
@@ -242,7 +242,7 @@ static int vmci_host_setup_notify(struct vmci_ctx *context,
 	/*
 	 * Lock physical page backing a given user VA.
 	 */
-	retval = get_user_pages_fast(uva, 1, 1, &context->notify_page);
+	retval = get_user_pages_fast(uva, 1, FOLL_WRITE, &context->notify_page);
 	if (retval != 1) {
 		context->notify_page = NULL;
 		return VMCI_ERROR_GENERIC;
diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c
index f5f1aac9d163..1174735f003d 100644
--- a/drivers/misc/vmw_vmci/vmci_queue_pair.c
+++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c
@@ -659,7 +659,8 @@ static int qp_host_get_user_memory(u64 produce_uva,
 	int err = VMCI_SUCCESS;
 
 	retval = get_user_pages_fast((uintptr_t) produce_uva,
-				     produce_q->kernel_if->num_pages, 1,
+				     produce_q->kernel_if->num_pages,
+				     FOLL_WRITE,
 				     produce_q->kernel_if->u.h.header_page);
 	if (retval < (int)produce_q->kernel_if->num_pages) {
 		pr_debug("get_user_pages_fast(produce) failed (retval=%d)",
@@ -671,7 +672,8 @@ static int qp_host_get_user_memory(u64 produce_uva,
 	}
 
 	retval = get_user_pages_fast((uintptr_t) consume_uva,
-				     consume_q->kernel_if->num_pages, 1,
+				     consume_q->kernel_if->num_pages,
+				     FOLL_WRITE,
 				     consume_q->kernel_if->u.h.header_page);
 	if (retval < (int)consume_q->kernel_if->num_pages) {
 		pr_debug("get_user_pages_fast(consume) failed (retval=%d)",
diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c
index 321bc673c417..cef0133aa47a 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -274,7 +274,8 @@ static int pin_user_pages(unsigned long first_page,
 		*iter_last_page_size = last_page_size;
 	}
 
-	ret = get_user_pages_fast(first_page, requested_pages, !is_write,
+	ret = get_user_pages_fast(first_page, requested_pages,
+				  !is_write ? FOLL_WRITE : 0,
 				  pages);
 	if (ret <= 0)
 		return -EFAULT;
diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index 1e1f42e210a0..4a4a75fa26d5 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -868,7 +868,9 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
 
 		pinned = get_user_pages_fast(
 				(unsigned long)xfer->loc_addr & PAGE_MASK,
-				nr_pages, dir == DMA_FROM_DEVICE, page_list);
+				nr_pages,
+				dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0,
+				page_list);
 
 		if (pinned != nr_pages) {
 			if (pinned < 0) {
diff --git a/drivers/sbus/char/oradax.c b/drivers/sbus/char/oradax.c
index acd9ba40eabe..8090dc9a1514 100644
--- a/drivers/sbus/char/oradax.c
+++ b/drivers/sbus/char/oradax.c
@@ -437,7 +437,7 @@ static int dax_lock_page(void *va, struct page **p)
 
 	dax_dbg("uva %p", va);
 
-	ret = get_user_pages_fast((unsigned long)va, 1, 1, p);
+	ret = get_user_pages_fast((unsigned long)va, 1, FOLL_WRITE, p);
 	if (ret == 1) {
 		dax_dbg("locked page %p, for VA %p", *p, va);
 		return 0;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 19c022e66d63..3c6a18ad9a87 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -4922,7 +4922,8 @@ static int sgl_map_user_pages(struct st_buffer *STbp,
 
         /* Try to fault in all of the necessary pages */
         /* rw==READ means read from drive, write into memory area */
-	res = get_user_pages_fast(uaddr, nr_pages, rw == READ, pages);
+	res = get_user_pages_fast(uaddr, nr_pages, rw == READ ? FOLL_WRITE : 0,
+				  pages);
 
 	/* Errors and no page mapped should return here */
 	if (res < nr_pages)
diff --git a/drivers/staging/gasket/gasket_page_table.c b/drivers/staging/gasket/gasket_page_table.c
index 600928f63577..d35c4fb19e28 100644
--- a/drivers/staging/gasket/gasket_page_table.c
+++ b/drivers/staging/gasket/gasket_page_table.c
@@ -486,8 +486,8 @@ static int gasket_perform_mapping(struct gasket_page_table *pg_tbl,
 			ptes[i].dma_addr = pg_tbl->coherent_pages[0].paddr +
 					   off + i * PAGE_SIZE;
 		} else {
-			ret = get_user_pages_fast(page_addr - offset, 1, 1,
-						  &page);
+			ret = get_user_pages_fast(page_addr - offset, 1,
+						  FOLL_WRITE, &page);
 
 			if (ret <= 0) {
 				dev_err(pg_tbl->device,
diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 0b9ab1d0dd45..49fd7312e2aa 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -273,7 +273,7 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr,
 		goto err;
 	}
 
-	rc = get_user_pages_fast(start, num_pages, 1, shm->pages);
+	rc = get_user_pages_fast(start, num_pages, FOLL_WRITE, shm->pages);
 	if (rc > 0)
 		shm->num_pages = rc;
 	if (rc != num_pages) {
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 6b64e45a5269..40ddc0c5f677 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -532,7 +532,8 @@ static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
 	enum dma_data_direction direction = iommu_tce_direction(tce);
 
 	if (get_user_pages_fast(tce & PAGE_MASK, 1,
-			direction != DMA_TO_DEVICE, &page) != 1)
+			direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
+			&page) != 1)
 		return -EFAULT;
 
 	*hpa = __pa((unsigned long) page_address(page));
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 351af88231ad..1e3ed41ae1f3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1704,7 +1704,7 @@ static int set_bit_to_user(int nr, void __user *addr)
 	int bit = nr + (log % PAGE_SIZE) * 8;
 	int r;
 
-	r = get_user_pages_fast(log, 1, 1, &page);
+	r = get_user_pages_fast(log, 1, FOLL_WRITE, &page);
 	if (r < 0)
 		return r;
 	BUG_ON(r != 1);
diff --git a/drivers/video/fbdev/pvr2fb.c b/drivers/video/fbdev/pvr2fb.c
index dfed532ed606..4e4d6a0df978 100644
--- a/drivers/video/fbdev/pvr2fb.c
+++ b/drivers/video/fbdev/pvr2fb.c
@@ -686,7 +686,7 @@ static ssize_t pvr2fb_write(struct fb_info *info, const char *buf,
 	if (!pages)
 		return -ENOMEM;
 
-	ret = get_user_pages_fast((unsigned long)buf, nr_pages, true, pages);
+	ret = get_user_pages_fast((unsigned long)buf, nr_pages, FOLL_WRITE, pages);
 	if (ret < nr_pages) {
 		nr_pages = ret;
 		ret = -EINVAL;
diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c
index 8ba726e600e9..6446bcab4185 100644
--- a/drivers/virt/fsl_hypervisor.c
+++ b/drivers/virt/fsl_hypervisor.c
@@ -244,7 +244,7 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p)
 
 	/* Get the physical addresses of the source buffer */
 	num_pinned = get_user_pages_fast(param.local_vaddr - lb_offset,
-		num_pages, param.source != -1, pages);
+		num_pages, param.source != -1 ? FOLL_WRITE : 0, pages);
 
 	if (num_pinned != num_pages) {
 		/* get_user_pages() failed */
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 7cf9c51318aa..02bc815982d4 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -852,7 +852,7 @@ static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt,
 	unsigned long xen_pfn;
 	int ret;
 
-	ret = get_user_pages_fast(addr, 1, writeable, &page);
+	ret = get_user_pages_fast(addr, 1, writeable ? FOLL_WRITE : 0, &page);
 	if (ret < 0)
 		return ret;
 
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index d4811f981608..2bb916d68576 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -269,7 +269,7 @@ orangefs_bufmap_map(struct orangefs_bufmap *bufmap,
 
 	/* map the pages */
 	ret = get_user_pages_fast((unsigned long)user_desc->ptr,
-			     bufmap->page_count, 1, bufmap->page_array);
+			     bufmap->page_count, FOLL_WRITE, bufmap->page_array);
 
 	if (ret < 0)
 		return ret;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8bc677ce8f01..c3c73b3c9adc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1505,8 +1505,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
 		    struct page **pages, unsigned int gup_flags);
 
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
-			struct page **pages);
+int get_user_pages_fast(unsigned long start, int nr_pages,
+			unsigned int gup_flags, struct page **pages);
 
 /* Container for pinned pfns / pages */
 struct frame_vector {
diff --git a/kernel/futex.c b/kernel/futex.c
index 6262f1534ac9..2268b97d5439 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -543,7 +543,7 @@ again:
 	if (unlikely(should_fail_futex(fshared)))
 		return -EFAULT;
 
-	err = get_user_pages_fast(address, 1, 1, &page);
+	err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
 	/*
 	 * If write access is not required (eg. FUTEX_WAIT), try
 	 * and get read-only access.
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index b396d328a764..f74fa832f3aa 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1293,7 +1293,9 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
 			len = maxpages * PAGE_SIZE;
 		addr &= ~(PAGE_SIZE - 1);
 		n = DIV_ROUND_UP(len, PAGE_SIZE);
-		res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, pages);
+		res = get_user_pages_fast(addr, n,
+				iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0,
+				pages);
 		if (unlikely(res < 0))
 			return res;
 		return (res == n ? len : res * PAGE_SIZE) - *start;
@@ -1374,7 +1376,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
 		p = get_pages_array(n);
 		if (!p)
 			return -ENOMEM;
-		res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, p);
+		res = get_user_pages_fast(addr, n,
+				iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0, p);
 		if (unlikely(res < 0)) {
 			kvfree(p);
 			return res;
diff --git a/mm/gup.c b/mm/gup.c
index 113c18a98cf5..3dde6a8da670 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2062,7 +2062,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
  * get_user_pages_fast() - pin user pages in memory
  * @start:	starting user address
  * @nr_pages:	number of pages from start to pin
- * @write:	whether pages will be written to
+ * @gup_flags:	flags modifying pin behaviour
  * @pages:	array that receives pointers to the pages pinned.
  *		Should be at least nr_pages long.
  *
@@ -2074,8 +2074,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
  * requested. If nr_pages is 0 or negative, returns 0. If no pages
  * were pinned, returns -errno.
  */
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
-			struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+			unsigned int gup_flags, struct page **pages)
 {
 	unsigned long addr, len, end;
 	int nr = 0, ret = 0;
@@ -2093,7 +2093,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 
 	if (gup_fast_permitted(start, nr_pages)) {
 		local_irq_disable();
-		gup_pgd_range(addr, end, write ? FOLL_WRITE : 0, pages, &nr);
+		gup_pgd_range(addr, end, gup_flags, pages, &nr);
 		local_irq_enable();
 		ret = nr;
 	}
@@ -2104,7 +2104,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 		pages += nr;
 
 		ret = get_user_pages_unlocked(start, nr_pages - nr, pages,
-				write ? FOLL_WRITE : 0);
+					      gup_flags);
 
 		/* Have to be a bit careful with return values */
 		if (nr > 0) {
diff --git a/mm/util.c b/mm/util.c
index 43a2984bccaa..05a464929b3e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -318,7 +318,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
  * get_user_pages_fast() - pin user pages in memory
  * @start:	starting user address
  * @nr_pages:	number of pages from start to pin
- * @write:	whether pages will be written to
+ * @gup_flags:	flags modifying pin behaviour
  * @pages:	array that receives pointers to the pages pinned.
  *		Should be at least nr_pages long.
  *
@@ -339,10 +339,10 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
  * were pinned, returns -errno.
  */
 int __weak get_user_pages_fast(unsigned long start,
-				int nr_pages, int write, struct page **pages)
+				int nr_pages, unsigned int gup_flags,
+				struct page **pages)
 {
-	return get_user_pages_unlocked(start, nr_pages, pages,
-				       write ? FOLL_WRITE : 0);
+	return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
 
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index d3736f5bffec..74cafc0142ea 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -27,7 +27,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data,
 	while (got < num_pages) {
 		rc = get_user_pages_fast(
 		    (unsigned long)data + ((unsigned long)got * PAGE_SIZE),
-		    num_pages - got, write_page, pages + got);
+		    num_pages - got, write_page ? FOLL_WRITE : 0, pages + got);
 		if (rc < 0)
 			break;
 		BUG_ON(rc == 0);
diff --git a/net/rds/info.c b/net/rds/info.c
index e367a97a18c8..03f6fd56d237 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -193,7 +193,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
 		ret = -ENOMEM;
 		goto out;
 	}
-	ret = get_user_pages_fast(start, nr_pages, 1, pages);
+	ret = get_user_pages_fast(start, nr_pages, FOLL_WRITE, pages);
 	if (ret != nr_pages) {
 		if (ret > 0)
 			nr_pages = ret;
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 182ab8430594..b340ed4fc43a 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -158,7 +158,8 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
 {
 	int ret;
 
-	ret = get_user_pages_fast(user_addr, nr_pages, write, pages);
+	ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0,
+				  pages);
 
 	if (ret >= 0 && ret < nr_pages) {
 		while (ret--)
-- 
cgit v1.2.3-59-g8ed1b


From d8ae8a3765bfa1f9bf977e2496fcc9cf64fbfabd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 May 2019 17:18:30 -0700
Subject: initramfs: move the legacy keepinitrd parameter to core code

No need to handle the freeing disable in arch code when we already have a
core hook (and a different name for the option) for it.

Link: http://lkml.kernel.org/r/20190213174621.29297-7-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>	[arm64]
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>	[m68k]
Cc: Steven Price <steven.price@arm.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig             |  7 +++++++
 arch/arm/Kconfig         |  1 +
 arch/arm/mm/init.c       | 25 ++++++-------------------
 arch/arm64/Kconfig       |  1 +
 arch/arm64/mm/init.c     | 17 ++---------------
 arch/unicore32/Kconfig   |  1 +
 arch/unicore32/mm/init.c | 14 +-------------
 init/initramfs.c         |  9 +++++++++
 8 files changed, 28 insertions(+), 47 deletions(-)

(limited to 'arch')

diff --git a/arch/Kconfig b/arch/Kconfig
index 5e43fcbad4ca..f11f0698b148 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -245,6 +245,13 @@ config ARCH_HAS_FORTIFY_SOURCE
 	  An architecture should select this when it can successfully
 	  build and run with CONFIG_FORTIFY_SOURCE.
 
+#
+# Select if the arch provides a historic keepinit alias for the retain_initrd
+# command line option
+#
+config ARCH_HAS_KEEPINITRD
+	bool
+
 # Select if arch has all set_memory_ro/rw/x/nx() functions in asm/cacheflush.h
 config ARCH_HAS_SET_MEMORY
 	bool
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index dc9855c4a3b4..a11dfcc2a130 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -9,6 +9,7 @@ config ARM
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FORTIFY_SOURCE
+	select ARCH_HAS_KEEPINITRD
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
 	select ARCH_HAS_PTE_SPECIAL if ARM_LPAE
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index c2daabbe0af0..68dcd5f8d7c6 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -695,27 +695,14 @@ void free_initmem(void)
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
-
-static int keep_initrd;
-
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	if (!keep_initrd) {
-		if (start == initrd_start)
-			start = round_down(start, PAGE_SIZE);
-		if (end == initrd_end)
-			end = round_up(end, PAGE_SIZE);
+	if (start == initrd_start)
+		start = round_down(start, PAGE_SIZE);
+	if (end == initrd_end)
+		end = round_up(end, PAGE_SIZE);
 
-		poison_init_mem((void *)start, PAGE_ALIGN(end) - start);
-		free_reserved_area((void *)start, (void *)end, -1, "initrd");
-	}
+	poison_init_mem((void *)start, PAGE_ALIGN(end) - start);
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
-
-static int __init keepinitrd_setup(char *__unused)
-{
-	keep_initrd = 1;
-	return 1;
-}
-
-__setup("keepinitrd", keepinitrd_setup);
 #endif
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3f957443f286..e24dc16453aa 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -21,6 +21,7 @@ config ARM64
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
 	select ARCH_HAS_KCOV
+	select ARCH_HAS_KEEPINITRD
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_SETUP_DMA_OPS
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 40e2d7e5efcb..007c05a4cce0 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -578,24 +578,11 @@ void free_initmem(void)
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
-
-static int keep_initrd __initdata;
-
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
-	if (!keep_initrd) {
-		free_reserved_area((void *)start, (void *)end, 0, "initrd");
-		memblock_free(__virt_to_phys(start), end - start);
-	}
-}
-
-static int __init keepinitrd_setup(char *__unused)
-{
-	keep_initrd = 1;
-	return 1;
+	free_reserved_area((void *)start, (void *)end, 0, "initrd");
+	memblock_free(__virt_to_phys(start), end - start);
 }
-
-__setup("keepinitrd", keepinitrd_setup);
 #endif
 
 /*
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index 2445dfcf6444..afe4949cfc2d 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -3,6 +3,7 @@ config UNICORE32
 	def_bool y
 	select ARCH_32BIT_OFF_T
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
+	select ARCH_HAS_KEEPINITRD
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
 	select HAVE_KERNEL_GZIP
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index 74b6a2e29809..c0573feb5064 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -294,20 +294,8 @@ void free_initmem(void)
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
-
-static int keep_initrd;
-
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	if (!keep_initrd)
-		free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-
-static int __init keepinitrd_setup(char *__unused)
-{
-	keep_initrd = 1;
-	return 1;
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
-
-__setup("keepinitrd", keepinitrd_setup);
 #endif
diff --git a/init/initramfs.c b/init/initramfs.c
index 32f940473d67..cb3d17735c66 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -513,6 +513,15 @@ static int __init retain_initrd_param(char *str)
 }
 __setup("retain_initrd", retain_initrd_param);
 
+#ifdef CONFIG_ARCH_HAS_KEEPINITRD
+static int __init keepinitrd_setup(char *__unused)
+{
+	do_retain_initrd = 1;
+	return 1;
+}
+__setup("keepinitrd", keepinitrd_setup);
+#endif
+
 extern char __initramfs_start[];
 extern unsigned long __initramfs_size;
 #include <linux/initrd.h>
-- 
cgit v1.2.3-59-g8ed1b


From 4afd58e14dd415e456fd236755373f52e6055ec7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 May 2019 17:18:34 -0700
Subject: initramfs: provide a generic free_initrd_mem implementation

For most architectures free_initrd_mem just expands to the same
free_reserved_area call.  Provide that as a generic implementation marked
__weak.

Link: http://lkml.kernel.org/r/20190213174621.29297-8-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>	[m68k]
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>	[arm64]
Cc: Steven Price <steven.price@arm.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/mm/init.c      | 8 --------
 arch/arc/mm/init.c        | 7 -------
 arch/c6x/mm/init.c        | 7 -------
 arch/h8300/mm/init.c      | 8 --------
 arch/m68k/mm/init.c       | 7 -------
 arch/microblaze/mm/init.c | 7 -------
 arch/nds32/mm/init.c      | 7 -------
 arch/nios2/mm/init.c      | 7 -------
 arch/openrisc/mm/init.c   | 7 -------
 arch/parisc/mm/init.c     | 7 -------
 arch/powerpc/mm/mem.c     | 7 -------
 arch/sh/mm/init.c         | 7 -------
 arch/um/kernel/mem.c      | 7 -------
 arch/unicore32/mm/init.c  | 7 -------
 init/initramfs.c          | 5 +++++
 15 files changed, 5 insertions(+), 100 deletions(-)

(limited to 'arch')

diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index a42fc5c4db89..97f4940f11e3 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -291,11 +291,3 @@ free_initmem(void)
 {
 	free_initmem_default(-1);
 }
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void
-free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index e1ab2d7f1d64..c357a3bd1532 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -214,10 +214,3 @@ void __ref free_initmem(void)
 {
 	free_initmem_default(-1);
 }
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c
index fe582c3a1794..6fd43ec53507 100644
--- a/arch/c6x/mm/init.c
+++ b/arch/c6x/mm/init.c
@@ -69,13 +69,6 @@ void __init mem_init(void)
 	mem_init_print_info(NULL);
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
 void __init free_initmem(void)
 {
 	free_initmem_default(-1);
diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c
index 0f04a5e9aa4f..ea635c9025fe 100644
--- a/arch/h8300/mm/init.c
+++ b/arch/h8300/mm/init.c
@@ -103,14 +103,6 @@ void __init mem_init(void)
 	mem_init_print_info(NULL);
 }
 
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
 void
 free_initmem(void)
 {
diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c
index 8868a4c9adae..778cacb7d57b 100644
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@@ -147,10 +147,3 @@ void __init mem_init(void)
 	init_pointer_tables();
 	mem_init_print_info(NULL);
 }
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 7e97d44f6538..b675bc666e68 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -186,13 +186,6 @@ void __init setup_memory(void)
 	paging_init();
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
 void free_initmem(void)
 {
 	free_initmem_default(-1);
diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c
index 1d03633f89a9..9a7065c1fb83 100644
--- a/arch/nds32/mm/init.c
+++ b/arch/nds32/mm/init.c
@@ -257,13 +257,6 @@ void free_initmem(void)
 	free_initmem_default(-1);
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
 void __set_fixmap(enum fixed_addresses idx,
 			       phys_addr_t phys, pgprot_t flags)
 {
diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c
index 16cea5776b87..60736a725883 100644
--- a/arch/nios2/mm/init.c
+++ b/arch/nios2/mm/init.c
@@ -82,13 +82,6 @@ void __init mmu_init(void)
 	flush_tlb_all();
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
 void __ref free_initmem(void)
 {
 	free_initmem_default(-1);
diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c
index caeb4184e8a6..08df7f0b1d96 100644
--- a/arch/openrisc/mm/init.c
+++ b/arch/openrisc/mm/init.c
@@ -224,13 +224,6 @@ void __init mem_init(void)
 	return;
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
 void free_initmem(void)
 {
 	free_initmem_default(-1);
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 3b0f9eab7f2c..11ec1f1221a6 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -917,10 +917,3 @@ void flush_tlb_all(void)
 	spin_unlock(&sid_lock);
 }
 #endif
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index cd525d709072..20266898f3a8 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -338,13 +338,6 @@ void free_initmem(void)
 	free_initmem_default(POISON_FREE_INITMEM);
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
 /*
  * This is called when a page has been modified by the kernel.
  * It just marks the page as not i-cache clean.  We do the i-cache
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 70621324db41..3e68d98af1bd 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -408,13 +408,6 @@ void free_initmem(void)
 	free_initmem_default(-1);
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
 		bool want_memblock)
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 99aa11bf53d1..a9c9a94c096f 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -188,13 +188,6 @@ void free_initmem(void)
 {
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
 /* Allocate and free page tables. */
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index c0573feb5064..6e352de80038 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -292,10 +292,3 @@ void free_initmem(void)
 {
 	free_initmem_default(-1);
 }
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
diff --git a/init/initramfs.c b/init/initramfs.c
index cb3d17735c66..fcb759a106be 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -527,6 +527,11 @@ extern unsigned long __initramfs_size;
 #include <linux/initrd.h>
 #include <linux/kexec.h>
 
+void __weak free_initrd_mem(unsigned long start, unsigned long end)
+{
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
+}
+
 #ifdef CONFIG_KEXEC_CORE
 static bool kexec_free_initrd(void)
 {
-- 
cgit v1.2.3-59-g8ed1b


From f94f7434cbbb02f7eb55ed5ad66284023c47968f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 May 2019 17:18:37 -0700
Subject: initramfs: poison freed initrd memory

Various architectures including x86 poison the freed initrd memory.  Do
the same in the generic free_initrd_mem implementation and switch a few
more architectures that are identical to the generic code over to it now.

Link: http://lkml.kernel.org/r/20190213174621.29297-9-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>	[arm64]
Cc: Geert Uytterhoeven <geert@linux-m68k.org>	[m68k]
Cc: Steven Price <steven.price@arm.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/mm/init.c     | 8 --------
 arch/s390/mm/init.c     | 8 --------
 arch/sparc/mm/init_32.c | 8 --------
 arch/sparc/mm/init_64.c | 8 --------
 init/initramfs.c        | 3 ++-
 5 files changed, 2 insertions(+), 33 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index bbb196ad5f26..8a038b30d3c4 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -504,14 +504,6 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end)
 	printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
-			   "initrd");
-}
-#endif
-
 void (*free_init_pages_eva)(void *begin, void *end) = NULL;
 
 void __ref free_initmem(void)
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 7cf48eefec8f..5f48fc7e61d5 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -157,14 +157,6 @@ void free_initmem(void)
 	free_initmem_default(POISON_FREE_INITMEM);
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
-			   "initrd");
-}
-#endif
-
 unsigned long memory_block_size_bytes(void)
 {
 	/*
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index a8ff29821bdb..417f89d5e0b2 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -299,14 +299,6 @@ void free_initmem (void)
 	free_initmem_default(POISON_FREE_INITMEM);
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
-			   "initrd");
-}
-#endif
-
 void sparc_flush_page_to_ram(struct page *page)
 {
 	unsigned long vaddr = (unsigned long)page_address(page);
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index bc2aaa47bc8a..4b099dd7a767 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2572,14 +2572,6 @@ void free_initmem(void)
 	}
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
-			   "initrd");
-}
-#endif
-
 pgprot_t PAGE_KERNEL __read_mostly;
 EXPORT_SYMBOL(PAGE_KERNEL);
 
diff --git a/init/initramfs.c b/init/initramfs.c
index fcb759a106be..435a428c2af1 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -529,7 +529,8 @@ extern unsigned long __initramfs_size;
 
 void __weak free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area((void *)start, (void *)end, -1, "initrd");
+	free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
+			"initrd");
 }
 
 #ifdef CONFIG_KEXEC_CORE
-- 
cgit v1.2.3-59-g8ed1b


From 997aef68af3ef1f2cb97da1c0b41a5afa87f63e2 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 13 May 2019 17:18:40 -0700
Subject: init: provide a generic free_initmem implementation

Patch series "provide a generic free_initmem implementation", v2.

Many architectures implement free_initmem() in exactly the same or very
similar way: they wrap the call to free_initmem_default() with sometimes
different 'poison' parameter.

These patches switch those architectures to use a generic implementation
that does free_initmem_default(POISON_FREE_INITMEM).

This was inspired by Christoph's patches for free_initrd_mem [1] and I
shamelessly copied changelog entries from his patches :)

[1] https://lore.kernel.org/lkml/20190213174621.29297-1-hch@lst.de/

This patch (of 2):

For most architectures free_initmem just a wrapper for the same
free_initmem_default(-1) call.  Provide that as a generic implementation
marked __weak.

Link: http://lkml.kernel.org/r/1550515285-17446-2-git-send-email-rppt@linux.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Palmer Dabbelt <palmer@sifive.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/mm/init.c      | 6 ------
 arch/arc/mm/init.c        | 8 --------
 arch/c6x/mm/init.c        | 5 -----
 arch/h8300/mm/init.c      | 6 ------
 arch/microblaze/mm/init.c | 5 -----
 arch/nds32/mm/init.c      | 5 -----
 arch/nios2/mm/init.c      | 5 -----
 arch/openrisc/mm/init.c   | 5 -----
 arch/sh/mm/init.c         | 5 -----
 arch/unicore32/mm/init.c  | 5 -----
 arch/xtensa/mm/init.c     | 5 -----
 init/main.c               | 5 +++++
 12 files changed, 5 insertions(+), 60 deletions(-)

(limited to 'arch')

diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 97f4940f11e3..e2cbec3789e8 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -285,9 +285,3 @@ mem_init(void)
 	memblock_free_all();
 	mem_init_print_info(NULL);
 }
-
-void
-free_initmem(void)
-{
-	free_initmem_default(-1);
-}
diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index c357a3bd1532..02b7a3b20d7c 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -206,11 +206,3 @@ void __init mem_init(void)
 	memblock_free_all();
 	mem_init_print_info(NULL);
 }
-
-/*
- * free_initmem: Free all the __init memory.
- */
-void __ref free_initmem(void)
-{
-	free_initmem_default(-1);
-}
diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c
index 6fd43ec53507..573242b160e1 100644
--- a/arch/c6x/mm/init.c
+++ b/arch/c6x/mm/init.c
@@ -68,8 +68,3 @@ void __init mem_init(void)
 
 	mem_init_print_info(NULL);
 }
-
-void __init free_initmem(void)
-{
-	free_initmem_default(-1);
-}
diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c
index ea635c9025fe..1eab16b1a0bc 100644
--- a/arch/h8300/mm/init.c
+++ b/arch/h8300/mm/init.c
@@ -102,9 +102,3 @@ void __init mem_init(void)
 
 	mem_init_print_info(NULL);
 }
-
-void
-free_initmem(void)
-{
-	free_initmem_default(-1);
-}
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index b675bc666e68..a015a951c8b7 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -186,11 +186,6 @@ void __init setup_memory(void)
 	paging_init();
 }
 
-void free_initmem(void)
-{
-	free_initmem_default(-1);
-}
-
 void __init mem_init(void)
 {
 	high_memory = (void *)__va(memory_start + lowmem_size - 1);
diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c
index 9a7065c1fb83..1a4ab1b7525f 100644
--- a/arch/nds32/mm/init.c
+++ b/arch/nds32/mm/init.c
@@ -252,11 +252,6 @@ void __init mem_init(void)
 	return;
 }
 
-void free_initmem(void)
-{
-	free_initmem_default(-1);
-}
-
 void __set_fixmap(enum fixed_addresses idx,
 			       phys_addr_t phys, pgprot_t flags)
 {
diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c
index 60736a725883..2c609c2516b2 100644
--- a/arch/nios2/mm/init.c
+++ b/arch/nios2/mm/init.c
@@ -82,11 +82,6 @@ void __init mmu_init(void)
 	flush_tlb_all();
 }
 
-void __ref free_initmem(void)
-{
-	free_initmem_default(-1);
-}
-
 #define __page_aligned(order) __aligned(PAGE_SIZE << (order))
 pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned(PGD_ORDER);
 pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned(PTE_ORDER);
diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c
index 08df7f0b1d96..abe87e54e231 100644
--- a/arch/openrisc/mm/init.c
+++ b/arch/openrisc/mm/init.c
@@ -223,8 +223,3 @@ void __init mem_init(void)
 	mem_init_done = 1;
 	return;
 }
-
-void free_initmem(void)
-{
-	free_initmem_default(-1);
-}
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 3e68d98af1bd..aeb9f45c7a39 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -403,11 +403,6 @@ void __init mem_init(void)
 	mem_init_done = 1;
 }
 
-void free_initmem(void)
-{
-	free_initmem_default(-1);
-}
-
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
 		bool want_memblock)
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index 6e352de80038..b4442f3060ce 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -287,8 +287,3 @@ void __init mem_init(void)
 		sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
 	}
 }
-
-void free_initmem(void)
-{
-	free_initmem_default(-1);
-}
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index d49861099684..b51746f2b80b 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -216,11 +216,6 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 }
 #endif
 
-void free_initmem(void)
-{
-	free_initmem_default(-1);
-}
-
 static void __init parse_memmap_one(char *p)
 {
 	char *oldp;
diff --git a/init/main.c b/init/main.c
index 33c87e91dc37..26234570a324 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1074,6 +1074,11 @@ static inline void mark_readonly(void)
 }
 #endif
 
+void __weak free_initmem(void)
+{
+	free_initmem_default(-1);
+}
+
 static int __ref kernel_init(void *unused)
 {
 	int ret;
-- 
cgit v1.2.3-59-g8ed1b


From 522c99194549e50a9bd76427a06922d7a68237d6 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 13 May 2019 17:18:43 -0700
Subject: hexagon: switch over to generic free_initmem()

hexagon implementation of free_initmem() is currently empty and marked
with comment

 * Todo:  free pages between __init_begin and __init_end; possibly
 * some devtree related stuff as well.

Switch it to the generic implementation.

Link: http://lkml.kernel.org/r/1550515285-17446-3-git-send-email-rppt@linux.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Palmer Dabbelt <palmer@sifive.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/hexagon/mm/init.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'arch')

diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c
index 1719ede9e9bd..41cf34243ea1 100644
--- a/arch/hexagon/mm/init.c
+++ b/arch/hexagon/mm/init.c
@@ -84,16 +84,6 @@ void __init mem_init(void)
 	init_mm.context.ptbase = __pa(init_mm.pgd);
 }
 
-/*
- * free_initmem - frees memory used by stuff declared with __init
- *
- * Todo:  free pages between __init_begin and __init_end; possibly
- * some devtree related stuff as well.
- */
-void __ref free_initmem(void)
-{
-}
-
 /*
  * free_initrd_mem - frees...  initrd memory.
  * @start - start of init memory
-- 
cgit v1.2.3-59-g8ed1b


From f40399992a245c852ad446e265d1567010db5e10 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 13 May 2019 17:18:46 -0700
Subject: init: free_initmem: poison freed init memory

Various architectures including x86 poison the freed init memory.  Do the
same in the generic free_initmem implementation and switch sparc32
architecture that is identical to the generic code over to it now.

Link: http://lkml.kernel.org/r/1550515285-17446-4-git-send-email-rppt@linux.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Palmer Dabbelt <palmer@sifive.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sparc/mm/init_32.c | 5 -----
 init/main.c             | 2 +-
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index 417f89d5e0b2..046ab116cc8c 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -294,11 +294,6 @@ void __init mem_init(void)
 	mem_init_print_info(NULL);
 }
 
-void free_initmem (void)
-{
-	free_initmem_default(POISON_FREE_INITMEM);
-}
-
 void sparc_flush_page_to_ram(struct page *page)
 {
 	unsigned long vaddr = (unsigned long)page_address(page);
diff --git a/init/main.c b/init/main.c
index 26234570a324..5a2c69b4d7b3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1076,7 +1076,7 @@ static inline void mark_readonly(void)
 
 void __weak free_initmem(void)
 {
-	free_initmem_default(-1);
+	free_initmem_default(POISON_FREE_INITMEM);
 }
 
 static int __ref kernel_init(void *unused)
-- 
cgit v1.2.3-59-g8ed1b


From 0d7b4a607d8f0f95f1ce49e993a04317d10a4ecd Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 13 May 2019 17:18:49 -0700
Subject: riscv: switch over to generic free_initmem()

The riscv version of free_initmem() differs from the generic one only in
that it sets the freed memory to zero.

Make ricsv use the generic version and poison the freed memory.

Link: http://lkml.kernel.org/r/1550515285-17446-5-git-send-email-rppt@linux.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Palmer Dabbelt <palmer@sifive.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Richard Kuo <rkuo@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/riscv/mm/init.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index bc7b77e34d09..8bf6f9c2d48c 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -66,11 +66,6 @@ void __init mem_init(void)
 	mem_init_print_info(NULL);
 }
 
-void free_initmem(void)
-{
-	free_initmem_default(0);
-}
-
 #ifdef CONFIG_BLK_DEV_INITRD
 static void __init setup_initrd(void)
 {
-- 
cgit v1.2.3-59-g8ed1b


From a861bbce27634160ae0330126b4ef001d6941c8f Mon Sep 17 00:00:00 2001
From: Alexandre Ghiti <alex@ghiti.fr>
Date: Mon, 13 May 2019 17:18:53 -0700
Subject: sh: advertise gigantic page support

Patch series "Fix free/allocation of runtime gigantic pages", v8.

This series fixes sh and sparc that did not advertise their gigantic page
support and then were not able to allocate and free those pages at
runtime.  It renames MEMORY_ISOLATION && COMPACTION || CMA condition into
the more accurate CONTIG_ALLOC, since it allows the definition of
alloc_contig_range function.

Finally, it then fixes the wrong definition of ARCH_HAS_GIGANTIC_PAGE
config that, without MEMORY_ISOLATION && COMPACTION || CMA defined, did
not allow architectures to free boottime allocated gigantic pages although
unrelated.

This patch (of 4):

sh actually supports gigantic pages and selecting ARCH_HAS_GIGANTIC_PAGE
allows it to allocate and free gigantic pages at runtime.

At least sdk7786_defconfig exposes such a configuration with huge pages of
64MB, pages of 4KB and MAX_ORDER = 11: HPAGE_SHIFT (26) - PAGE_SHIFT (12)
= 14 >= MAX_ORDER (11)

Link: http://lkml.kernel.org/r/20190327063626.18421-2-alex@ghiti.fr
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sh/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 0be08d586d40..6349396317a9 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -53,6 +53,7 @@ config SUPERH
 	select HAVE_FUTEX_CMPXCHG if FUTEX
 	select HAVE_NMI
 	select NEED_SG_DMA_LENGTH
+	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
 
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
-- 
cgit v1.2.3-59-g8ed1b


From b53f45695449f692d5d2ab89cecc3316cdb636e8 Mon Sep 17 00:00:00 2001
From: Alexandre Ghiti <alex@ghiti.fr>
Date: Mon, 13 May 2019 17:18:56 -0700
Subject: sparc: advertise gigantic page support

sparc actually supports gigantic pages and selecting
ARCH_HAS_GIGANTIC_PAGE allows it to allocate and free gigantic pages at
runtime.

sparc allows configuration such as huge pages of 16GB, pages of 8KB and
MAX_ORDER = 13 (default): HPAGE_SHIFT (34) - PAGE_SHIFT (13) = 21 >=
MAX_ORDER (13)

Link: http://lkml.kernel.org/r/20190327063626.18421-3-alex@ghiti.fr
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sparc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index f6421c9ce5d3..b848c8ddd92e 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -92,6 +92,7 @@ config SPARC64
 	select ARCH_CLOCKSOURCE_DATA
 	select ARCH_HAS_PTE_SPECIAL
 	select PCI_DOMAINS if PCI
+	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
 
 config ARCH_DEFCONFIG
 	string
-- 
cgit v1.2.3-59-g8ed1b


From 8df995f6bde01de96ce93373785f41c3bd13ad1c Mon Sep 17 00:00:00 2001
From: Alexandre Ghiti <alex@ghiti.fr>
Date: Mon, 13 May 2019 17:19:00 -0700
Subject: mm: simplify MEMORY_ISOLATION && COMPACTION || CMA into CONTIG_ALLOC

This condition allows to define alloc_contig_range, so simplify it into a
more accurate naming.

Link: http://lkml.kernel.org/r/20190327063626.18421-4-alex@ghiti.fr
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/Kconfig                     | 2 +-
 arch/powerpc/platforms/Kconfig.cputype | 2 +-
 arch/s390/Kconfig                      | 2 +-
 arch/sh/Kconfig                        | 2 +-
 arch/sparc/Kconfig                     | 2 +-
 arch/x86/Kconfig                       | 2 +-
 arch/x86/mm/hugetlbpage.c              | 2 +-
 include/linux/gfp.h                    | 2 +-
 mm/Kconfig                             | 3 +++
 mm/page_alloc.c                        | 3 +--
 10 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e24dc16453aa..7f7fbd8bd9d5 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -19,7 +19,7 @@ config ARM64
 	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
-	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+	select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_KEEPINITRD
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index d0e172d47574..3a31d4289ea4 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -331,7 +331,7 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
 config PPC_RADIX_MMU
 	bool "Radix MMU Support"
 	depends on PPC_BOOK3S_64 && HUGETLB_PAGE
-	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+	select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC
 	select PPC_HAVE_KUEP
 	select PPC_HAVE_KUAP
 	default y
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 07485582d027..724dbc6b7d33 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -63,7 +63,7 @@ config S390
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
-	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+	select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_SET_MEMORY
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 6349396317a9..2a5ec643fec0 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -53,7 +53,7 @@ config SUPERH
 	select HAVE_FUTEX_CMPXCHG if FUTEX
 	select HAVE_NMI
 	select NEED_SG_DMA_LENGTH
-	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+	select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC
 
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index b848c8ddd92e..566de738e487 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -92,7 +92,7 @@ config SPARC64
 	select ARCH_CLOCKSOURCE_DATA
 	select ARCH_HAS_PTE_SPECIAL
 	select PCI_DOMAINS if PCI
-	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+	select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC
 
 config ARCH_DEFCONFIG
 	string
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e7212731cffb..526d95abfe5e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,7 +22,7 @@ config X86_64
 	def_bool y
 	depends on 64BIT
 	# Options that are inherently 64-bit kernel only:
-	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+	select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC
 	select ARCH_SUPPORTS_INT128
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select HAVE_ARCH_SOFT_DIRTY
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 92e4c4b85bba..fab095362c50 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -203,7 +203,7 @@ static __init int setup_hugepagesz(char *opt)
 }
 __setup("hugepagesz=", setup_hugepagesz);
 
-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
+#ifdef CONFIG_CONTIG_ALLOC
 static __init int gigantic_pages_init(void)
 {
 	/* With compaction or CMA we can allocate gigantic pages at runtime */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index fdab7de7490d..e77ab30e9328 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -585,7 +585,7 @@ static inline bool pm_suspended_storage(void)
 }
 #endif /* CONFIG_PM_SLEEP */
 
-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
+#ifdef CONFIG_CONTIG_ALLOC
 /* The below functions must be run on a range from a single zone. */
 extern int alloc_contig_range(unsigned long start, unsigned long end,
 			      unsigned migratetype, gfp_t gfp_mask);
diff --git a/mm/Kconfig b/mm/Kconfig
index 25c71eb8a7db..137eadc18732 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -258,6 +258,9 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
 config ARCH_ENABLE_THP_MIGRATION
 	bool
 
+config CONTIG_ALLOC
+       def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
+
 config PHYS_ADDR_T_64BIT
 	def_bool 64BIT
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07a0d722d481..2efb6525d932 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8137,8 +8137,7 @@ unmovable:
 	return true;
 }
 
-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
-
+#ifdef CONFIG_CONTIG_ALLOC
 static unsigned long pfn_max_align_down(unsigned long pfn)
 {
 	return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
-- 
cgit v1.2.3-59-g8ed1b


From 4eb0716e868eed963967adb0b1b11d9bd8ca1d01 Mon Sep 17 00:00:00 2001
From: Alexandre Ghiti <alex@ghiti.fr>
Date: Mon, 13 May 2019 17:19:04 -0700
Subject: hugetlb: allow to free gigantic pages regardless of the configuration

On systems without CONTIG_ALLOC activated but that support gigantic pages,
boottime reserved gigantic pages can not be freed at all.  This patch
simply enables the possibility to hand back those pages to memory
allocator.

Link: http://lkml.kernel.org/r/20190327063626.18421-5-alex@ghiti.fr
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
Acked-by: David S. Miller <davem@davemloft.net> [sparc]
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/Kconfig                           |  2 +-
 arch/arm64/include/asm/hugetlb.h             |  4 ---
 arch/powerpc/include/asm/book3s/64/hugetlb.h |  5 ++-
 arch/powerpc/platforms/Kconfig.cputype       |  2 +-
 arch/s390/Kconfig                            |  2 +-
 arch/s390/include/asm/hugetlb.h              |  8 +++--
 arch/sh/Kconfig                              |  2 +-
 arch/sparc/Kconfig                           |  2 +-
 arch/x86/Kconfig                             |  2 +-
 arch/x86/include/asm/hugetlb.h               |  4 ---
 include/asm-generic/hugetlb.h                |  7 ++++
 include/linux/gfp.h                          |  2 +-
 mm/hugetlb.c                                 | 54 +++++++++++++++++++---------
 mm/page_alloc.c                              |  4 +--
 14 files changed, 61 insertions(+), 39 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7f7fbd8bd9d5..7a1aa53d188d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -19,7 +19,7 @@ config ARM64
 	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
-	select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC
+	select ARCH_HAS_GIGANTIC_PAGE
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_KEEPINITRD
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index c6a07a3b433e..4aad6382f631 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -70,8 +70,4 @@ extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
 
 #include <asm-generic/hugetlb.h>
 
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void) { return true; }
-#endif
-
 #endif /* __ASM_HUGETLB_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index 56140d19c85f..12e150e615b7 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -36,8 +36,8 @@ static inline int hstate_get_psize(struct hstate *hstate)
 	}
 }
 
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void)
+#define __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED
+static inline bool gigantic_page_runtime_supported(void)
 {
 	/*
 	 * We used gigantic page reservation with hypervisor assist in some case.
@@ -49,7 +49,6 @@ static inline bool gigantic_page_supported(void)
 
 	return true;
 }
-#endif
 
 /* hugepd entry valid bit */
 #define HUGEPD_VAL_BITS		(0x8000000000000000UL)
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 3a31d4289ea4..2794235e9d3e 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -331,7 +331,7 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
 config PPC_RADIX_MMU
 	bool "Radix MMU Support"
 	depends on PPC_BOOK3S_64 && HUGETLB_PAGE
-	select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC
+	select ARCH_HAS_GIGANTIC_PAGE
 	select PPC_HAVE_KUEP
 	select PPC_HAVE_KUAP
 	default y
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 724dbc6b7d33..d0c046af65fa 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -63,7 +63,7 @@ config S390
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
-	select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC
+	select ARCH_HAS_GIGANTIC_PAGE
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_SET_MEMORY
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index 2d1afa58a4b6..bb59dd964590 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -116,7 +116,9 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
 	return pte_modify(pte, newprot);
 }
 
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void) { return true; }
-#endif
+static inline bool gigantic_page_runtime_supported(void)
+{
+	return true;
+}
+
 #endif /* _ASM_S390_HUGETLB_H */
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 2a5ec643fec0..2a77033e1e7c 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -53,7 +53,7 @@ config SUPERH
 	select HAVE_FUTEX_CMPXCHG if FUTEX
 	select HAVE_NMI
 	select NEED_SG_DMA_LENGTH
-	select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC
+	select ARCH_HAS_GIGANTIC_PAGE
 
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 566de738e487..7c93f3121ee6 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -92,7 +92,7 @@ config SPARC64
 	select ARCH_CLOCKSOURCE_DATA
 	select ARCH_HAS_PTE_SPECIAL
 	select PCI_DOMAINS if PCI
-	select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC
+	select ARCH_HAS_GIGANTIC_PAGE
 
 config ARCH_DEFCONFIG
 	string
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 526d95abfe5e..f21bc56e5d7b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,7 +22,7 @@ config X86_64
 	def_bool y
 	depends on 64BIT
 	# Options that are inherently 64-bit kernel only:
-	select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC
+	select ARCH_HAS_GIGANTIC_PAGE
 	select ARCH_SUPPORTS_INT128
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select HAVE_ARCH_SOFT_DIRTY
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 7469d321f072..f65cfb48cfdd 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -17,8 +17,4 @@ static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
 
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void) { return true; }
-#endif
-
 #endif /* _ASM_X86_HUGETLB_H */
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index 71d7b77eea50..822f433ac95c 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -126,4 +126,11 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
 }
 #endif
 
+#ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED
+static inline bool gigantic_page_runtime_supported(void)
+{
+	return IS_ENABLED(CONFIG_ARCH_HAS_GIGANTIC_PAGE);
+}
+#endif /* __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED */
+
 #endif /* _ASM_GENERIC_HUGETLB_H */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index e77ab30e9328..fb07b503dc45 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -589,8 +589,8 @@ static inline bool pm_suspended_storage(void)
 /* The below functions must be run on a range from a single zone. */
 extern int alloc_contig_range(unsigned long start, unsigned long end,
 			      unsigned migratetype, gfp_t gfp_mask);
-extern void free_contig_range(unsigned long pfn, unsigned nr_pages);
 #endif
+void free_contig_range(unsigned long pfn, unsigned int nr_pages);
 
 #ifdef CONFIG_CMA
 /* CMA stuff */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dffe5d9d03ae..2f901a6e13d2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1059,6 +1059,7 @@ static void free_gigantic_page(struct page *page, unsigned int order)
 	free_contig_range(page_to_pfn(page), 1 << order);
 }
 
+#ifdef CONFIG_CONTIG_ALLOC
 static int __alloc_gigantic_page(unsigned long start_pfn,
 				unsigned long nr_pages, gfp_t gfp_mask)
 {
@@ -1143,11 +1144,20 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
 static void prep_compound_gigantic_page(struct page *page, unsigned int order);
+#else /* !CONFIG_CONTIG_ALLOC */
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+					int nid, nodemask_t *nodemask)
+{
+	return NULL;
+}
+#endif /* CONFIG_CONTIG_ALLOC */
 
 #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
-static inline bool gigantic_page_supported(void) { return false; }
 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
-		int nid, nodemask_t *nodemask) { return NULL; }
+					int nid, nodemask_t *nodemask)
+{
+	return NULL;
+}
 static inline void free_gigantic_page(struct page *page, unsigned int order) { }
 static inline void destroy_compound_gigantic_page(struct page *page,
 						unsigned int order) { }
@@ -1157,7 +1167,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
 {
 	int i;
 
-	if (hstate_is_gigantic(h) && !gigantic_page_supported())
+	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
 		return;
 
 	h->nr_huge_pages--;
@@ -2278,13 +2288,27 @@ found:
 }
 
 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
-static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
-						nodemask_t *nodes_allowed)
+static int set_max_huge_pages(struct hstate *h, unsigned long count,
+			      nodemask_t *nodes_allowed)
 {
 	unsigned long min_count, ret;
 
-	if (hstate_is_gigantic(h) && !gigantic_page_supported())
-		return h->max_huge_pages;
+	spin_lock(&hugetlb_lock);
+
+	/*
+	 * Gigantic pages runtime allocation depend on the capability for large
+	 * page range allocation.
+	 * If the system does not provide this feature, return an error when
+	 * the user tries to allocate gigantic pages but let the user free the
+	 * boottime allocated gigantic pages.
+	 */
+	if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
+		if (count > persistent_huge_pages(h)) {
+			spin_unlock(&hugetlb_lock);
+			return -EINVAL;
+		}
+		/* Fall through to decrease pool */
+	}
 
 	/*
 	 * Increase the pool size
@@ -2297,7 +2321,6 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
 	 * pool might be one hugepage larger than it needs to be, but
 	 * within all the constraints specified by the sysctls.
 	 */
-	spin_lock(&hugetlb_lock);
 	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
 		if (!adjust_pool_surplus(h, nodes_allowed, -1))
 			break;
@@ -2352,9 +2375,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
 			break;
 	}
 out:
-	ret = persistent_huge_pages(h);
+	h->max_huge_pages = persistent_huge_pages(h);
 	spin_unlock(&hugetlb_lock);
-	return ret;
+
+	return 0;
 }
 
 #define HSTATE_ATTR_RO(_name) \
@@ -2406,7 +2430,7 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
 	int err;
 	NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
 
-	if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
+	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) {
 		err = -EINVAL;
 		goto out;
 	}
@@ -2430,15 +2454,13 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
 	} else
 		nodes_allowed = &node_states[N_MEMORY];
 
-	h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
+	err = set_max_huge_pages(h, count, nodes_allowed);
 
+out:
 	if (nodes_allowed != &node_states[N_MEMORY])
 		NODEMASK_FREE(nodes_allowed);
 
-	return len;
-out:
-	NODEMASK_FREE(nodes_allowed);
-	return err;
+	return err ? err : len;
 }
 
 static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2efb6525d932..4ea71bc70413 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8346,8 +8346,9 @@ done:
 				pfn_max_align_up(end), migratetype);
 	return ret;
 }
+#endif /* CONFIG_CONTIG_ALLOC */
 
-void free_contig_range(unsigned long pfn, unsigned nr_pages)
+void free_contig_range(unsigned long pfn, unsigned int nr_pages)
 {
 	unsigned int count = 0;
 
@@ -8359,7 +8360,6 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
 	}
 	WARN(count != 0, "%d pages are still in use!\n", count);
 }
-#endif
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 5470dea49f5382257c242ac617d908267727f1a8 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Date: Mon, 13 May 2019 17:21:10 -0700
Subject: mm: use mm_zero_struct_page from SPARC on all 64b architectures

Patch series "Deferred page init improvements", v7.

This patchset is essentially a refactor of the page initialization logic
that is meant to provide for better code reuse while providing a
significant improvement in deferred page initialization performance.

In my testing on an x86_64 system with 384GB of RAM I have seen the
following.  In the case of regular memory initialization the deferred init
time was decreased from 3.75s to 1.38s on average.  This amounts to a 172%
improvement for the deferred memory initialization performance.

I have called out the improvement observed with each patch.

This patch (of 4):

Use the same approach that was already in use on Sparc on all the
architectures that support a 64b long.

This is mostly motivated by the fact that 7 to 10 store/move instructions
are likely always going to be faster than having to call into a function
that is not specialized for handling page init.

An added advantage to doing it this way is that the compiler can get away
with combining writes in the __init_single_page call.  As a result the
memset call will be reduced to only about 4 write operations, or at least
that is what I am seeing with GCC 6.2 as the flags, LRU pointers, and
count/mapcount seem to be cancelling out at least 4 of the 8 assignments
on my system.

One change I had to make to the function was to reduce the minimum page
size to 56 to support some powerpc64 configurations.

This change should introduce no change on SPARC since it already had this
code.  In the case of x86_64 I saw a reduction from 3.75s to 2.80s when
initializing 384GB of RAM per node.  Pavel Tatashin tested on a system
with Broadcom's Stingray CPU and 48GB of RAM and found that
__init_single_page() takes 19.30ns / 64-byte struct page before this patch
and with this patch it takes 17.33ns / 64-byte struct page.  Mike Rapoport
ran a similar test on a OpenPower (S812LC 8348-21C) with Power8 processor
and 128GB or RAM.  His results per 64-byte struct page were 4.68ns before,
and 4.59ns after this patch.

Link: http://lkml.kernel.org/r/20190405221213.12227.9392.stgit@localhost.localdomain
Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Reviewed-by: Pavel Tatashin <pavel.tatashin@microsoft.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Khalid Aziz <khalid.aziz@oracle.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <yi.z.zhang@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sparc/include/asm/pgtable_64.h | 30 ---------------------------
 include/linux/mm.h                  | 41 ++++++++++++++++++++++++++++++++++---
 2 files changed, 38 insertions(+), 33 deletions(-)

(limited to 'arch')

diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 1393a8ac596b..22500c3be7a9 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -231,36 +231,6 @@ extern unsigned long _PAGE_ALL_SZ_BITS;
 extern struct page *mem_map_zero;
 #define ZERO_PAGE(vaddr)	(mem_map_zero)
 
-/* This macro must be updated when the size of struct page grows above 80
- * or reduces below 64.
- * The idea that compiler optimizes out switch() statement, and only
- * leaves clrx instructions
- */
-#define	mm_zero_struct_page(pp) do {					\
-	unsigned long *_pp = (void *)(pp);				\
-									\
-	 /* Check that struct page is either 64, 72, or 80 bytes */	\
-	BUILD_BUG_ON(sizeof(struct page) & 7);				\
-	BUILD_BUG_ON(sizeof(struct page) < 64);				\
-	BUILD_BUG_ON(sizeof(struct page) > 80);				\
-									\
-	switch (sizeof(struct page)) {					\
-	case 80:							\
-		_pp[9] = 0;	/* fallthrough */			\
-	case 72:							\
-		_pp[8] = 0;	/* fallthrough */			\
-	default:							\
-		_pp[7] = 0;						\
-		_pp[6] = 0;						\
-		_pp[5] = 0;						\
-		_pp[4] = 0;						\
-		_pp[3] = 0;						\
-		_pp[2] = 0;						\
-		_pp[1] = 0;						\
-		_pp[0] = 0;						\
-	}								\
-} while (0)
-
 /* PFNs are real physical page numbers.  However, mem_map only begins to record
  * per-page information starting at pfn_base.  This is to handle systems where
  * the first physical page in the machine is at some huge physical address,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e6b6be15609e..abb7eb7ef0f2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -124,10 +124,45 @@ extern int mmap_rnd_compat_bits __read_mostly;
 
 /*
  * On some architectures it is expensive to call memset() for small sizes.
- * Those architectures should provide their own implementation of "struct page"
- * zeroing by defining this macro in <asm/pgtable.h>.
+ * If an architecture decides to implement their own version of
+ * mm_zero_struct_page they should wrap the defines below in a #ifndef and
+ * define their own version of this macro in <asm/pgtable.h>
  */
-#ifndef mm_zero_struct_page
+#if BITS_PER_LONG == 64
+/* This function must be updated when the size of struct page grows above 80
+ * or reduces below 56. The idea that compiler optimizes out switch()
+ * statement, and only leaves move/store instructions. Also the compiler can
+ * combine write statments if they are both assignments and can be reordered,
+ * this can result in several of the writes here being dropped.
+ */
+#define	mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
+static inline void __mm_zero_struct_page(struct page *page)
+{
+	unsigned long *_pp = (void *)page;
+
+	 /* Check that struct page is either 56, 64, 72, or 80 bytes */
+	BUILD_BUG_ON(sizeof(struct page) & 7);
+	BUILD_BUG_ON(sizeof(struct page) < 56);
+	BUILD_BUG_ON(sizeof(struct page) > 80);
+
+	switch (sizeof(struct page)) {
+	case 80:
+		_pp[9] = 0;	/* fallthrough */
+	case 72:
+		_pp[8] = 0;	/* fallthrough */
+	case 64:
+		_pp[7] = 0;	/* fallthrough */
+	case 56:
+		_pp[6] = 0;
+		_pp[5] = 0;
+		_pp[4] = 0;
+		_pp[3] = 0;
+		_pp[2] = 0;
+		_pp[1] = 0;
+		_pp[0] = 0;
+	}
+}
+#else
 #define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
 #endif
 
-- 
cgit v1.2.3-59-g8ed1b


From 940519f0c8b757fdcbc5d14c93cdaada20ded14c Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 13 May 2019 17:21:26 -0700
Subject: mm, memory_hotplug: provide a more generic restrictions for memory
 hotplug

arch_add_memory, __add_pages take a want_memblock which controls whether
the newly added memory should get the sysfs memblock user API (e.g.
ZONE_DEVICE users do not want/need this interface).  Some callers even
want to control where do we allocate the memmap from by configuring
altmap.

Add a more generic hotplug context for arch_add_memory and __add_pages.
struct mhp_restrictions contains flags which contains additional features
to be enabled by the memory hotplug (MHP_MEMBLOCK_API currently) and
altmap for alternative memmap allocator.

This patch shouldn't introduce any functional change.

[akpm@linux-foundation.org: build fix]
Link: http://lkml.kernel.org/r/20190408082633.2864-3-osalvador@suse.de
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/mm/mmu.c            |  6 +++---
 arch/ia64/mm/init.c            |  6 +++---
 arch/powerpc/mm/mem.c          |  6 +++---
 arch/s390/mm/init.c            |  6 +++---
 arch/sh/mm/init.c              |  6 +++---
 arch/x86/mm/init_32.c          |  6 +++---
 arch/x86/mm/init_64.c          | 10 +++++-----
 include/linux/memory_hotplug.h | 31 ++++++++++++++++++++++++-------
 kernel/memremap.c              | 12 +++++++++---
 mm/memory_hotplug.c            | 11 +++++++----
 10 files changed, 63 insertions(+), 37 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index ef82312860ac..ef32d4839c3f 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1065,8 +1065,8 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-		    bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+			struct mhp_restrictions *restrictions)
 {
 	int flags = 0;
 
@@ -1077,6 +1077,6 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
 			     size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
 
 	return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
-			   altmap, want_memblock);
+			   restrictions);
 }
 #endif
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index e49200e31750..379eb1f9adc9 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -666,14 +666,14 @@ mem_init (void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-		bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+			struct mhp_restrictions *restrictions)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
-	ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+	ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
 	if (ret)
 		printk("%s: Problem encountered in __add_pages() as ret=%d\n",
 		       __func__,  ret);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 20266898f3a8..de5c591a550d 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -109,8 +109,8 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end)
 	return -ENODEV;
 }
 
-int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-			  bool want_memblock)
+int __ref arch_add_memory(int nid, u64 start, u64 size,
+			struct mhp_restrictions *restrictions)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -127,7 +127,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altm
 	}
 	flush_inval_dcache_range(start, start + size);
 
-	return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+	return __add_pages(nid, start_pfn, nr_pages, restrictions);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 5f48fc7e61d5..06bd05137a00 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -219,8 +219,8 @@ device_initcall(s390_cma_mem_init);
 
 #endif /* CONFIG_CMA */
 
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-		bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+		struct mhp_restrictions *restrictions)
 {
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long size_pages = PFN_DOWN(size);
@@ -230,7 +230,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
 	if (rc)
 		return rc;
 
-	rc = __add_pages(nid, start_pfn, size_pages, altmap, want_memblock);
+	rc = __add_pages(nid, start_pfn, size_pages, restrictions);
 	if (rc)
 		vmem_remove_mapping(start, size);
 	return rc;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index aeb9f45c7a39..d3cd07bd2dc1 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -404,15 +404,15 @@ void __init mem_init(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-		bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+			struct mhp_restrictions *restrictions)
 {
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
 	/* We only have ZONE_NORMAL, so this is easy.. */
-	ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+	ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
 	if (unlikely(ret))
 		printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 85c94f9a87f8..755dbed85531 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -850,13 +850,13 @@ void __init mem_init(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-		bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+			struct mhp_restrictions *restrictions)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
-	return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+	return __add_pages(nid, start_pfn, nr_pages, restrictions);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bccff68e3267..db42c11b48fb 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -777,11 +777,11 @@ static void update_end_of_memory_vars(u64 start, u64 size)
 }
 
 int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
-		struct vmem_altmap *altmap, bool want_memblock)
+				struct mhp_restrictions *restrictions)
 {
 	int ret;
 
-	ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+	ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
 	WARN_ON_ONCE(ret);
 
 	/* update max_pfn, max_low_pfn and high_memory */
@@ -791,15 +791,15 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
 	return ret;
 }
 
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-		bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+			struct mhp_restrictions *restrictions)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
 	init_memory_mapping(start, start + size);
 
-	return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+	return add_pages(nid, start_pfn, nr_pages, restrictions);
 }
 
 #define PAGE_INUSE 0xFD
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 3c8cf347804c..b24aca54353e 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -53,6 +53,16 @@ enum {
 	MMOP_ONLINE_MOVABLE,
 };
 
+/*
+ * Restrictions for the memory hotplug:
+ * flags:  MHP_ flags
+ * altmap: alternative allocator for memmap array
+ */
+struct mhp_restrictions {
+	unsigned long flags;
+	struct vmem_altmap *altmap;
+};
+
 /*
  * Zone resizing functions
  *
@@ -101,6 +111,8 @@ extern void __online_page_free(struct page *page);
 
 extern int try_online_node(int nid);
 
+extern int arch_add_memory(int nid, u64 start, u64 size,
+			struct mhp_restrictions *restrictions);
 extern u64 max_mem_size;
 
 extern bool memhp_auto_online;
@@ -118,20 +130,27 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
 	unsigned long nr_pages, struct vmem_altmap *altmap);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
+/*
+ * Do we want sysfs memblock files created. This will allow userspace to online
+ * and offline memory explicitly. Lack of this bit means that the caller has to
+ * call move_pfn_range_to_zone to finish the initialization.
+ */
+
+#define MHP_MEMBLOCK_API               (1<<0)
+
 /* reasonably generic interface to expand the physical pages */
 extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
-		struct vmem_altmap *altmap, bool want_memblock);
+		       struct mhp_restrictions *restrictions);
 
 #ifndef CONFIG_ARCH_HAS_ADD_PAGES
 static inline int add_pages(int nid, unsigned long start_pfn,
-		unsigned long nr_pages, struct vmem_altmap *altmap,
-		bool want_memblock)
+		unsigned long nr_pages, struct mhp_restrictions *restrictions)
 {
-	return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+	return __add_pages(nid, start_pfn, nr_pages, restrictions);
 }
 #else /* ARCH_HAS_ADD_PAGES */
 int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
-		struct vmem_altmap *altmap, bool want_memblock);
+	      struct mhp_restrictions *restrictions);
 #endif /* ARCH_HAS_ADD_PAGES */
 
 #ifdef CONFIG_NUMA
@@ -332,8 +351,6 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
 extern int __add_memory(int nid, u64 start, u64 size);
 extern int add_memory(int nid, u64 start, u64 size);
 extern int add_memory_resource(int nid, struct resource *resource);
-extern int arch_add_memory(int nid, u64 start, u64 size,
-		struct vmem_altmap *altmap, bool want_memblock);
 extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 		unsigned long nr_pages, struct vmem_altmap *altmap);
 extern bool is_memblock_offlined(struct memory_block *mem);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index a856cb5ff192..4e59d29245f4 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -148,6 +148,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 			&pgmap->altmap : NULL;
 	struct resource *res = &pgmap->res;
 	struct dev_pagemap *conflict_pgmap;
+	struct mhp_restrictions restrictions = {
+		/*
+		 * We do not want any optional features only our own memmap
+		*/
+		.altmap = altmap,
+	};
 	pgprot_t pgprot = PAGE_KERNEL;
 	int error, nid, is_ram;
 
@@ -214,7 +220,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	 */
 	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
 		error = add_pages(nid, align_start >> PAGE_SHIFT,
-				align_size >> PAGE_SHIFT, NULL, false);
+				align_size >> PAGE_SHIFT, &restrictions);
 	} else {
 		error = kasan_add_zero_shadow(__va(align_start), align_size);
 		if (error) {
@@ -222,8 +228,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 			goto err_kasan;
 		}
 
-		error = arch_add_memory(nid, align_start, align_size, altmap,
-				false);
+		error = arch_add_memory(nid, align_start, align_size,
+					&restrictions);
 	}
 
 	if (!error) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 75f9f6590677..339d5a62d5d5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -273,12 +273,12 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
  * add the new pages.
  */
 int __ref __add_pages(int nid, unsigned long phys_start_pfn,
-		unsigned long nr_pages, struct vmem_altmap *altmap,
-		bool want_memblock)
+		unsigned long nr_pages, struct mhp_restrictions *restrictions)
 {
 	unsigned long i;
 	int err = 0;
 	int start_sec, end_sec;
+	struct vmem_altmap *altmap = restrictions->altmap;
 
 	/* during initialize mem_map, align hot-added range to section */
 	start_sec = pfn_to_section_nr(phys_start_pfn);
@@ -299,7 +299,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
 
 	for (i = start_sec; i <= end_sec; i++) {
 		err = __add_section(nid, section_nr_to_pfn(i), altmap,
-				want_memblock);
+				restrictions->flags & MHP_MEMBLOCK_API);
 
 		/*
 		 * EEXIST is finally dealt with by ioresource collision
@@ -1097,6 +1097,9 @@ static int online_memory_block(struct memory_block *mem, void *arg)
  */
 int __ref add_memory_resource(int nid, struct resource *res)
 {
+	struct mhp_restrictions restrictions = {
+		.flags = MHP_MEMBLOCK_API,
+	};
 	u64 start, size;
 	bool new_node = false;
 	int ret;
@@ -1124,7 +1127,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
 	new_node = ret;
 
 	/* call arch's memory hotadd */
-	ret = arch_add_memory(nid, start, size, NULL, true);
+	ret = arch_add_memory(nid, start, size, &restrictions);
 	if (ret < 0)
 		goto error;
 
-- 
cgit v1.2.3-59-g8ed1b


From ac5c94264580f498e484c854031d0226b3c1038f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 13 May 2019 17:21:46 -0700
Subject: mm/memory_hotplug: make __remove_pages() and arch_remove_memory()
 never fail

All callers of arch_remove_memory() ignore errors.  And we should really
try to remove any errors from the memory removal path.  No more errors are
reported from __remove_pages().  BUG() in s390x code in case
arch_remove_memory() is triggered.  We may implement that properly later.
WARN in case powerpc code failed to remove the section mapping, which is
better than ignoring the error completely right now.

Link: http://lkml.kernel.org/r/20190409100148.24703-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Qian Cai <cai@lca.pw>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Mike Travis <mike.travis@hpe.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/init.c            | 11 +++--------
 arch/powerpc/mm/mem.c          |  9 +++------
 arch/s390/mm/init.c            |  5 +++--
 arch/sh/mm/init.c              | 11 +++--------
 arch/x86/mm/init_32.c          |  5 +++--
 arch/x86/mm/init_64.c          | 10 +++-------
 include/linux/memory_hotplug.h |  8 ++++----
 mm/memory_hotplug.c            |  5 ++---
 8 files changed, 24 insertions(+), 40 deletions(-)

(limited to 'arch')

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 379eb1f9adc9..d28e29103bdb 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -682,20 +682,15 @@ int arch_add_memory(int nid, u64 start, u64 size,
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(int nid, u64 start, u64 size,
+			struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	struct zone *zone;
-	int ret;
 
 	zone = page_zone(pfn_to_page(start_pfn));
-	ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
-	if (ret)
-		pr_warn("%s: Problem encountered in __remove_pages() as"
-			" ret=%d\n", __func__,  ret);
-
-	return ret;
+	__remove_pages(zone, start_pfn, nr_pages, altmap);
 }
 #endif
 #endif
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index de5c591a550d..e885fe2aafcc 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -131,7 +131,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int __ref arch_remove_memory(int nid, u64 start, u64 size,
+void __ref arch_remove_memory(int nid, u64 start, u64 size,
 			     struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
@@ -147,14 +147,13 @@ int __ref arch_remove_memory(int nid, u64 start, u64 size,
 	if (altmap)
 		page += vmem_altmap_offset(altmap);
 
-	ret = __remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
-	if (ret)
-		return ret;
+	__remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
 
 	/* Remove htab bolted mappings for this section of memory */
 	start = (unsigned long)__va(start);
 	flush_inval_dcache_range(start, start + size);
 	ret = remove_section_mapping(start, start + size);
+	WARN_ON_ONCE(ret);
 
 	/* Ensure all vmalloc mappings are flushed in case they also
 	 * hit that section of memory
@@ -163,8 +162,6 @@ int __ref arch_remove_memory(int nid, u64 start, u64 size,
 
 	if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
 		pr_warn("Hash collision while resizing HPT\n");
-
-	return ret;
 }
 #endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 06bd05137a00..14d1eae9fe43 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -237,14 +237,15 @@ int arch_add_memory(int nid, u64 start, u64 size,
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(int nid, u64 start, u64 size,
+			struct vmem_altmap *altmap)
 {
 	/*
 	 * There is no hardware or firmware interface which could trigger a
 	 * hot memory remove on s390. So there is nothing that needs to be
 	 * implemented.
 	 */
-	return -EBUSY;
+	BUG();
 }
 #endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index d3cd07bd2dc1..b95e343e3c9d 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -429,20 +429,15 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(int nid, u64 start, u64 size,
+			struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	struct zone *zone;
-	int ret;
 
 	zone = page_zone(pfn_to_page(start_pfn));
-	ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
-	if (unlikely(ret))
-		pr_warn("%s: Failed, __remove_pages() == %d\n", __func__,
-			ret);
-
-	return ret;
+	__remove_pages(zone, start_pfn, nr_pages, altmap);
 }
 #endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 755dbed85531..075e568098f2 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -860,14 +860,15 @@ int arch_add_memory(int nid, u64 start, u64 size,
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(int nid, u64 start, u64 size,
+			struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	struct zone *zone;
 
 	zone = page_zone(pfn_to_page(start_pfn));
-	return __remove_pages(zone, start_pfn, nr_pages, altmap);
+	__remove_pages(zone, start_pfn, nr_pages, altmap);
 }
 #endif
 #endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index db42c11b48fb..20d14254b686 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1141,24 +1141,20 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
 	remove_pagetable(start, end, true, NULL);
 }
 
-int __ref arch_remove_memory(int nid, u64 start, u64 size,
-				struct vmem_altmap *altmap)
+void __ref arch_remove_memory(int nid, u64 start, u64 size,
+			      struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	struct page *page = pfn_to_page(start_pfn);
 	struct zone *zone;
-	int ret;
 
 	/* With altmap the first mapped page is offset from @start */
 	if (altmap)
 		page += vmem_altmap_offset(altmap);
 	zone = page_zone(page);
-	ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
-	WARN_ON_ONCE(ret);
+	__remove_pages(zone, start_pfn, nr_pages, altmap);
 	kernel_physical_mapping_remove(start, start + size);
-
-	return ret;
 }
 #endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index b24aca54353e..ae892eef8b82 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -124,10 +124,10 @@ static inline bool movable_node_is_enabled(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-extern int arch_remove_memory(int nid, u64 start, u64 size,
-				struct vmem_altmap *altmap);
-extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
-	unsigned long nr_pages, struct vmem_altmap *altmap);
+extern void arch_remove_memory(int nid, u64 start, u64 size,
+			       struct vmem_altmap *altmap);
+extern void __remove_pages(struct zone *zone, unsigned long start_pfn,
+			   unsigned long nr_pages, struct vmem_altmap *altmap);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3512bba20e2b..6c0c4f48638e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -547,8 +547,8 @@ static void __remove_section(struct zone *zone, struct mem_section *ms,
  * sure that pages are marked reserved and zones are adjust properly by
  * calling offline_pages().
  */
-int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
-		 unsigned long nr_pages, struct vmem_altmap *altmap)
+void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
+		    unsigned long nr_pages, struct vmem_altmap *altmap)
 {
 	unsigned long i;
 	unsigned long map_offset = 0;
@@ -579,7 +579,6 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 	}
 
 	set_zone_contiguous(zone);
-	return 0;
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
-- 
cgit v1.2.3-59-g8ed1b


From 6248461d2168233321601cd6bf2528179b6ac3d1 Mon Sep 17 00:00:00 2001
From: Souptick Joarder <jrdr.linux@gmail.com>
Date: Mon, 13 May 2019 17:22:00 -0700
Subject: arm: mm: dma-mapping: convert to use vm_map_pages()

Convert to use vm_map_pages() to map range of kernel memory to user vma.

Link: http://lkml.kernel.org/r/936e5e107c746a7310e3a3c471188ca3ac8f9754.1552921225.git.jrdr.linux@gmail.com
Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Airlie <airlied@linux.ie>
Cc: Heiko Stuebner <heiko@sntech.de>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mauro Carvalho Chehab <mchehab@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
Cc: Pawel Osciak <pawel@osciak.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Sandy Huang <hjc@rock-chips.com>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Thierry Reding <treding@nvidia.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/dma-mapping.c | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 43f46aa7ef33..0a75058c11f3 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -1577,31 +1577,21 @@ static int __arm_iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma
 		    void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		    unsigned long attrs)
 {
-	unsigned long uaddr = vma->vm_start;
-	unsigned long usize = vma->vm_end - vma->vm_start;
 	struct page **pages = __iommu_get_pages(cpu_addr, attrs);
 	unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
-	unsigned long off = vma->vm_pgoff;
+	int err;
 
 	if (!pages)
 		return -ENXIO;
 
-	if (off >= nr_pages || (usize >> PAGE_SHIFT) > nr_pages - off)
+	if (vma->vm_pgoff >= nr_pages)
 		return -ENXIO;
 
-	pages += off;
-
-	do {
-		int ret = vm_insert_page(vma, uaddr, *pages++);
-		if (ret) {
-			pr_err("Remapping memory failed: %d\n", ret);
-			return ret;
-		}
-		uaddr += PAGE_SIZE;
-		usize -= PAGE_SIZE;
-	} while (usize > 0);
+	err = vm_map_pages(vma, pages, nr_pages);
+	if (err)
+		pr_err("Remapping memory failed: %d\n", err);
 
-	return 0;
+	return err;
 }
 static int arm_iommu_mmap_attrs(struct device *dev,
 		struct vm_area_struct *vma, void *cpu_addr,
-- 
cgit v1.2.3-59-g8ed1b


From 350e88bad4964da6feabee02a1a70381bcdb087e Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 13 May 2019 17:22:59 -0700
Subject: mm: memblock: make keeping memblock memory opt-in rather than opt-out

Most architectures do not need the memblock memory after the page
allocator is initialized, but only few enable ARCH_DISCARD_MEMBLOCK in the
arch Kconfig.

Replacing ARCH_DISCARD_MEMBLOCK with ARCH_KEEP_MEMBLOCK and inverting the
logic makes it clear which architectures actually use memblock after
system initialization and skips the necessity to add ARCH_DISCARD_MEMBLOCK
to the architectures that are still missing that option.

Link: http://lkml.kernel.org/r/1556102150-32517-1-git-send-email-rppt@linux.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Cc: Russell King <linux@armlinux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/Kconfig         |  2 +-
 arch/arm64/Kconfig       |  1 +
 arch/hexagon/Kconfig     |  1 -
 arch/ia64/Kconfig        |  1 -
 arch/m68k/Kconfig        |  1 -
 arch/mips/Kconfig        |  1 -
 arch/nios2/Kconfig       |  1 -
 arch/powerpc/Kconfig     |  1 +
 arch/s390/Kconfig        |  1 +
 arch/sh/Kconfig          |  1 -
 arch/x86/Kconfig         |  1 -
 include/linux/memblock.h |  3 ++-
 kernel/kexec_file.c      | 16 ++++++++--------
 mm/Kconfig               |  2 +-
 mm/memblock.c            |  6 +++---
 mm/page_alloc.c          |  3 +--
 16 files changed, 19 insertions(+), 23 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index a11dfcc2a130..5fd344bd06b9 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -4,7 +4,6 @@ config ARM
 	default y
 	select ARCH_32BIT_OFF_T
 	select ARCH_CLOCKSOURCE_DATA
-	select ARCH_DISCARD_MEMBLOCK if !HAVE_ARCH_PFN_VALID && !KEXEC
 	select ARCH_HAS_DEBUG_VIRTUAL if MMU
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_HAS_ELF_RANDOMIZE
@@ -22,6 +21,7 @@ config ARM
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAVE_CUSTOM_GPIO_H
 	select ARCH_HAS_GCOV_PROFILE_ALL
+	select ARCH_KEEP_MEMBLOCK if HAVE_ARCH_PFN_VALID || KEXEC
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_NO_SG_CHAIN if !ARM_HAS_SG_CHAIN
 	select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7a1aa53d188d..69a59a5d1143 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -60,6 +60,7 @@ config ARM64
 	select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPT
 	select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPT
 	select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPT
+	select ARCH_KEEP_MEMBLOCK
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 3e54a53208d5..b7d404bbaa0f 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -22,7 +22,6 @@ config HEXAGON
 	select GENERIC_IRQ_SHOW
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_TRACEHOOK
-	select ARCH_DISCARD_MEMBLOCK
 	select NEED_SG_DMA_LENGTH
 	select NO_IOPORT_MAP
 	select GENERIC_IOMAP
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 73a26f04644e..7468d8e50467 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -33,7 +33,6 @@ config IA64
 	select ARCH_HAS_DMA_COHERENT_TO_PFN if SWIOTLB
 	select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB
 	select VIRT_TO_BUS
-	select ARCH_DISCARD_MEMBLOCK
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PENDING_IRQ if SMP
 	select GENERIC_IRQ_SHOW
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index fe5cc2da6d10..218e037ef901 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -26,7 +26,6 @@ config M68K
 	select MODULES_USE_ELF_RELA
 	select OLD_SIGSUSPEND3
 	select OLD_SIGACTION
-	select ARCH_DISCARD_MEMBLOCK
 	select MMU_GATHER_NO_RANGE if MMU
 
 config CPU_BIG_ENDIAN
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index ff8cff9fcf54..677e5bfeff47 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -5,7 +5,6 @@ config MIPS
 	select ARCH_32BIT_OFF_T if !64BIT
 	select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT
 	select ARCH_CLOCKSOURCE_DATA
-	select ARCH_DISCARD_MEMBLOCK
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index ea37394ff3ea..26a9c760a98b 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -23,7 +23,6 @@ config NIOS2
 	select SPARSE_IRQ
 	select USB_ARCH_HAS_HCD if USB_SUPPORT
 	select CPU_NO_EFFICIENT_FFS
-	select ARCH_DISCARD_MEMBLOCK
 	select MMU_GATHER_NO_RANGE if MMU
 
 config GENERIC_CSUM
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d7996cfaceca..8c1c636308c8 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -137,6 +137,7 @@ config PPC
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARCH_HAS_ZONE_DEVICE		if PPC_BOOK3S_64
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
+	select ARCH_KEEP_MEMBLOCK
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
 	select ARCH_OPTIONAL_KERNEL_RWX		if ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index d0c046af65fa..109243fdb6ec 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -100,6 +100,7 @@ config S390
 	select ARCH_INLINE_WRITE_UNLOCK_BH
 	select ARCH_INLINE_WRITE_UNLOCK_IRQ
 	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+	select ARCH_KEEP_MEMBLOCK
 	select ARCH_SAVE_PAGE_KEYS if HIBERNATION
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_NUMA_BALANCING
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 2a77033e1e7c..b77f512bb176 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -10,7 +10,6 @@ config SUPERH
 	select DMA_DECLARE_COHERENT
 	select HAVE_IDE if HAS_IOPORT_MAP
 	select HAVE_MEMBLOCK_NODE_MAP
-	select ARCH_DISCARD_MEMBLOCK
 	select HAVE_OPROFILE
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_PERF_EVENTS
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f21bc56e5d7b..818b361094ed 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -47,7 +47,6 @@ config X86
 	select ARCH_32BIT_OFF_T			if X86_32
 	select ARCH_CLOCKSOURCE_DATA
 	select ARCH_CLOCKSOURCE_INIT
-	select ARCH_DISCARD_MEMBLOCK
 	select ARCH_HAS_ACPI_TABLE_UPGRADE	if ACPI
 	select ARCH_HAS_DEBUG_VIRTUAL
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 47e3c0612592..676d3900e1bd 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -96,13 +96,14 @@ struct memblock {
 extern struct memblock memblock;
 extern int memblock_debug;
 
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+#ifndef CONFIG_ARCH_KEEP_MEMBLOCK
 #define __init_memblock __meminit
 #define __initdata_memblock __meminitdata
 void memblock_discard(void);
 #else
 #define __init_memblock
 #define __initdata_memblock
+static inline void memblock_discard(void) {}
 #endif
 
 #define memblock_dbg(fmt, ...) \
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f7fb8f6a688f..072b6ee55e3f 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -500,13 +500,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg)
 	return locate_mem_hole_bottom_up(start, end, kbuf);
 }
 
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-static int kexec_walk_memblock(struct kexec_buf *kbuf,
-			       int (*func)(struct resource *, void *))
-{
-	return 0;
-}
-#else
+#ifdef CONFIG_ARCH_KEEP_MEMBLOCK
 static int kexec_walk_memblock(struct kexec_buf *kbuf,
 			       int (*func)(struct resource *, void *))
 {
@@ -550,6 +544,12 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
 
 	return ret;
 }
+#else
+static int kexec_walk_memblock(struct kexec_buf *kbuf,
+			       int (*func)(struct resource *, void *))
+{
+	return 0;
+}
 #endif
 
 /**
@@ -589,7 +589,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
 	if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN)
 		return 0;
 
-	if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK))
+	if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
 		ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
 	else
 		ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback);
diff --git a/mm/Kconfig b/mm/Kconfig
index 71e697e693df..c5124c2cb0b2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -136,7 +136,7 @@ config HAVE_MEMBLOCK_PHYS_MAP
 config HAVE_GENERIC_GUP
 	bool
 
-config ARCH_DISCARD_MEMBLOCK
+config ARCH_KEEP_MEMBLOCK
 	bool
 
 config MEMORY_ISOLATION
diff --git a/mm/memblock.c b/mm/memblock.c
index f315eca9f4a1..6bbad46f4d2c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -94,7 +94,7 @@
  * :c:func:`mem_init` function frees all the memory to the buddy page
  * allocator.
  *
- * If an architecure enables %CONFIG_ARCH_DISCARD_MEMBLOCK, the
+ * Unless an architecure enables %CONFIG_ARCH_KEEP_MEMBLOCK, the
  * memblock data structures will be discarded after the system
  * initialization compltes.
  */
@@ -375,7 +375,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
 	}
 }
 
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+#ifndef CONFIG_ARCH_KEEP_MEMBLOCK
 /**
  * memblock_discard - discard memory and reserved arrays if they were allocated
  */
@@ -1987,7 +1987,7 @@ unsigned long __init memblock_free_all(void)
 	return pages;
 }
 
-#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK)
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK)
 
 static int memblock_debug_show(struct seq_file *m, void *private)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cbda9aea0bf5..f2f3fb4921d1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1896,10 +1896,9 @@ void __init page_alloc_init_late(void)
 	/* Reinit limits that are based on free pages after the kernel is up */
 	files_maxfiles_init();
 #endif
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+
 	/* Discard memblock private memory */
 	memblock_discard();
-#endif
 
 	for_each_populated_zone(zone)
 		set_zone_contiguous(zone);
-- 
cgit v1.2.3-59-g8ed1b


From 687a3e4d8e6129f064711291f1564d95472dba3e Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:41:45 -0700
Subject: treewide: remove SPDX "WITH Linux-syscall-note" from kernel-space
 headers

The "WITH Linux-syscall-note" should be added to headers exported to the
user-space.

Some kernel-space headers have "WITH Linux-syscall-note", which seems a
mistake.

[1] arch/x86/include/asm/hyperv-tlfs.h

Commit 5a4858032217 ("x86/hyper-v: move hyperv.h out of uapi") moved
this file out of uapi, but missed to update the SPDX License tag.

[2] include/asm-generic/shmparam.h

Commit 76ce2a80a28e ("Rename include/{uapi => }/asm-generic/shmparam.h
really") moved this file out of uapi, but missed to update the SPDX
License tag.

[3] include/linux/qcom-geni-se.h

Commit eddac5af0654 ("soc: qcom: Add GENI based QUP Wrapper driver")
added this file, but I do not see a good reason why its license tag must
include "WITH Linux-syscall-note".

Link: http://lkml.kernel.org/r/1554196104-3522-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/hyperv-tlfs.h | 2 +-
 include/asm-generic/shmparam.h     | 2 +-
 include/linux/qcom-geni-se.h       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 2bdbbbcfa393..cdf44aa9a501 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: GPL-2.0 */
 
 /*
  * This file contains definitions from Hyper-V Hypervisor Top-Level Functional
diff --git a/include/asm-generic/shmparam.h b/include/asm-generic/shmparam.h
index 8b78c0ba08b1..b8f9035ffc2c 100644
--- a/include/asm-generic/shmparam.h
+++ b/include/asm-generic/shmparam.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __ASM_GENERIC_SHMPARAM_H
 #define __ASM_GENERIC_SHMPARAM_H
 
diff --git a/include/linux/qcom-geni-se.h b/include/linux/qcom-geni-se.h
index 3bcd67fd5548..dd464943f717 100644
--- a/include/linux/qcom-geni-se.h
+++ b/include/linux/qcom-geni-se.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright (c) 2017-2018, The Linux Foundation. All rights reserved.
  */
-- 
cgit v1.2.3-59-g8ed1b


From be167862ae7dd85c56d385209a4890678e1b0488 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 14 May 2019 15:41:48 -0700
Subject: ARM: prevent tracing IPI_CPU_BACKTRACE

Patch series "compiler: allow all arches to enable
CONFIG_OPTIMIZE_INLINING", v3.

This patch (of 11):

When function tracing for IPIs is enabled, we get a warning for an
overflow of the ipi_types array with the IPI_CPU_BACKTRACE type as
triggered by raise_nmi():

  arch/arm/kernel/smp.c: In function 'raise_nmi':
  arch/arm/kernel/smp.c:489:2: error: array subscript is above array bounds [-Werror=array-bounds]
    trace_ipi_raise(target, ipi_types[ipinr]);

This is a correct warning as we actually overflow the array here.

This patch raise_nmi() to call __smp_cross_call() instead of
smp_cross_call(), to avoid calling into ftrace.  For clarification, I'm
also adding a two new code comments describing how this one is special.

The warning appears to have shown up after commit e7273ff49acf ("ARM:
8488/1: Make IPI_CPU_BACKTRACE a "non-secure" SGI"), which changed the
number assignment from '15' to '8', but as far as I can tell has existed
since the IPI tracepoints were first introduced.  If we decide to
backport this patch to stable kernels, we probably need to backport
e7273ff49acf as well.

[yamada.masahiro@socionext.com: rebase on v5.1-rc1]
Link: http://lkml.kernel.org/r/20190423034959.13525-2-yamada.masahiro@socionext.com
Fixes: e7273ff49acf ("ARM: 8488/1: Make IPI_CPU_BACKTRACE a "non-secure" SGI")
Fixes: 365ec7b17327 ("ARM: add IPI tracepoints") # v3.17
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Borislav Petkov <bp@suse.de>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/hardirq.h | 1 +
 arch/arm/kernel/smp.c          | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h
index cba23eaa6072..7a88f160b1fb 100644
--- a/arch/arm/include/asm/hardirq.h
+++ b/arch/arm/include/asm/hardirq.h
@@ -6,6 +6,7 @@
 #include <linux/threads.h>
 #include <asm/irq.h>
 
+/* number of IPIS _not_ including IPI_CPU_BACKTRACE */
 #define NR_IPI	7
 
 typedef struct {
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index facd4240ca02..c93fe0f256de 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -70,6 +70,10 @@ enum ipi_msg_type {
 	IPI_CPU_STOP,
 	IPI_IRQ_WORK,
 	IPI_COMPLETION,
+	/*
+	 * CPU_BACKTRACE is special and not included in NR_IPI
+	 * or tracable with trace_ipi_*
+	 */
 	IPI_CPU_BACKTRACE,
 	/*
 	 * SGI8-15 can be reserved by secure firmware, and thus may
@@ -797,7 +801,7 @@ core_initcall(register_cpufreq_notifier);
 
 static void raise_nmi(cpumask_t *mask)
 {
-	smp_cross_call(mask, IPI_CPU_BACKTRACE);
+	__smp_cross_call(mask, IPI_CPU_BACKTRACE);
 }
 
 void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
-- 
cgit v1.2.3-59-g8ed1b


From 02166b88d376f21ea5caac26e7e8f74ab5578986 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:41:52 -0700
Subject: arm64: mark (__)cpus_have_const_cap as __always_inline

This prepares to move CONFIG_OPTIMIZE_INLINING from x86 to a common
place.  We need to eliminate potential issues beforehand.

If it is enabled for arm64, the following errors are reported:

  In file included from include/linux/compiler_types.h:68,
                   from <command-line>:
  arch/arm64/include/asm/jump_label.h: In function 'cpus_have_const_cap':
  include/linux/compiler-gcc.h:120:38: warning: asm operand 0 probably doesn't match constraints
   #define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0)
                                        ^~~
  arch/arm64/include/asm/jump_label.h:32:2: note: in expansion of macro 'asm_volatile_goto'
    asm_volatile_goto(
    ^~~~~~~~~~~~~~~~~
  include/linux/compiler-gcc.h:120:38: error: impossible constraint in 'asm'
   #define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0)
                                        ^~~
  arch/arm64/include/asm/jump_label.h:32:2: note: in expansion of macro 'asm_volatile_goto'
    asm_volatile_goto(
    ^~~~~~~~~~~~~~~~~

Link: http://lkml.kernel.org/r/20190423034959.13525-3-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/include/asm/cpufeature.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index f210bcf096f7..bc895c869892 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -401,7 +401,7 @@ unsigned long cpu_get_elf_hwcap2(void);
 #define cpu_have_named_feature(name) cpu_have_feature(cpu_feature(name))
 
 /* System capability check for constant caps */
-static inline bool __cpus_have_const_cap(int num)
+static __always_inline bool __cpus_have_const_cap(int num)
 {
 	if (num >= ARM64_NCAPS)
 		return false;
@@ -415,7 +415,7 @@ static inline bool cpus_have_cap(unsigned int num)
 	return test_bit(num, cpu_hwcaps);
 }
 
-static inline bool cpus_have_const_cap(int num)
+static __always_inline bool cpus_have_const_cap(int num)
 {
 	if (static_branch_likely(&arm64_const_caps_ready))
 		return __cpus_have_const_cap(num);
-- 
cgit v1.2.3-59-g8ed1b


From 1221a5854d4375b8b9fdf1340e6e2679b869ccd0 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:41:56 -0700
Subject: MIPS: mark mult_sh_align_mod() as __always_inline

This prepares to move CONFIG_OPTIMIZE_INLINING from x86 to a common
place.  We need to eliminate potential issues beforehand.

If it is enabled for mips, the following error is reported:

  arch/mips/kernel/cpu-bugs64.c: In function 'mult_sh_align_mod.constprop':
  arch/mips/kernel/cpu-bugs64.c:33:2: error: asm operand 1 probably doesn't match constraints [-Werror]
    asm volatile(
    ^~~
  arch/mips/kernel/cpu-bugs64.c:33:2: error: asm operand 1 probably doesn't match constraints [-Werror]
    asm volatile(
    ^~~
  arch/mips/kernel/cpu-bugs64.c:33:2: error: impossible constraint in 'asm'
    asm volatile(
    ^~~
  arch/mips/kernel/cpu-bugs64.c:33:2: error: impossible constraint in 'asm'
    asm volatile(
    ^~~

Link: http://lkml.kernel.org/r/20190423034959.13525-4-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/kernel/cpu-bugs64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/cpu-bugs64.c b/arch/mips/kernel/cpu-bugs64.c
index bada74af7641..c04b97aace4a 100644
--- a/arch/mips/kernel/cpu-bugs64.c
+++ b/arch/mips/kernel/cpu-bugs64.c
@@ -42,8 +42,8 @@ static inline void align_mod(const int align, const int mod)
 		: "n"(align), "n"(mod));
 }
 
-static inline void mult_sh_align_mod(long *v1, long *v2, long *w,
-				     const int align, const int mod)
+static __always_inline void mult_sh_align_mod(long *v1, long *v2, long *w,
+					      const int align, const int mod)
 {
 	unsigned long flags;
 	int m1, m2;
-- 
cgit v1.2.3-59-g8ed1b


From e60fb8bf68d40ec32f8ea34950623ea570c9090e Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:41:59 -0700
Subject: s390/cpacf: mark scpacf_query() as __always_inline

This prepares to move CONFIG_OPTIMIZE_INLINING from x86 to a common
place.  We need to eliminate potential issues beforehand.

If it is enabled for s390, the following error is reported:

  In file included from arch/s390/crypto/des_s390.c:19:
  arch/s390/include/asm/cpacf.h: In function 'cpacf_query':
  arch/s390/include/asm/cpacf.h:170:2: warning: asm operand 3 probably doesn't match constraints
    asm volatile(
    ^~~
  arch/s390/include/asm/cpacf.h:170:2: error: impossible constraint in 'asm'

Link: http://lkml.kernel.org/r/20190423034959.13525-5-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/include/asm/cpacf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index 3cc52e37b4b2..f316de40e51b 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -202,7 +202,7 @@ static inline int __cpacf_check_opcode(unsigned int opcode)
 	}
 }
 
-static inline int cpacf_query(unsigned int opcode, cpacf_mask_t *mask)
+static __always_inline int cpacf_query(unsigned int opcode, cpacf_mask_t *mask)
 {
 	if (__cpacf_check_opcode(opcode)) {
 		__cpacf_query(opcode, mask);
-- 
cgit v1.2.3-59-g8ed1b


From e9ea596c2c6dedf9438176f4e844f1c8b68a35ac Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:42:07 -0700
Subject: MIPS: mark __fls() and __ffs() as __always_inline

This prepares to move CONFIG_OPTIMIZE_INLINING from x86 to a common
place.  We need to eliminate potential issues beforehand.

If it is enabled for mips, the following errors are reported:

  arch/mips/mm/sc-mips.o: In function `mips_sc_prefetch_enable.part.2':
  sc-mips.c:(.text+0x98): undefined reference to `mips_gcr_base'
  sc-mips.c:(.text+0x9c): undefined reference to `mips_gcr_base'
  sc-mips.c:(.text+0xbc): undefined reference to `mips_gcr_base'
  sc-mips.c:(.text+0xc8): undefined reference to `mips_gcr_base'
  sc-mips.c:(.text+0xdc): undefined reference to `mips_gcr_base'
  arch/mips/mm/sc-mips.o:sc-mips.c:(.text.unlikely+0x44): more undefined references to `mips_gcr_base'

Link: http://lkml.kernel.org/r/20190423034959.13525-7-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/include/asm/bitops.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h
index 830c93a010c3..9a466dde9b96 100644
--- a/arch/mips/include/asm/bitops.h
+++ b/arch/mips/include/asm/bitops.h
@@ -482,7 +482,7 @@ static inline void __clear_bit_unlock(unsigned long nr, volatile unsigned long *
  * Return the bit position (0..63) of the most significant 1 bit in a word
  * Returns -1 if no 1 bit exists
  */
-static inline unsigned long __fls(unsigned long word)
+static __always_inline unsigned long __fls(unsigned long word)
 {
 	int num;
 
@@ -548,7 +548,7 @@ static inline unsigned long __fls(unsigned long word)
  * Returns 0..SZLONG-1
  * Undefined if no bit exists, so code should check against 0 first.
  */
-static inline unsigned long __ffs(unsigned long word)
+static __always_inline unsigned long __ffs(unsigned long word)
 {
 	return __fls(word & -word);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 2e0168a714586fb0fa600157e837e90569cbd8ec Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:42:10 -0700
Subject: ARM: mark setup_machine_tags() stub as __init __noreturn

This prepares to move CONFIG_OPTIMIZE_INLINING from x86 to a common
place.  We need to eliminate potential issues beforehand.

If it is enabled for arm, Clang build results in the following modpost
warning:

  WARNING: vmlinux.o(.text+0x1124): Section mismatch in reference from the function setup_machine_tags() to the function .init.text:early_print()
  The function setup_machine_tags() references the function __init early_print().
  This is often because setup_machine_tags lacks a __init annotation or the annotation of early_print is wrong.

Link: http://lkml.kernel.org/r/20190423034959.13525-8-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/atags.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm/kernel/atags.h b/arch/arm/kernel/atags.h
index 201100226301..067e12edc341 100644
--- a/arch/arm/kernel/atags.h
+++ b/arch/arm/kernel/atags.h
@@ -5,7 +5,7 @@ void convert_to_tag_list(struct tag *tags);
 const struct machine_desc *setup_machine_tags(phys_addr_t __atags_pointer,
 	unsigned int machine_nr);
 #else
-static inline const struct machine_desc *
+static inline const struct machine_desc * __init __noreturn
 setup_machine_tags(phys_addr_t __atags_pointer, unsigned int machine_nr)
 {
 	early_print("no ATAGS support: can't continue\n");
-- 
cgit v1.2.3-59-g8ed1b


From 480795a095343c9aaab49cc4d499c41a966be164 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:42:14 -0700
Subject: powerpc/prom_init: mark prom_getprop() and prom_getproplen() as
 __init

This prepares to move CONFIG_OPTIMIZE_INLINING from x86 to a common
place.  We need to eliminate potential issues beforehand.

If it is enabled for powerpc, the following modpost warnings are
reported:

  WARNING: vmlinux.o(.text.unlikely+0x20): Section mismatch in reference from the function .prom_getprop() to the function .init.text:.call_prom()
  The function .prom_getprop() references the function __init .call_prom().
  This is often because .prom_getprop lacks a __init annotation or the annotation of .call_prom is wrong.

  WARNING: vmlinux.o(.text.unlikely+0x3c): Section mismatch in reference from the function .prom_getproplen() to the function .init.text:.call_prom()
  The function .prom_getproplen() references the function __init .call_prom().
  This is often because .prom_getproplen lacks a __init annotation or the annotation of .call_prom is wrong.

Link: http://lkml.kernel.org/r/20190423034959.13525-9-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/prom_init.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 523bb99d7676..00682b8df330 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -628,14 +628,14 @@ static int __init prom_next_node(phandle *nodep)
 	}
 }
 
-static inline int prom_getprop(phandle node, const char *pname,
-			       void *value, size_t valuelen)
+static inline int __init prom_getprop(phandle node, const char *pname,
+				      void *value, size_t valuelen)
 {
 	return call_prom("getprop", 4, 1, node, ADDR(pname),
 			 (u32)(unsigned long) value, (u32) valuelen);
 }
 
-static inline int prom_getproplen(phandle node, const char *pname)
+static inline int __init prom_getproplen(phandle node, const char *pname)
 {
 	return call_prom("getproplen", 2, 1, node, ADDR(pname));
 }
-- 
cgit v1.2.3-59-g8ed1b


From e12d6d7d46a62ccf62f4c60eb6ea6f0f121797c7 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:42:17 -0700
Subject: powerpc/mm/radix: mark __radix__flush_tlb_range_psize() as
 __always_inline

This prepares to move CONFIG_OPTIMIZE_INLINING from x86 to a common
place.  We need to eliminate potential issues beforehand.

If it is enabled for powerpc, the following error is reported:

  arch/powerpc/mm/tlb-radix.c: In function '__radix__flush_tlb_range_psize':
  arch/powerpc/mm/tlb-radix.c:104:2: error: asm operand 3 probably doesn't match constraints [-Werror]
    asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
    ^~~
  arch/powerpc/mm/tlb-radix.c:104:2: error: impossible constraint in 'asm'

Link: http://lkml.kernel.org/r/20190423034959.13525-10-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/mm/book3s64/radix_tlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 6a23b9ebd2a1..a2b2848f0ae3 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -928,7 +928,7 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 	tlb->need_flush_all = 0;
 }
 
-static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
+static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
 				unsigned long start, unsigned long end,
 				int psize, bool also_pwc)
 {
-- 
cgit v1.2.3-59-g8ed1b


From efc344c57e39754416731e24e1eadd7d9b9f335e Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:42:21 -0700
Subject: powerpc/mm/radix: mark as __tlbie_pid() and friends as__always_inline

This prepares to move CONFIG_OPTIMIZE_INLINING from x86 to a common
place.  We need to eliminate potential issues beforehand.

If it is enabled for powerpc, the following errors are reported:

  arch/powerpc/mm/tlb-radix.c: In function '__tlbie_lpid':
  arch/powerpc/mm/tlb-radix.c:148:2: warning: asm operand 3 probably doesn't match constraints
    asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
    ^~~
  arch/powerpc/mm/tlb-radix.c:148:2: error: impossible constraint in 'asm'
  arch/powerpc/mm/tlb-radix.c: In function '__tlbie_pid':
  arch/powerpc/mm/tlb-radix.c:118:2: warning: asm operand 3 probably doesn't match constraints
    asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
    ^~~
  arch/powerpc/mm/tlb-radix.c: In function '__tlbiel_pid':
  arch/powerpc/mm/tlb-radix.c:104:2: warning: asm operand 3 probably doesn't match constraints
    asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
    ^~~

Link: http://lkml.kernel.org/r/20190423034959.13525-11-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/mm/book3s64/radix_tlb.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index a2b2848f0ae3..4d841369399f 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -90,7 +90,7 @@ void radix__tlbiel_all(unsigned int action)
 	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
 }
 
-static inline void __tlbiel_pid(unsigned long pid, int set,
+static __always_inline void __tlbiel_pid(unsigned long pid, int set,
 				unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
@@ -106,7 +106,7 @@ static inline void __tlbiel_pid(unsigned long pid, int set,
 	trace_tlbie(0, 1, rb, rs, ric, prs, r);
 }
 
-static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
+static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
 
@@ -120,7 +120,7 @@ static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
 	trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
-static inline void __tlbiel_lpid(unsigned long lpid, int set,
+static __always_inline void __tlbiel_lpid(unsigned long lpid, int set,
 				unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
@@ -136,7 +136,7 @@ static inline void __tlbiel_lpid(unsigned long lpid, int set,
 	trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
 }
 
-static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
+static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
 
-- 
cgit v1.2.3-59-g8ed1b


From 9012d011660ea5cf2a623e1de207a2bc0ca6936d Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:42:25 -0700
Subject: compiler: allow all arches to enable CONFIG_OPTIMIZE_INLINING

Commit 60a3cdd06394 ("x86: add optimized inlining") introduced
CONFIG_OPTIMIZE_INLINING, but it has been available only for x86.

The idea is obviously arch-agnostic.  This commit moves the config entry
from arch/x86/Kconfig.debug to lib/Kconfig.debug so that all
architectures can benefit from it.

This can make a huge difference in kernel image size especially when
CONFIG_OPTIMIZE_FOR_SIZE is enabled.

For example, I got 3.5% smaller arm64 kernel for v5.1-rc1.

  dec       file
  18983424  arch/arm64/boot/Image.before
  18321920  arch/arm64/boot/Image.after

This also slightly improves the "Kernel hacking" Kconfig menu as
e61aca5158a8 ("Merge branch 'kconfig-diet' from Dave Hansen') suggested;
this config option would be a good fit in the "compiler option" menu.

Link: http://lkml.kernel.org/r/20190423034959.13525-12-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig               |  3 ---
 arch/x86/Kconfig.debug         | 14 --------------
 include/linux/compiler_types.h |  3 +--
 lib/Kconfig.debug              | 14 ++++++++++++++
 4 files changed, 15 insertions(+), 19 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 818b361094ed..326b2d5bab9d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -305,9 +305,6 @@ config ZONE_DMA32
 config AUDIT_ARCH
 	def_bool y if X86_64
 
-config ARCH_SUPPORTS_OPTIMIZED_INLINING
-	def_bool y
-
 config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	def_bool y
 
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 15d0fbe27872..f730680dc818 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -266,20 +266,6 @@ config CPA_DEBUG
 	---help---
 	  Do change_page_attr() self-tests every 30 seconds.
 
-config OPTIMIZE_INLINING
-	bool "Allow gcc to uninline functions marked 'inline'"
-	---help---
-	  This option determines if the kernel forces gcc to inline the functions
-	  developers have marked 'inline'. Doing so takes away freedom from gcc to
-	  do what it thinks is best, which is desirable for the gcc 3.x series of
-	  compilers. The gcc 4.x series have a rewritten inlining algorithm and
-	  enabling this option will generate a smaller kernel there. Hopefully
-	  this algorithm is so good that allowing gcc 4.x and above to make the
-	  decision will become the default in the future. Until then this option
-	  is there to test gcc for this.
-
-	  If unsure, say N.
-
 config DEBUG_ENTRY
 	bool "Debug low-level entry code"
 	depends on DEBUG_KERNEL
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index ba814f18cb4c..19e58b9138a0 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -140,8 +140,7 @@ struct ftrace_likely_data {
  * Do not use __always_inline here, since currently it expands to inline again
  * (which would break users of __always_inline).
  */
-#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \
-	!defined(CONFIG_OPTIMIZE_INLINING)
+#if !defined(CONFIG_OPTIMIZE_INLINING)
 #define inline inline __attribute__((__always_inline__)) __gnu_inline \
 	__maybe_unused notrace
 #else
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d695ec1477f3..d5411a7484f6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -318,6 +318,20 @@ config HEADERS_CHECK
 	  exported to $(INSTALL_HDR_PATH) (usually 'usr/include' in
 	  your build tree), to make sure they're suitable.
 
+config OPTIMIZE_INLINING
+	bool "Allow compiler to uninline functions marked 'inline'"
+	help
+	  This option determines if the kernel forces gcc to inline the functions
+	  developers have marked 'inline'. Doing so takes away freedom from gcc to
+	  do what it thinks is best, which is desirable for the gcc 3.x series of
+	  compilers. The gcc 4.x series have a rewritten inlining algorithm and
+	  enabling this option will generate a smaller kernel there. Hopefully
+	  this algorithm is so good that allowing gcc 4.x and above to make the
+	  decision will become the default in the future. Until then this option
+	  is there to test gcc for this.
+
+	  If unsure, say N.
+
 config DEBUG_SECTION_MISMATCH
 	bool "Enable full Section mismatch analysis"
 	help
-- 
cgit v1.2.3-59-g8ed1b


From efb463cc16555450d5e3ec7adc4a19b416ae0f21 Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@kernel.org>
Date: Tue, 14 May 2019 15:44:04 -0700
Subject: powerpc: replace CONFIG_DEBUG_KERNEL with CONFIG_DEBUG_MISC

CONFIG_DEBUG_KERNEL should not impact code generation.  Use the newly
defined CONFIG_DEBUG_MISC instead to keep the current code.

Link: http://lkml.kernel.org/r/20190413224438.10802-3-okaya@kernel.org
Signed-off-by: Sinan Kaya <okaya@kernel.org>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Anders Roxell <anders.roxell@linaro.org>
Cc:  Chris Zankel <chris@zankel.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Florian Westphal <fw@strlen.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/sysfs.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index e8e93c2c7d03..7a1708875d27 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -610,7 +610,7 @@ SYSFS_PMCSETUP(pa6t_pmc2, SPRN_PA6T_PMC2);
 SYSFS_PMCSETUP(pa6t_pmc3, SPRN_PA6T_PMC3);
 SYSFS_PMCSETUP(pa6t_pmc4, SPRN_PA6T_PMC4);
 SYSFS_PMCSETUP(pa6t_pmc5, SPRN_PA6T_PMC5);
-#ifdef CONFIG_DEBUG_KERNEL
+#ifdef CONFIG_DEBUG_MISC
 SYSFS_SPRSETUP(hid0, SPRN_HID0);
 SYSFS_SPRSETUP(hid1, SPRN_HID1);
 SYSFS_SPRSETUP(hid4, SPRN_HID4);
@@ -639,7 +639,7 @@ SYSFS_SPRSETUP(tsr0, SPRN_PA6T_TSR0);
 SYSFS_SPRSETUP(tsr1, SPRN_PA6T_TSR1);
 SYSFS_SPRSETUP(tsr2, SPRN_PA6T_TSR2);
 SYSFS_SPRSETUP(tsr3, SPRN_PA6T_TSR3);
-#endif /* CONFIG_DEBUG_KERNEL */
+#endif /* CONFIG_DEBUG_MISC */
 #endif /* HAS_PPC_PMC_PA6T */
 
 #ifdef HAS_PPC_PMC_IBM
@@ -680,7 +680,7 @@ static struct device_attribute pa6t_attrs[] = {
 	__ATTR(pmc3, 0600, show_pa6t_pmc3, store_pa6t_pmc3),
 	__ATTR(pmc4, 0600, show_pa6t_pmc4, store_pa6t_pmc4),
 	__ATTR(pmc5, 0600, show_pa6t_pmc5, store_pa6t_pmc5),
-#ifdef CONFIG_DEBUG_KERNEL
+#ifdef CONFIG_DEBUG_MISC
 	__ATTR(hid0, 0600, show_hid0, store_hid0),
 	__ATTR(hid1, 0600, show_hid1, store_hid1),
 	__ATTR(hid4, 0600, show_hid4, store_hid4),
@@ -709,7 +709,7 @@ static struct device_attribute pa6t_attrs[] = {
 	__ATTR(tsr1, 0600, show_tsr1, store_tsr1),
 	__ATTR(tsr2, 0600, show_tsr2, store_tsr2),
 	__ATTR(tsr3, 0600, show_tsr3, store_tsr3),
-#endif /* CONFIG_DEBUG_KERNEL */
+#endif /* CONFIG_DEBUG_MISC */
 };
 #endif /* HAS_PPC_PMC_PA6T */
 #endif /* HAS_PPC_PMC_CLASSIC */
-- 
cgit v1.2.3-59-g8ed1b


From 900f492836df432f1b5a9078a5e4838583643d93 Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@kernel.org>
Date: Tue, 14 May 2019 15:44:07 -0700
Subject: xtensa: replace CONFIG_DEBUG_KERNEL with CONFIG_DEBUG_MISC

CONFIG_DEBUG_KERNEL should not impact code generation.  Use the newly
defined CONFIG_DEBUG_MISC instead to keep the current code.

Link: http://lkml.kernel.org/r/20190413224438.10802-5-okaya@kernel.org
Signed-off-by: Sinan Kaya <okaya@kernel.org>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Max Filippov <jcmvbkbc@gmail.com>
Cc:  Chris Zankel <chris@zankel.net>
Cc: Anders Roxell <anders.roxell@linaro.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Florian Westphal <fw@strlen.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/xtensa/include/asm/irqflags.h | 2 +-
 arch/xtensa/kernel/smp.c           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/xtensa/include/asm/irqflags.h b/arch/xtensa/include/asm/irqflags.h
index 9b5e8526afe5..12890681587b 100644
--- a/arch/xtensa/include/asm/irqflags.h
+++ b/arch/xtensa/include/asm/irqflags.h
@@ -27,7 +27,7 @@ static inline unsigned long arch_local_irq_save(void)
 {
 	unsigned long flags;
 #if XTENSA_FAKE_NMI
-#if defined(CONFIG_DEBUG_KERNEL) && (LOCKLEVEL | TOPLEVEL) >= XCHAL_DEBUGLEVEL
+#if defined(CONFIG_DEBUG_MISC) && (LOCKLEVEL | TOPLEVEL) >= XCHAL_DEBUGLEVEL
 	unsigned long tmp;
 
 	asm volatile("rsr	%0, ps\t\n"
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index 3699d6d3e479..83b244ce61ee 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -126,7 +126,7 @@ void secondary_start_kernel(void)
 
 	init_mmu();
 
-#ifdef CONFIG_DEBUG_KERNEL
+#ifdef CONFIG_DEBUG_MISC
 	if (boot_secondary_processors == 0) {
 		pr_debug("%s: boot_secondary_processors:%d; Hanging cpu:%d\n",
 			__func__, boot_secondary_processors, cpu);
-- 
cgit v1.2.3-59-g8ed1b


From 87dfb311b707cd4c4b666c9af0fa15acbe6eee99 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:46:51 -0700
Subject: treewide: replace #include <asm/sizes.h> with #include
 <linux/sizes.h>

Since commit dccd2304cc90 ("ARM: 7430/1: sizes.h: move from asm-generic
to <linux/sizes.h>"), <asm/sizes.h> and <asm-generic/sizes.h> are just
wrappers of <linux/sizes.h>.

This commit replaces all <asm/sizes.h> and <asm-generic/sizes.h> to
prepare for the removal.

Link: http://lkml.kernel.org/r/1553267665-27228-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/common/sa1111.c                                | 2 +-
 arch/arm/mach-imx/devices/platform-fec.c                | 2 +-
 arch/arm/mach-imx/devices/platform-gpio_keys.c          | 2 +-
 arch/arm/mach-imx/devices/platform-imx2-wdt.c           | 2 +-
 arch/arm/mach-imx/devices/platform-mxc_nand.c           | 2 +-
 arch/arm/mach-imx/hardware.h                            | 2 +-
 arch/arm/mach-integrator/impd1.c                        | 2 +-
 arch/arm/mach-iop13xx/pci.c                             | 2 +-
 arch/arm/mach-iop13xx/tpmi.c                            | 2 +-
 arch/arm/mach-ixp4xx/common-pci.c                       | 2 +-
 arch/arm/mach-ks8695/include/mach/hardware.h            | 2 +-
 arch/arm/mach-omap1/include/mach/hardware.h             | 2 +-
 arch/arm/mach-omap2/omap_hwmod_2xxx_interconnect_data.c | 2 +-
 arch/arm/mach-prima2/common.c                           | 2 +-
 arch/arm/mach-pxa/balloon3.c                            | 2 +-
 arch/arm/mach-pxa/colibri-pxa270.c                      | 2 +-
 arch/arm/mach-pxa/colibri-pxa300.c                      | 2 +-
 arch/arm/mach-pxa/colibri-pxa320.c                      | 2 +-
 arch/arm/mach-pxa/colibri-pxa3xx.c                      | 2 +-
 arch/arm/mach-pxa/gumstix.c                             | 2 +-
 arch/arm/mach-pxa/lpd270.c                              | 2 +-
 arch/arm/mach-pxa/lubbock.c                             | 2 +-
 arch/arm/mach-pxa/mainstone.c                           | 2 +-
 arch/arm/mach-pxa/trizeps4.c                            | 2 +-
 arch/arm/mach-pxa/viper.c                               | 2 +-
 arch/arm/mach-s3c24xx/include/mach/hardware.h           | 2 +-
 arch/arm/mach-sa1100/include/mach/memory.h              | 2 +-
 arch/arm/mach-sa1100/neponset.c                         | 2 +-
 arch/arm/mach-tegra/iomap.h                             | 2 +-
 arch/arm/mach-tegra/irammap.h                           | 2 +-
 arch/arm/mach-w90x900/include/mach/hardware.h           | 2 +-
 arch/arm64/include/asm/boot.h                           | 2 +-
 arch/arm64/include/asm/memory.h                         | 2 +-
 arch/arm64/mm/init.c                                    | 2 +-
 arch/arm64/mm/mmu.c                                     | 2 +-
 arch/nds32/include/asm/pgtable.h                        | 2 +-
 arch/nds32/kernel/head.S                                | 2 +-
 arch/sh/boards/board-apsh4a3a.c                         | 2 +-
 arch/sh/boards/board-apsh4ad0a.c                        | 2 +-
 arch/sh/boards/board-edosk7705.c                        | 2 +-
 arch/sh/boards/board-edosk7760.c                        | 2 +-
 arch/sh/boards/board-espt.c                             | 2 +-
 arch/sh/boards/board-urquell.c                          | 2 +-
 arch/sh/boards/mach-microdev/setup.c                    | 2 +-
 arch/sh/boards/mach-sdk7786/fpga.c                      | 2 +-
 arch/sh/boards/mach-sdk7786/setup.c                     | 2 +-
 arch/sh/boards/mach-sdk7786/sram.c                      | 2 +-
 arch/sh/boards/mach-se/7343/irq.c                       | 2 +-
 arch/sh/boards/mach-se/7722/irq.c                       | 2 +-
 arch/sh/drivers/pci/pci-sh7751.c                        | 2 +-
 arch/sh/drivers/pci/pci-sh7780.c                        | 2 +-
 arch/sh/drivers/pci/pcie-sh7786.c                       | 2 +-
 arch/sh/mm/init.c                                       | 2 +-
 arch/sh/mm/pmb.c                                        | 2 +-
 arch/sh/mm/uncached.c                                   | 2 +-
 arch/unicore32/include/asm/memory.h                     | 2 +-
 arch/unicore32/mm/init.c                                | 2 +-
 arch/unicore32/mm/ioremap.c                             | 2 +-
 arch/unicore32/mm/mmu.c                                 | 2 +-
 arch/x86/events/intel/bts.c                             | 2 +-
 drivers/gpu/drm/msm/msm_drv.h                           | 2 +-
 drivers/iommu/msm_iommu.c                               | 2 +-
 drivers/mmc/host/mvsdio.c                               | 2 +-
 drivers/mmc/host/pxamci.c                               | 2 +-
 drivers/mtd/maps/sa1100-flash.c                         | 2 +-
 drivers/pcmcia/omap_cf.c                                | 2 +-
 drivers/sh/intc/userimask.c                             | 2 +-
 drivers/video/fbdev/fb-puv3.c                           | 2 +-
 68 files changed, 68 insertions(+), 68 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/common/sa1111.c b/arch/arm/common/sa1111.c
index 45412d21aa6b..179ca8757a74 100644
--- a/arch/arm/common/sa1111.c
+++ b/arch/arm/common/sa1111.c
@@ -32,7 +32,7 @@
 #include <mach/hardware.h>
 #include <asm/mach/irq.h>
 #include <asm/mach-types.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <asm/hardware/sa1111.h>
 
diff --git a/arch/arm/mach-imx/devices/platform-fec.c b/arch/arm/mach-imx/devices/platform-fec.c
index b403a4fe2892..605c0af5851d 100644
--- a/arch/arm/mach-imx/devices/platform-fec.c
+++ b/arch/arm/mach-imx/devices/platform-fec.c
@@ -7,7 +7,7 @@
  * Free Software Foundation.
  */
 #include <linux/dma-mapping.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include "../hardware.h"
 #include "devices-common.h"
diff --git a/arch/arm/mach-imx/devices/platform-gpio_keys.c b/arch/arm/mach-imx/devices/platform-gpio_keys.c
index 486282539c76..9f0a132ea1bc 100644
--- a/arch/arm/mach-imx/devices/platform-gpio_keys.c
+++ b/arch/arm/mach-imx/devices/platform-gpio_keys.c
@@ -15,7 +15,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor,
  * Boston, MA  02110-1301, USA.
  */
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include "../hardware.h"
 #include "devices-common.h"
diff --git a/arch/arm/mach-imx/devices/platform-imx2-wdt.c b/arch/arm/mach-imx/devices/platform-imx2-wdt.c
index 8c134c8d7500..0c6d3c05fd6d 100644
--- a/arch/arm/mach-imx/devices/platform-imx2-wdt.c
+++ b/arch/arm/mach-imx/devices/platform-imx2-wdt.c
@@ -6,7 +6,7 @@
  * the terms of the GNU General Public License version 2 as published by the
  * Free Software Foundation.
  */
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include "../hardware.h"
 #include "devices-common.h"
diff --git a/arch/arm/mach-imx/devices/platform-mxc_nand.c b/arch/arm/mach-imx/devices/platform-mxc_nand.c
index 676df4920c7b..046e0cc826c1 100644
--- a/arch/arm/mach-imx/devices/platform-mxc_nand.c
+++ b/arch/arm/mach-imx/devices/platform-mxc_nand.c
@@ -6,7 +6,7 @@
  * the terms of the GNU General Public License version 2 as published by the
  * Free Software Foundation.
  */
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include "../hardware.h"
 #include "devices-common.h"
diff --git a/arch/arm/mach-imx/hardware.h b/arch/arm/mach-imx/hardware.h
index 90e10cbd8fd1..b5ca8cebe1d6 100644
--- a/arch/arm/mach-imx/hardware.h
+++ b/arch/arm/mach-imx/hardware.h
@@ -24,7 +24,7 @@
 #include <asm/io.h>
 #include <soc/imx/revision.h>
 #endif
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #define addr_in_module(addr, mod) \
 	((unsigned long)(addr) - mod ## _BASE_ADDR < mod ## _SIZE)
diff --git a/arch/arm/mach-integrator/impd1.c b/arch/arm/mach-integrator/impd1.c
index 8dfad012dfae..6ddbe153910a 100644
--- a/arch/arm/mach-integrator/impd1.c
+++ b/arch/arm/mach-integrator/impd1.c
@@ -27,7 +27,7 @@
 #include <linux/irqchip/arm-vic.h>
 #include <linux/gpio/machine.h>
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include "lm.h"
 #include "impd1.h"
 
diff --git a/arch/arm/mach-iop13xx/pci.c b/arch/arm/mach-iop13xx/pci.c
index 070d92ae1b6f..8426ab9e2f5a 100644
--- a/arch/arm/mach-iop13xx/pci.c
+++ b/arch/arm/mach-iop13xx/pci.c
@@ -24,7 +24,7 @@
 #include <linux/export.h>
 #include <asm/irq.h>
 #include <mach/hardware.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/signal.h>
 #include <asm/mach/pci.h>
 #include "pci.h"
diff --git a/arch/arm/mach-iop13xx/tpmi.c b/arch/arm/mach-iop13xx/tpmi.c
index 116feb6b261e..d3d8c78e7d10 100644
--- a/arch/arm/mach-iop13xx/tpmi.c
+++ b/arch/arm/mach-iop13xx/tpmi.c
@@ -23,7 +23,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/io.h>
 #include <asm/irq.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <mach/irqs.h>
 
 /* assumes CONTROLLER_ONLY# is never asserted in the ESSR register */
diff --git a/arch/arm/mach-ixp4xx/common-pci.c b/arch/arm/mach-ixp4xx/common-pci.c
index 6835b17113e5..a53104bb28f5 100644
--- a/arch/arm/mach-ixp4xx/common-pci.c
+++ b/arch/arm/mach-ixp4xx/common-pci.c
@@ -31,7 +31,7 @@
 
 #include <asm/cputype.h>
 #include <asm/irq.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/mach/pci.h>
 #include <mach/hardware.h>
 
diff --git a/arch/arm/mach-ks8695/include/mach/hardware.h b/arch/arm/mach-ks8695/include/mach/hardware.h
index 959c748ee8bb..877629b3d944 100644
--- a/arch/arm/mach-ks8695/include/mach/hardware.h
+++ b/arch/arm/mach-ks8695/include/mach/hardware.h
@@ -14,7 +14,7 @@
 #ifndef __ASM_ARCH_HARDWARE_H
 #define __ASM_ARCH_HARDWARE_H
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 /*
  * Clocks are derived from MCLK, which is 25MHz
diff --git a/arch/arm/mach-omap1/include/mach/hardware.h b/arch/arm/mach-omap1/include/mach/hardware.h
index 5875a5098d35..e7c8ac7d83e3 100644
--- a/arch/arm/mach-omap1/include/mach/hardware.h
+++ b/arch/arm/mach-omap1/include/mach/hardware.h
@@ -36,7 +36,7 @@
 #ifndef __ASM_ARCH_OMAP_HARDWARE_H
 #define __ASM_ARCH_OMAP_HARDWARE_H
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #ifndef __ASSEMBLER__
 #include <asm/types.h>
 #include <mach/soc.h>
diff --git a/arch/arm/mach-omap2/omap_hwmod_2xxx_interconnect_data.c b/arch/arm/mach-omap2/omap_hwmod_2xxx_interconnect_data.c
index 9b30b6b471ae..e19f620c4074 100644
--- a/arch/arm/mach-omap2/omap_hwmod_2xxx_interconnect_data.c
+++ b/arch/arm/mach-omap2/omap_hwmod_2xxx_interconnect_data.c
@@ -11,7 +11,7 @@
  * XXX handle crossbar/shared link difference for L3?
  * XXX these should be marked initdata for multi-OMAP kernels
  */
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include "omap_hwmod.h"
 #include "l3_2xxx.h"
diff --git a/arch/arm/mach-prima2/common.c b/arch/arm/mach-prima2/common.c
index ffe05c27087e..1607deab5290 100644
--- a/arch/arm/mach-prima2/common.c
+++ b/arch/arm/mach-prima2/common.c
@@ -8,7 +8,7 @@
 
 #include <linux/init.h>
 #include <linux/kernel.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
 #include <linux/of.h>
diff --git a/arch/arm/mach-pxa/balloon3.c b/arch/arm/mach-pxa/balloon3.c
index 4bcbd3d55b36..1f24e0259f99 100644
--- a/arch/arm/mach-pxa/balloon3.c
+++ b/arch/arm/mach-pxa/balloon3.c
@@ -35,7 +35,7 @@
 #include <asm/setup.h>
 #include <asm/mach-types.h>
 #include <asm/irq.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
diff --git a/arch/arm/mach-pxa/colibri-pxa270.c b/arch/arm/mach-pxa/colibri-pxa270.c
index e68acdd0cdbb..510625dde3cb 100644
--- a/arch/arm/mach-pxa/colibri-pxa270.c
+++ b/arch/arm/mach-pxa/colibri-pxa270.c
@@ -24,7 +24,7 @@
 #include <asm/mach/arch.h>
 #include <asm/mach/flash.h>
 #include <asm/mach-types.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <mach/audio.h>
 #include "colibri.h"
diff --git a/arch/arm/mach-pxa/colibri-pxa300.c b/arch/arm/mach-pxa/colibri-pxa300.c
index 6a5558d95d4e..2f635bdc797f 100644
--- a/arch/arm/mach-pxa/colibri-pxa300.c
+++ b/arch/arm/mach-pxa/colibri-pxa300.c
@@ -18,7 +18,7 @@
 #include <linux/interrupt.h>
 
 #include <asm/mach-types.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/mach/arch.h>
 #include <asm/mach/irq.h>
 
diff --git a/arch/arm/mach-pxa/colibri-pxa320.c b/arch/arm/mach-pxa/colibri-pxa320.c
index 17067a3039a8..ffcefe6dbc82 100644
--- a/arch/arm/mach-pxa/colibri-pxa320.c
+++ b/arch/arm/mach-pxa/colibri-pxa320.c
@@ -19,7 +19,7 @@
 #include <linux/usb/gpio_vbus.h>
 
 #include <asm/mach-types.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/mach/arch.h>
 #include <asm/mach/irq.h>
 
diff --git a/arch/arm/mach-pxa/colibri-pxa3xx.c b/arch/arm/mach-pxa/colibri-pxa3xx.c
index e31a591e949f..0c88e4e417b4 100644
--- a/arch/arm/mach-pxa/colibri-pxa3xx.c
+++ b/arch/arm/mach-pxa/colibri-pxa3xx.c
@@ -17,7 +17,7 @@
 #include <linux/etherdevice.h>
 #include <asm/mach-types.h>
 #include <mach/hardware.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/system_info.h>
 #include <asm/mach/arch.h>
 #include <asm/mach/irq.h>
diff --git a/arch/arm/mach-pxa/gumstix.c b/arch/arm/mach-pxa/gumstix.c
index 4764acca5480..eb03283ccdee 100644
--- a/arch/arm/mach-pxa/gumstix.c
+++ b/arch/arm/mach-pxa/gumstix.c
@@ -33,7 +33,7 @@
 #include <asm/mach-types.h>
 #include <mach/hardware.h>
 #include <asm/irq.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
diff --git a/arch/arm/mach-pxa/lpd270.c b/arch/arm/mach-pxa/lpd270.c
index e9f401b0a432..5c03c4f7b82e 100644
--- a/arch/arm/mach-pxa/lpd270.c
+++ b/arch/arm/mach-pxa/lpd270.c
@@ -33,7 +33,7 @@
 #include <asm/mach-types.h>
 #include <mach/hardware.h>
 #include <asm/irq.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
diff --git a/arch/arm/mach-pxa/lubbock.c b/arch/arm/mach-pxa/lubbock.c
index c1bd0d544981..825939877839 100644
--- a/arch/arm/mach-pxa/lubbock.c
+++ b/arch/arm/mach-pxa/lubbock.c
@@ -39,7 +39,7 @@
 #include <asm/mach-types.h>
 #include <mach/hardware.h>
 #include <asm/irq.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
diff --git a/arch/arm/mach-pxa/mainstone.c b/arch/arm/mach-pxa/mainstone.c
index d6e17d407ac0..b3f8592eebe6 100644
--- a/arch/arm/mach-pxa/mainstone.c
+++ b/arch/arm/mach-pxa/mainstone.c
@@ -40,7 +40,7 @@
 #include <asm/mach-types.h>
 #include <mach/hardware.h>
 #include <asm/irq.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
diff --git a/arch/arm/mach-pxa/trizeps4.c b/arch/arm/mach-pxa/trizeps4.c
index c76f1daecfc9..99a2ee433f1f 100644
--- a/arch/arm/mach-pxa/trizeps4.c
+++ b/arch/arm/mach-pxa/trizeps4.c
@@ -35,7 +35,7 @@
 #include <asm/memory.h>
 #include <asm/mach-types.h>
 #include <asm/irq.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
diff --git a/arch/arm/mach-pxa/viper.c b/arch/arm/mach-pxa/viper.c
index ab2f89266bbd..c4c25a2f24f6 100644
--- a/arch/arm/mach-pxa/viper.c
+++ b/arch/arm/mach-pxa/viper.c
@@ -58,7 +58,7 @@
 #include <asm/setup.h>
 #include <asm/mach-types.h>
 #include <asm/irq.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/system_info.h>
 
 #include <asm/mach/arch.h>
diff --git a/arch/arm/mach-s3c24xx/include/mach/hardware.h b/arch/arm/mach-s3c24xx/include/mach/hardware.h
index 1b2975708e3f..f28ac6c78d82 100644
--- a/arch/arm/mach-s3c24xx/include/mach/hardware.h
+++ b/arch/arm/mach-s3c24xx/include/mach/hardware.h
@@ -15,7 +15,7 @@ extern unsigned int s3c2410_modify_misccr(unsigned int clr, unsigned int chg);
 
 #endif /* __ASSEMBLY__ */
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <mach/map.h>
 
 #endif /* __ASM_ARCH_HARDWARE_H */
diff --git a/arch/arm/mach-sa1100/include/mach/memory.h b/arch/arm/mach-sa1100/include/mach/memory.h
index fa5cf4744992..3b19296f5062 100644
--- a/arch/arm/mach-sa1100/include/mach/memory.h
+++ b/arch/arm/mach-sa1100/include/mach/memory.h
@@ -8,7 +8,7 @@
 #ifndef __ASM_ARCH_MEMORY_H
 #define __ASM_ARCH_MEMORY_H
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 /*
  * Because of the wide memory address space between physical RAM banks on the
diff --git a/arch/arm/mach-sa1100/neponset.c b/arch/arm/mach-sa1100/neponset.c
index eb60a71cf125..a671e4c994cf 100644
--- a/arch/arm/mach-sa1100/neponset.c
+++ b/arch/arm/mach-sa1100/neponset.c
@@ -21,7 +21,7 @@
 #include <asm/mach-types.h>
 #include <asm/mach/map.h>
 #include <asm/hardware/sa1111.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <mach/hardware.h>
 #include <mach/assabet.h>
diff --git a/arch/arm/mach-tegra/iomap.h b/arch/arm/mach-tegra/iomap.h
index 9bc291e76887..4af9e92a216f 100644
--- a/arch/arm/mach-tegra/iomap.h
+++ b/arch/arm/mach-tegra/iomap.h
@@ -20,7 +20,7 @@
 #define __MACH_TEGRA_IOMAP_H
 
 #include <asm/pgtable.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #define TEGRA_IRAM_BASE			0x40000000
 #define TEGRA_IRAM_SIZE			SZ_256K
diff --git a/arch/arm/mach-tegra/irammap.h b/arch/arm/mach-tegra/irammap.h
index e32e1742c9a1..6a7bb887585e 100644
--- a/arch/arm/mach-tegra/irammap.h
+++ b/arch/arm/mach-tegra/irammap.h
@@ -17,7 +17,7 @@
 #ifndef __MACH_TEGRA_IRAMMAP_H
 #define __MACH_TEGRA_IRAMMAP_H
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 /* The first 1K of IRAM is permanently reserved for the CPU reset handler */
 #define TEGRA_IRAM_RESET_HANDLER_OFFSET	0
diff --git a/arch/arm/mach-w90x900/include/mach/hardware.h b/arch/arm/mach-w90x900/include/mach/hardware.h
index fe3c6265a466..2e6555df538e 100644
--- a/arch/arm/mach-w90x900/include/mach/hardware.h
+++ b/arch/arm/mach-w90x900/include/mach/hardware.h
@@ -18,7 +18,7 @@
 #ifndef __ASM_ARCH_HARDWARE_H
 #define __ASM_ARCH_HARDWARE_H
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <mach/map.h>
 
 #endif /* __ASM_ARCH_HARDWARE_H */
diff --git a/arch/arm64/include/asm/boot.h b/arch/arm64/include/asm/boot.h
index 355e552a9175..c7f67da13cd9 100644
--- a/arch/arm64/include/asm/boot.h
+++ b/arch/arm64/include/asm/boot.h
@@ -3,7 +3,7 @@
 #ifndef __ASM_BOOT_H
 #define __ASM_BOOT_H
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 /*
  * arm64 requires the DTB to be 8 byte aligned and
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 2cb8248fa2c8..8ffcf5a512bb 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -26,7 +26,7 @@
 #include <linux/types.h>
 #include <asm/bug.h>
 #include <asm/page-def.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 /*
  * Size of the PCI I/O space. This must remain a power of two so that
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 007c05a4cce0..d2adffb81b5d 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -48,7 +48,7 @@
 #include <asm/numa.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/tlb.h>
 #include <asm/alternative.h>
 
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index ef32d4839c3f..a170c6369a68 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -40,7 +40,7 @@
 #include <asm/kernel-pgtable.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
 #include <asm/ptdump.h>
diff --git a/arch/nds32/include/asm/pgtable.h b/arch/nds32/include/asm/pgtable.h
index 9f52db930c00..ee59c1f9e4fc 100644
--- a/arch/nds32/include/asm/pgtable.h
+++ b/arch/nds32/include/asm/pgtable.h
@@ -6,7 +6,7 @@
 
 #define __PAGETABLE_PMD_FOLDED 1
 #include <asm-generic/4level-fixup.h>
-#include <asm-generic/sizes.h>
+#include <linux/sizes.h>
 
 #include <asm/memory.h>
 #include <asm/nds32.h>
diff --git a/arch/nds32/kernel/head.S b/arch/nds32/kernel/head.S
index db64b78b1232..fcefb62606ca 100644
--- a/arch/nds32/kernel/head.S
+++ b/arch/nds32/kernel/head.S
@@ -7,7 +7,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/thread_info.h>
 
 #ifdef CONFIG_CPU_BIG_ENDIAN
diff --git a/arch/sh/boards/board-apsh4a3a.c b/arch/sh/boards/board-apsh4a3a.c
index 346eda7a2ef6..abf19a947df3 100644
--- a/arch/sh/boards/board-apsh4a3a.c
+++ b/arch/sh/boards/board-apsh4a3a.c
@@ -16,7 +16,7 @@
 #include <linux/irq.h>
 #include <linux/clk.h>
 #include <asm/machvec.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/clock.h>
 
 static struct mtd_partition nor_flash_partitions[] = {
diff --git a/arch/sh/boards/board-apsh4ad0a.c b/arch/sh/boards/board-apsh4ad0a.c
index 4efa9c571f64..fa031a16c9b5 100644
--- a/arch/sh/boards/board-apsh4ad0a.c
+++ b/arch/sh/boards/board-apsh4ad0a.c
@@ -15,7 +15,7 @@
 #include <linux/irq.h>
 #include <linux/clk.h>
 #include <asm/machvec.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 /* Dummy supplies, where voltage doesn't matter */
 static struct regulator_consumer_supply dummy_supplies[] = {
diff --git a/arch/sh/boards/board-edosk7705.c b/arch/sh/boards/board-edosk7705.c
index 67a8803eb3f9..0de7d603da2d 100644
--- a/arch/sh/boards/board-edosk7705.c
+++ b/arch/sh/boards/board-edosk7705.c
@@ -16,7 +16,7 @@
 #include <linux/smc91x.h>
 #include <linux/sh_intc.h>
 #include <asm/machvec.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #define SMC_IOBASE	0xA2000000
 #define SMC_IO_OFFSET	0x300
diff --git a/arch/sh/boards/board-edosk7760.c b/arch/sh/boards/board-edosk7760.c
index 0fbe91cba67a..7569d85c5ff5 100644
--- a/arch/sh/boards/board-edosk7760.c
+++ b/arch/sh/boards/board-edosk7760.c
@@ -18,7 +18,7 @@
 #include <asm/addrspace.h>
 #include <asm/delay.h>
 #include <asm/i2c-sh7760.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 /* Bus state controller registers for CS4 area */
 #define BSC_CS4BCR	0xA4FD0010
diff --git a/arch/sh/boards/board-espt.c b/arch/sh/boards/board-espt.c
index f478fee3b48a..6e784b5cf5a0 100644
--- a/arch/sh/boards/board-espt.c
+++ b/arch/sh/boards/board-espt.c
@@ -13,7 +13,7 @@
 #include <linux/sh_eth.h>
 #include <linux/sh_intc.h>
 #include <asm/machvec.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 /* NOR Flash */
 static struct mtd_partition espt_nor_flash_partitions[] = {
diff --git a/arch/sh/boards/board-urquell.c b/arch/sh/boards/board-urquell.c
index 799af57c0b81..dad2b3b40735 100644
--- a/arch/sh/boards/board-urquell.c
+++ b/arch/sh/boards/board-urquell.c
@@ -21,7 +21,7 @@
 #include <mach/urquell.h>
 #include <cpu/sh7786.h>
 #include <asm/heartbeat.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/smp-ops.h>
 
 /*
diff --git a/arch/sh/boards/mach-microdev/setup.c b/arch/sh/boards/mach-microdev/setup.c
index 706b48f797be..f4a777fe2d01 100644
--- a/arch/sh/boards/mach-microdev/setup.c
+++ b/arch/sh/boards/mach-microdev/setup.c
@@ -15,7 +15,7 @@
 #include <mach/microdev.h>
 #include <asm/io.h>
 #include <asm/machvec.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 static struct resource smc91x_resources[] = {
 	[0] = {
diff --git a/arch/sh/boards/mach-sdk7786/fpga.c b/arch/sh/boards/mach-sdk7786/fpga.c
index 6d2a3d381c2a..895576ff8376 100644
--- a/arch/sh/boards/mach-sdk7786/fpga.c
+++ b/arch/sh/boards/mach-sdk7786/fpga.c
@@ -8,7 +8,7 @@
 #include <linux/io.h>
 #include <linux/bcd.h>
 #include <mach/fpga.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #define FPGA_REGS_OFFSET	0x03fff800
 #define FPGA_REGS_SIZE		0x490
diff --git a/arch/sh/boards/mach-sdk7786/setup.c b/arch/sh/boards/mach-sdk7786/setup.c
index 65721c3a482c..d183026dbeb1 100644
--- a/arch/sh/boards/mach-sdk7786/setup.c
+++ b/arch/sh/boards/mach-sdk7786/setup.c
@@ -19,7 +19,7 @@
 #include <mach/irq.h>
 #include <asm/machvec.h>
 #include <asm/heartbeat.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/clock.h>
 #include <asm/reboot.h>
 #include <asm/smp-ops.h>
diff --git a/arch/sh/boards/mach-sdk7786/sram.c b/arch/sh/boards/mach-sdk7786/sram.c
index d76cdb7ede39..7c6ca976f332 100644
--- a/arch/sh/boards/mach-sdk7786/sram.c
+++ b/arch/sh/boards/mach-sdk7786/sram.c
@@ -13,7 +13,7 @@
 #include <linux/string.h>
 #include <mach/fpga.h>
 #include <asm/sram.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 static int __init fpga_sram_init(void)
 {
diff --git a/arch/sh/boards/mach-se/7343/irq.c b/arch/sh/boards/mach-se/7343/irq.c
index 39a3175e72b2..1aedbfe32654 100644
--- a/arch/sh/boards/mach-se/7343/irq.c
+++ b/arch/sh/boards/mach-se/7343/irq.c
@@ -16,7 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/irqdomain.h>
 #include <linux/io.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <mach-se/mach/se7343.h>
 
 #define PA_CPLD_BASE_ADDR	0x11400000
diff --git a/arch/sh/boards/mach-se/7722/irq.c b/arch/sh/boards/mach-se/7722/irq.c
index f6e3009edd4e..6d34592767f8 100644
--- a/arch/sh/boards/mach-se/7722/irq.c
+++ b/arch/sh/boards/mach-se/7722/irq.c
@@ -14,7 +14,7 @@
 #include <linux/irqdomain.h>
 #include <linux/io.h>
 #include <linux/err.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <mach-se/mach/se7722.h>
 
 #define IRQ01_BASE_ADDR	0x11800000
diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index 1b9e5caac389..11ed21c2e9bb 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -14,7 +14,7 @@
 #include <linux/io.h>
 #include "pci-sh4.h"
 #include <asm/addrspace.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 static int __init __area_sdram_check(struct pci_channel *chan,
 				     unsigned int area)
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 3fd0f392a0ee..287b3a68570c 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -16,7 +16,7 @@
 #include <linux/log2.h>
 #include "pci-sh4.h"
 #include <asm/mmu.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #if defined(CONFIG_CPU_BIG_ENDIAN)
 # define PCICR_ENDIANNESS SH4_PCICR_BSWP
diff --git a/arch/sh/drivers/pci/pcie-sh7786.c b/arch/sh/drivers/pci/pcie-sh7786.c
index a58b77cea295..e0b568aaa701 100644
--- a/arch/sh/drivers/pci/pcie-sh7786.c
+++ b/arch/sh/drivers/pci/pcie-sh7786.c
@@ -18,7 +18,7 @@
 #include <linux/sh_intc.h>
 #include <cpu/sh7786.h>
 #include "pcie-sh7786.h"
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 struct sh7786_pcie_port {
 	struct pci_channel	*hose;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index b95e343e3c9d..5aeb4d7099a1 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -26,7 +26,7 @@
 #include <asm/sections.h>
 #include <asm/setup.h>
 #include <asm/cache.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 pgd_t swapper_pg_dir[PTRS_PER_PGD];
 
diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c
index 7b2cc490ebb7..a53a040d0054 100644
--- a/arch/sh/mm/pmb.c
+++ b/arch/sh/mm/pmb.c
@@ -24,7 +24,7 @@
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <asm/cacheflush.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/page.h>
diff --git a/arch/sh/mm/uncached.c b/arch/sh/mm/uncached.c
index 010010bf205a..bd1585e8efed 100644
--- a/arch/sh/mm/uncached.c
+++ b/arch/sh/mm/uncached.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/init.h>
 #include <linux/module.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/page.h>
 #include <asm/addrspace.h>
 
diff --git a/arch/unicore32/include/asm/memory.h b/arch/unicore32/include/asm/memory.h
index 66bb9f6525c0..46cf27efbb7e 100644
--- a/arch/unicore32/include/asm/memory.h
+++ b/arch/unicore32/include/asm/memory.h
@@ -16,7 +16,7 @@
 
 #include <linux/compiler.h>
 #include <linux/const.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <mach/memory.h>
 
 /*
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index b4442f3060ce..c994cdf14119 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -23,7 +23,7 @@
 
 #include <asm/sections.h>
 #include <asm/setup.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/tlb.h>
 #include <asm/memblock.h>
 #include <mach/map.h>
diff --git a/arch/unicore32/mm/ioremap.c b/arch/unicore32/mm/ioremap.c
index bf012b2b71a9..b69cb18ce8b1 100644
--- a/arch/unicore32/mm/ioremap.c
+++ b/arch/unicore32/mm/ioremap.c
@@ -34,7 +34,7 @@
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <mach/map.h>
 #include "mm.h"
diff --git a/arch/unicore32/mm/mmu.c b/arch/unicore32/mm/mmu.c
index aa2060beb408..f0ae623b305f 100644
--- a/arch/unicore32/mm/mmu.c
+++ b/arch/unicore32/mm/mmu.c
@@ -22,7 +22,7 @@
 #include <asm/cputype.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/tlb.h>
 #include <asm/memblock.h>
 
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 7cdd7b13bbda..890a3fb5706f 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -23,7 +23,7 @@
 #include <linux/device.h>
 #include <linux/coredump.h>
 
-#include <asm-generic/sizes.h>
+#include <linux/sizes.h>
 #include <asm/perf_event.h>
 
 #include "../perf_event.h"
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index eb33d2d00d77..e20e6b429804 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -33,7 +33,7 @@
 #include <linux/types.h>
 #include <linux/of_graph.h>
 #include <linux/of_device.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <linux/kthread.h>
 
 #include <drm/drmP.h>
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index 9fb0eb7a4d02..b4b87d6ae67f 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -34,7 +34,7 @@
 #include <linux/of_iommu.h>
 
 #include <asm/cacheflush.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include "msm_iommu_hw-8xxx.h"
 #include "msm_iommu.h"
diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c
index e22bbff89c8d..9cb93e15b197 100644
--- a/drivers/mmc/host/mvsdio.c
+++ b/drivers/mmc/host/mvsdio.c
@@ -24,7 +24,7 @@
 #include <linux/mmc/host.h>
 #include <linux/mmc/slot-gpio.h>
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/unaligned.h>
 
 #include "mvsdio.h"
diff --git a/drivers/mmc/host/pxamci.c b/drivers/mmc/host/pxamci.c
index c1d3f0e38921..e7d80c83da2c 100644
--- a/drivers/mmc/host/pxamci.c
+++ b/drivers/mmc/host/pxamci.c
@@ -35,7 +35,7 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <mach/hardware.h>
 #include <linux/platform_data/mmc-pxamci.h>
diff --git a/drivers/mtd/maps/sa1100-flash.c b/drivers/mtd/maps/sa1100-flash.c
index fd5fe12d7461..893239629d6b 100644
--- a/drivers/mtd/maps/sa1100-flash.c
+++ b/drivers/mtd/maps/sa1100-flash.c
@@ -20,7 +20,7 @@
 #include <linux/mtd/concat.h>
 
 #include <mach/hardware.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/mach/flash.h>
 
 struct sa_subdev_info {
diff --git a/drivers/pcmcia/omap_cf.c b/drivers/pcmcia/omap_cf.c
index c2a17a79f0b2..267fb875e40f 100644
--- a/drivers/pcmcia/omap_cf.c
+++ b/drivers/pcmcia/omap_cf.c
@@ -22,7 +22,7 @@
 
 #include <mach/hardware.h>
 #include <asm/io.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <mach/mux.h>
 #include <mach/tc.h>
diff --git a/drivers/sh/intc/userimask.c b/drivers/sh/intc/userimask.c
index e649ceaaa410..87d69e7471f9 100644
--- a/drivers/sh/intc/userimask.c
+++ b/drivers/sh/intc/userimask.c
@@ -14,7 +14,7 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/stat.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include "internals.h"
 
 static void __iomem *uimask;
diff --git a/drivers/video/fbdev/fb-puv3.c b/drivers/video/fbdev/fb-puv3.c
index d9e816d53531..1bddcc20b2c0 100644
--- a/drivers/video/fbdev/fb-puv3.c
+++ b/drivers/video/fbdev/fb-puv3.c
@@ -20,7 +20,7 @@
 #include <linux/console.h>
 #include <linux/mm.h>
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/pgtable.h>
 #include <mach/hardware.h>
 
-- 
cgit v1.2.3-59-g8ed1b


From b09e89366e1730ac13088706cec0f1335299eefb Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:46:54 -0700
Subject: arch: remove <asm/sizes.h> and <asm-generic/sizes.h>

Now that all instances of #include <asm/sizes.h> have been replaced with
#include <linux/sizes.h>, we can remove these.

Link: http://lkml.kernel.org/r/1553267665-27228-2-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/Kbuild       | 1 -
 arch/arm64/include/asm/Kbuild     | 1 -
 arch/h8300/include/asm/Kbuild     | 1 -
 arch/hexagon/include/asm/Kbuild   | 1 -
 arch/nds32/include/asm/Kbuild     | 1 -
 arch/sh/include/asm/Kbuild        | 1 -
 arch/unicore32/include/asm/Kbuild | 1 -
 include/asm-generic/sizes.h       | 2 --
 8 files changed, 9 deletions(-)
 delete mode 100644 include/asm-generic/sizes.h

(limited to 'arch')

diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 41deac2451af..0b2ecc98e086 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -17,7 +17,6 @@ generic-y += seccomp.h
 generic-y += segment.h
 generic-y += serial.h
 generic-y += simd.h
-generic-y += sizes.h
 generic-y += trace_clock.h
 
 generated-y += mach-types.h
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index eb0df239a759..9e977dedf193 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -20,7 +20,6 @@ generic-y += qspinlock.h
 generic-y += segment.h
 generic-y += serial.h
 generic-y += set_memory.h
-generic-y += sizes.h
 generic-y += switch_to.h
 generic-y += trace_clock.h
 generic-y += unaligned.h
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index 123d8f54be4a..f2e22058e488 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -42,7 +42,6 @@ generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += serial.h
 generic-y += shmparam.h
-generic-y += sizes.h
 generic-y += spinlock.h
 generic-y += timex.h
 generic-y += tlbflush.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index 6234a303d2a3..4a3d72f76ea2 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -32,7 +32,6 @@ generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
 generic-y += shmparam.h
-generic-y += sizes.h
 generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += unaligned.h
diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild
index 688b6ed26227..f67a327777b5 100644
--- a/arch/nds32/include/asm/Kbuild
+++ b/arch/nds32/include/asm/Kbuild
@@ -39,7 +39,6 @@ generic-y += preempt.h
 generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
-generic-y += sizes.h
 generic-y += switch_to.h
 generic-y += timex.h
 generic-y += topology.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 73fff39a0122..51a54df22c11 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -18,6 +18,5 @@ generic-y += parport.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += serial.h
-generic-y += sizes.h
 generic-y += trace_clock.h
 generic-y += xor.h
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index b301a0b3c0b2..c93dc6478cb2 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -31,7 +31,6 @@ generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
 generic-y += shmparam.h
-generic-y += sizes.h
 generic-y += syscalls.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/include/asm-generic/sizes.h b/include/asm-generic/sizes.h
deleted file mode 100644
index 1dcfad9629ef..000000000000
--- a/include/asm-generic/sizes.h
+++ /dev/null
@@ -1,2 +0,0 @@
-/* This is a placeholder, to be removed over time */
-#include <linux/sizes.h>
-- 
cgit v1.2.3-59-g8ed1b