From c771320e9357c9b85634002daedfe5c8988f27a6 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Thu, 5 Oct 2017 08:44:26 +0200
Subject: s390/mm,kvm: improve detection of KVM guest faults

The identification of guest fault currently relies on the PF_VCPU flag.
This is set in guest_entry_irqoff and cleared in guest_exit_irqoff.
Both functions are called by __vcpu_run, the PF_VCPU flag is set for
quite a lot of kernel code outside of the guest execution.

Replace the PF_VCPU scheme with the PIF_GUEST_FAULT in the pt_regs and
make the program check handler code in entry.S set the bit only for
exception that occurred between the .Lsie_gmap and .Lsie_done labels.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/include/asm/ptrace.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/s390/include/asm')

diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index 2f84e77f1f1b..a3788dafc0e1 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -13,10 +13,12 @@
 #define PIF_SYSCALL		0	/* inside a system call */
 #define PIF_PER_TRAP		1	/* deliver sigtrap on return to user */
 #define PIF_SYSCALL_RESTART	2	/* restart the current system call */
+#define PIF_GUEST_FAULT		3	/* indicates program check in sie64a */
 
 #define _PIF_SYSCALL		_BITUL(PIF_SYSCALL)
 #define _PIF_PER_TRAP		_BITUL(PIF_PER_TRAP)
 #define _PIF_SYSCALL_RESTART	_BITUL(PIF_SYSCALL_RESTART)
+#define _PIF_GUEST_FAULT	_BITUL(PIF_GUEST_FAULT)
 
 #ifndef __ASSEMBLY__
 
-- 
cgit v1.2.3-59-g8ed1b


From 0aaba41b58bc5f3074c0c0a6136b9500b5e29e19 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 22 Aug 2017 12:08:22 +0200
Subject: s390: remove all code using the access register mode

The vdso code for the getcpu() and the clock_gettime() call use the access
register mode to access the per-CPU vdso data page with the current code.

An alternative to the complicated AR mode is to use the secondary space
mode. This makes the vdso faster and quite a bit simpler. The downside is
that the uaccess code has to be changed quite a bit.

Which instructions are used depends on the machine and what kind of uaccess
operation is requested. The instruction dictates which ASCE value needs
to be loaded into %cr1 and %cr7.

The different cases:

* User copy with MVCOS for z10 and newer machines
  The MVCOS instruction can copy between the primary space (aka user) and
  the home space (aka kernel) directly. For set_fs(KERNEL_DS) the kernel
  ASCE is loaded into %cr1. For set_fs(USER_DS) the user space is already
  loaded in %cr1.

* User copy with MVCP/MVCS for older machines
  To be able to execute the MVCP/MVCS instructions the kernel needs to
  switch to primary mode. The control register %cr1 has to be set to the
  kernel ASCE and %cr7 to either the kernel ASCE or the user ASCE dependent
  on set_fs(KERNEL_DS) vs set_fs(USER_DS).

* Data access in the user address space for strnlen / futex
  To use "normal" instruction with data from the user address space the
  secondary space mode is used. The kernel needs to switch to primary mode,
  %cr1 has to contain the kernel ASCE and %cr7 either the user ASCE or the
  kernel ASCE, dependent on set_fs.

To load a new value into %cr1 or %cr7 is an expensive operation, the kernel
tries to be lazy about it. E.g. for multiple user copies in a row with
MVCP/MVCS the replacement of the vdso ASCE in %cr7 with the user ASCE is
done only once. On return to user space a CPU bit is checked that loads the
vdso ASCE again.

To enable and disable the data access via the secondary space two new
functions are added, enable_sacf_uaccess and disable_sacf_uaccess. The fact
that a context is in secondary space uaccess mode is stored in the
mm_segment_t value for the task. The code of an interrupt may use set_fs
as long as it returns to the previous state it got with get_fs with another
call to set_fs. The code in finish_arch_post_lock_switch simply has to do a
set_fs with the current mm_segment_t value for the task.

For CPUs with MVCOS:

CPU running in                        | %cr1 ASCE | %cr7 ASCE |
--------------------------------------|-----------|-----------|
user space                            |  user     |  vdso     |
kernel, USER_DS, normal-mode          |  user     |  vdso     |
kernel, USER_DS, normal-mode, lazy    |  user     |  user     |
kernel, USER_DS, sacf-mode            |  kernel   |  user     |
kernel, KERNEL_DS, normal-mode        |  kernel   |  vdso     |
kernel, KERNEL_DS, normal-mode, lazy  |  kernel   |  kernel   |
kernel, KERNEL_DS, sacf-mode          |  kernel   |  kernel   |

For CPUs without MVCOS:

CPU running in                        | %cr1 ASCE | %cr7 ASCE |
--------------------------------------|-----------|-----------|
user space                            |  user     |  vdso     |
kernel, USER_DS, normal-mode          |  user     |  vdso     |
kernel, USER_DS, normal-mode lazy     |  kernel   |  user     |
kernel, USER_DS, sacf-mode            |  kernel   |  user     |
kernel, KERNEL_DS, normal-mode        |  kernel   |  vdso     |
kernel, KERNEL_DS, normal-mode, lazy  |  kernel   |  kernel   |
kernel, KERNEL_DS, sacf-mode          |  kernel   |  kernel   |

The lines with "lazy" refer to the state after a copy via the secondary
space with a delayed reload of %cr1 and %cr7.

There are three hardware address spaces that can cause a DAT exception,
primary, secondary and home space. The exception can be related to
four different fault types: user space fault, vdso fault, kernel fault,
and the gmap faults.

Dependent on the set_fs state and normal vs. sacf mode there are a number
of fault combinations:

1) user address space fault via the primary ASCE
2) gmap address space fault via the primary ASCE
3) kernel address space fault via the primary ASCE for machines with
   MVCOS and set_fs(KERNEL_DS)
4) vdso address space faults via the secondary ASCE with an invalid
   address while running in secondary space in problem state
5) user address space fault via the secondary ASCE for user-copy
   based on the secondary space mode, e.g. futex_ops or strnlen_user
6) kernel address space fault via the secondary ASCE for user-copy
   with secondary space mode with set_fs(KERNEL_DS)
7) kernel address space fault via the primary ASCE for user-copy
   with secondary space mode with set_fs(USER_DS) on machines without
   MVCOS.
8) kernel address space fault via the home space ASCE

Replace user_space_fault() with a new function get_fault_type() that
can distinguish all four different fault types.

With these changes the futex atomic ops from the kernel and the
strnlen_user will get a little bit slower, as well as the old style
uaccess with MVCP/MVCS. All user accesses based on MVCOS will be as
fast as before. On the positive side, the user space vdso code is a
lot faster and Linux ceases to use the complicated AR mode.

Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/include/asm/futex.h           |   9 ++-
 arch/s390/include/asm/lowcore.h         |  33 +++++-----
 arch/s390/include/asm/mmu_context.h     |  36 +++++------
 arch/s390/include/asm/processor.h       |   4 +-
 arch/s390/include/asm/uaccess.h         |  29 +++------
 arch/s390/kernel/asm-offsets.c          |   2 +-
 arch/s390/kernel/entry.S                |  26 ++++++--
 arch/s390/kernel/head64.S               |   2 +-
 arch/s390/kernel/vdso.c                 |  44 ++-----------
 arch/s390/kernel/vdso32/getcpu.S        |  16 +----
 arch/s390/kernel/vdso64/clock_gettime.S |  19 ++----
 arch/s390/kernel/vdso64/getcpu.S        |  15 +----
 arch/s390/lib/uaccess.c                 |  90 +++++++++++++++++++++++---
 arch/s390/mm/fault.c                    | 108 +++++++++++++++++++-------------
 arch/s390/mm/init.c                     |   1 +
 arch/s390/mm/pgalloc.c                  |   4 +-
 16 files changed, 228 insertions(+), 210 deletions(-)

(limited to 'arch/s390/include/asm')

diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h
index 9b5a3469fed9..5e97a4353147 100644
--- a/arch/s390/include/asm/futex.h
+++ b/arch/s390/include/asm/futex.h
@@ -26,9 +26,9 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
 		u32 __user *uaddr)
 {
 	int oldval = 0, newval, ret;
+	mm_segment_t old_fs;
 
-	load_kernel_asce();
-
+	old_fs = enable_sacf_uaccess();
 	pagefault_disable();
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -55,6 +55,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
 		ret = -ENOSYS;
 	}
 	pagefault_enable();
+	disable_sacf_uaccess(old_fs);
 
 	if (!ret)
 		*oval = oldval;
@@ -65,9 +66,10 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
 static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 						u32 oldval, u32 newval)
 {
+	mm_segment_t old_fs;
 	int ret;
 
-	load_kernel_asce();
+	old_fs = enable_sacf_uaccess();
 	asm volatile(
 		"   sacf 256\n"
 		"0: cs   %1,%4,0(%5)\n"
@@ -77,6 +79,7 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 		: "=d" (ret), "+d" (oldval), "=m" (*uaddr)
 		: "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr)
 		: "cc", "memory");
+	disable_sacf_uaccess(old_fs);
 	*uval = oldval;
 	return ret;
 }
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index 9eb36a1592c7..2306fa17f6cd 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -115,33 +115,28 @@ struct lowcore {
 	/* Address space pointer. */
 	__u64	kernel_asce;			/* 0x0378 */
 	__u64	user_asce;			/* 0x0380 */
+	__u64	vdso_asce;			/* 0x0388 */
 
 	/*
 	 * The lpp and current_pid fields form a
 	 * 64-bit value that is set as program
 	 * parameter with the LPP instruction.
 	 */
-	__u32	lpp;				/* 0x0388 */
-	__u32	current_pid;			/* 0x038c */
+	__u32	lpp;				/* 0x0390 */
+	__u32	current_pid;			/* 0x0394 */
 
 	/* SMP info area */
-	__u32	cpu_nr;				/* 0x0390 */
-	__u32	softirq_pending;		/* 0x0394 */
-	__u64	percpu_offset;			/* 0x0398 */
-	__u64	vdso_per_cpu_data;		/* 0x03a0 */
-	__u64	machine_flags;			/* 0x03a8 */
-	__u32	preempt_count;			/* 0x03b0 */
-	__u8	pad_0x03b4[0x03b8-0x03b4];	/* 0x03b4 */
-	__u64	gmap;				/* 0x03b8 */
-	__u32	spinlock_lockval;		/* 0x03c0 */
-	__u32	spinlock_index;			/* 0x03c4 */
-	__u32	fpu_flags;			/* 0x03c8 */
-	__u8	pad_0x03cc[0x0400-0x03cc];	/* 0x03cc */
-
-	/* Per cpu primary space access list */
-	__u32	paste[16];			/* 0x0400 */
-
-	__u8	pad_0x04c0[0x0e00-0x0440];	/* 0x0440 */
+	__u32	cpu_nr;				/* 0x0398 */
+	__u32	softirq_pending;		/* 0x039c */
+	__u32	preempt_count;			/* 0x03a0 */
+	__u32	spinlock_lockval;		/* 0x03a4 */
+	__u32	spinlock_index;			/* 0x03a8 */
+	__u32	fpu_flags;			/* 0x03ac */
+	__u64	percpu_offset;			/* 0x03b0 */
+	__u64	vdso_per_cpu_data;		/* 0x03b8 */
+	__u64	machine_flags;			/* 0x03c0 */
+	__u64	gmap;				/* 0x03c8 */
+	__u8	pad_0x03d0[0x0e00-0x03d0];	/* 0x03d0 */
 
 	/*
 	 * 0xe00 contains the address of the IPL Parameter Information
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 43607bb12cc2..6133aa376b7c 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -71,41 +71,38 @@ static inline int init_new_context(struct task_struct *tsk,
 static inline void set_user_asce(struct mm_struct *mm)
 {
 	S390_lowcore.user_asce = mm->context.asce;
-	if (current->thread.mm_segment.ar4)
-		__ctl_load(S390_lowcore.user_asce, 7, 7);
-	set_cpu_flag(CIF_ASCE_PRIMARY);
+	__ctl_load(S390_lowcore.user_asce, 1, 1);
+	clear_cpu_flag(CIF_ASCE_PRIMARY);
 }
 
 static inline void clear_user_asce(void)
 {
 	S390_lowcore.user_asce = S390_lowcore.kernel_asce;
-
-	__ctl_load(S390_lowcore.user_asce, 1, 1);
-	__ctl_load(S390_lowcore.user_asce, 7, 7);
-}
-
-static inline void load_kernel_asce(void)
-{
-	unsigned long asce;
-
-	__ctl_store(asce, 1, 1);
-	if (asce != S390_lowcore.kernel_asce)
-		__ctl_load(S390_lowcore.kernel_asce, 1, 1);
+	__ctl_load(S390_lowcore.kernel_asce, 1, 1);
 	set_cpu_flag(CIF_ASCE_PRIMARY);
 }
 
+mm_segment_t enable_sacf_uaccess(void);
+void disable_sacf_uaccess(mm_segment_t old_fs);
+
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			     struct task_struct *tsk)
 {
 	int cpu = smp_processor_id();
 
-	S390_lowcore.user_asce = next->context.asce;
 	if (prev == next)
 		return;
+	S390_lowcore.user_asce = next->context.asce;
 	cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
-	/* Clear old ASCE by loading the kernel ASCE. */
-	__ctl_load(S390_lowcore.kernel_asce, 1, 1);
-	__ctl_load(S390_lowcore.kernel_asce, 7, 7);
+	/* Clear previous user-ASCE from CR1 and CR7 */
+	if (!test_cpu_flag(CIF_ASCE_PRIMARY)) {
+		__ctl_load(S390_lowcore.kernel_asce, 1, 1);
+		set_cpu_flag(CIF_ASCE_PRIMARY);
+	}
+	if (test_cpu_flag(CIF_ASCE_SECONDARY)) {
+		__ctl_load(S390_lowcore.vdso_asce, 7, 7);
+		clear_cpu_flag(CIF_ASCE_SECONDARY);
+	}
 	cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask);
 }
 
@@ -115,7 +112,6 @@ static inline void finish_arch_post_lock_switch(void)
 	struct task_struct *tsk = current;
 	struct mm_struct *mm = tsk->mm;
 
-	load_kernel_asce();
 	if (mm) {
 		preempt_disable();
 		while (atomic_read(&mm->context.flush_count))
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index f25bfe888933..709351bce80e 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -109,9 +109,7 @@ extern void execve_tail(void);
 
 #define HAVE_ARCH_PICK_MMAP_LAYOUT
 
-typedef struct {
-        __u32 ar4;
-} mm_segment_t;
+typedef unsigned int mm_segment_t;
 
 /*
  * Thread structure
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index cdd0f0d999e2..ad6b91013a05 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -16,7 +16,7 @@
 #include <asm/processor.h>
 #include <asm/ctl_reg.h>
 #include <asm/extable.h>
-
+#include <asm/facility.h>
 
 /*
  * The fs value determines whether argument validity checking should be
@@ -26,27 +26,16 @@
  * For historical reasons, these macros are grossly misnamed.
  */
 
-#define MAKE_MM_SEG(a)  ((mm_segment_t) { (a) })
-
-
-#define KERNEL_DS       MAKE_MM_SEG(0)
-#define USER_DS         MAKE_MM_SEG(1)
+#define KERNEL_DS	(0)
+#define KERNEL_DS_SACF	(1)
+#define USER_DS		(2)
+#define USER_DS_SACF	(3)
 
 #define get_ds()        (KERNEL_DS)
 #define get_fs()        (current->thread.mm_segment)
-#define segment_eq(a,b) ((a).ar4 == (b).ar4)
+#define segment_eq(a,b) (((a) & 2) == ((b) & 2))
 
-static inline void set_fs(mm_segment_t fs)
-{
-	current->thread.mm_segment = fs;
-	if (uaccess_kernel()) {
-		set_cpu_flag(CIF_ASCE_SECONDARY);
-		__ctl_load(S390_lowcore.kernel_asce, 7, 7);
-	} else {
-		clear_cpu_flag(CIF_ASCE_SECONDARY);
-		__ctl_load(S390_lowcore.user_asce, 7, 7);
-	}
-}
+void set_fs(mm_segment_t fs);
 
 static inline int __range_ok(unsigned long addr, unsigned long size)
 {
@@ -95,7 +84,7 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n);
 
 static inline int __put_user_fn(void *x, void __user *ptr, unsigned long size)
 {
-	unsigned long spec = 0x810000UL;
+	unsigned long spec = 0x010000UL;
 	int rc;
 
 	switch (size) {
@@ -125,7 +114,7 @@ static inline int __put_user_fn(void *x, void __user *ptr, unsigned long size)
 
 static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size)
 {
-	unsigned long spec = 0x81UL;
+	unsigned long spec = 0x01UL;
 	int rc;
 
 	switch (size) {
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index 33ec80df7ed4..587b195b588d 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -171,6 +171,7 @@ int main(void)
 	OFFSET(__LC_RESTART_DATA, lowcore, restart_data);
 	OFFSET(__LC_RESTART_SOURCE, lowcore, restart_source);
 	OFFSET(__LC_USER_ASCE, lowcore, user_asce);
+	OFFSET(__LC_VDSO_ASCE, lowcore, vdso_asce);
 	OFFSET(__LC_LPP, lowcore, lpp);
 	OFFSET(__LC_CURRENT_PID, lowcore, current_pid);
 	OFFSET(__LC_PERCPU_OFFSET, lowcore, percpu_offset);
@@ -178,7 +179,6 @@ int main(void)
 	OFFSET(__LC_MACHINE_FLAGS, lowcore, machine_flags);
 	OFFSET(__LC_PREEMPT_COUNT, lowcore, preempt_count);
 	OFFSET(__LC_GMAP, lowcore, gmap);
-	OFFSET(__LC_PASTE, lowcore, paste);
 	/* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */
 	OFFSET(__LC_DUMP_REIPL, lowcore, ipib);
 	/* hardware defined lowcore locations 0x1000 - 0x18ff */
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index ee53ac7b1ab8..a316cd6999ad 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -379,13 +379,21 @@ ENTRY(system_call)
 	jg	s390_handle_mcck	# TIF bit will be cleared by handler
 
 #
-# _CIF_ASCE_PRIMARY and/or CIF_ASCE_SECONDARY set, load user space asce
+# _CIF_ASCE_PRIMARY and/or _CIF_ASCE_SECONDARY set, load user space asce
 #
 .Lsysc_asce:
+	ni	__LC_CPU_FLAGS+7,255-_CIF_ASCE_SECONDARY
+	lctlg	%c7,%c7,__LC_VDSO_ASCE		# load secondary asce
+	TSTMSK	__LC_CPU_FLAGS,_CIF_ASCE_PRIMARY
+	jz	.Lsysc_return
+#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES
+	tm	__LC_STFLE_FAC_LIST+3,0x10	# has MVCOS ?
+	jnz	.Lsysc_set_fs_fixup
 	ni	__LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY
 	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
-	TSTMSK	__LC_CPU_FLAGS,_CIF_ASCE_SECONDARY
-	jz	.Lsysc_return
+	j	.Lsysc_return
+.Lsysc_set_fs_fixup:
+#endif
 	larl	%r14,.Lsysc_return
 	jg	set_fs_fixup
 
@@ -741,10 +749,18 @@ ENTRY(io_int_handler)
 # _CIF_ASCE_PRIMARY and/or CIF_ASCE_SECONDARY set, load user space asce
 #
 .Lio_asce:
+	ni	__LC_CPU_FLAGS+7,255-_CIF_ASCE_SECONDARY
+	lctlg	%c7,%c7,__LC_VDSO_ASCE		# load secondary asce
+	TSTMSK	__LC_CPU_FLAGS,_CIF_ASCE_PRIMARY
+	jz	.Lio_return
+#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES
+	tm	__LC_STFLE_FAC_LIST+3,0x10	# has MVCOS ?
+	jnz	.Lio_set_fs_fixup
 	ni	__LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY
 	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
-	TSTMSK	__LC_CPU_FLAGS,_CIF_ASCE_SECONDARY
-	jz	.Lio_return
+	j	.Lio_return
+.Lio_set_fs_fixup:
+#endif
 	larl	%r14,.Lio_return
 	jg	set_fs_fixup
 
diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S
index 172002da7075..38a973ccf501 100644
--- a/arch/s390/kernel/head64.S
+++ b/arch/s390/kernel/head64.S
@@ -28,7 +28,7 @@ ENTRY(startup_continue)
 	lctlg	%c0,%c15,.Lctl-.LPG1(%r13)	# load control registers
 	lg	%r12,.Lparmaddr-.LPG1(%r13)	# pointer to parameter area
 					# move IPL device to lowcore
-	lghi	%r0,__LC_PASTE
+	larl	%r0,boot_vdso_data
 	stg	%r0,__LC_VDSO_PER_CPU
 #
 # Setup stack
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index 0520854a4dab..39a218703c50 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -158,16 +158,9 @@ int vdso_alloc_per_cpu(struct lowcore *lowcore)
 {
 	unsigned long segment_table, page_table, page_frame;
 	struct vdso_per_cpu_data *vd;
-	u32 *psal, *aste;
-	int i;
-
-	lowcore->vdso_per_cpu_data = __LC_PASTE;
-
-	if (!vdso_enabled)
-		return 0;
 
 	segment_table = __get_free_pages(GFP_KERNEL, SEGMENT_ORDER);
-	page_table = get_zeroed_page(GFP_KERNEL | GFP_DMA);
+	page_table = get_zeroed_page(GFP_KERNEL);
 	page_frame = get_zeroed_page(GFP_KERNEL);
 	if (!segment_table || !page_table || !page_frame)
 		goto out;
@@ -179,25 +172,15 @@ int vdso_alloc_per_cpu(struct lowcore *lowcore)
 	vd->cpu_nr = lowcore->cpu_nr;
 	vd->node_id = cpu_to_node(vd->cpu_nr);
 
-	/* Set up access register mode page table */
+	/* Set up page table for the vdso address space */
 	memset64((u64 *)segment_table, _SEGMENT_ENTRY_EMPTY, _CRST_ENTRIES);
 	memset64((u64 *)page_table, _PAGE_INVALID, PTRS_PER_PTE);
 
 	*(unsigned long *) segment_table = _SEGMENT_ENTRY + page_table;
 	*(unsigned long *) page_table = _PAGE_PROTECT + page_frame;
 
-	psal = (u32 *) (page_table + 256*sizeof(unsigned long));
-	aste = psal + 32;
-
-	for (i = 4; i < 32; i += 4)
-		psal[i] = 0x80000000;
-
-	lowcore->paste[4] = (u32)(addr_t) psal;
-	psal[0] = 0x02000000;
-	psal[2] = (u32)(addr_t) aste;
-	*(unsigned long *) (aste + 2) = segment_table +
+	lowcore->vdso_asce = segment_table +
 		_ASCE_TABLE_LENGTH + _ASCE_USER_BITS + _ASCE_TYPE_SEGMENT;
-	aste[4] = (u32)(addr_t) psal;
 	lowcore->vdso_per_cpu_data = page_frame;
 
 	return 0;
@@ -212,14 +195,8 @@ out:
 void vdso_free_per_cpu(struct lowcore *lowcore)
 {
 	unsigned long segment_table, page_table, page_frame;
-	u32 *psal, *aste;
-
-	if (!vdso_enabled)
-		return;
 
-	psal = (u32 *)(addr_t) lowcore->paste[4];
-	aste = (u32 *)(addr_t) psal[2];
-	segment_table = *(unsigned long *)(aste + 2) & PAGE_MASK;
+	segment_table = lowcore->vdso_asce & PAGE_MASK;
 	page_table = *(unsigned long *) segment_table;
 	page_frame = *(unsigned long *) page_table;
 
@@ -228,16 +205,6 @@ void vdso_free_per_cpu(struct lowcore *lowcore)
 	free_pages(segment_table, SEGMENT_ORDER);
 }
 
-static void vdso_init_cr5(void)
-{
-	unsigned long cr5;
-
-	if (!vdso_enabled)
-		return;
-	cr5 = offsetof(struct lowcore, paste);
-	__ctl_load(cr5, 5, 5);
-}
-
 /*
  * This is called from binfmt_elf, we create the special vma for the
  * vDSO and insert it into the mm struct tree
@@ -314,8 +281,6 @@ static int __init vdso_init(void)
 {
 	int i;
 
-	if (!vdso_enabled)
-		return 0;
 	vdso_init_data(vdso_data);
 #ifdef CONFIG_COMPAT
 	/* Calculate the size of the 32 bit vDSO */
@@ -354,7 +319,6 @@ static int __init vdso_init(void)
 	vdso64_pagelist[vdso64_pages] = NULL;
 	if (vdso_alloc_per_cpu(&S390_lowcore))
 		BUG();
-	vdso_init_cr5();
 
 	get_page(virt_to_page(vdso_data));
 
diff --git a/arch/s390/kernel/vdso32/getcpu.S b/arch/s390/kernel/vdso32/getcpu.S
index 6e30769dd017..5477a2c112fb 100644
--- a/arch/s390/kernel/vdso32/getcpu.S
+++ b/arch/s390/kernel/vdso32/getcpu.S
@@ -15,23 +15,11 @@
 	.type  __kernel_getcpu,@function
 __kernel_getcpu:
 	.cfi_startproc
-	ear	%r1,%a4
-	lhi	%r4,1
-	sll	%r4,24
-	sar	%a4,%r4
 	la	%r4,0
-	epsw	%r0,0
-	sacf	512
+	sacf	256
 	l	%r5,__VDSO_CPU_NR(%r4)
 	l	%r4,__VDSO_NODE_ID(%r4)
-	tml	%r0,0x4000
-	jo	1f
-	tml	%r0,0x8000
-	jno	0f
-	sacf	256
-	j	1f
-0:	sacf	0
-1:	sar	%a4,%r1
+	sacf	0
 	ltr	%r2,%r2
 	jz	2f
 	st	%r5,0(%r2)
diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S
index 9c3b12626dba..5d7b56b49458 100644
--- a/arch/s390/kernel/vdso64/clock_gettime.S
+++ b/arch/s390/kernel/vdso64/clock_gettime.S
@@ -114,23 +114,12 @@ __kernel_clock_gettime:
 	br	%r14
 
 	/* CPUCLOCK_VIRT for this thread */
-9:	icm	%r0,15,__VDSO_ECTG_OK(%r5)
+9:	lghi	%r4,0
+	icm	%r0,15,__VDSO_ECTG_OK(%r5)
 	jz	12f
-	ear	%r2,%a4
-	llilh	%r4,0x0100
-	sar	%a4,%r4
-	lghi	%r4,0
-	epsw	%r5,0
-	sacf	512				/* Magic ectg instruction */
+	sacf	256				/* Magic ectg instruction */
 	.insn	ssf,0xc80100000000,__VDSO_ECTG_BASE(4),__VDSO_ECTG_USER(4),4
-	tml	%r5,0x4000
-	jo	11f
-	tml	%r5,0x8000
-	jno	10f
-	sacf	256
-	j	11f
-10:	sacf	0
-11:	sar	%a4,%r2
+	sacf	0
 	algr	%r1,%r0				/* r1 = cputime as TOD value */
 	mghi	%r1,1000			/* convert to nanoseconds */
 	srlg	%r1,%r1,12			/* r1 = cputime in nanosec */
diff --git a/arch/s390/kernel/vdso64/getcpu.S b/arch/s390/kernel/vdso64/getcpu.S
index 43983764b959..e9c34364d97b 100644
--- a/arch/s390/kernel/vdso64/getcpu.S
+++ b/arch/s390/kernel/vdso64/getcpu.S
@@ -15,22 +15,11 @@
 	.type  __kernel_getcpu,@function
 __kernel_getcpu:
 	.cfi_startproc
-	ear	%r1,%a4
-	llilh	%r4,0x0100
-	sar	%a4,%r4
 	la	%r4,0
-	epsw	%r0,0
-	sacf	512
+	sacf	256
 	l	%r5,__VDSO_CPU_NR(%r4)
 	l	%r4,__VDSO_NODE_ID(%r4)
-	tml	%r0,0x4000
-	jo	1f
-	tml	%r0,0x8000
-	jno	0f
-	sacf	256
-	j	1f
-0:	sacf	0
-1:	sar	%a4,%r1
+	sacf	0
 	ltgr	%r2,%r2
 	jz	2f
 	st	%r5,0(%r2)
diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index 802903c50de1..cae5a1e16cbd 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -40,10 +40,67 @@ static inline int copy_with_mvcos(void)
 }
 #endif
 
+void set_fs(mm_segment_t fs)
+{
+	current->thread.mm_segment = fs;
+	if (fs == USER_DS) {
+		__ctl_load(S390_lowcore.user_asce, 1, 1);
+		clear_cpu_flag(CIF_ASCE_PRIMARY);
+	} else {
+		__ctl_load(S390_lowcore.kernel_asce, 1, 1);
+		set_cpu_flag(CIF_ASCE_PRIMARY);
+	}
+	if (fs & 1) {
+		if (fs == USER_DS_SACF)
+			__ctl_load(S390_lowcore.user_asce, 7, 7);
+		else
+			__ctl_load(S390_lowcore.kernel_asce, 7, 7);
+		set_cpu_flag(CIF_ASCE_SECONDARY);
+	}
+}
+EXPORT_SYMBOL(set_fs);
+
+mm_segment_t enable_sacf_uaccess(void)
+{
+	mm_segment_t old_fs;
+	unsigned long asce, cr;
+
+	old_fs = current->thread.mm_segment;
+	if (old_fs & 1)
+		return old_fs;
+	current->thread.mm_segment |= 1;
+	asce = S390_lowcore.kernel_asce;
+	if (likely(old_fs == USER_DS)) {
+		__ctl_store(cr, 1, 1);
+		if (cr != S390_lowcore.kernel_asce) {
+			__ctl_load(S390_lowcore.kernel_asce, 1, 1);
+			set_cpu_flag(CIF_ASCE_PRIMARY);
+		}
+		asce = S390_lowcore.user_asce;
+	}
+	__ctl_store(cr, 7, 7);
+	if (cr != asce) {
+		__ctl_load(asce, 7, 7);
+		set_cpu_flag(CIF_ASCE_SECONDARY);
+	}
+	return old_fs;
+}
+EXPORT_SYMBOL(enable_sacf_uaccess);
+
+void disable_sacf_uaccess(mm_segment_t old_fs)
+{
+	if (old_fs == USER_DS && test_facility(27)) {
+		__ctl_load(S390_lowcore.user_asce, 1, 1);
+		clear_cpu_flag(CIF_ASCE_PRIMARY);
+	}
+	current->thread.mm_segment = old_fs;
+}
+EXPORT_SYMBOL(disable_sacf_uaccess);
+
 static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr,
 						 unsigned long size)
 {
-	register unsigned long reg0 asm("0") = 0x81UL;
+	register unsigned long reg0 asm("0") = 0x01UL;
 	unsigned long tmp1, tmp2;
 
 	tmp1 = -4096UL;
@@ -74,8 +131,9 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr,
 						unsigned long size)
 {
 	unsigned long tmp1, tmp2;
+	mm_segment_t old_fs;
 
-	load_kernel_asce();
+	old_fs = enable_sacf_uaccess();
 	tmp1 = -256UL;
 	asm volatile(
 		"   sacf  0\n"
@@ -102,6 +160,7 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr,
 		EX_TABLE(7b,3b) EX_TABLE(8b,3b) EX_TABLE(9b,6b)
 		: "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
 		: : "cc", "memory");
+	disable_sacf_uaccess(old_fs);
 	return size;
 }
 
@@ -116,7 +175,7 @@ EXPORT_SYMBOL(raw_copy_from_user);
 static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x,
 					       unsigned long size)
 {
-	register unsigned long reg0 asm("0") = 0x810000UL;
+	register unsigned long reg0 asm("0") = 0x010000UL;
 	unsigned long tmp1, tmp2;
 
 	tmp1 = -4096UL;
@@ -147,8 +206,9 @@ static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x,
 					      unsigned long size)
 {
 	unsigned long tmp1, tmp2;
+	mm_segment_t old_fs;
 
-	load_kernel_asce();
+	old_fs = enable_sacf_uaccess();
 	tmp1 = -256UL;
 	asm volatile(
 		"   sacf  0\n"
@@ -175,6 +235,7 @@ static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x,
 		EX_TABLE(7b,3b) EX_TABLE(8b,3b) EX_TABLE(9b,6b)
 		: "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
 		: : "cc", "memory");
+	disable_sacf_uaccess(old_fs);
 	return size;
 }
 
@@ -189,7 +250,7 @@ EXPORT_SYMBOL(raw_copy_to_user);
 static inline unsigned long copy_in_user_mvcos(void __user *to, const void __user *from,
 					       unsigned long size)
 {
-	register unsigned long reg0 asm("0") = 0x810081UL;
+	register unsigned long reg0 asm("0") = 0x010001UL;
 	unsigned long tmp1, tmp2;
 
 	tmp1 = -4096UL;
@@ -212,9 +273,10 @@ static inline unsigned long copy_in_user_mvcos(void __user *to, const void __use
 static inline unsigned long copy_in_user_mvc(void __user *to, const void __user *from,
 					     unsigned long size)
 {
+	mm_segment_t old_fs;
 	unsigned long tmp1;
 
-	load_kernel_asce();
+	old_fs = enable_sacf_uaccess();
 	asm volatile(
 		"   sacf  256\n"
 		"   aghi  %0,-1\n"
@@ -238,6 +300,7 @@ static inline unsigned long copy_in_user_mvc(void __user *to, const void __user
 		EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b)
 		: "+a" (size), "+a" (to), "+a" (from), "=a" (tmp1)
 		: : "cc", "memory");
+	disable_sacf_uaccess(old_fs);
 	return size;
 }
 
@@ -251,7 +314,7 @@ EXPORT_SYMBOL(raw_copy_in_user);
 
 static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size)
 {
-	register unsigned long reg0 asm("0") = 0x810000UL;
+	register unsigned long reg0 asm("0") = 0x010000UL;
 	unsigned long tmp1, tmp2;
 
 	tmp1 = -4096UL;
@@ -279,9 +342,10 @@ static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size
 
 static inline unsigned long clear_user_xc(void __user *to, unsigned long size)
 {
+	mm_segment_t old_fs;
 	unsigned long tmp1, tmp2;
 
-	load_kernel_asce();
+	old_fs = enable_sacf_uaccess();
 	asm volatile(
 		"   sacf  256\n"
 		"   aghi  %0,-1\n"
@@ -310,6 +374,7 @@ static inline unsigned long clear_user_xc(void __user *to, unsigned long size)
 		EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b)
 		: "+a" (size), "+a" (to), "=a" (tmp1), "=a" (tmp2)
 		: : "cc", "memory");
+	disable_sacf_uaccess(old_fs);
 	return size;
 }
 
@@ -345,10 +410,15 @@ static inline unsigned long strnlen_user_srst(const char __user *src,
 
 unsigned long __strnlen_user(const char __user *src, unsigned long size)
 {
+	mm_segment_t old_fs;
+	unsigned long len;
+
 	if (unlikely(!size))
 		return 0;
-	load_kernel_asce();
-	return strnlen_user_srst(src, size);
+	old_fs = enable_sacf_uaccess();
+	len = strnlen_user_srst(src, size);
+	disable_sacf_uaccess(old_fs);
+	return len;
 }
 EXPORT_SYMBOL(__strnlen_user);
 
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index be974b3eb7e4..14654007dce4 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -50,6 +50,13 @@
 #define VM_FAULT_SIGNAL		0x080000
 #define VM_FAULT_PFAULT		0x100000
 
+enum fault_type {
+	KERNEL_FAULT,
+	USER_FAULT,
+	VDSO_FAULT,
+	GMAP_FAULT,
+};
+
 static unsigned long store_indication __read_mostly;
 
 static int __init fault_init(void)
@@ -99,27 +106,34 @@ void bust_spinlocks(int yes)
 }
 
 /*
- * Returns the address space associated with the fault.
- * Returns 0 for kernel space and 1 for user space.
+ * Find out which address space caused the exception.
+ * Access register mode is impossible, ignore space == 3.
  */
-static inline int user_space_fault(struct pt_regs *regs)
+static inline enum fault_type get_fault_type(struct pt_regs *regs)
 {
 	unsigned long trans_exc_code;
 
-	/*
-	 * The lowest two bits of the translation exception
-	 * identification indicate which paging table was used.
-	 */
 	trans_exc_code = regs->int_parm_long & 3;
-	if (trans_exc_code == 3) /* home space -> kernel */
-		return 0;
-	if (user_mode(regs))
-		return 1;
-	if (trans_exc_code == 2) /* secondary space -> set_fs */
-		return current->thread.mm_segment.ar4;
-	if (test_pt_regs_flag(regs, PIF_GUEST_FAULT))
-		return 1;
-	return 0;
+	if (likely(trans_exc_code == 0)) {
+		/* primary space exception */
+		if (IS_ENABLED(CONFIG_PGSTE) &&
+		    test_pt_regs_flag(regs, PIF_GUEST_FAULT))
+			return GMAP_FAULT;
+		if (current->thread.mm_segment == USER_DS)
+			return USER_FAULT;
+		return KERNEL_FAULT;
+	}
+	if (trans_exc_code == 2) {
+		/* secondary space exception */
+		if (current->thread.mm_segment & 1) {
+			if (current->thread.mm_segment == USER_DS_SACF)
+				return USER_FAULT;
+			return KERNEL_FAULT;
+		}
+		return VDSO_FAULT;
+	}
+	/* home space exception -> access via kernel ASCE */
+	return KERNEL_FAULT;
 }
 
 static int bad_address(void *p)
@@ -204,20 +218,23 @@ static void dump_fault_info(struct pt_regs *regs)
 		break;
 	}
 	pr_cont("mode while using ");
-	if (!user_space_fault(regs)) {
-		asce = S390_lowcore.kernel_asce;
-		pr_cont("kernel ");
-	}
-#ifdef CONFIG_PGSTE
-	else if (test_pt_regs_flag(regs, PIF_GUEST_FAULT)) {
-		struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
-		asce = gmap->asce;
-		pr_cont("gmap ");
-	}
-#endif
-	else {
+	switch (get_fault_type(regs)) {
+	case USER_FAULT:
 		asce = S390_lowcore.user_asce;
 		pr_cont("user ");
+		break;
+	case VDSO_FAULT:
+		asce = S390_lowcore.vdso_asce;
+		pr_cont("vdso ");
+		break;
+	case GMAP_FAULT:
+		asce = ((struct gmap *) S390_lowcore.gmap)->asce;
+		pr_cont("gmap ");
+		break;
+	case KERNEL_FAULT:
+		asce = S390_lowcore.kernel_asce;
+		pr_cont("kernel ");
+		break;
 	}
 	pr_cont("ASCE.\n");
 	dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK);
@@ -273,7 +290,7 @@ static noinline void do_no_context(struct pt_regs *regs)
 	 * Oops. The kernel tried to access some bad page. We'll have to
 	 * terminate things with extreme prejudice.
 	 */
-	if (!user_space_fault(regs))
+	if (get_fault_type(regs) == KERNEL_FAULT)
 		printk(KERN_ALERT "Unable to handle kernel pointer dereference"
 		       " in virtual kernel address space\n");
 	else
@@ -395,12 +412,11 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, int fault)
  */
 static inline int do_exception(struct pt_regs *regs, int access)
 {
-#ifdef CONFIG_PGSTE
 	struct gmap *gmap;
-#endif
 	struct task_struct *tsk;
 	struct mm_struct *mm;
 	struct vm_area_struct *vma;
+	enum fault_type type;
 	unsigned long trans_exc_code;
 	unsigned long address;
 	unsigned int flags;
@@ -425,8 +441,19 @@ static inline int do_exception(struct pt_regs *regs, int access)
 	 * user context.
 	 */
 	fault = VM_FAULT_BADCONTEXT;
-	if (unlikely(!user_space_fault(regs) || faulthandler_disabled() || !mm))
+	type = get_fault_type(regs);
+	switch (type) {
+	case KERNEL_FAULT:
+		goto out;
+	case VDSO_FAULT:
+		fault = VM_FAULT_BADMAP;
 		goto out;
+	case USER_FAULT:
+	case GMAP_FAULT:
+		if (faulthandler_disabled() || !mm)
+			goto out;
+		break;
+	}
 
 	address = trans_exc_code & __FAIL_ADDR_MASK;
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
@@ -437,10 +464,9 @@ static inline int do_exception(struct pt_regs *regs, int access)
 		flags |= FAULT_FLAG_WRITE;
 	down_read(&mm->mmap_sem);
 
-#ifdef CONFIG_PGSTE
-	gmap = test_pt_regs_flag(regs, PIF_GUEST_FAULT) ?
-		(struct gmap *) S390_lowcore.gmap : NULL;
-	if (gmap) {
+	gmap = NULL;
+	if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
+		gmap = (struct gmap *) S390_lowcore.gmap;
 		current->thread.gmap_addr = address;
 		current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
 		current->thread.gmap_int_code = regs->int_code & 0xffff;
@@ -452,7 +478,6 @@ static inline int do_exception(struct pt_regs *regs, int access)
 		if (gmap->pfault_enabled)
 			flags |= FAULT_FLAG_RETRY_NOWAIT;
 	}
-#endif
 
 retry:
 	fault = VM_FAULT_BADMAP;
@@ -507,15 +532,14 @@ retry:
 				      regs, address);
 		}
 		if (fault & VM_FAULT_RETRY) {
-#ifdef CONFIG_PGSTE
-			if (gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) {
+			if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
+			    (flags & FAULT_FLAG_RETRY_NOWAIT)) {
 				/* FAULT_FLAG_RETRY_NOWAIT has been set,
 				 * mmap_sem has not been released */
 				current->thread.gmap_pfault = 1;
 				fault = VM_FAULT_PFAULT;
 				goto out_up;
 			}
-#endif
 			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
 			 * of starvation. */
 			flags &= ~(FAULT_FLAG_ALLOW_RETRY |
@@ -525,8 +549,7 @@ retry:
 			goto retry;
 		}
 	}
-#ifdef CONFIG_PGSTE
-	if (gmap) {
+	if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
 		address =  __gmap_link(gmap, current->thread.gmap_addr,
 				       address);
 		if (address == -EFAULT) {
@@ -538,7 +561,6 @@ retry:
 			goto out_up;
 		}
 	}
-#endif
 	fault = 0;
 out_up:
 	up_read(&mm->mmap_sem);
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 817c9e16e83e..671535e64aba 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -95,6 +95,7 @@ void __init paging_init(void)
 	}
 	init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits;
 	S390_lowcore.kernel_asce = init_mm.context.asce;
+	S390_lowcore.user_asce = S390_lowcore.kernel_asce;
 	crst_table_init((unsigned long *) init_mm.pgd, pgd_type);
 	vmem_map_init();
 
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 4ad4c4f77b4d..434a9564917b 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -71,10 +71,8 @@ static void __crst_table_upgrade(void *arg)
 {
 	struct mm_struct *mm = arg;
 
-	if (current->active_mm == mm) {
-		clear_user_asce();
+	if (current->active_mm == mm)
 		set_user_asce(mm);
-	}
 	__tlb_flush_local();
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 11776eaa6568f5357542bf41b0c7bb90854137cc Mon Sep 17 00:00:00 2001
From: Vasily Gorbik <gor@linux.vnet.ibm.com>
Date: Mon, 13 Nov 2017 16:37:33 +0100
Subject: s390: correct some inline assembly constraints

Inline assembly code changed in this patch should really use "Q"
constraint "Memory reference without index register and with short
displacement". The kernel does not compile with kasan support enabled
otherwise (due to stack instrumentation).

Signed-off-by: Vasily Gorbik <gor@linux.vnet.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/include/asm/cpu_mf.h    | 2 +-
 arch/s390/include/asm/lowcore.h   | 4 ++--
 arch/s390/include/asm/processor.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/s390/include/asm')

diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h
index 05480e4cc5ca..7364130a29c8 100644
--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h
@@ -167,7 +167,7 @@ static inline int lcctl(u64 ctl)
 		"	.insn	s,0xb2840000,%1\n"
 		"	ipm	%0\n"
 		"	srl	%0,28\n"
-		: "=d" (cc) : "m" (ctl) : "cc");
+		: "=d" (cc) : "Q" (ctl) : "cc");
 	return cc;
 }
 
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index 2306fa17f6cd..ec6592e8ba36 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -188,14 +188,14 @@ extern struct lowcore *lowcore_ptr[];
 
 static inline void set_prefix(__u32 address)
 {
-	asm volatile("spx %0" : : "m" (address) : "memory");
+	asm volatile("spx %0" : : "Q" (address) : "memory");
 }
 
 static inline __u32 store_prefix(void)
 {
 	__u32 address;
 
-	asm volatile("stpx %0" : "=m" (address));
+	asm volatile("stpx %0" : "=Q" (address));
 	return address;
 }
 
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 709351bce80e..bfbfad482289 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -245,7 +245,7 @@ static inline unsigned short stap(void)
 {
 	unsigned short cpu_address;
 
-	asm volatile("stap %0" : "=m" (cpu_address));
+	asm volatile("stap %0" : "=Q" (cpu_address));
 	return cpu_address;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 3c6153e8145f74870bad11fa4344fd20f1ad3aaf Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 14 Nov 2017 13:35:14 +0100
Subject: s390/vdso: add missing boot_vdso_data declaration

sparse says:
arch/s390/kernel/vdso.c:150:18:
 warning: symbol 'boot_vdso_data' was not declared. Should it be static?

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/include/asm/vdso.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/s390/include/asm')

diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h
index ae6261ef97d5..169d7604eb80 100644
--- a/arch/s390/include/asm/vdso.h
+++ b/arch/s390/include/asm/vdso.h
@@ -46,6 +46,7 @@ struct vdso_per_cpu_data {
 };
 
 extern struct vdso_data *vdso_data;
+extern struct vdso_data boot_vdso_data;
 
 void vdso_alloc_boot_cpu(struct lowcore *lowcore);
 int vdso_alloc_per_cpu(struct lowcore *lowcore);
-- 
cgit v1.2.3-59-g8ed1b


From 049a2c2d486e8cc82c5cd79fa479c5b105b109e9 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 14 Nov 2017 15:20:24 +0100
Subject: s390: enable CPU alternatives unconditionally

Remove the CPU_ALTERNATIVES config option and enable the code
unconditionally. The config option was only added to avoid a conflict
with the named saved segment support. Since that code is gone there is
no reason to keep the CPU_ALTERNATIVES config option.

Just enable it unconditionally to also reduce the number of config
options and make it less likely that something breaks.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/Kconfig                   | 16 ----------------
 arch/s390/include/asm/alternative.h | 20 +++-----------------
 arch/s390/kernel/Makefile           |  3 +--
 arch/s390/kernel/module.c           | 15 ++++++---------
 4 files changed, 10 insertions(+), 44 deletions(-)

(limited to 'arch/s390/include/asm')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index f5beccbe74d8..84767046daff 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -539,22 +539,6 @@ config ARCH_RANDOM
 
 	  If unsure, say Y.
 
-config ALTERNATIVES
-	def_bool y
-	prompt "Patch optimized instructions for running CPU type"
-	help
-	  When enabled the kernel code is compiled with additional
-	  alternative instructions blocks optimized for newer CPU types.
-	  These alternative instructions blocks are patched at kernel boot
-	  time when running CPU supports them. This mechanism is used to
-	  optimize some critical code paths (i.e. spinlocks) for newer CPUs
-	  even if kernel is build to support older machine generations.
-
-	  This mechanism could be disabled by appending "noaltinstr"
-	  option to the kernel command line.
-
-	  If unsure, say Y.
-
 endmenu
 
 menu "Memory setup"
diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h
index 6c268f6a51d3..a72002056b54 100644
--- a/arch/s390/include/asm/alternative.h
+++ b/arch/s390/include/asm/alternative.h
@@ -15,14 +15,9 @@ struct alt_instr {
 	u8  replacementlen;	/* length of new instruction */
 } __packed;
 
-#ifdef CONFIG_ALTERNATIVES
-extern void apply_alternative_instructions(void);
-extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
-#else
-static inline void apply_alternative_instructions(void) {};
-static inline void apply_alternatives(struct alt_instr *start,
-				      struct alt_instr *end) {};
-#endif
+void apply_alternative_instructions(void);
+void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
+
 /*
  * |661:       |662:	  |6620      |663:
  * +-----------+---------------------+
@@ -109,7 +104,6 @@ static inline void apply_alternatives(struct alt_instr *start,
 	b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n"	\
 	INSTR_LEN_SANITY_CHECK(altinstr_len(num))
 
-#ifdef CONFIG_ALTERNATIVES
 /* alternative assembly primitive: */
 #define ALTERNATIVE(oldinstr, altinstr, facility) \
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
@@ -130,14 +124,6 @@ static inline void apply_alternatives(struct alt_instr *start,
 	ALTINSTR_ENTRY(facility1, 1)					\
 	ALTINSTR_ENTRY(facility2, 2)					\
 	".popsection\n"
-#else
-/* Alternative instructions are disabled, let's put just oldinstr in */
-#define ALTERNATIVE(oldinstr, altinstr, facility) \
-	oldinstr "\n"
-
-#define ALTERNATIVE_2(oldinstr, altinstr1, facility1, altinstr2, facility2) \
-	oldinstr "\n"
-#endif
 
 /*
  * Alternative instructions for different CPU types or capabilities.
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 83bc82001c06..0319f4e81ea4 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -59,7 +59,7 @@ obj-y	+= processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
 obj-y	+= debug.o irq.o ipl.o dis.o diag.o vdso.o als.o
 obj-y	+= sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o
 obj-y	+= runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
-obj-y	+= entry.o reipl.o relocate_kernel.o kdebugfs.o
+obj-y	+= entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o
 
 extra-y				+= head.o head64.o vmlinux.lds
 
@@ -77,7 +77,6 @@ obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_FUNCTION_TRACER)	+= mcount.o ftrace.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_UPROBES)		+= uprobes.o
-obj-$(CONFIG_ALTERNATIVES)	+= alternative.o
 
 obj-$(CONFIG_PERF_EVENTS)	+= perf_event.o perf_cpum_cf.o perf_cpum_sf.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf_events.o
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 6d9f73bb4142..7b87991416fd 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -433,16 +433,13 @@ int module_finalize(const Elf_Ehdr *hdr,
 	const Elf_Shdr *s;
 	char *secstrings;
 
-	if (IS_ENABLED(CONFIG_ALTERNATIVES)) {
-		secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
-		for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
-			if (!strcmp(".altinstructions",
-				    secstrings + s->sh_name)) {
-				/* patch .altinstructions */
-				void *aseg = (void *)s->sh_addr;
+	secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
+		if (!strcmp(".altinstructions", secstrings + s->sh_name)) {
+			/* patch .altinstructions */
+			void *aseg = (void *)s->sh_addr;
 
-				apply_alternatives(aseg, aseg + s->sh_size);
-			}
+			apply_alternatives(aseg, aseg + s->sh_size);
 		}
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 3d43b981eb841a9493717e6d509f59553dbe8c7a Mon Sep 17 00:00:00 2001
From: Pu Hou <bjhoupu@linux.vnet.ibm.com>
Date: Fri, 19 May 2017 11:16:55 +0200
Subject: s390/cpumf: remove raw event support in basic-only sampling mode

Raw sample was implemented to export the diagnostic samples.
With having this achieved with AUX buffers, there is no requirement
for basic samples to export raw data.  In particular, most basic
sampling information are consumed for creating the perf event sample.

Signed-off-by: Pu Hou <bjhoupu@linux.vnet.ibm.com>
Reviewed-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/perf_event.h |  17 ----
 arch/s390/kernel/perf_cpum_sf.c    | 185 ++++++-------------------------------
 2 files changed, 27 insertions(+), 175 deletions(-)

(limited to 'arch/s390/include/asm')

diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h
index 79aa6421fedb..d6c9d1e0dc2d 100644
--- a/arch/s390/include/asm/perf_event.h
+++ b/arch/s390/include/asm/perf_event.h
@@ -64,27 +64,10 @@ struct perf_sf_sde_regs {
 #define REG_OVERFLOW		1
 #define OVERFLOW_REG(hwc)	((hwc)->extra_reg.config)
 #define SFB_ALLOC_REG(hwc)	((hwc)->extra_reg.alloc)
-#define RAWSAMPLE_REG(hwc)	((hwc)->config)
 #define TEAR_REG(hwc)		((hwc)->last_tag)
 #define SAMPL_RATE(hwc)		((hwc)->event_base)
 #define SAMPL_FLAGS(hwc)	((hwc)->config_base)
 #define SAMPL_DIAG_MODE(hwc)	(SAMPL_FLAGS(hwc) & PERF_CPUM_SF_DIAG_MODE)
 #define SDB_FULL_BLOCKS(hwc)	(SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FULL_BLOCKS)
 
-/* Structure for sampling data entries to be passed as perf raw sample data
- * to user space.  Note that raw sample data must be aligned and, thus, might
- * be padded with zeros.
- */
-struct sf_raw_sample {
-#define SF_RAW_SAMPLE_BASIC	PERF_CPUM_SF_BASIC_MODE
-#define SF_RAW_SAMPLE_DIAG	PERF_CPUM_SF_DIAG_MODE
-	u64			format;
-	u32			 size;	  /* Size of sf_raw_sample */
-	u16			bsdes;	  /* Basic-sampling data entry size */
-	u16			dsdes;	  /* Diagnostic-sampling data entry size */
-	struct hws_basic_entry	basic;	  /* Basic-sampling data entry */
-	struct hws_diag_entry	 diag;	  /* Diagnostic-sampling data entry */
-	u8		    padding[];	  /* Padding to next multiple of 8 */
-} __packed;
-
 #endif /* _ASM_S390_PERF_EVENT_H */
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index b9248a70b232..4d8ddd8bd9be 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -351,22 +351,6 @@ static void sfb_init_allocs(unsigned long num, struct hw_perf_event *hwc)
 	sfb_account_allocs(num, hwc);
 }
 
-static size_t event_sample_size(struct hw_perf_event *hwc)
-{
-	struct sf_raw_sample *sfr = (struct sf_raw_sample *) RAWSAMPLE_REG(hwc);
-	size_t sample_size;
-
-	/* The sample size depends on the sampling function: The basic-sampling
-	 * function must be always enabled, diagnostic-sampling function is
-	 * optional.
-	 */
-	sample_size = sfr->bsdes;
-	if (SAMPL_DIAG_MODE(hwc))
-		sample_size += sfr->dsdes;
-
-	return sample_size;
-}
-
 static void deallocate_buffers(struct cpu_hw_sf *cpuhw)
 {
 	if (cpuhw->sfb.sdbt)
@@ -376,35 +360,7 @@ static void deallocate_buffers(struct cpu_hw_sf *cpuhw)
 static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc)
 {
 	unsigned long n_sdb, freq, factor;
-	size_t sfr_size, sample_size;
-	struct sf_raw_sample *sfr;
-
-	/* Allocate raw sample buffer
-	 *
-	 *    The raw sample buffer is used to temporarily store sampling data
-	 *    entries for perf raw sample processing.  The buffer size mainly
-	 *    depends on the size of diagnostic-sampling data entries which is
-	 *    machine-specific.  The exact size calculation includes:
-	 *	1. The first 4 bytes of diagnostic-sampling data entries are
-	 *	   already reflected in the sf_raw_sample structure.  Subtract
-	 *	   these bytes.
-	 *	2. The perf raw sample data must be 8-byte aligned (u64) and
-	 *	   perf's internal data size must be considered too.  So add
-	 *	   an additional u32 for correct alignment and subtract before
-	 *	   allocating the buffer.
-	 *	3. Store the raw sample buffer pointer in the perf event
-	 *	   hardware structure.
-	 */
-	sfr_size = ALIGN((sizeof(*sfr) - sizeof(sfr->diag) + cpuhw->qsi.dsdes) +
-			 sizeof(u32), sizeof(u64));
-	sfr_size -= sizeof(u32);
-	sfr = kzalloc(sfr_size, GFP_KERNEL);
-	if (!sfr)
-		return -ENOMEM;
-	sfr->size = sfr_size;
-	sfr->bsdes = cpuhw->qsi.bsdes;
-	sfr->dsdes = cpuhw->qsi.dsdes;
-	RAWSAMPLE_REG(hwc) = (unsigned long) sfr;
+	size_t sample_size;
 
 	/* Calculate sampling buffers using 4K pages
 	 *
@@ -430,7 +386,7 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc)
 	 *	 ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up
 	 *	 to 511 SDBs).
 	 */
-	sample_size = event_sample_size(hwc);
+	sample_size = sizeof(struct hws_basic_entry);
 	freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc));
 	factor = 1;
 	n_sdb = DIV_ROUND_UP(freq, factor * ((PAGE_SIZE-64) / sample_size));
@@ -629,10 +585,6 @@ static int reserve_pmc_hardware(void)
 
 static void hw_perf_event_destroy(struct perf_event *event)
 {
-	/* Free raw sample buffer */
-	if (RAWSAMPLE_REG(&event->hw))
-		kfree((void *) RAWSAMPLE_REG(&event->hw));
-
 	/* Release PMC if this is the last perf event */
 	if (!atomic_add_unless(&num_events, -1, 1)) {
 		mutex_lock(&pmc_reserve_mutex);
@@ -652,15 +604,8 @@ static void hw_init_period(struct hw_perf_event *hwc, u64 period)
 static void hw_reset_registers(struct hw_perf_event *hwc,
 			       unsigned long *sdbt_origin)
 {
-	struct sf_raw_sample *sfr;
-
 	/* (Re)set to first sample-data-block-table */
 	TEAR_REG(hwc) = (unsigned long) sdbt_origin;
-
-	/* (Re)set raw sampling buffer register */
-	sfr = (struct sf_raw_sample *) RAWSAMPLE_REG(hwc);
-	memset(&sfr->basic, 0, sizeof(sfr->basic));
-	memset(&sfr->diag, 0, sfr->dsdes);
 }
 
 static unsigned long hw_limit_rate(const struct hws_qsi_info_block *si,
@@ -986,22 +931,16 @@ static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs,
  *
  * Return non-zero if an event overflow occurred.
  */
-static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr)
+static int perf_push_sample(struct perf_event *event,
+			    struct hws_basic_entry *basic)
 {
 	int overflow;
 	struct pt_regs regs;
 	struct perf_sf_sde_regs *sde_regs;
 	struct perf_sample_data data;
-	struct perf_raw_record raw = {
-		.frag = {
-			.size = sfr->size,
-			.data = sfr,
-		},
-	};
 
 	/* Setup perf sample */
 	perf_sample_data_init(&data, 0, event->hw.last_period);
-	data.raw = &raw;
 
 	/* Setup pt_regs to look like an CPU-measurement external interrupt
 	 * using the Program Request Alert code.  The regs.int_parm_long
@@ -1013,11 +952,11 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr)
 	regs.int_parm = CPU_MF_INT_SF_PRA;
 	sde_regs = (struct perf_sf_sde_regs *) &regs.int_parm_long;
 
-	psw_bits(regs.psw).ia	= sfr->basic.ia;
-	psw_bits(regs.psw).dat	= sfr->basic.T;
-	psw_bits(regs.psw).wait = sfr->basic.W;
-	psw_bits(regs.psw).pstate = sfr->basic.P;
-	psw_bits(regs.psw).as	= sfr->basic.AS;
+	psw_bits(regs.psw).ia	= basic->ia;
+	psw_bits(regs.psw).dat	= basic->T;
+	psw_bits(regs.psw).wait = basic->W;
+	psw_bits(regs.psw).pstate = basic->P;
+	psw_bits(regs.psw).as	= basic->AS;
 
 	/*
 	 * Use the hardware provided configuration level to decide if the
@@ -1030,7 +969,7 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr)
 	 * If the value differs from 0xffff (the host value), we assume to
 	 * be a KVM guest.
 	 */
-	switch (sfr->basic.CL) {
+	switch (basic->CL) {
 	case 1: /* logical partition */
 		sde_regs->in_guest = 0;
 		break;
@@ -1038,7 +977,7 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr)
 		sde_regs->in_guest = 1;
 		break;
 	default: /* old machine, use heuristics */
-		if (sfr->basic.gpp || sfr->basic.prim_asn != 0xffff)
+		if (basic->gpp || basic->prim_asn != 0xffff)
 			sde_regs->in_guest = 1;
 		break;
 	}
@@ -1060,75 +999,12 @@ static void perf_event_count_update(struct perf_event *event, u64 count)
 	local64_add(count, &event->count);
 }
 
-static int sample_format_is_valid(struct hws_combined_entry *sample,
-				   unsigned int flags)
-{
-	if (likely(flags & PERF_CPUM_SF_BASIC_MODE))
-		/* Only basic-sampling data entries with data-entry-format
-		 * version of 0x0001 can be processed.
-		 */
-		if (sample->basic.def != 0x0001)
-			return 0;
-	if (flags & PERF_CPUM_SF_DIAG_MODE)
-		/* The data-entry-format number of diagnostic-sampling data
-		 * entries can vary.  Because diagnostic data is just passed
-		 * through, do only a sanity check on the DEF.
-		 */
-		if (sample->diag.def < 0x8001)
-			return 0;
-	return 1;
-}
-
-static int sample_is_consistent(struct hws_combined_entry *sample,
-				unsigned long flags)
-{
-	/* This check applies only to basic-sampling data entries of potentially
-	 * combined-sampling data entries.  Invalid entries cannot be processed
-	 * by the PMU and, thus, do not deliver an associated
-	 * diagnostic-sampling data entry.
-	 */
-	if (unlikely(!(flags & PERF_CPUM_SF_BASIC_MODE)))
-		return 0;
-	/*
-	 * Samples are skipped, if they are invalid or for which the
-	 * instruction address is not predictable, i.e., the wait-state bit is
-	 * set.
-	 */
-	if (sample->basic.I || sample->basic.W)
-		return 0;
-	return 1;
-}
-
-static void reset_sample_slot(struct hws_combined_entry *sample,
-			      unsigned long flags)
-{
-	if (likely(flags & PERF_CPUM_SF_BASIC_MODE))
-		sample->basic.def = 0;
-	if (flags & PERF_CPUM_SF_DIAG_MODE)
-		sample->diag.def = 0;
-}
-
-static void sfr_store_sample(struct sf_raw_sample *sfr,
-			     struct hws_combined_entry *sample)
-{
-	if (likely(sfr->format & PERF_CPUM_SF_BASIC_MODE))
-		sfr->basic = sample->basic;
-	if (sfr->format & PERF_CPUM_SF_DIAG_MODE)
-		memcpy(&sfr->diag, &sample->diag, sfr->dsdes);
-}
-
-static void debug_sample_entry(struct hws_combined_entry *sample,
-			       struct hws_trailer_entry *te,
-			       unsigned long flags)
+static void debug_sample_entry(struct hws_basic_entry *sample,
+			       struct hws_trailer_entry *te)
 {
 	debug_sprintf_event(sfdbg, 4, "hw_collect_samples: Found unknown "
-			    "sampling data entry: te->f=%i basic.def=%04x (%p)"
-			    " diag.def=%04x (%p)\n", te->f,
-			    sample->basic.def, &sample->basic,
-			    (flags & PERF_CPUM_SF_DIAG_MODE)
-					? sample->diag.def : 0xFFFF,
-			    (flags & PERF_CPUM_SF_DIAG_MODE)
-					?  &sample->diag : NULL);
+			    "sampling data entry: te->f=%i basic.def=%04x (%p)\n",
+			    te->f, sample->def, sample);
 }
 
 /* hw_collect_samples() - Walk through a sample-data-block and collect samples
@@ -1154,44 +1030,37 @@ static void debug_sample_entry(struct hws_combined_entry *sample,
 static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
 			       unsigned long long *overflow)
 {
-	unsigned long flags = SAMPL_FLAGS(&event->hw);
-	struct hws_combined_entry *sample;
 	struct hws_trailer_entry *te;
-	struct sf_raw_sample *sfr;
-	size_t sample_size;
-
-	/* Prepare and initialize raw sample data */
-	sfr = (struct sf_raw_sample *) RAWSAMPLE_REG(&event->hw);
-	sfr->format = flags & PERF_CPUM_SF_MODE_MASK;
+	struct hws_basic_entry *sample;
 
-	sample_size = event_sample_size(&event->hw);
 	te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt);
-	sample = (struct hws_combined_entry *) *sdbt;
+	sample = (struct hws_basic_entry *) *sdbt;
 	while ((unsigned long *) sample < (unsigned long *) te) {
 		/* Check for an empty sample */
-		if (!sample->basic.def)
+		if (!sample->def)
 			break;
 
 		/* Update perf event period */
 		perf_event_count_update(event, SAMPL_RATE(&event->hw));
 
-		/* Check sampling data entry */
-		if (sample_format_is_valid(sample, flags)) {
+		/* Check whether sample is valid */
+		if (sample->def == 0x0001) {
 			/* If an event overflow occurred, the PMU is stopped to
 			 * throttle event delivery.  Remaining sample data is
 			 * discarded.
 			 */
 			if (!*overflow) {
-				if (sample_is_consistent(sample, flags)) {
+				/* Check whether sample is consistent */
+				if (sample->I == 0 && sample->W == 0) {
 					/* Deliver sample data to perf */
-					sfr_store_sample(sfr, sample);
-					*overflow = perf_push_sample(event, sfr);
+					*overflow = perf_push_sample(event,
+								     sample);
 				}
 			} else
 				/* Count discarded samples */
 				*overflow += 1;
 		} else {
-			debug_sample_entry(sample, te, flags);
+			debug_sample_entry(sample, te);
 			/* Sample slot is not yet written or other record.
 			 *
 			 * This condition can occur if the buffer was reused
@@ -1207,8 +1076,8 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
 		}
 
 		/* Reset sample slot and advance to next sample */
-		reset_sample_slot(sample, flags);
-		sample += sample_size;
+		sample->def = 0;
+		sample++;
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From d4c7e649d7bf17792629dbeaf25945e26a32894f Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Fri, 27 Oct 2017 15:45:19 +0200
Subject: s390/cpum_sf: load program parameter at sampler enablement

The lpp instruction is used to place the PID of the current
task in the program-parameter (PP) register.  The register
contents is then included in the sampling data entries.

The lpp instruction loads the PP register only when at least
one sampling function is enabled.  Otherwise it is executed
as a no-op.

Linux calls lpp at context switch.  If the context switch
happens before the sampler is enabled, the PP register is
empty.  That means, the PID of the task that is sampled is
not stored in sampling data until the next context switch.

Hence, always call lpp when enabling the sampler.

Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/cpu_mf.h  | 6 ++++++
 arch/s390/kernel/perf_cpum_sf.c | 3 +++
 2 files changed, 9 insertions(+)

(limited to 'arch/s390/include/asm')

diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h
index 7364130a29c8..792cda339af1 100644
--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h
@@ -144,6 +144,12 @@ struct hws_trailer_entry {
 	unsigned long long progusage2;	 /*				      */
 } __packed;
 
+/* Load program parameter */
+static inline void lpp(void *pp)
+{
+	asm volatile(".insn s,0xb2800000,0(%0)\n":: "a" (pp) : "memory");
+}
+
 /* Query counter information */
 static inline int qctri(struct cpumf_ctr_info *info)
 {
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 7e9b9e6ee821..dbb62c05805d 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -848,6 +848,9 @@ static void cpumsf_pmu_enable(struct pmu *pmu)
 		return;
 	}
 
+	/* Load current program parameter */
+	lpp(&S390_lowcore.lpp);
+
 	debug_sprintf_event(sfdbg, 6, "pmu_enable: es=%i cs=%i ed=%i cd=%i "
 			    "tear=%p dear=%p\n", cpuhw->lsctl.es, cpuhw->lsctl.cs,
 			    cpuhw->lsctl.ed, cpuhw->lsctl.cd,
-- 
cgit v1.2.3-59-g8ed1b


From 544e8dd7a8e49d22b4315fc232479bc02b417b46 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Tue, 8 Mar 2016 14:00:23 +0100
Subject: s390/cpum_sf: correctly set the PID and TID in perf samples

The hardware sampler creates samples that are processed at a later
point in time.  The PID and TID values of the perf samples that are
created for hardware samples are initialized with values from the
current task.  Hence, the PID and TID values are not correct and
perf samples are associated with wrong processes.

The PID and TID values are obtained from the Host Program Parameter
(HPP) field in the basic-sampling data entries.  These PIDs are
valid in the init PID namespace.  Ensure that the PIDs in the perf
samples are resolved considering the PID namespace in which the
perf event was created.

To correct the PID and TID values in the created perf samples,
a special overflow handler is installed.  It replaces the default
overflow handler and does not become effective if any other
overflow handler is used.  With the special overflow handler most
of the perf samples are associated with the right processes.
For processes, that are no longer exist, the association might
still be wrong.

Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/setup.h   |  2 +-
 arch/s390/kernel/perf_cpum_sf.c | 76 +++++++++++++++++++++++++++++++++++++++++
 arch/s390/mm/fault.c            |  2 +-
 3 files changed, 78 insertions(+), 2 deletions(-)

(limited to 'arch/s390/include/asm')

diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index 8bc87dcb10eb..2eb0c8a7b664 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -36,7 +36,7 @@
 #define MACHINE_FLAG_SCC	_BITUL(17)
 
 #define LPP_MAGIC		_BITUL(31)
-#define LPP_PFAULT_PID_MASK	_AC(0xffffffff, UL)
+#define LPP_PID_MASK		_AC(0xffffffff, UL)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index dbb62c05805d..227b38bd82c9 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -15,6 +15,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
 #include <linux/percpu.h>
+#include <linux/pid.h>
 #include <linux/notifier.h>
 #include <linux/export.h>
 #include <linux/slab.h>
@@ -615,6 +616,67 @@ static unsigned long hw_limit_rate(const struct hws_qsi_info_block *si,
 		       si->min_sampl_rate, si->max_sampl_rate);
 }
 
+static u32 cpumsf_pid_type(struct perf_event *event,
+			   u32 pid, enum pid_type type)
+{
+	struct task_struct *tsk;
+
+	/* Idle process */
+	if (!pid)
+		goto out;
+
+	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+	pid = -1;
+	if (tsk) {
+		/*
+		 * Only top level events contain the pid namespace in which
+		 * they are created.
+		 */
+		if (event->parent)
+			event = event->parent;
+		pid = __task_pid_nr_ns(tsk, type, event->ns);
+		/*
+		 * See also 1d953111b648
+		 * "perf/core: Don't report zero PIDs for exiting tasks".
+		 */
+		if (!pid && !pid_alive(tsk))
+			pid = -1;
+	}
+out:
+	return pid;
+}
+
+static void cpumsf_output_event_pid(struct perf_event *event,
+				    struct perf_sample_data *data,
+				    struct pt_regs *regs)
+{
+	u32 pid;
+	struct perf_event_header header;
+	struct perf_output_handle handle;
+
+	/*
+	 * Obtain the PID from the basic-sampling data entry and
+	 * correct the data->tid_entry.pid value.
+	 */
+	pid = data->tid_entry.pid;
+
+	/* Protect callchain buffers, tasks */
+	rcu_read_lock();
+
+	perf_prepare_sample(&header, data, event, regs);
+	if (perf_output_begin(&handle, event, header.size))
+		goto out;
+
+	/* Update the process ID (see also kernel/events/core.c) */
+	data->tid_entry.pid = cpumsf_pid_type(event, pid, __PIDTYPE_TGID);
+	data->tid_entry.tid = cpumsf_pid_type(event, pid, PIDTYPE_PID);
+
+	perf_output_sample(&handle, &header, data, event);
+	perf_output_end(&handle);
+out:
+	rcu_read_unlock();
+}
+
 static int __hw_perf_event_init(struct perf_event *event)
 {
 	struct cpu_hw_sf *cpuhw;
@@ -748,6 +810,14 @@ static int __hw_perf_event_init(struct perf_event *event)
 				break;
 		}
 	}
+
+	/* If PID/TID sampling is active, replace the default overflow
+	 * handler to extract and resolve the PIDs from the basic-sampling
+	 * data entries.
+	 */
+	if (event->attr.sample_type & PERF_SAMPLE_TID)
+		if (is_default_overflow_handler(event))
+			event->overflow_handler = cpumsf_output_event_pid;
 out:
 	return err;
 }
@@ -985,6 +1055,12 @@ static int perf_push_sample(struct perf_event *event,
 		break;
 	}
 
+	/*
+	 * Store the PID value from the sample-data-entry to be
+	 * processed and resolved by cpumsf_output_event_pid().
+	 */
+	data.tid_entry.pid = basic->hpp & LPP_PID_MASK;
+
 	overflow = 0;
 	if (perf_exclude_event(event, &regs, sde_regs))
 		goto out;
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 14654007dce4..93faeca52284 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -728,7 +728,7 @@ static void pfault_interrupt(struct ext_code ext_code,
 		return;
 	inc_irq_stat(IRQEXT_PFL);
 	/* Get the token (= pid of the affected task). */
-	pid = param64 & LPP_PFAULT_PID_MASK;
+	pid = param64 & LPP_PID_MASK;
 	rcu_read_lock();
 	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
 	if (tsk)
-- 
cgit v1.2.3-59-g8ed1b