diff options
Diffstat (limited to 'arch/s390/kernel')
62 files changed, 2674 insertions, 924 deletions
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 80f500ffb55c..5e6a23299790 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -33,16 +33,16 @@ CFLAGS_stacktrace.o += -fno-optimize-sibling-calls CFLAGS_dumpstack.o += -fno-optimize-sibling-calls CFLAGS_unwind_bc.o += -fno-optimize-sibling-calls -obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o +obj-y := head64.o traps.o time.o process.o earlypgm.o early.o setup.o idle.o vtime.o obj-y += processor.o syscall.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o -obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o +obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o cpufeature.o obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o -obj-y += smp.o text_amode31.o +obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o -extra-y += head64.o vmlinux.lds +extra-y += vmlinux.lds obj-$(CONFIG_SYSFS) += nospec-sysfs.o CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) @@ -55,10 +55,11 @@ compat-obj-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_COMPAT) += compat_linux.o compat_signal.o obj-$(CONFIG_COMPAT) += $(compat-obj-y) obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KPROBES) += kprobes_insn_page.o -obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o +obj-$(CONFIG_KPROBES) += mcount.o +obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o +obj-$(CONFIG_FUNCTION_TRACER) += mcount.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o @@ -71,6 +72,7 @@ obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf_common.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o +obj-$(CONFIG_PERF_EVENTS) += perf_pai_crypto.o perf_pai_ext.o obj-$(CONFIG_TRACEPOINTS) += trace.o obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o diff --git a/arch/s390/kernel/abs_lowcore.c b/arch/s390/kernel/abs_lowcore.c new file mode 100644 index 000000000000..fb92e8ed0525 --- /dev/null +++ b/arch/s390/kernel/abs_lowcore.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/pgtable.h> +#include <asm/abs_lowcore.h> + +#define ABS_LOWCORE_UNMAPPED 1 +#define ABS_LOWCORE_LAP_ON 2 +#define ABS_LOWCORE_IRQS_ON 4 + +unsigned long __bootdata_preserved(__abs_lowcore); +bool __ro_after_init abs_lowcore_mapped; + +int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc) +{ + unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore)); + unsigned long phys = __pa(lc); + int rc, i; + + for (i = 0; i < LC_PAGES; i++) { + rc = __vmem_map_4k_page(addr, phys, PAGE_KERNEL, alloc); + if (rc) { + /* + * Do not unmap allocated page tables in case the + * allocation was not requested. In such a case the + * request is expected coming from an atomic context, + * while the unmap attempt might sleep. + */ + if (alloc) { + for (--i; i >= 0; i--) { + addr -= PAGE_SIZE; + vmem_unmap_4k_page(addr); + } + } + return rc; + } + addr += PAGE_SIZE; + phys += PAGE_SIZE; + } + return 0; +} + +void abs_lowcore_unmap(int cpu) +{ + unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore)); + int i; + + for (i = 0; i < LC_PAGES; i++) { + vmem_unmap_4k_page(addr); + addr += PAGE_SIZE; + } +} + +struct lowcore *get_abs_lowcore(unsigned long *flags) +{ + unsigned long irq_flags; + union ctlreg0 cr0; + int cpu; + + *flags = 0; + cpu = get_cpu(); + if (abs_lowcore_mapped) { + return ((struct lowcore *)__abs_lowcore) + cpu; + } else { + if (cpu != 0) + panic("Invalid unmapped absolute lowcore access\n"); + local_irq_save(irq_flags); + if (!irqs_disabled_flags(irq_flags)) + *flags |= ABS_LOWCORE_IRQS_ON; + __ctl_store(cr0.val, 0, 0); + if (cr0.lap) { + *flags |= ABS_LOWCORE_LAP_ON; + __ctl_clear_bit(0, 28); + } + *flags |= ABS_LOWCORE_UNMAPPED; + return lowcore_ptr[0]; + } +} + +void put_abs_lowcore(struct lowcore *lc, unsigned long flags) +{ + if (abs_lowcore_mapped) { + if (flags) + panic("Invalid mapped absolute lowcore release\n"); + } else { + if (smp_processor_id() != 0) + panic("Invalid mapped absolute lowcore access\n"); + if (!(flags & ABS_LOWCORE_UNMAPPED)) + panic("Invalid unmapped absolute lowcore release\n"); + if (flags & ABS_LOWCORE_LAP_ON) + __ctl_set_bit(0, 28); + if (flags & ABS_LOWCORE_IRQS_ON) + local_irq_enable(); + } + put_cpu(); +} diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index cce0ddee2d02..e7bca29f9c34 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -7,8 +7,6 @@ #include <asm/facility.h> #include <asm/nospec-branch.h> -#define MAX_PATCH_LEN (255 - 1) - static int __initdata_or_module alt_instr_disabled; static int __init disable_alternative_instructions(char *str) @@ -19,85 +17,30 @@ static int __init disable_alternative_instructions(char *str) early_param("noaltinstr", disable_alternative_instructions); -struct brcl_insn { - u16 opc; - s32 disp; -} __packed; - -static u16 __initdata_or_module nop16 = 0x0700; -static u32 __initdata_or_module nop32 = 0x47000000; -static struct brcl_insn __initdata_or_module nop48 = { - 0xc004, 0 -}; - -static const void *nops[] __initdata_or_module = { - &nop16, - &nop32, - &nop48 -}; - -static void __init_or_module add_jump_padding(void *insns, unsigned int len) -{ - struct brcl_insn brcl = { - 0xc0f4, - len / 2 - }; - - memcpy(insns, &brcl, sizeof(brcl)); - insns += sizeof(brcl); - len -= sizeof(brcl); - - while (len > 0) { - memcpy(insns, &nop16, 2); - insns += 2; - len -= 2; - } -} - -static void __init_or_module add_padding(void *insns, unsigned int len) -{ - if (len > 6) - add_jump_padding(insns, len); - else if (len >= 2) - memcpy(insns, nops[len / 2 - 1], len); -} - static void __init_or_module __apply_alternatives(struct alt_instr *start, struct alt_instr *end) { struct alt_instr *a; u8 *instr, *replacement; - u8 insnbuf[MAX_PATCH_LEN]; /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. */ for (a = start; a < end; a++) { - int insnbuf_sz = 0; - instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; if (!__test_facility(a->facility, alt_stfle_fac_list)) continue; - if (unlikely(a->instrlen % 2 || a->replacementlen % 2)) { + if (unlikely(a->instrlen % 2)) { WARN_ONCE(1, "cpu alternatives instructions length is " "odd, skipping patching\n"); continue; } - memcpy(insnbuf, replacement, a->replacementlen); - insnbuf_sz = a->replacementlen; - - if (a->instrlen > a->replacementlen) { - add_padding(insnbuf + a->replacementlen, - a->instrlen - a->replacementlen); - insnbuf_sz += a->instrlen - a->replacementlen; - } - - s390_kernel_write(instr, insnbuf, insnbuf_sz); + s390_kernel_write(instr, replacement, a->instrlen); } } diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 8e00bb228662..d8ce965c0a97 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -32,6 +32,22 @@ int main(void) /* pt_regs offsets */ OFFSET(__PT_PSW, pt_regs, psw); OFFSET(__PT_GPRS, pt_regs, gprs); + OFFSET(__PT_R0, pt_regs, gprs[0]); + OFFSET(__PT_R1, pt_regs, gprs[1]); + OFFSET(__PT_R2, pt_regs, gprs[2]); + OFFSET(__PT_R3, pt_regs, gprs[3]); + OFFSET(__PT_R4, pt_regs, gprs[4]); + OFFSET(__PT_R5, pt_regs, gprs[5]); + OFFSET(__PT_R6, pt_regs, gprs[6]); + OFFSET(__PT_R7, pt_regs, gprs[7]); + OFFSET(__PT_R8, pt_regs, gprs[8]); + OFFSET(__PT_R9, pt_regs, gprs[9]); + OFFSET(__PT_R10, pt_regs, gprs[10]); + OFFSET(__PT_R11, pt_regs, gprs[11]); + OFFSET(__PT_R12, pt_regs, gprs[12]); + OFFSET(__PT_R13, pt_regs, gprs[13]); + OFFSET(__PT_R14, pt_regs, gprs[14]); + OFFSET(__PT_R15, pt_regs, gprs[15]); OFFSET(__PT_ORIG_GPR2, pt_regs, orig_gpr2); OFFSET(__PT_FLAGS, pt_regs, flags); OFFSET(__PT_CR1, pt_regs, cr1); @@ -41,18 +57,16 @@ int main(void) /* stack_frame offsets */ OFFSET(__SF_BACKCHAIN, stack_frame, back_chain); OFFSET(__SF_GPRS, stack_frame, gprs); - OFFSET(__SF_EMPTY, stack_frame, empty1[0]); - OFFSET(__SF_SIE_CONTROL, stack_frame, empty1[1]); - OFFSET(__SF_SIE_SAVEAREA, stack_frame, empty1[2]); - OFFSET(__SF_SIE_REASON, stack_frame, empty1[3]); - OFFSET(__SF_SIE_FLAGS, stack_frame, empty1[4]); + OFFSET(__SF_EMPTY, stack_frame, empty[0]); + OFFSET(__SF_SIE_CONTROL, stack_frame, sie_control_block); + OFFSET(__SF_SIE_SAVEAREA, stack_frame, sie_savearea); + OFFSET(__SF_SIE_REASON, stack_frame, sie_reason); + OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags); DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame)); BLANK(); /* idle data offsets */ OFFSET(__CLOCK_IDLE_ENTER, s390_idle_data, clock_idle_enter); - OFFSET(__CLOCK_IDLE_EXIT, s390_idle_data, clock_idle_exit); OFFSET(__TIMER_IDLE_ENTER, s390_idle_data, timer_idle_enter); - OFFSET(__TIMER_IDLE_EXIT, s390_idle_data, timer_idle_exit); OFFSET(__MT_CYCLES_ENTER, s390_idle_data, mt_cycles_enter); BLANK(); /* hardware defined lowcore locations 0x000 - 0x1ff */ @@ -123,14 +137,12 @@ int main(void) OFFSET(__LC_USER_ASCE, lowcore, user_asce); OFFSET(__LC_LPP, lowcore, lpp); OFFSET(__LC_CURRENT_PID, lowcore, current_pid); - OFFSET(__LC_PERCPU_OFFSET, lowcore, percpu_offset); - OFFSET(__LC_MACHINE_FLAGS, lowcore, machine_flags); - OFFSET(__LC_PREEMPT_COUNT, lowcore, preempt_count); OFFSET(__LC_GMAP, lowcore, gmap); - OFFSET(__LC_BR_R1, lowcore, br_r1_trampoline); OFFSET(__LC_LAST_BREAK, lowcore, last_break); /* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */ OFFSET(__LC_DUMP_REIPL, lowcore, ipib); + OFFSET(__LC_VMCORE_INFO, lowcore, vmcore_info); + OFFSET(__LC_OS_INFO, lowcore, os_info); /* hardware defined lowcore locations 0x1000 - 0x18ff */ OFFSET(__LC_MCESAD, lowcore, mcesad); OFFSET(__LC_EXT_PARAMS2, lowcore, ext_params2); diff --git a/arch/s390/kernel/base.S b/arch/s390/kernel/base.S deleted file mode 100644 index d255c69c1779..000000000000 --- a/arch/s390/kernel/base.S +++ /dev/null @@ -1,41 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * arch/s390/kernel/base.S - * - * Copyright IBM Corp. 2006, 2007 - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> - * Michael Holzheu <holzheu@de.ibm.com> - */ - -#include <linux/linkage.h> -#include <asm/asm-offsets.h> -#include <asm/nospec-insn.h> -#include <asm/ptrace.h> - - GEN_BR_THUNK %r9 - GEN_BR_THUNK %r14 - -ENTRY(s390_base_pgm_handler) - stmg %r0,%r15,__LC_SAVE_AREA_SYNC - basr %r13,0 -0: aghi %r15,-STACK_FRAME_OVERHEAD - larl %r1,s390_base_pgm_handler_fn - lg %r9,0(%r1) - ltgr %r9,%r9 - jz 1f - BASR_EX %r14,%r9 - lmg %r0,%r15,__LC_SAVE_AREA_SYNC - lpswe __LC_PGM_OLD_PSW -1: lpswe disabled_wait_psw-0b(%r13) -ENDPROC(s390_base_pgm_handler) - - .align 8 -disabled_wait_psw: - .quad 0x0002000180000000,0x0000000000000000 + s390_base_pgm_handler - - .section .bss - .align 8 - .globl s390_base_pgm_handler_fn -s390_base_pgm_handler_fn: - .quad 0 - .previous diff --git a/arch/s390/kernel/cache.c b/arch/s390/kernel/cache.c index d66825e53fce..7ee3651d00ab 100644 --- a/arch/s390/kernel/cache.c +++ b/arch/s390/kernel/cache.c @@ -3,7 +3,6 @@ * Extract CPU cache information and expose them via sysfs. * * Copyright IBM Corp. 2012 - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> */ #include <linux/seq_file.h> @@ -71,8 +70,6 @@ void show_cacheinfo(struct seq_file *m) struct cacheinfo *cache; int idx; - if (!test_facility(34)) - return; this_cpu_ci = get_cpu_cacheinfo(cpumask_any(cpu_online_mask)); for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) { cache = this_cpu_ci->info_list + idx; @@ -132,8 +129,6 @@ int init_cache_level(unsigned int cpu) union cache_topology ct; enum cache_type ctype; - if (!test_facility(34)) - return -EOPNOTSUPP; if (!this_cpu_ci) return -EINVAL; ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0); @@ -157,8 +152,6 @@ int populate_cache_leaves(unsigned int cpu) union cache_topology ct; enum cache_type ctype; - if (!test_facility(34)) - return -EOPNOTSUPP; ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0); for (idx = 0, level = 0; level < this_cpu_ci->num_levels && idx < this_cpu_ci->num_leaves; idx++, level++) { diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h index 64509e7dbd3b..ef23739b277c 100644 --- a/arch/s390/kernel/compat_linux.h +++ b/arch/s390/kernel/compat_linux.h @@ -5,69 +5,59 @@ #include <linux/compat.h> #include <linux/socket.h> #include <linux/syscalls.h> +#include <asm/ptrace.h> -/* Macro that masks the high order bit of an 32 bit pointer and converts it*/ -/* to a 64 bit pointer */ -#define A(__x) ((unsigned long)((__x) & 0x7FFFFFFFUL)) -#define AA(__x) \ - ((unsigned long)(__x)) +/* + * Macro that masks the high order bit of a 32 bit pointer and + * converts it to a 64 bit pointer. + */ +#define A(__x) ((unsigned long)((__x) & 0x7FFFFFFFUL)) +#define AA(__x) ((unsigned long)(__x)) /* Now 32bit compatibility types */ struct ipc_kludge_32 { - __u32 msgp; /* pointer */ - __s32 msgtyp; + __u32 msgp; /* pointer */ + __s32 msgtyp; }; /* asm/sigcontext.h */ -typedef union -{ - __u64 d; - __u32 f; +typedef union { + __u64 d; + __u32 f; } freg_t32; -typedef struct -{ +typedef struct { unsigned int fpc; unsigned int pad; - freg_t32 fprs[__NUM_FPRS]; + freg_t32 fprs[__NUM_FPRS]; } _s390_fp_regs32; -typedef struct -{ - __u32 mask; - __u32 addr; -} _psw_t32 __attribute__ ((aligned(8))); - -typedef struct -{ - _psw_t32 psw; +typedef struct { + psw_t32 psw; __u32 gprs[__NUM_GPRS]; __u32 acrs[__NUM_ACRS]; } _s390_regs_common32; -typedef struct -{ +typedef struct { _s390_regs_common32 regs; - _s390_fp_regs32 fpregs; + _s390_fp_regs32 fpregs; } _sigregs32; -typedef struct -{ - __u32 gprs_high[__NUM_GPRS]; - __u64 vxrs_low[__NUM_VXRS_LOW]; - __vector128 vxrs_high[__NUM_VXRS_HIGH]; - __u8 __reserved[128]; +typedef struct { + __u32 gprs_high[__NUM_GPRS]; + __u64 vxrs_low[__NUM_VXRS_LOW]; + __vector128 vxrs_high[__NUM_VXRS_HIGH]; + __u8 __reserved[128]; } _sigregs_ext32; #define _SIGCONTEXT_NSIG32 64 #define _SIGCONTEXT_NSIG_BPW32 32 #define __SIGNAL_FRAMESIZE32 96 -#define _SIGMASK_COPY_SIZE32 (sizeof(u32)*2) +#define _SIGMASK_COPY_SIZE32 (sizeof(u32) * 2) -struct sigcontext32 -{ +struct sigcontext32 { __u32 oldmask[_COMPAT_NSIG_WORDS]; - __u32 sregs; /* pointer */ + __u32 sregs; /* pointer */ }; /* asm/signal.h */ @@ -75,11 +65,11 @@ struct sigcontext32 /* asm/ucontext.h */ struct ucontext32 { __u32 uc_flags; - __u32 uc_link; /* pointer */ + __u32 uc_link; /* pointer */ compat_stack_t uc_stack; _sigregs32 uc_mcontext; compat_sigset_t uc_sigmask; - /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */ + /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */ unsigned char __unused[128 - sizeof(compat_sigset_t)]; _sigregs_ext32 uc_mcontext_ext; }; @@ -88,25 +78,6 @@ struct stat64_emu31; struct mmap_arg_struct_emu31; struct fadvise64_64_args; -long compat_sys_s390_chown16(const char __user *filename, u16 user, u16 group); -long compat_sys_s390_lchown16(const char __user *filename, u16 user, u16 group); -long compat_sys_s390_fchown16(unsigned int fd, u16 user, u16 group); -long compat_sys_s390_setregid16(u16 rgid, u16 egid); -long compat_sys_s390_setgid16(u16 gid); -long compat_sys_s390_setreuid16(u16 ruid, u16 euid); -long compat_sys_s390_setuid16(u16 uid); -long compat_sys_s390_setresuid16(u16 ruid, u16 euid, u16 suid); -long compat_sys_s390_getresuid16(u16 __user *ruid, u16 __user *euid, u16 __user *suid); -long compat_sys_s390_setresgid16(u16 rgid, u16 egid, u16 sgid); -long compat_sys_s390_getresgid16(u16 __user *rgid, u16 __user *egid, u16 __user *sgid); -long compat_sys_s390_setfsuid16(u16 uid); -long compat_sys_s390_setfsgid16(u16 gid); -long compat_sys_s390_getgroups16(int gidsetsize, u16 __user *grouplist); -long compat_sys_s390_setgroups16(int gidsetsize, u16 __user *grouplist); -long compat_sys_s390_getuid16(void); -long compat_sys_s390_geteuid16(void); -long compat_sys_s390_getgid16(void); -long compat_sys_s390_getegid16(void); long compat_sys_s390_truncate64(const char __user *path, u32 high, u32 low); long compat_sys_s390_ftruncate64(unsigned int fd, u32 high, u32 low); long compat_sys_s390_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, u32 high, u32 low); @@ -118,8 +89,8 @@ long compat_sys_s390_fstat64(unsigned int fd, struct stat64_emu31 __user *statbu long compat_sys_s390_fstatat64(unsigned int dfd, const char __user *filename, struct stat64_emu31 __user *statbuf, int flag); long compat_sys_s390_old_mmap(struct mmap_arg_struct_emu31 __user *arg); long compat_sys_s390_mmap2(struct mmap_arg_struct_emu31 __user *arg); -long compat_sys_s390_read(unsigned int fd, char __user * buf, compat_size_t count); -long compat_sys_s390_write(unsigned int fd, const char __user * buf, compat_size_t count); +long compat_sys_s390_read(unsigned int fd, char __user *buf, compat_size_t count); +long compat_sys_s390_write(unsigned int fd, const char __user *buf, compat_size_t count); long compat_sys_s390_fadvise64(int fd, u32 high, u32 low, compat_size_t len, int advise); long compat_sys_s390_fadvise64_64(struct fadvise64_64_args __user *args); long compat_sys_s390_sync_file_range(int fd, u32 offhigh, u32 offlow, u32 nhigh, u32 nlow, unsigned int flags); diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index cca142fbb516..eee1ad3e1b29 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -89,7 +89,7 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) _sigregs32 user_sregs; int i; - /* Alwys make any pending restarted system call return -EINTR */ + /* Always make any pending restarted system call return -EINTR */ current->restart_block.fn = do_no_restart_syscall; if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs))) diff --git a/arch/s390/kernel/cpufeature.c b/arch/s390/kernel/cpufeature.c new file mode 100644 index 000000000000..1b2ae42a0c15 --- /dev/null +++ b/arch/s390/kernel/cpufeature.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2022 + */ + +#include <linux/cpufeature.h> +#include <linux/bug.h> +#include <asm/elf.h> + +enum { + TYPE_HWCAP, + TYPE_FACILITY, +}; + +struct s390_cpu_feature { + unsigned int type : 4; + unsigned int num : 28; +}; + +static struct s390_cpu_feature s390_cpu_features[MAX_CPU_FEATURES] = { + [S390_CPU_FEATURE_MSA] = {.type = TYPE_HWCAP, .num = HWCAP_NR_MSA}, + [S390_CPU_FEATURE_VXRS] = {.type = TYPE_HWCAP, .num = HWCAP_NR_VXRS}, + [S390_CPU_FEATURE_UV] = {.type = TYPE_FACILITY, .num = 158}, +}; + +/* + * cpu_have_feature - Test CPU features on module initialization + */ +int cpu_have_feature(unsigned int num) +{ + struct s390_cpu_feature *feature; + + if (WARN_ON_ONCE(num >= MAX_CPU_FEATURES)) + return 0; + feature = &s390_cpu_features[num]; + switch (feature->type) { + case TYPE_HWCAP: + return !!(elf_hwcap & BIT(feature->num)); + case TYPE_FACILITY: + return test_facility(feature->num); + default: + WARN_ON_ONCE(1); + return 0; + } +} +EXPORT_SYMBOL(cpu_have_feature); diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index 785d54c9350c..dd74fe664ed1 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -15,11 +15,13 @@ #include <linux/slab.h> #include <linux/memblock.h> #include <linux/elf.h> +#include <linux/uio.h> #include <asm/asm-offsets.h> #include <asm/os_info.h> #include <asm/elf.h> #include <asm/ipl.h> #include <asm/sclp.h> +#include <asm/maccess.h> #define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y))) #define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y))) @@ -60,9 +62,9 @@ struct save_area * __init save_area_alloc(bool is_boot_cpu) { struct save_area *sa; - sa = (void *) memblock_phys_alloc(sizeof(*sa), 8); + sa = memblock_alloc(sizeof(*sa), 8); if (!sa) - panic("Failed to allocate save area\n"); + return NULL; if (is_boot_cpu) list_add(&sa->list, &dump_save_areas); @@ -113,121 +115,60 @@ void __init save_area_add_vxrs(struct save_area *sa, __vector128 *vxrs) memcpy(sa->vxrs_high, vxrs + 16, 16 * sizeof(__vector128)); } -/* - * Return physical address for virtual address - */ -static inline void *load_real_addr(void *addr) -{ - unsigned long real_addr; - - asm volatile( - " lra %0,0(%1)\n" - " jz 0f\n" - " la %0,0\n" - "0:" - : "=a" (real_addr) : "a" (addr) : "cc"); - return (void *)real_addr; -} - -/* - * Copy memory of the old, dumped system to a kernel space virtual address - */ -int copy_oldmem_kernel(void *dst, void *src, size_t count) +static size_t copy_oldmem_iter(struct iov_iter *iter, unsigned long src, size_t count) { - unsigned long from, len; - void *ra; - int rc; + size_t len, copied, res = 0; while (count) { - from = __pa(src); - if (!oldmem_data.start && from < sclp.hsa_size) { + if (!oldmem_data.start && src < sclp.hsa_size) { /* Copy from zfcp/nvme dump HSA area */ - len = min(count, sclp.hsa_size - from); - rc = memcpy_hsa_kernel(dst, from, len); - if (rc) - return rc; + len = min(count, sclp.hsa_size - src); + copied = memcpy_hsa_iter(iter, src, len); } else { /* Check for swapped kdump oldmem areas */ - if (oldmem_data.start && from - oldmem_data.start < oldmem_data.size) { - from -= oldmem_data.start; - len = min(count, oldmem_data.size - from); - } else if (oldmem_data.start && from < oldmem_data.size) { - len = min(count, oldmem_data.size - from); - from += oldmem_data.start; + if (oldmem_data.start && src - oldmem_data.start < oldmem_data.size) { + src -= oldmem_data.start; + len = min(count, oldmem_data.size - src); + } else if (oldmem_data.start && src < oldmem_data.size) { + len = min(count, oldmem_data.size - src); + src += oldmem_data.start; } else { len = count; } - if (is_vmalloc_or_module_addr(dst)) { - ra = load_real_addr(dst); - len = min(PAGE_SIZE - offset_in_page(ra), len); - } else { - ra = dst; - } - if (memcpy_real(ra, (void *) from, len)) - return -EFAULT; + copied = memcpy_real_iter(iter, src, len); } - dst += len; - src += len; - count -= len; + count -= copied; + src += copied; + res += copied; + if (copied < len) + break; } - return 0; + return res; } -/* - * Copy memory of the old, dumped system to a user space virtual address - */ -static int copy_oldmem_user(void __user *dst, void *src, size_t count) +int copy_oldmem_kernel(void *dst, unsigned long src, size_t count) { - unsigned long from, len; - int rc; + struct iov_iter iter; + struct kvec kvec; - while (count) { - from = __pa(src); - if (!oldmem_data.start && from < sclp.hsa_size) { - /* Copy from zfcp/nvme dump HSA area */ - len = min(count, sclp.hsa_size - from); - rc = memcpy_hsa_user(dst, from, len); - if (rc) - return rc; - } else { - /* Check for swapped kdump oldmem areas */ - if (oldmem_data.start && from - oldmem_data.start < oldmem_data.size) { - from -= oldmem_data.start; - len = min(count, oldmem_data.size - from); - } else if (oldmem_data.start && from < oldmem_data.size) { - len = min(count, oldmem_data.size - from); - from += oldmem_data.start; - } else { - len = count; - } - rc = copy_to_user_real(dst, (void *) from, count); - if (rc) - return rc; - } - dst += len; - src += len; - count -= len; - } + kvec.iov_base = dst; + kvec.iov_len = count; + iov_iter_kvec(&iter, WRITE, &kvec, 1, count); + if (copy_oldmem_iter(&iter, src, count) < count) + return -EFAULT; return 0; } /* * Copy one page from "oldmem" */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, + unsigned long offset) { - void *src; - int rc; + unsigned long src; - if (!csize) - return 0; - src = (void *) (pfn << PAGE_SHIFT) + offset; - if (userbuf) - rc = copy_oldmem_user((void __force __user *) buf, src, csize); - else - rc = copy_oldmem_kernel((void *) buf, src, csize); - return rc; + src = pfn_to_phys(pfn) + offset; + return copy_oldmem_iter(iter, src, csize); } /* @@ -429,10 +370,10 @@ static void *nt_prpsinfo(void *ptr) static void *get_vmcoreinfo_old(unsigned long *size) { char nt_name[11], *vmcoreinfo; + unsigned long addr; Elf64_Nhdr note; - void *addr; - if (copy_oldmem_kernel(&addr, &S390_lowcore.vmcore_info, sizeof(addr))) + if (copy_oldmem_kernel(&addr, __LC_VMCORE_INFO, sizeof(addr))) return NULL; memset(nt_name, 0, sizeof(nt_name)); if (copy_oldmem_kernel(¬e, addr, sizeof(note))) diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 4331c7e6e1c0..d7a82066a638 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -250,7 +250,7 @@ static debug_info_t *debug_info_alloc(const char *name, int pages_per_area, rc->level = level; rc->buf_size = buf_size; rc->entry_size = sizeof(debug_entry_t) + buf_size; - strlcpy(rc->name, name, sizeof(rc->name)); + strscpy(rc->name, name, sizeof(rc->name)); memset(rc->views, 0, DEBUG_MAX_VIEWS * sizeof(struct debug_view *)); memset(rc->debugfs_entries, 0, DEBUG_MAX_VIEWS * sizeof(struct dentry *)); refcount_set(&(rc->ref_count), 0); diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index 76a656b2146f..a778714e4d8b 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -11,6 +11,7 @@ #include <linux/cpu.h> #include <linux/seq_file.h> #include <linux/debugfs.h> +#include <asm/asm-extable.h> #include <asm/diag.h> #include <asm/trace/diag.h> #include <asm/sections.h> diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index ec5515423f17..90bbb4ea1d08 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -278,6 +278,7 @@ static const unsigned char formats[][6] = { [INSTR_SIL_RDI] = { D_20, B_16, I16_32, 0, 0, 0 }, [INSTR_SIL_RDU] = { D_20, B_16, U16_32, 0, 0, 0 }, [INSTR_SIY_IRD] = { D20_20, B_16, I8_8, 0, 0, 0 }, + [INSTR_SIY_RD] = { D20_20, B_16, 0, 0, 0, 0 }, [INSTR_SIY_URD] = { D20_20, B_16, U8_8, 0, 0, 0 }, [INSTR_SI_RD] = { D_20, B_16, 0, 0, 0, 0 }, [INSTR_SI_URD] = { D_20, B_16, U8_8, 0, 0, 0 }, diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 0681c55e831d..1e3233eb510a 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -224,5 +224,5 @@ void __noreturn die(struct pt_regs *regs, const char *str) if (panic_on_oops) panic("Fatal exception: panic_on_oops"); oops_exit(); - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); } diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 3cdf68c53614..6030fdd6997b 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -2,7 +2,6 @@ /* * Copyright IBM Corp. 2007, 2009 * Author(s): Hongjie Yang <hongjie@us.ibm.com>, - * Heiko Carstens <heiko.carstens@de.ibm.com> */ #define KMSG_COMPONENT "setup" @@ -18,6 +17,7 @@ #include <linux/pfn.h> #include <linux/uaccess.h> #include <linux/kernel.h> +#include <asm/asm-extable.h> #include <asm/diag.h> #include <asm/ebcdic.h> #include <asm/ipl.h> @@ -149,34 +149,21 @@ static __init void setup_topology(void) topology_max_mnest = max_mnest; } -static void early_pgm_check_handler(void) +void __do_early_pgm_check(struct pt_regs *regs) { - const struct exception_table_entry *fixup; - unsigned long cr0, cr0_new; - unsigned long addr; - - addr = S390_lowcore.program_old_psw.addr; - fixup = s390_search_extables(addr); - if (!fixup) + if (!fixup_exception(regs)) disabled_wait(); - /* Disable low address protection before storing into lowcore. */ - __ctl_store(cr0, 0, 0); - cr0_new = cr0 & ~(1UL << 28); - __ctl_load(cr0_new, 0, 0); - S390_lowcore.program_old_psw.addr = extable_fixup(fixup); - __ctl_load(cr0, 0, 0); } static noinline __init void setup_lowcore_early(void) { psw_t psw; - psw.addr = (unsigned long)s390_base_pgm_handler; + psw.addr = (unsigned long)early_pgm_check_handler; psw.mask = PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA; if (IS_ENABLED(CONFIG_KASAN)) psw.mask |= PSW_MASK_DAT; S390_lowcore.program_new_psw = psw; - s390_base_pgm_handler_fn = early_pgm_check_handler; S390_lowcore.preempt_count = INIT_PREEMPT_COUNT; } @@ -280,7 +267,7 @@ char __bootdata(early_command_line)[COMMAND_LINE_SIZE]; static void __init setup_boot_command_line(void) { /* copy arch command line */ - strlcpy(boot_command_line, early_command_line, COMMAND_LINE_SIZE); + strscpy(boot_command_line, early_command_line, COMMAND_LINE_SIZE); } static void __init check_image_bootable(void) @@ -294,6 +281,11 @@ static void __init check_image_bootable(void) disabled_wait(); } +static void __init sort_amode31_extable(void) +{ + sort_extable(__start_amode31_ex_table, __stop_amode31_ex_table); +} + void __init startup_init(void) { sclp_early_adjust_va(); @@ -302,6 +294,7 @@ void __init startup_init(void) time_early_init(); init_kernel_storage_key(); lockdep_off(); + sort_amode31_extable(); setup_lowcore_early(); setup_facility_list(); detect_machine_type(); diff --git a/arch/s390/kernel/earlypgm.S b/arch/s390/kernel/earlypgm.S new file mode 100644 index 000000000000..f521c6da37b8 --- /dev/null +++ b/arch/s390/kernel/earlypgm.S @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2006, 2007 + * Author(s): Michael Holzheu <holzheu@de.ibm.com> + */ + +#include <linux/linkage.h> +#include <asm/asm-offsets.h> + +ENTRY(early_pgm_check_handler) + stmg %r8,%r15,__LC_SAVE_AREA_SYNC + aghi %r15,-(STACK_FRAME_OVERHEAD+__PT_SIZE) + la %r11,STACK_FRAME_OVERHEAD(%r15) + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + stmg %r0,%r7,__PT_R0(%r11) + mvc __PT_PSW(16,%r11),__LC_PGM_OLD_PSW + mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC + lgr %r2,%r11 + brasl %r14,__do_early_pgm_check + mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) + lpswe __LC_RETURN_PSW +ENDPROC(early_pgm_check_handler) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 01bae1d51113..d2a1f2f4f5b8 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -6,11 +6,11 @@ * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), * Hartmut Penner (hp@de.ibm.com), * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com), - * Heiko Carstens <heiko.carstens@de.ibm.com> */ #include <linux/init.h> #include <linux/linkage.h> +#include <asm/asm-extable.h> #include <asm/alternative-asm.h> #include <asm/processor.h> #include <asm/cache.h> @@ -29,23 +29,6 @@ #include <asm/export.h> #include <asm/nospec-insn.h> -__PT_R0 = __PT_GPRS -__PT_R1 = __PT_GPRS + 8 -__PT_R2 = __PT_GPRS + 16 -__PT_R3 = __PT_GPRS + 24 -__PT_R4 = __PT_GPRS + 32 -__PT_R5 = __PT_GPRS + 40 -__PT_R6 = __PT_GPRS + 48 -__PT_R7 = __PT_GPRS + 56 -__PT_R8 = __PT_GPRS + 64 -__PT_R9 = __PT_GPRS + 72 -__PT_R10 = __PT_GPRS + 80 -__PT_R11 = __PT_GPRS + 88 -__PT_R12 = __PT_GPRS + 96 -__PT_R13 = __PT_GPRS + 104 -__PT_R14 = __PT_GPRS + 112 -__PT_R15 = __PT_GPRS + 120 - STACK_SHIFT = PAGE_SHIFT + THREAD_SIZE_ORDER STACK_SIZE = 1 << STACK_SHIFT STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE @@ -53,19 +36,19 @@ STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE _LPP_OFFSET = __LC_LPP .macro STBEAR address - ALTERNATIVE "", ".insn s,0xb2010000,\address", 193 + ALTERNATIVE "nop", ".insn s,0xb2010000,\address", 193 .endm .macro LBEAR address - ALTERNATIVE "", ".insn s,0xb2000000,\address", 193 + ALTERNATIVE "nop", ".insn s,0xb2000000,\address", 193 .endm .macro LPSWEY address,lpswe - ALTERNATIVE "b \lpswe", ".insn siy,0xeb0000000071,\address,0", 193 + ALTERNATIVE "b \lpswe; nopr", ".insn siy,0xeb0000000071,\address,0", 193 .endm .macro MBEAR reg - ALTERNATIVE "", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), 193 + ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), 193 .endm .macro CHECK_STACK savearea @@ -98,11 +81,6 @@ _LPP_OFFSET = __LC_LPP #endif .endm - .macro STCK savearea - ALTERNATIVE ".insn s,0xb2050000,\savearea", \ - ".insn s,0xb27c0000,\savearea", 25 - .endm - /* * The TSTMSK macro generates a test-under-mask instruction by * calculating the memory offset for the specified mask value. @@ -126,22 +104,22 @@ _LPP_OFFSET = __LC_LPP .endm .macro BPOFF - ALTERNATIVE "", ".long 0xb2e8c000", 82 + ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", 82 .endm .macro BPON - ALTERNATIVE "", ".long 0xb2e8d000", 82 + ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", 82 .endm .macro BPENTER tif_ptr,tif_mask - ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .long 0xb2e8d000", \ - "", 82 + ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .insn rrf,0xb2e80000,0,0,13,0", \ + "j .+12; nop; nop", 82 .endm .macro BPEXIT tif_ptr,tif_mask TSTMSK \tif_ptr,\tif_mask - ALTERNATIVE "jz .+8; .long 0xb2e8c000", \ - "jnz .+8; .long 0xb2e8d000", 82 + ALTERNATIVE "jz .+8; .insn rrf,0xb2e80000,0,0,12,0", \ + "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", 82 .endm /* @@ -177,9 +155,19 @@ _LPP_OFFSET = __LC_LPP lgr %r14,\reg larl %r13,\start slgr %r14,%r13 - lghi %r13,\end - \start - clgr %r14,%r13 +#ifdef CONFIG_AS_IS_LLVM + clgfrl %r14,.Lrange_size\@ +#else + clgfi %r14,\end - \start +#endif jhe \outside_label +#ifdef CONFIG_AS_IS_LLVM + .section .rodata, "a" + .align 4 +.Lrange_size\@: + .long \end - \start + .previous +#endif .endm .macro SIEEXIT @@ -191,7 +179,6 @@ _LPP_OFFSET = __LC_LPP #endif GEN_BR_THUNK %r14 - GEN_BR_THUNK %r14,%r13 .section .kprobes.text, "ax" .Ldummy: @@ -232,7 +219,7 @@ ENTRY(__switch_to) aghi %r3,__TASK_pid mvc __LC_CURRENT_PID(4,%r0),0(%r3) # store pid of next lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task - ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40 + ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40 BR_EX %r14 ENDPROC(__switch_to) @@ -264,6 +251,10 @@ ENTRY(sie64a) BPEXIT __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) .Lsie_entry: sie 0(%r14) +# Let the next instruction be NOP to avoid triggering a machine check +# and handling it in a guest as result of the instruction execution. + nopr 7 +.Lsie_leave: BPOFF BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) .Lsie_skip: @@ -443,7 +434,7 @@ ENDPROC(pgm_check_handler) */ .macro INT_HANDLER name,lc_old_psw,handler ENTRY(\name) - STCK __LC_INT_CLOCK + stckf __LC_INT_CLOCK stpt __LC_SYS_ENTER_TIMER STBEAR __LC_LAST_BREAK BPOFF @@ -479,10 +470,7 @@ ENTRY(\name) mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC MBEAR %r11 stmg %r8,%r9,__PT_PSW(%r11) - tm %r8,0x0001 # coming from user space? - jno 1f - lctlg %c1,%c1,__LC_KERNEL_ASCE -1: lgr %r2,%r11 # pass pointer to pt_regs + lgr %r2,%r11 # pass pointer to pt_regs brasl %r14,\handler mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) tmhh %r8,0x0001 # returning to user ? @@ -515,7 +503,7 @@ ENTRY(psw_idle) .Lpsw_idle_stcctm: oi __LC_CPU_FLAGS+7,_CIF_ENABLED_WAIT BPON - STCK __CLOCK_IDLE_ENTER(%r2) + stckf __CLOCK_IDLE_ENTER(%r2) stpt __TIMER_IDLE_ENTER(%r2) lpswe __SF_EMPTY(%r15) .globl psw_idle_exit @@ -527,7 +515,7 @@ ENDPROC(psw_idle) * Machine check handler routines */ ENTRY(mcck_int_handler) - STCK __LC_MCCK_CLOCK + stckf __LC_MCCK_CLOCK BPOFF la %r1,4095 # validate r1 spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # validate cpu timer @@ -563,7 +551,7 @@ ENTRY(mcck_int_handler) jno .Lmcck_panic #if IS_ENABLED(CONFIG_KVM) OUTSIDE %r9,.Lsie_gmap,.Lsie_done,6f - OUTSIDE %r9,.Lsie_entry,.Lsie_skip,4f + OUTSIDE %r9,.Lsie_entry,.Lsie_leave,4f oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST j 5f 4: CHKSTG .Lmcck_panic @@ -608,6 +596,7 @@ ENTRY(mcck_int_handler) mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11) xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1) la %r11,STACK_FRAME_OVERHEAD(%r1) + lgr %r2,%r11 lgr %r15,%r1 brasl %r14,s390_handle_mcck .Lmcck_return: @@ -618,7 +607,7 @@ ENTRY(mcck_int_handler) jno 0f BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP stpt __LC_EXIT_TIMER -0: ALTERNATIVE "", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA),193 +0: ALTERNATIVE "nop", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA),193 LBEAR 0(%r12) lmg %r11,%r15,__PT_R11(%r11) LPSWEY __LC_RETURN_MCCK_PSW,__LC_RETURN_MCCK_LPSWE @@ -654,7 +643,7 @@ ENTRY(mcck_int_handler) ENDPROC(mcck_int_handler) ENTRY(restart_int_handler) - ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40 + ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40 stg %r15,__LC_SAVE_AREA_RESTART TSTMSK __LC_RESTART_FLAGS,RESTART_FLAG_CTLREGS,4 jz 0f diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index 6083090be1f4..995ec7449feb 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -5,6 +5,7 @@ #include <linux/percpu.h> #include <linux/types.h> #include <linux/signal.h> +#include <asm/extable.h> #include <asm/ptrace.h> #include <asm/idle.h> @@ -16,10 +17,12 @@ void ext_int_handler(void); void io_int_handler(void); void mcck_int_handler(void); void restart_int_handler(void); +void early_pgm_check_handler(void); void __ret_from_fork(struct task_struct *prev, struct pt_regs *regs); void __do_pgm_check(struct pt_regs *regs); void __do_syscall(struct pt_regs *regs, int per_trap); +void __do_early_pgm_check(struct pt_regs *regs); void do_protection_exception(struct pt_regs *regs); void do_dat_exception(struct pt_regs *regs); diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 5510c7d10ddc..416b5a94353d 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -4,8 +4,7 @@ * * Copyright IBM Corp. 2009,2014 * - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>, - * Martin Schwidefsky <schwidefsky@de.ibm.com> + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ #include <linux/moduleloader.h> @@ -61,18 +60,9 @@ asm( #ifdef CONFIG_EXPOLINE asm( " .align 16\n" - "ftrace_shared_hotpatch_trampoline_ex:\n" - " lmg %r0,%r1,2(%r1)\n" - " ex %r0," __stringify(__LC_BR_R1) "(%r0)\n" - " j .\n" - "ftrace_shared_hotpatch_trampoline_ex_end:\n" -); - -asm( - " .align 16\n" "ftrace_shared_hotpatch_trampoline_exrl:\n" " lmg %r0,%r1,2(%r1)\n" - " .insn ril,0xc60000000000,%r0,0f\n" /* exrl */ + " exrl %r0,0f\n" " j .\n" "0: br %r1\n" "ftrace_shared_hotpatch_trampoline_exrl_end:\n" @@ -91,12 +81,8 @@ static const char *ftrace_shared_hotpatch_trampoline(const char **end) tend = ftrace_shared_hotpatch_trampoline_br_end; #ifdef CONFIG_EXPOLINE if (!nospec_disable) { - tstart = ftrace_shared_hotpatch_trampoline_ex; - tend = ftrace_shared_hotpatch_trampoline_ex_end; - if (test_facility(35)) { /* exrl */ - tstart = ftrace_shared_hotpatch_trampoline_exrl; - tend = ftrace_shared_hotpatch_trampoline_exrl_end; - } + tstart = ftrace_shared_hotpatch_trampoline_exrl; + tend = ftrace_shared_hotpatch_trampoline_exrl_end; } #endif /* CONFIG_EXPOLINE */ if (end) @@ -159,37 +145,73 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) return 0; } +static struct ftrace_hotpatch_trampoline *ftrace_get_trampoline(struct dyn_ftrace *rec) +{ + struct ftrace_hotpatch_trampoline *trampoline; + struct ftrace_insn insn; + s64 disp; + u16 opc; + + if (copy_from_kernel_nofault(&insn, (void *)rec->ip, sizeof(insn))) + return ERR_PTR(-EFAULT); + disp = (s64)insn.disp * 2; + trampoline = (void *)(rec->ip + disp); + if (get_kernel_nofault(opc, &trampoline->brasl_opc)) + return ERR_PTR(-EFAULT); + if (opc != 0xc015) + return ERR_PTR(-EINVAL); + return trampoline; +} + int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { + struct ftrace_hotpatch_trampoline *trampoline; + u64 old; + + trampoline = ftrace_get_trampoline(rec); + if (IS_ERR(trampoline)) + return PTR_ERR(trampoline); + if (get_kernel_nofault(old, &trampoline->interceptor)) + return -EFAULT; + if (old != old_addr) + return -EINVAL; + s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr)); return 0; } -static void brcl_disable(void *brcl) +static int ftrace_patch_branch_mask(void *addr, u16 expected, bool enable) { - u8 op = 0x04; /* set mask field to zero */ + u16 old; + u8 op; - s390_kernel_write((char *)brcl + 1, &op, sizeof(op)); + if (get_kernel_nofault(old, addr)) + return -EFAULT; + if (old != expected) + return -EINVAL; + /* set mask field to all ones or zeroes */ + op = enable ? 0xf4 : 0x04; + s390_kernel_write((char *)addr + 1, &op, sizeof(op)); + return 0; } int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - brcl_disable((void *)rec->ip); - return 0; -} - -static void brcl_enable(void *brcl) -{ - u8 op = 0xf4; /* set mask field to all ones */ - - s390_kernel_write((char *)brcl + 1, &op, sizeof(op)); + /* Expect brcl 0xf,... */ + return ftrace_patch_branch_mask((void *)rec->ip, 0xc0f4, false); } int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - brcl_enable((void *)rec->ip); - return 0; + struct ftrace_hotpatch_trampoline *trampoline; + + trampoline = ftrace_get_trampoline(rec); + if (IS_ERR(trampoline)) + return PTR_ERR(trampoline); + s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr)); + /* Expect brcl 0x0,... */ + return ftrace_patch_branch_mask((void *)rec->ip, 0xc004, true); } int ftrace_update_ftrace_func(ftrace_func_t func) @@ -203,14 +225,13 @@ void arch_ftrace_update_code(int command) ftrace_modify_all_code(command); } -int ftrace_arch_code_modify_post_process(void) +void ftrace_arch_code_modify_post_process(void) { /* * Flush any pre-fetched instructions on all * CPUs to make the new code visible. */ text_poke_sync_lock(); - return 0; } #ifdef CONFIG_MODULES @@ -262,14 +283,24 @@ NOKPROBE_SYMBOL(prepare_ftrace_return); */ int ftrace_enable_ftrace_graph_caller(void) { - brcl_disable(ftrace_graph_caller); + int rc; + + /* Expect brc 0xf,... */ + rc = ftrace_patch_branch_mask(ftrace_graph_caller, 0xa7f4, false); + if (rc) + return rc; text_poke_sync_lock(); return 0; } int ftrace_disable_ftrace_graph_caller(void) { - brcl_enable(ftrace_graph_caller); + int rc; + + /* Expect brc 0x0,... */ + rc = ftrace_patch_branch_mask(ftrace_graph_caller, 0xa704, true); + if (rc) + return rc; text_poke_sync_lock(); return 0; } @@ -290,9 +321,8 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, return; regs = ftrace_get_regs(fregs); - preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); - if (unlikely(!p) || kprobe_disabled(p)) + if (!regs || unlikely(!p) || kprobe_disabled(p)) goto out; if (kprobe_running()) { @@ -318,7 +348,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, } __this_cpu_write(current_kprobe, NULL); out: - preempt_enable_notrace(); ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/s390/kernel/ftrace.h b/arch/s390/kernel/ftrace.h index 69e416f4c6b0..7f75a9616406 100644 --- a/arch/s390/kernel/ftrace.h +++ b/arch/s390/kernel/ftrace.h @@ -16,8 +16,6 @@ extern struct ftrace_hotpatch_trampoline __ftrace_hotpatch_trampolines_start[]; extern struct ftrace_hotpatch_trampoline __ftrace_hotpatch_trampolines_end[]; extern const char ftrace_shared_hotpatch_trampoline_br[]; extern const char ftrace_shared_hotpatch_trampoline_br_end[]; -extern const char ftrace_shared_hotpatch_trampoline_ex[]; -extern const char ftrace_shared_hotpatch_trampoline_ex_end[]; extern const char ftrace_shared_hotpatch_trampoline_exrl[]; extern const char ftrace_shared_hotpatch_trampoline_exrl_end[]; extern const char ftrace_plt_template[]; diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 42f9a325a257..d7b8b6ad574d 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -5,7 +5,6 @@ * Author(s): Hartmut Penner <hp@de.ibm.com> * Martin Schwidefsky <schwidefsky@de.ibm.com> * Rob van der Heij <rvdhei@iae.nl> - * Heiko Carstens <heiko.carstens@de.ibm.com> * */ diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 5ad1dde23dc5..325cbf69ebbd 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -4,7 +4,6 @@ * * Copyright IBM Corp. 2005, 2012 * Author(s): Michael Holzheu <holzheu@de.ibm.com> - * Heiko Carstens <heiko.carstens@de.ibm.com> * Volker Sameske <sameske@de.ibm.com> */ @@ -20,6 +19,7 @@ #include <linux/gfp.h> #include <linux/crash_dump.h> #include <linux/debug_locks.h> +#include <asm/asm-extable.h> #include <asm/diag.h> #include <asm/ipl.h> #include <asm/smp.h> @@ -29,6 +29,7 @@ #include <asm/sclp.h> #include <asm/checksum.h> #include <asm/debug.h> +#include <asm/abs_lowcore.h> #include <asm/os_info.h> #include <asm/sections.h> #include <asm/boot_data.h> @@ -1642,12 +1643,16 @@ static struct shutdown_action __refdata dump_action = { static void dump_reipl_run(struct shutdown_trigger *trigger) { unsigned long ipib = (unsigned long) reipl_block_actual; + struct lowcore *abs_lc; + unsigned long flags; unsigned int csum; csum = (__force unsigned int) csum_partial(reipl_block_actual, reipl_block_actual->hdr.len, 0); - mem_assign_absolute(S390_lowcore.ipib, ipib); - mem_assign_absolute(S390_lowcore.ipib_checksum, csum); + abs_lc = get_abs_lowcore(&flags); + abs_lc->ipib = ipib; + abs_lc->ipib_checksum = csum; + put_abs_lowcore(abs_lc, flags); dump_run(trigger); } diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 0df83ecaa2e0..45393919fe61 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -138,7 +138,7 @@ void noinstr do_io_irq(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); int from_idle; - irq_enter(); + irq_enter_rcu(); if (user_mode(regs)) { update_timer_sys(); @@ -158,7 +158,8 @@ void noinstr do_io_irq(struct pt_regs *regs) do_irq_async(regs, IO_INTERRUPT); } while (MACHINE_IS_LPAR && irq_pending(regs)); - irq_exit(); + irq_exit_rcu(); + set_irq_regs(old_regs); irqentry_exit(regs, state); @@ -172,7 +173,7 @@ void noinstr do_ext_irq(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); int from_idle; - irq_enter(); + irq_enter_rcu(); if (user_mode(regs)) { update_timer_sys(); @@ -190,7 +191,7 @@ void noinstr do_ext_irq(struct pt_regs *regs) do_irq_async(regs, EXT_INTERRUPT); - irq_exit(); + irq_exit_rcu(); set_irq_regs(old_regs); irqentry_exit(regs, state); @@ -204,7 +205,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq) unsigned long flags; int cpu; - irq_lock_sparse(); + rcu_read_lock(); desc = irq_to_desc(irq); if (!desc) goto out; @@ -223,7 +224,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq) seq_putc(p, '\n'); raw_spin_unlock_irqrestore(&desc->lock, flags); out: - irq_unlock_sparse(); + rcu_read_unlock(); } /* @@ -341,7 +342,7 @@ static irqreturn_t do_ext_interrupt(int irq, void *dummy) struct ext_int_info *p; int index; - ext_code = *(struct ext_code *) ®s->int_code; + ext_code.int_code = regs->int_code; if (ext_code.code != EXT_IRQ_CLK_COMP) set_cpu_flag(CIF_NOHZ_DELAY); diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c index 6bec000c6c1c..e808bb8bc0da 100644 --- a/arch/s390/kernel/jump_label.c +++ b/arch/s390/kernel/jump_label.c @@ -44,14 +44,8 @@ static void jump_label_bug(struct jump_entry *entry, struct insn *expected, panic("Corrupted kernel text"); } -static struct insn orignop = { - .opcode = 0xc004, - .offset = JUMP_LABEL_NOP_OFFSET >> 1, -}; - static void jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - int init) + enum jump_label_type type) { void *code = (void *)jump_entry_code(entry); struct insn old, new; @@ -63,27 +57,22 @@ static void jump_label_transform(struct jump_entry *entry, jump_label_make_branch(entry, &old); jump_label_make_nop(entry, &new); } - if (init) { - if (memcmp(code, &orignop, sizeof(orignop))) - jump_label_bug(entry, &orignop, &new); - } else { - if (memcmp(code, &old, sizeof(old))) - jump_label_bug(entry, &old, &new); - } + if (memcmp(code, &old, sizeof(old))) + jump_label_bug(entry, &old, &new); s390_kernel_write(code, &new, sizeof(new)); } void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { - jump_label_transform(entry, type, 0); + jump_label_transform(entry, type); text_poke_sync(); } bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type) { - jump_label_transform(entry, type, 0); + jump_label_transform(entry, type); return true; } @@ -91,10 +80,3 @@ void arch_jump_label_transform_apply(void) { text_poke_sync(); } - -void __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, - enum jump_label_type type) -{ - jump_label_transform(entry, type, 1); - text_poke_sync(); -} diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index e27a7d3b0364..0032bdbe8e3f 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -284,11 +284,11 @@ NOKPROBE_SYMBOL(pop_kprobe); void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { - ri->ret_addr = (kprobe_opcode_t *) regs->gprs[14]; - ri->fp = NULL; + ri->ret_addr = (kprobe_opcode_t *)regs->gprs[14]; + ri->fp = (void *)regs->gprs[15]; /* Replace the return addr with trampoline addr */ - regs->gprs[14] = (unsigned long) &__kretprobe_trampoline; + regs->gprs[14] = (unsigned long)&__kretprobe_trampoline; } NOKPROBE_SYMBOL(arch_prepare_kretprobe); @@ -372,33 +372,26 @@ static int kprobe_handler(struct pt_regs *regs) } NOKPROBE_SYMBOL(kprobe_handler); -/* - * Function return probe trampoline: - * - init_kprobes() establishes a probepoint here - * - When the probed function returns, this probe - * causes the handlers to fire - */ -static void __used kretprobe_trampoline_holder(void) +void arch_kretprobe_fixup_return(struct pt_regs *regs, + kprobe_opcode_t *correct_ret_addr) { - asm volatile(".global __kretprobe_trampoline\n" - "__kretprobe_trampoline: bcr 0,0\n"); + /* Replace fake return address with real one. */ + regs->gprs[14] = (unsigned long)correct_ret_addr; } +NOKPROBE_SYMBOL(arch_kretprobe_fixup_return); /* - * Called when the probe at kretprobe trampoline is hit + * Called from __kretprobe_trampoline */ -static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) +void trampoline_probe_handler(struct pt_regs *regs) { - regs->psw.addr = __kretprobe_trampoline_handler(regs, NULL); - /* - * By returning a non-zero value, we are telling - * kprobe_handler() that we don't want the post_handler - * to run (and have re-enabled preemption) - */ - return 1; + kretprobe_trampoline_handler(regs, (void *)regs->gprs[15]); } NOKPROBE_SYMBOL(trampoline_probe_handler); +/* assembler function that handles the kretprobes must not be probed itself */ +NOKPROBE_SYMBOL(__kretprobe_trampoline); + /* * Called after single-stepping. p->addr is the address of the * instruction whose first byte has been replaced by the "breakpoint" @@ -465,7 +458,6 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); struct kprobe *p = kprobe_running(); - const struct exception_table_entry *entry; switch(kcb->kprobe_status) { case KPROBE_HIT_SS: @@ -487,10 +479,8 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr) * In case the user-specified fault handler returned * zero, try to fix up. */ - entry = s390_search_extables(regs->psw.addr); - if (entry && ex_handle(entry, regs)) + if (fixup_exception(regs)) return 1; - /* * fixup_exception() could not handle it, * Let do_page_fault() fix it. @@ -554,18 +544,13 @@ int kprobe_exceptions_notify(struct notifier_block *self, } NOKPROBE_SYMBOL(kprobe_exceptions_notify); -static struct kprobe trampoline = { - .addr = (kprobe_opcode_t *) &__kretprobe_trampoline, - .pre_handler = trampoline_probe_handler -}; - int __init arch_init_kprobes(void) { - return register_kprobe(&trampoline); + return 0; } int arch_trampoline_kprobe(struct kprobe *p) { - return p->addr == (kprobe_opcode_t *) &__kretprobe_trampoline; + return 0; } NOKPROBE_SYMBOL(arch_trampoline_kprobe); diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c index 3b895971c3d0..6652e54cf3db 100644 --- a/arch/s390/kernel/lgr.c +++ b/arch/s390/kernel/lgr.c @@ -88,8 +88,7 @@ static void lgr_stsi_2_2_2(struct lgr_info *lgr_info) if (stsi(si, 2, 2, 2)) return; cpascii(lgr_info->name, si->name, sizeof(si->name)); - memcpy(&lgr_info->lpar_number, &si->lpar_number, - sizeof(lgr_info->lpar_number)); + lgr_info->lpar_number = si->lpar_number; } /* diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 0505e55a6297..4579b42286d5 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -3,7 +3,6 @@ * Copyright IBM Corp. 2005, 2011 * * Author(s): Rolf Adelsberger, - * Heiko Carstens <heiko.carstens@de.ibm.com> * Michael Holzheu <holzheu@linux.vnet.ibm.com> */ @@ -22,13 +21,16 @@ #include <asm/elf.h> #include <asm/asm-offsets.h> #include <asm/cacheflush.h> +#include <asm/abs_lowcore.h> #include <asm/os_info.h> #include <asm/set_memory.h> #include <asm/stacktrace.h> #include <asm/switch_to.h> #include <asm/nmi.h> +#include <asm/sclp.h> -typedef void (*relocate_kernel_t)(kimage_entry_t *, unsigned long); +typedef void (*relocate_kernel_t)(kimage_entry_t *, unsigned long, + unsigned long); extern const unsigned char relocate_kernel[]; extern const unsigned long long relocate_kernel_len; @@ -55,7 +57,7 @@ static void __do_machine_kdump(void *image) * This need to be done *after* s390_reset_system set the * prefix register of this CPU to zero */ - memcpy((void *) __LC_FPREGS_SAVE_AREA, + memcpy(absolute_pointer(__LC_FPREGS_SAVE_AREA), (void *)(prefix + __LC_FPREGS_SAVE_AREA), 512); __load_psw_mask(PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA); @@ -86,7 +88,7 @@ static noinline void __machine_kdump(void *image) continue; } /* Store status of the boot CPU */ - mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); + mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); if (MACHINE_HAS_VX) save_vx_regs((__vector128 *) mcesa->vector_save_area); if (MACHINE_HAS_GS) { @@ -221,13 +223,18 @@ void machine_kexec_cleanup(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { + struct lowcore *abs_lc; + unsigned long flags; + VMCOREINFO_SYMBOL(lowcore_ptr); VMCOREINFO_SYMBOL(high_memory); VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); vmcoreinfo_append_str("SAMODE31=%lx\n", __samode31); vmcoreinfo_append_str("EAMODE31=%lx\n", __eamode31); vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); - mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); + abs_lc = get_abs_lowcore(&flags); + abs_lc->vmcore_info = paddr_vmcoreinfo_note(); + put_abs_lowcore(abs_lc, flags); } void machine_shutdown(void) @@ -244,6 +251,7 @@ void machine_crash_shutdown(struct pt_regs *regs) */ static void __do_machine_kexec(void *data) { + unsigned long diag308_subcode; relocate_kernel_t data_mover; struct kimage *image = data; @@ -252,7 +260,10 @@ static void __do_machine_kexec(void *data) __arch_local_irq_stnsm(0xfb); /* disable DAT - avoid no-execute */ /* Call the moving routine */ - (*data_mover)(&image->head, image->start); + diag308_subcode = DIAG308_CLEAR_RESET; + if (sclp.has_iplcc) + diag308_subcode |= DIAG308_FLAG_EI; + (*data_mover)(&image->head, image->start, diag308_subcode); /* Die if kexec returns */ disabled_wait(); diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index 9975ad200d74..fc6d5f58debe 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -7,6 +7,8 @@ * Author(s): Philipp Rudo <prudo@linux.vnet.ibm.com> */ +#define pr_fmt(fmt) "kexec: " fmt + #include <linux/elf.h> #include <linux/errno.h> #include <linux/kexec.h> @@ -29,6 +31,7 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len) const unsigned long marker_len = sizeof(MODULE_SIG_STRING) - 1; struct module_signature *ms; unsigned long sig_len; + int ret; /* Skip signature verification when not secure IPLed. */ if (!ipl_secure_flag) @@ -63,11 +66,18 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len) return -EBADMSG; } - return verify_pkcs7_signature(kernel, kernel_len, - kernel + kernel_len, sig_len, - VERIFY_USE_PLATFORM_KEYRING, - VERIFYING_MODULE_SIGNATURE, - NULL, NULL); + ret = verify_pkcs7_signature(kernel, kernel_len, + kernel + kernel_len, sig_len, + VERIFY_USE_SECONDARY_KEYRING, + VERIFYING_MODULE_SIGNATURE, + NULL, NULL); + if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) + ret = verify_pkcs7_signature(kernel, kernel_len, + kernel + kernel_len, sig_len, + VERIFY_USE_PLATFORM_KEYRING, + VERIFYING_MODULE_SIGNATURE, + NULL, NULL); + return ret; } #endif /* CONFIG_KEXEC_SIG */ @@ -290,8 +300,16 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, const Elf_Shdr *relsec, const Elf_Shdr *symtab) { + const char *strtab, *name, *shstrtab; + const Elf_Shdr *sechdrs; Elf_Rela *relas; int i, r_type; + int ret; + + /* String & section header string table */ + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; + strtab = (char *)pi->ehdr + sechdrs[symtab->sh_link].sh_offset; + shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; relas = (void *)pi->ehdr + relsec->sh_offset; @@ -304,15 +322,27 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, sym = (void *)pi->ehdr + symtab->sh_offset; sym += ELF64_R_SYM(relas[i].r_info); - if (sym->st_shndx == SHN_UNDEF) + if (sym->st_name) + name = strtab + sym->st_name; + else + name = shstrtab + sechdrs[sym->st_shndx].sh_name; + + if (sym->st_shndx == SHN_UNDEF) { + pr_err("Undefined symbol: %s\n", name); return -ENOEXEC; + } - if (sym->st_shndx == SHN_COMMON) + if (sym->st_shndx == SHN_COMMON) { + pr_err("symbol '%s' in common section\n", name); return -ENOEXEC; + } if (sym->st_shndx >= pi->ehdr->e_shnum && - sym->st_shndx != SHN_ABS) + sym->st_shndx != SHN_ABS) { + pr_err("Invalid section %d for symbol %s\n", + sym->st_shndx, name); return -ENOEXEC; + } loc = pi->purgatory_buf; loc += section->sh_offset; @@ -326,7 +356,15 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, addr = section->sh_addr + relas[i].r_offset; r_type = ELF64_R_TYPE(relas[i].r_info); - arch_kexec_do_relocs(r_type, loc, val, addr); + + if (r_type == R_390_PLT32DBL) + r_type = R_390_PC32DBL; + + ret = arch_kexec_do_relocs(r_type, loc, val, addr); + if (ret) { + pr_err("Unknown rela relocation: %d\n", r_type); + return -ENOEXEC; + } } return 0; } diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index 39bcc0e39a10..4786bfe02144 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -2,8 +2,6 @@ /* * Copyright IBM Corp. 2008, 2009 * - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>, - * */ #include <linux/linkage.h> @@ -13,6 +11,18 @@ #include <asm/ptrace.h> #include <asm/export.h> + +#define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE) +#define STACK_PTREGS (STACK_FRAME_OVERHEAD) +#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS) +#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) +#define STACK_PTREGS_ORIG_GPR2 (STACK_PTREGS + __PT_ORIG_GPR2) +#define STACK_PTREGS_FLAGS (STACK_PTREGS + __PT_FLAGS) +/* packed stack: allocate just enough for r14, r15 and backchain */ +#define TRACED_FUNC_FRAME_SIZE 24 + +#ifdef CONFIG_FUNCTION_TRACER + GEN_BR_THUNK %r1 GEN_BR_THUNK %r14 @@ -22,25 +32,14 @@ ENTRY(ftrace_stub) BR_EX %r14 ENDPROC(ftrace_stub) -#define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE) -#define STACK_PTREGS (STACK_FRAME_OVERHEAD) -#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS) -#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) -#define STACK_PTREGS_ORIG_GPR2 (STACK_PTREGS + __PT_ORIG_GPR2) -#ifdef __PACK_STACK -/* allocate just enough for r14, r15 and backchain */ -#define TRACED_FUNC_FRAME_SIZE 24 -#else -#define TRACED_FUNC_FRAME_SIZE STACK_FRAME_OVERHEAD -#endif - .macro ftrace_regs_entry, allregs=0 stg %r14,(__SF_GPRS+8*8)(%r15) # save traced function caller .if \allregs == 1 - lghi %r14,0 # save condition code - ipm %r14 # don't put any instructions - sllg %r14,%r14,16 # clobbering CC before this point + # save psw mask + # don't put any instructions clobbering CC before this point + epsw %r1,%r14 + risbg %r14,%r1,0,31,32 .endif lgr %r1,%r15 @@ -56,7 +55,9 @@ ENDPROC(ftrace_stub) .if \allregs == 1 stg %r14,(STACK_PTREGS_PSW)(%r15) - stosm (STACK_PTREGS_PSW)(%r15),0 + mvghi STACK_PTREGS_FLAGS(%r15),_PIF_FTRACE_FULL_REGS + .else + xc STACK_PTREGS_FLAGS(8,%r15),STACK_PTREGS_FLAGS(%r15) .endif lg %r14,(__SF_GPRS+8*8)(%r1) # restore original return address @@ -132,3 +133,35 @@ SYM_FUNC_START(return_to_handler) SYM_FUNC_END(return_to_handler) #endif +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_KPROBES + +SYM_FUNC_START(__kretprobe_trampoline) + + stg %r14,(__SF_GPRS+8*8)(%r15) + lay %r15,-STACK_FRAME_SIZE(%r15) + stmg %r0,%r14,STACK_PTREGS_GPRS(%r15) + + # store original stack pointer in backchain and pt_regs + lay %r7,STACK_FRAME_SIZE(%r15) + stg %r7,__SF_BACKCHAIN(%r15) + stg %r7,STACK_PTREGS_GPRS+(15*8)(%r15) + + # store full psw + epsw %r2,%r3 + risbg %r3,%r2,0,31,32 + stg %r3,STACK_PTREGS_PSW(%r15) + larl %r1,__kretprobe_trampoline + stg %r1,STACK_PTREGS_PSW+8(%r15) + + lay %r2,STACK_PTREGS(%r15) + brasl %r14,trampoline_probe_handler + + mvc __SF_EMPTY(16,%r7),STACK_PTREGS_PSW(%r15) + lmg %r0,%r15,STACK_PTREGS_GPRS(%r15) + lpswe __SF_EMPTY(%r15) + +SYM_FUNC_END(__kretprobe_trampoline) + +#endif /* CONFIG_KPROBES */ diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index b01ba460b7ca..2d159b32885b 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -33,18 +33,19 @@ #define DEBUGP(fmt , ...) #endif -#define PLT_ENTRY_SIZE 20 +#define PLT_ENTRY_SIZE 22 void *module_alloc(unsigned long size) { + gfp_t gfp_mask = GFP_KERNEL; void *p; if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } @@ -340,27 +341,26 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, case R_390_PLTOFF32: /* 32 bit offset from GOT to PLT. */ case R_390_PLTOFF64: /* 16 bit offset from GOT to PLT. */ if (info->plt_initialized == 0) { - unsigned int insn[5]; - unsigned int *ip = me->core_layout.base + - me->arch.plt_offset + - info->plt_offset; - - insn[0] = 0x0d10e310; /* basr 1,0 */ - insn[1] = 0x100a0004; /* lg 1,10(1) */ + unsigned char insn[PLT_ENTRY_SIZE]; + char *plt_base; + char *ip; + + plt_base = me->core_layout.base + me->arch.plt_offset; + ip = plt_base + info->plt_offset; + *(int *)insn = 0x0d10e310; /* basr 1,0 */ + *(int *)&insn[4] = 0x100c0004; /* lg 1,12(1) */ if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable) { - unsigned int *ij; - ij = me->core_layout.base + - me->arch.plt_offset + - me->arch.plt_size - PLT_ENTRY_SIZE; - insn[2] = 0xa7f40000 + /* j __jump_r1 */ - (unsigned int)(u16) - (((unsigned long) ij - 8 - - (unsigned long) ip) / 2); + char *jump_r1; + + jump_r1 = plt_base + me->arch.plt_size - + PLT_ENTRY_SIZE; + /* brcl 0xf,__jump_r1 */ + *(short *)&insn[8] = 0xc0f4; + *(int *)&insn[10] = (jump_r1 - (ip + 8)) / 2; } else { - insn[2] = 0x07f10000; /* br %r1 */ + *(int *)&insn[8] = 0x07f10000; /* br %r1 */ } - insn[3] = (unsigned int) (val >> 32); - insn[4] = (unsigned int) val; + *(long *)&insn[14] = val; write(ip, insn, sizeof(insn)); info->plt_initialized = 1; @@ -517,15 +517,9 @@ int module_finalize(const Elf_Ehdr *hdr, ij = me->core_layout.base + me->arch.plt_offset + me->arch.plt_size - PLT_ENTRY_SIZE; - if (test_facility(35)) { - ij[0] = 0xc6000000; /* exrl %r0,.+10 */ - ij[1] = 0x0005a7f4; /* j . */ - ij[2] = 0x000007f1; /* br %r1 */ - } else { - ij[0] = 0x44000000 | (unsigned int) - offsetof(struct lowcore, br_r1_trampoline); - ij[1] = 0xa7f40000; /* j . */ - } + ij[0] = 0xc6000000; /* exrl %r0,.+10 */ + ij[1] = 0x0005a7f4; /* j . */ + ij[2] = 0x000007f1; /* br %r1 */ } secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; @@ -554,6 +548,5 @@ int module_finalize(const Elf_Ehdr *hdr, #endif /* CONFIG_FUNCTION_TRACER */ } - jump_label_apply_nops(me); return 0; } diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 20f8e1868853..31cb9b00a36b 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -6,12 +6,12 @@ * Author(s): Ingo Adlung <adlung@de.ibm.com>, * Martin Schwidefsky <schwidefsky@de.ibm.com>, * Cornelia Huck <cornelia.huck@de.ibm.com>, - * Heiko Carstens <heiko.carstens@de.ibm.com>, */ #include <linux/kernel_stat.h> #include <linux/init.h> #include <linux/errno.h> +#include <linux/entry-common.h> #include <linux/hardirq.h> #include <linux/log2.h> #include <linux/kprobes.h> @@ -30,6 +30,8 @@ #include <asm/switch_to.h> #include <asm/ctl_reg.h> #include <asm/asm-offsets.h> +#include <asm/pai.h> + #include <linux/kvm_host.h> struct mcck_struct { @@ -58,27 +60,27 @@ static inline unsigned long nmi_get_mcesa_size(void) /* * The initial machine check extended save area for the boot CPU. - * It will be replaced by nmi_init() with an allocated structure. - * The structure is required for machine check happening early in - * the boot process. + * It will be replaced on the boot CPU reinit with an allocated + * structure. The structure is required for machine check happening + * early in the boot process. */ -static struct mcesa boot_mcesa __initdata __aligned(MCESA_MAX_SIZE); +static struct mcesa boot_mcesa __aligned(MCESA_MAX_SIZE); -void __init nmi_alloc_boot_cpu(struct lowcore *lc) +void __init nmi_alloc_mcesa_early(u64 *mcesad) { if (!nmi_needs_mcesa()) return; - lc->mcesad = (unsigned long) &boot_mcesa; + *mcesad = __pa(&boot_mcesa); if (MACHINE_HAS_GS) - lc->mcesad |= ilog2(MCESA_MAX_SIZE); + *mcesad |= ilog2(MCESA_MAX_SIZE); } -static int __init nmi_init(void) +static void __init nmi_alloc_cache(void) { - unsigned long origin, cr0, size; + unsigned long size; if (!nmi_needs_mcesa()) - return 0; + return; size = nmi_get_mcesa_size(); if (size > MCESA_MIN_SIZE) mcesa_origin_lc = ilog2(size); @@ -86,40 +88,31 @@ static int __init nmi_init(void) mcesa_cache = kmem_cache_create("nmi_save_areas", size, size, 0, NULL); if (!mcesa_cache) panic("Couldn't create nmi save area cache"); - origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL); - if (!origin) - panic("Couldn't allocate nmi save area"); - /* The pointer is stored with mcesa_bits ORed in */ - kmemleak_not_leak((void *) origin); - __ctl_store(cr0, 0, 0); - __ctl_clear_bit(0, 28); /* disable lowcore protection */ - /* Replace boot_mcesa on the boot CPU */ - S390_lowcore.mcesad = origin | mcesa_origin_lc; - __ctl_load(cr0, 0, 0); - return 0; } -early_initcall(nmi_init); -int nmi_alloc_per_cpu(struct lowcore *lc) +int __ref nmi_alloc_mcesa(u64 *mcesad) { unsigned long origin; + *mcesad = 0; if (!nmi_needs_mcesa()) return 0; + if (!mcesa_cache) + nmi_alloc_cache(); origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL); if (!origin) return -ENOMEM; /* The pointer is stored with mcesa_bits ORed in */ kmemleak_not_leak((void *) origin); - lc->mcesad = origin | mcesa_origin_lc; + *mcesad = __pa(origin) | mcesa_origin_lc; return 0; } -void nmi_free_per_cpu(struct lowcore *lc) +void nmi_free_mcesa(u64 *mcesad) { if (!nmi_needs_mcesa()) return; - kmem_cache_free(mcesa_cache, (void *)(lc->mcesad & MCESA_ORIGIN_MASK)); + kmem_cache_free(mcesa_cache, __va(*mcesad & MCESA_ORIGIN_MASK)); } static notrace void s390_handle_damage(void) @@ -175,14 +168,16 @@ void __s390_handle_mcck(void) "malfunction (code 0x%016lx).\n", mcck.mcck_code); printk(KERN_EMERG "mcck: task: %s, pid: %d.\n", current->comm, current->pid); - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); } } -void noinstr s390_handle_mcck(void) +void noinstr s390_handle_mcck(struct pt_regs *regs) { trace_hardirqs_off(); + pai_kernel_enter(regs); __s390_handle_mcck(); + pai_kernel_exit(regs); trace_hardirqs_on(); } /* @@ -246,7 +241,7 @@ static int notrace s390_validate_registers(union mci mci, int umode) : "Q" (S390_lowcore.fpt_creg_save_area)); } - mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); + mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); if (!MACHINE_HAS_VX) { /* Validate floating point registers */ asm volatile( @@ -273,7 +268,14 @@ static int notrace s390_validate_registers(union mci mci, int umode) /* Validate vector registers */ union ctlreg0 cr0; - if (!mci.vr) { + /* + * The vector validity must only be checked if not running a + * KVM guest. For KVM guests the machine check is forwarded by + * KVM and it is the responsibility of the guest to take + * appropriate actions. The host vector or FPU values have been + * saved by KVM and will be restored by KVM. + */ + if (!mci.vr && !test_cpu_flag(CIF_MCCK_GUEST)) { /* * Vector registers can't be restored. If the kernel * currently uses vector registers the system is @@ -316,11 +318,21 @@ static int notrace s390_validate_registers(union mci mci, int umode) if (cr2.gse) { if (!mci.gs) { /* - * Guarded storage register can't be restored and - * the current processes uses guarded storage. - * It has to be terminated. + * 2 cases: + * - machine check in kernel or userspace + * - machine check while running SIE (KVM guest) + * For kernel or userspace the userspace values of + * guarded storage control can not be recreated, the + * process must be terminated. + * For SIE the guest values of guarded storage can not + * be recreated. This is either due to a bug or due to + * GS being disabled in the guest. The guest will be + * notified by KVM code and the guests machine check + * handling must take care of this. The host values + * are saved by KVM and are not affected. */ - kill_task = 1; + if (!test_cpu_flag(CIF_MCCK_GUEST)) + kill_task = 1; } else { load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area); } @@ -386,11 +398,12 @@ int notrace s390_do_machine_check(struct pt_regs *regs) static unsigned long long last_ipd; struct mcck_struct *mcck; unsigned long long tmp; + irqentry_state_t irq_state; union mci mci; unsigned long mcck_dam_code; int mcck_pending = 0; - nmi_enter(); + irq_state = irqentry_nmi_enter(regs); if (user_mode(regs)) update_timer_mcck(); @@ -493,14 +506,14 @@ int notrace s390_do_machine_check(struct pt_regs *regs) clear_cpu_flag(CIF_MCCK_GUEST); if (user_mode(regs) && mcck_pending) { - nmi_exit(); + irqentry_nmi_exit(regs, irq_state); return 1; } if (mcck_pending) schedule_mcck_handler(); - nmi_exit(); + irqentry_nmi_exit(regs, irq_state); return 0; } NOKPROBE_SYMBOL(s390_do_machine_check); diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c index 60e6fec27bba..717bbcc056e5 100644 --- a/arch/s390/kernel/nospec-branch.c +++ b/arch/s390/kernel/nospec-branch.c @@ -105,6 +105,7 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end) s32 *epo; /* Second part of the instruction replace is always a nop */ + memcpy(insnbuf + 2, branch, sizeof(branch)); for (epo = start; epo < end; epo++) { instr = (u8 *) epo + *epo; if (instr[0] == 0xc0 && (instr[1] & 0x0f) == 0x04) @@ -117,42 +118,20 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end) if (thunk[0] == 0xc6 && thunk[1] == 0x00) /* exrl %r0,<target-br> */ br = thunk + (*(int *)(thunk + 2)) * 2; - else if (thunk[0] == 0xc0 && (thunk[1] & 0x0f) == 0x00 && - thunk[6] == 0x44 && thunk[7] == 0x00 && - (thunk[8] & 0x0f) == 0x00 && thunk[9] == 0x00 && - (thunk[1] & 0xf0) == (thunk[8] & 0xf0)) - /* larl %rx,<target br> + ex %r0,0(%rx) */ - br = thunk + (*(int *)(thunk + 2)) * 2; else continue; - /* Check for unconditional branch 0x07f? or 0x47f???? */ - if ((br[0] & 0xbf) != 0x07 || (br[1] & 0xf0) != 0xf0) + if (br[0] != 0x07 || (br[1] & 0xf0) != 0xf0) continue; - - memcpy(insnbuf + 2, branch, sizeof(branch)); switch (type) { case BRCL_EXPOLINE: + /* brcl to thunk, replace with br + nop */ insnbuf[0] = br[0]; insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f); - if (br[0] == 0x47) { - /* brcl to b, replace with bc + nopr */ - insnbuf[2] = br[2]; - insnbuf[3] = br[3]; - } else { - /* brcl to br, replace with bcr + nop */ - } break; case BRASL_EXPOLINE: + /* brasl to thunk, replace with basr + nop */ + insnbuf[0] = 0x0d; insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f); - if (br[0] == 0x47) { - /* brasl to b, replace with bas + nopr */ - insnbuf[0] = 0x4d; - insnbuf[2] = br[2]; - insnbuf[3] = br[3]; - } else { - /* brasl to br, replace with basr + nop */ - insnbuf[0] = 0x0d; - } break; } diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c index 51c5a9f6e525..23ab9f02f278 100644 --- a/arch/s390/kernel/numa.c +++ b/arch/s390/kernel/numa.c @@ -33,10 +33,3 @@ void __init numa_setup(void) NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT; NODE_DATA(0)->node_id = 0; } - -static int __init numa_init_late(void) -{ - register_one_node(0); - return 0; -} -arch_initcall(numa_init_late); diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c index 4bef35b79b93..ec0bd9457e90 100644 --- a/arch/s390/kernel/os_info.c +++ b/arch/s390/kernel/os_info.c @@ -13,8 +13,10 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <asm/checksum.h> -#include <asm/lowcore.h> +#include <asm/abs_lowcore.h> #include <asm/os_info.h> +#include <asm/maccess.h> +#include <asm/asm-offsets.h> /* * OS info structure has to be page aligned @@ -45,7 +47,7 @@ void os_info_crashkernel_add(unsigned long base, unsigned long size) */ void os_info_entry_add(int nr, void *ptr, u64 size) { - os_info.entry[nr].addr = (u64)(unsigned long)ptr; + os_info.entry[nr].addr = __pa(ptr); os_info.entry[nr].size = size; os_info.entry[nr].csum = (__force u32)csum_partial(ptr, size, 0); os_info.csum = os_info_csum(&os_info); @@ -56,13 +58,16 @@ void os_info_entry_add(int nr, void *ptr, u64 size) */ void __init os_info_init(void) { - void *ptr = &os_info; + struct lowcore *abs_lc; + unsigned long flags; os_info.version_major = OS_INFO_VERSION_MAJOR; os_info.version_minor = OS_INFO_VERSION_MINOR; os_info.magic = OS_INFO_MAGIC; os_info.csum = os_info_csum(&os_info); - mem_assign_absolute(S390_lowcore.os_info, (unsigned long) ptr); + abs_lc = get_abs_lowcore(&flags); + abs_lc->os_info = __pa(&os_info); + put_abs_lowcore(abs_lc, flags); } #ifdef CONFIG_CRASH_DUMP @@ -90,7 +95,7 @@ static void os_info_old_alloc(int nr, int align) goto fail; } buf_align = PTR_ALIGN(buf, align); - if (copy_oldmem_kernel(buf_align, (void *) addr, size)) { + if (copy_oldmem_kernel(buf_align, addr, size)) { msg = "copy failed"; goto fail_free; } @@ -123,15 +128,14 @@ static void os_info_old_init(void) return; if (!oldmem_data.start) goto fail; - if (copy_oldmem_kernel(&addr, &S390_lowcore.os_info, sizeof(addr))) + if (copy_oldmem_kernel(&addr, __LC_OS_INFO, sizeof(addr))) goto fail; if (addr == 0 || addr % PAGE_SIZE) goto fail; os_info_old = kzalloc(sizeof(*os_info_old), GFP_KERNEL); if (!os_info_old) goto fail; - if (copy_oldmem_kernel(os_info_old, (void *) addr, - sizeof(*os_info_old))) + if (copy_oldmem_kernel(os_info_old, addr, sizeof(*os_info_old))) goto fail_free; if (os_info_old->magic != OS_INFO_MAGIC) goto fail_free; diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index ee8707abdb6a..f043a7ff220b 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -516,6 +516,26 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) return err; } +/* Events CPU_CYLCES and INSTRUCTIONS can be submitted with two different + * attribute::type values: + * - PERF_TYPE_HARDWARE: + * - pmu->type: + * Handle both type of invocations identical. They address the same hardware. + * The result is different when event modifiers exclude_kernel and/or + * exclude_user are also set. + */ +static int cpumf_pmu_event_type(struct perf_event *event) +{ + u64 ev = event->attr.config; + + if (cpumf_generic_events_basic[PERF_COUNT_HW_CPU_CYCLES] == ev || + cpumf_generic_events_basic[PERF_COUNT_HW_INSTRUCTIONS] == ev || + cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev || + cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev) + return PERF_TYPE_HARDWARE; + return PERF_TYPE_RAW; +} + static int cpumf_pmu_event_init(struct perf_event *event) { unsigned int type = event->attr.type; @@ -525,7 +545,7 @@ static int cpumf_pmu_event_init(struct perf_event *event) err = __hw_perf_event_init(event, type); else if (event->pmu->type == type) /* Registered as unknown PMU */ - err = __hw_perf_event_init(event, PERF_TYPE_RAW); + err = __hw_perf_event_init(event, cpumf_pmu_event_type(event)); else return -ENOENT; @@ -644,6 +664,7 @@ static int cfdiag_push_sample(struct perf_event *event, raw.frag.data = cpuhw->stop; raw.size = raw.frag.size; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } overflow = perf_event_overflow(event, &data, ®s); @@ -1451,6 +1472,8 @@ static size_t cfdiag_maxsize(struct cpumf_ctr_info *info) /* Get the CPU speed, try sampling facility first and CPU attributes second. */ static void cfdiag_get_cpu_speed(void) { + unsigned long mhz; + if (cpum_sf_avail()) { /* Sampling facility first */ struct hws_qsi_info_block si; @@ -1464,12 +1487,9 @@ static void cfdiag_get_cpu_speed(void) /* Fallback: CPU speed extract static part. Used in case * CPU Measurement Sampling Facility is turned off. */ - if (test_facility(34)) { - unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); - - if (mhz != -1UL) - cfdiag_cpu_speed = mhz & 0xffffffff; - } + mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); + if (mhz != -1UL) + cfdiag_cpu_speed = mhz & 0xffffffff; } static int cfset_init(void) diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c index 30f0242de4a5..8ee48672233f 100644 --- a/arch/s390/kernel/perf_cpum_cf_common.c +++ b/arch/s390/kernel/perf_cpum_cf_common.c @@ -178,7 +178,7 @@ size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, case CPUMF_CTR_SET_CRYPTO: if (info->csvn >= 1 && info->csvn <= 5) ctrset_size = 16; - else if (info->csvn == 6) + else if (info->csvn == 6 || info->csvn == 7) ctrset_size = 20; break; case CPUMF_CTR_SET_EXT: @@ -188,7 +188,7 @@ size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, ctrset_size = 48; else if (info->csvn >= 3 && info->csvn <= 5) ctrset_size = 128; - else if (info->csvn == 6) + else if (info->csvn == 6 || info->csvn == 7) ctrset_size = 160; break; case CPUMF_CTR_SET_MT_DIAG: diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c index 37265f551a11..0d64aafd158f 100644 --- a/arch/s390/kernel/perf_cpum_cf_events.c +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -295,6 +295,76 @@ CPUMF_EVENT_ATTR(cf_z15, DFLT_CC, 0x00108); CPUMF_EVENT_ATTR(cf_z15, DFLT_CCFINISH, 0x00109); CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); +CPUMF_EVENT_ATTR(cf_z16, L1D_RO_EXCL_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z16, DTLB2_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z16, DTLB2_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z16, CRSTE_1MB_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z16, DTLB2_GPAGE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z16, ITLB2_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z16, ITLB2_MISSES, 0x0087); +CPUMF_EVENT_ATTR(cf_z16, TLB2_PTE_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z16, TLB2_CRSTE_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z16, TLB2_ENGINES_BUSY, 0x008b); +CPUMF_EVENT_ATTR(cf_z16, TX_C_TEND, 0x008c); +CPUMF_EVENT_ATTR(cf_z16, TX_NC_TEND, 0x008d); +CPUMF_EVENT_ATTR(cf_z16, L1C_TLB2_MISSES, 0x008f); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ, 0x0091); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_IV, 0x0092); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_CHIP_HIT, 0x0093); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_DRAWER_HIT, 0x0094); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP, 0x0095); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_IV, 0x0096); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_CHIP_HIT, 0x0097); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_DRAWER_HIT, 0x0098); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_MODULE, 0x0099); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_DRAWER, 0x009a); +CPUMF_EVENT_ATTR(cf_z16, DCW_OFF_DRAWER, 0x009b); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_MEMORY, 0x009c); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_MODULE_MEMORY, 0x009d); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_DRAWER_MEMORY, 0x009e); +CPUMF_EVENT_ATTR(cf_z16, DCW_OFF_DRAWER_MEMORY, 0x009f); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_IV, 0x00a0); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_CHIP_HIT, 0x00a1); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_DRAWER_HIT, 0x00a2); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_IV, 0x00a3); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_CHIP_HIT, 0x00a4); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_DRAWER_HIT, 0x00a5); +CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_IV, 0x00a6); +CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_CHIP_HIT, 0x00a7); +CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_DRAWER_HIT, 0x00a8); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ, 0x00a9); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_IV, 0x00aa); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_CHIP_HIT, 0x00ab); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_DRAWER_HIT, 0x00ac); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP, 0x00ad); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_IV, 0x00ae); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_CHIP_HIT, 0x00af); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_DRAWER_HIT, 0x00b0); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_MODULE, 0x00b1); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_DRAWER, 0x00b2); +CPUMF_EVENT_ATTR(cf_z16, ICW_OFF_DRAWER, 0x00b3); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_MEMORY, 0x00b4); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_MODULE_MEMORY, 0x00b5); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_DRAWER_MEMORY, 0x00b6); +CPUMF_EVENT_ATTR(cf_z16, ICW_OFF_DRAWER_MEMORY, 0x00b7); +CPUMF_EVENT_ATTR(cf_z16, BCD_DFP_EXECUTION_SLOTS, 0x00e0); +CPUMF_EVENT_ATTR(cf_z16, VX_BCD_EXECUTION_SLOTS, 0x00e1); +CPUMF_EVENT_ATTR(cf_z16, DECIMAL_INSTRUCTIONS, 0x00e2); +CPUMF_EVENT_ATTR(cf_z16, LAST_HOST_TRANSLATIONS, 0x00e8); +CPUMF_EVENT_ATTR(cf_z16, TX_NC_TABORT, 0x00f4); +CPUMF_EVENT_ATTR(cf_z16, TX_C_TABORT_NO_SPECIAL, 0x00f5); +CPUMF_EVENT_ATTR(cf_z16, TX_C_TABORT_SPECIAL, 0x00f6); +CPUMF_EVENT_ATTR(cf_z16, DFLT_ACCESS, 0x00f8); +CPUMF_EVENT_ATTR(cf_z16, DFLT_CYCLES, 0x00fd); +CPUMF_EVENT_ATTR(cf_z16, SORTL, 0x0100); +CPUMF_EVENT_ATTR(cf_z16, DFLT_CC, 0x0109); +CPUMF_EVENT_ATTR(cf_z16, DFLT_CCFINISH, 0x010a); +CPUMF_EVENT_ATTR(cf_z16, NNPA_INVOCATIONS, 0x010b); +CPUMF_EVENT_ATTR(cf_z16, NNPA_COMPLETIONS, 0x010c); +CPUMF_EVENT_ATTR(cf_z16, NNPA_WAIT_LOCK, 0x010d); +CPUMF_EVENT_ATTR(cf_z16, NNPA_HOLD_LOCK, 0x010e); +CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); +CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); static struct attribute *cpumcf_fvn1_pmu_event_attr[] __initdata = { CPUMF_EVENT_PTR(cf_fvn1, CPU_CYCLES), @@ -344,7 +414,7 @@ static struct attribute *cpumcf_svn_12345_pmu_event_attr[] __initdata = { NULL, }; -static struct attribute *cpumcf_svn_6_pmu_event_attr[] __initdata = { +static struct attribute *cpumcf_svn_67_pmu_event_attr[] __initdata = { CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS), CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES), CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS), @@ -635,6 +705,80 @@ static struct attribute *cpumcf_z15_pmu_event_attr[] __initdata = { NULL, }; +static struct attribute *cpumcf_z16_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z16, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z16, DTLB2_WRITES), + CPUMF_EVENT_PTR(cf_z16, DTLB2_MISSES), + CPUMF_EVENT_PTR(cf_z16, CRSTE_1MB_WRITES), + CPUMF_EVENT_PTR(cf_z16, DTLB2_GPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z16, ITLB2_WRITES), + CPUMF_EVENT_PTR(cf_z16, ITLB2_MISSES), + CPUMF_EVENT_PTR(cf_z16, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z16, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z16, TLB2_ENGINES_BUSY), + CPUMF_EVENT_PTR(cf_z16, TX_C_TEND), + CPUMF_EVENT_PTR(cf_z16, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_z16, L1C_TLB2_MISSES), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ_IV), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_IV), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_MODULE), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_DRAWER), + CPUMF_EVENT_PTR(cf_z16, DCW_OFF_DRAWER), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_MEMORY), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_MODULE_MEMORY), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, DCW_OFF_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_IV), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_IV), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_IV), + CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ_IV), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_IV), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_MODULE), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_DRAWER), + CPUMF_EVENT_PTR(cf_z16, ICW_OFF_DRAWER), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_MEMORY), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_MODULE_MEMORY), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, ICW_OFF_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, BCD_DFP_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z16, VX_BCD_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z16, DECIMAL_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_z16, LAST_HOST_TRANSLATIONS), + CPUMF_EVENT_PTR(cf_z16, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_z16, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_z16, TX_C_TABORT_SPECIAL), + CPUMF_EVENT_PTR(cf_z16, DFLT_ACCESS), + CPUMF_EVENT_PTR(cf_z16, DFLT_CYCLES), + CPUMF_EVENT_PTR(cf_z16, SORTL), + CPUMF_EVENT_PTR(cf_z16, DFLT_CC), + CPUMF_EVENT_PTR(cf_z16, DFLT_CCFINISH), + CPUMF_EVENT_PTR(cf_z16, NNPA_INVOCATIONS), + CPUMF_EVENT_PTR(cf_z16, NNPA_COMPLETIONS), + CPUMF_EVENT_PTR(cf_z16, NNPA_WAIT_LOCK), + CPUMF_EVENT_PTR(cf_z16, NNPA_HOLD_LOCK), + CPUMF_EVENT_PTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE), + CPUMF_EVENT_PTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE), + NULL, +}; + /* END: CPUM_CF COUNTER DEFINITIONS ===================================== */ static struct attribute_group cpumcf_pmu_events_group = { @@ -715,8 +859,8 @@ __init const struct attribute_group **cpumf_cf_event_group(void) case 1 ... 5: csvn = cpumcf_svn_12345_pmu_event_attr; break; - case 6: - csvn = cpumcf_svn_6_pmu_event_attr; + case 6 ... 7: + csvn = cpumcf_svn_67_pmu_event_attr; break; default: csvn = none; @@ -749,6 +893,10 @@ __init const struct attribute_group **cpumf_cf_event_group(void) case 0x8562: model = cpumcf_z15_pmu_event_attr; break; + case 0x3931: + case 0x3932: + model = cpumcf_z16_pmu_event_attr; + break; default: model = none; break; diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index db62def4ef28..332a49965130 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1179,7 +1179,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, sample = (struct hws_basic_entry *) *sdbt; while ((unsigned long *) sample < (unsigned long *) te) { /* Check for an empty sample */ - if (!sample->def) + if (!sample->def || sample->LS) break; /* Update perf event period */ diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c index ea7729bebaa0..c27321cb0969 100644 --- a/arch/s390/kernel/perf_event.c +++ b/arch/s390/kernel/perf_event.c @@ -30,7 +30,7 @@ static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs) if (!stack) return NULL; - return (struct kvm_s390_sie_block *) stack->empty1[0]; + return (struct kvm_s390_sie_block *)stack->sie_control_block; } static bool is_in_guest(struct pt_regs *regs) diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c new file mode 100644 index 000000000000..6826e2a69a21 --- /dev/null +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -0,0 +1,699 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Performance event support - Processor Activity Instrumentation Facility + * + * Copyright IBM Corp. 2022 + * Author(s): Thomas Richter <tmricht@linux.ibm.com> + */ +#define KMSG_COMPONENT "pai_crypto" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/init.h> +#include <linux/export.h> +#include <linux/io.h> +#include <linux/perf_event.h> + +#include <asm/ctl_reg.h> +#include <asm/pai.h> +#include <asm/debug.h> + +static debug_info_t *cfm_dbg; +static unsigned int paicrypt_cnt; /* Size of the mapped counter sets */ + /* extracted with QPACI instruction */ + +DEFINE_STATIC_KEY_FALSE(pai_key); + +struct pai_userdata { + u16 num; + u64 value; +} __packed; + +struct paicrypt_map { + unsigned long *page; /* Page for CPU to store counters */ + struct pai_userdata *save; /* Page to store no-zero counters */ + unsigned int users; /* # of PAI crypto users */ + unsigned int sampler; /* # of PAI crypto samplers */ + unsigned int counter; /* # of PAI crypto counters */ + struct perf_event *event; /* Perf event for sampling */ +}; + +static DEFINE_PER_CPU(struct paicrypt_map, paicrypt_map); + +/* Release the PMU if event is the last perf event */ +static DEFINE_MUTEX(pai_reserve_mutex); + +/* Adjust usage counters and remove allocated memory when all users are + * gone. + */ +static void paicrypt_event_destroy(struct perf_event *event) +{ + struct paicrypt_map *cpump = per_cpu_ptr(&paicrypt_map, event->cpu); + + cpump->event = NULL; + static_branch_dec(&pai_key); + mutex_lock(&pai_reserve_mutex); + if (event->attr.sample_period) + cpump->sampler -= 1; + else + cpump->counter -= 1; + debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d" + " sampler %d counter %d\n", __func__, + event->attr.config, event->cpu, cpump->sampler, + cpump->counter); + if (!cpump->counter && !cpump->sampler) { + debug_sprintf_event(cfm_dbg, 4, "%s page %#lx save %p\n", + __func__, (unsigned long)cpump->page, + cpump->save); + free_page((unsigned long)cpump->page); + cpump->page = NULL; + kvfree(cpump->save); + cpump->save = NULL; + } + mutex_unlock(&pai_reserve_mutex); +} + +static u64 paicrypt_getctr(struct paicrypt_map *cpump, int nr, bool kernel) +{ + if (kernel) + nr += PAI_CRYPTO_MAXCTR; + return cpump->page[nr]; +} + +/* Read the counter values. Return value from location in CMP. For event + * CRYPTO_ALL sum up all events. + */ +static u64 paicrypt_getdata(struct perf_event *event, bool kernel) +{ + struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map); + u64 sum = 0; + int i; + + if (event->attr.config != PAI_CRYPTO_BASE) { + return paicrypt_getctr(cpump, + event->attr.config - PAI_CRYPTO_BASE, + kernel); + } + + for (i = 1; i <= paicrypt_cnt; i++) { + u64 val = paicrypt_getctr(cpump, i, kernel); + + if (!val) + continue; + sum += val; + } + return sum; +} + +static u64 paicrypt_getall(struct perf_event *event) +{ + u64 sum = 0; + + if (!event->attr.exclude_kernel) + sum += paicrypt_getdata(event, true); + if (!event->attr.exclude_user) + sum += paicrypt_getdata(event, false); + + return sum; +} + +/* Used to avoid races in checking concurrent access of counting and + * sampling for crypto events + * + * Only one instance of event pai_crypto/CRYPTO_ALL/ for sampling is + * allowed and when this event is running, no counting event is allowed. + * Several counting events are allowed in parallel, but no sampling event + * is allowed while one (or more) counting events are running. + * + * This function is called in process context and it is save to block. + * When the event initialization functions fails, no other call back will + * be invoked. + * + * Allocate the memory for the event. + */ +static int paicrypt_busy(struct perf_event_attr *a, struct paicrypt_map *cpump) +{ + unsigned int *use_ptr; + int rc = 0; + + mutex_lock(&pai_reserve_mutex); + if (a->sample_period) { /* Sampling requested */ + use_ptr = &cpump->sampler; + if (cpump->counter || cpump->sampler) + rc = -EBUSY; /* ... sampling/counting active */ + } else { /* Counting requested */ + use_ptr = &cpump->counter; + if (cpump->sampler) + rc = -EBUSY; /* ... and sampling active */ + } + if (rc) + goto unlock; + + /* Allocate memory for counter page and counter extraction. + * Only the first counting event has to allocate a page. + */ + if (cpump->page) + goto unlock; + + rc = -ENOMEM; + cpump->page = (unsigned long *)get_zeroed_page(GFP_KERNEL); + if (!cpump->page) + goto unlock; + cpump->save = kvmalloc_array(paicrypt_cnt + 1, + sizeof(struct pai_userdata), GFP_KERNEL); + if (!cpump->save) { + free_page((unsigned long)cpump->page); + cpump->page = NULL; + goto unlock; + } + rc = 0; + +unlock: + /* If rc is non-zero, do not increment counter/sampler. */ + if (!rc) + *use_ptr += 1; + debug_sprintf_event(cfm_dbg, 5, "%s sample_period %#llx sampler %d" + " counter %d page %#lx save %p rc %d\n", __func__, + a->sample_period, cpump->sampler, cpump->counter, + (unsigned long)cpump->page, cpump->save, rc); + mutex_unlock(&pai_reserve_mutex); + return rc; +} + +/* Might be called on different CPU than the one the event is intended for. */ +static int paicrypt_event_init(struct perf_event *event) +{ + struct perf_event_attr *a = &event->attr; + struct paicrypt_map *cpump; + int rc; + + /* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */ + if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type) + return -ENOENT; + /* PAI crypto event must be in valid range */ + if (a->config < PAI_CRYPTO_BASE || + a->config > PAI_CRYPTO_BASE + paicrypt_cnt) + return -EINVAL; + /* Allow only CPU wide operation, no process context for now. */ + if (event->hw.target || event->cpu == -1) + return -ENOENT; + /* Allow only CRYPTO_ALL for sampling. */ + if (a->sample_period && a->config != PAI_CRYPTO_BASE) + return -EINVAL; + + cpump = per_cpu_ptr(&paicrypt_map, event->cpu); + rc = paicrypt_busy(a, cpump); + if (rc) + return rc; + + /* Event initialization sets last_tag to 0. When later on the events + * are deleted and re-added, do not reset the event count value to zero. + * Events are added, deleted and re-added when 2 or more events + * are active at the same time. + */ + event->hw.last_tag = 0; + cpump->event = event; + event->destroy = paicrypt_event_destroy; + + if (a->sample_period) { + a->sample_period = 1; + a->freq = 0; + /* Register for paicrypt_sched_task() to be called */ + event->attach_state |= PERF_ATTACH_SCHED_CB; + /* Add raw data which contain the memory mapped counters */ + a->sample_type |= PERF_SAMPLE_RAW; + /* Turn off inheritance */ + a->inherit = 0; + } + + static_branch_inc(&pai_key); + return 0; +} + +static void paicrypt_read(struct perf_event *event) +{ + u64 prev, new, delta; + + prev = local64_read(&event->hw.prev_count); + new = paicrypt_getall(event); + local64_set(&event->hw.prev_count, new); + delta = (prev <= new) ? new - prev + : (-1ULL - prev) + new + 1; /* overflow */ + local64_add(delta, &event->count); +} + +static void paicrypt_start(struct perf_event *event, int flags) +{ + u64 sum; + + if (!event->hw.last_tag) { + event->hw.last_tag = 1; + sum = paicrypt_getall(event); /* Get current value */ + local64_set(&event->count, 0); + local64_set(&event->hw.prev_count, sum); + } +} + +static int paicrypt_add(struct perf_event *event, int flags) +{ + struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map); + unsigned long ccd; + + if (cpump->users++ == 0) { + ccd = virt_to_phys(cpump->page) | PAI_CRYPTO_KERNEL_OFFSET; + WRITE_ONCE(S390_lowcore.ccd, ccd); + __ctl_set_bit(0, 50); + } + cpump->event = event; + if (flags & PERF_EF_START && !event->attr.sample_period) { + /* Only counting needs initial counter value */ + paicrypt_start(event, PERF_EF_RELOAD); + } + event->hw.state = 0; + if (event->attr.sample_period) + perf_sched_cb_inc(event->pmu); + return 0; +} + +static void paicrypt_stop(struct perf_event *event, int flags) +{ + paicrypt_read(event); + event->hw.state = PERF_HES_STOPPED; +} + +static void paicrypt_del(struct perf_event *event, int flags) +{ + struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map); + + if (event->attr.sample_period) + perf_sched_cb_dec(event->pmu); + if (!event->attr.sample_period) + /* Only counting needs to read counter */ + paicrypt_stop(event, PERF_EF_UPDATE); + if (cpump->users-- == 1) { + __ctl_clear_bit(0, 50); + WRITE_ONCE(S390_lowcore.ccd, 0); + } +} + +/* Create raw data and save it in buffer. Returns number of bytes copied. + * Saves only positive counter entries of the form + * 2 bytes: Number of counter + * 8 bytes: Value of counter + */ +static size_t paicrypt_copy(struct pai_userdata *userdata, + struct paicrypt_map *cpump, + bool exclude_user, bool exclude_kernel) +{ + int i, outidx = 0; + + for (i = 1; i <= paicrypt_cnt; i++) { + u64 val = 0; + + if (!exclude_kernel) + val += paicrypt_getctr(cpump, i, true); + if (!exclude_user) + val += paicrypt_getctr(cpump, i, false); + if (val) { + userdata[outidx].num = i; + userdata[outidx].value = val; + outidx++; + } + } + return outidx * sizeof(struct pai_userdata); +} + +static int paicrypt_push_sample(void) +{ + struct paicrypt_map *cpump = this_cpu_ptr(&paicrypt_map); + struct perf_event *event = cpump->event; + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + size_t rawsize; + int overflow; + + if (!cpump->event) /* No event active */ + return 0; + rawsize = paicrypt_copy(cpump->save, cpump, + cpump->event->attr.exclude_user, + cpump->event->attr.exclude_kernel); + if (!rawsize) /* No incremented counters */ + return 0; + + /* Setup perf sample */ + memset(®s, 0, sizeof(regs)); + memset(&raw, 0, sizeof(raw)); + memset(&data, 0, sizeof(data)); + perf_sample_data_init(&data, 0, event->hw.last_period); + if (event->attr.sample_type & PERF_SAMPLE_TID) { + data.tid_entry.pid = task_tgid_nr(current); + data.tid_entry.tid = task_pid_nr(current); + } + if (event->attr.sample_type & PERF_SAMPLE_TIME) + data.time = event->clock(); + if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) + data.id = event->id; + if (event->attr.sample_type & PERF_SAMPLE_CPU) { + data.cpu_entry.cpu = smp_processor_id(); + data.cpu_entry.reserved = 0; + } + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.frag.size = rawsize; + raw.frag.data = cpump->save; + raw.size = raw.frag.size; + data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; + } + + overflow = perf_event_overflow(event, &data, ®s); + perf_event_update_userpage(event); + /* Clear lowcore page after read */ + memset(cpump->page, 0, PAGE_SIZE); + return overflow; +} + +/* Called on schedule-in and schedule-out. No access to event structure, + * but for sampling only event CRYPTO_ALL is allowed. + */ +static void paicrypt_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + /* We started with a clean page on event installation. So read out + * results on schedule_out and if page was dirty, clear values. + */ + if (!sched_in) + paicrypt_push_sample(); +} + +/* Attribute definitions for paicrypt interface. As with other CPU + * Measurement Facilities, there is one attribute per mapped counter. + * The number of mapped counters may vary per machine generation. Use + * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction + * to determine the number of mapped counters. The instructions returns + * a positive number, which is the highest number of supported counters. + * All counters less than this number are also supported, there are no + * holes. A returned number of zero means no support for mapped counters. + * + * The identification of the counter is a unique number. The chosen range + * is 0x1000 + offset in mapped kernel page. + * All CPU Measurement Facility counters identifiers must be unique and + * the numbers from 0 to 496 are already used for the CPU Measurement + * Counter facility. Numbers 0xb0000, 0xbc000 and 0xbd000 are already + * used for the CPU Measurement Sampling facility. + */ +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *paicrypt_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group paicrypt_events_group = { + .name = "events", + .attrs = NULL /* Filled in attr_event_init() */ +}; + +static struct attribute_group paicrypt_format_group = { + .name = "format", + .attrs = paicrypt_format_attr, +}; + +static const struct attribute_group *paicrypt_attr_groups[] = { + &paicrypt_events_group, + &paicrypt_format_group, + NULL, +}; + +/* Performance monitoring unit for mapped counters */ +static struct pmu paicrypt = { + .task_ctx_nr = perf_invalid_context, + .event_init = paicrypt_event_init, + .add = paicrypt_add, + .del = paicrypt_del, + .start = paicrypt_start, + .stop = paicrypt_stop, + .read = paicrypt_read, + .sched_task = paicrypt_sched_task, + .attr_groups = paicrypt_attr_groups +}; + +/* List of symbolic PAI counter names. */ +static const char * const paicrypt_ctrnames[] = { + [0] = "CRYPTO_ALL", + [1] = "KM_DEA", + [2] = "KM_TDEA_128", + [3] = "KM_TDEA_192", + [4] = "KM_ENCRYPTED_DEA", + [5] = "KM_ENCRYPTED_TDEA_128", + [6] = "KM_ENCRYPTED_TDEA_192", + [7] = "KM_AES_128", + [8] = "KM_AES_192", + [9] = "KM_AES_256", + [10] = "KM_ENCRYPTED_AES_128", + [11] = "KM_ENCRYPTED_AES_192", + [12] = "KM_ENCRYPTED_AES_256", + [13] = "KM_XTS_AES_128", + [14] = "KM_XTS_AES_256", + [15] = "KM_XTS_ENCRYPTED_AES_128", + [16] = "KM_XTS_ENCRYPTED_AES_256", + [17] = "KMC_DEA", + [18] = "KMC_TDEA_128", + [19] = "KMC_TDEA_192", + [20] = "KMC_ENCRYPTED_DEA", + [21] = "KMC_ENCRYPTED_TDEA_128", + [22] = "KMC_ENCRYPTED_TDEA_192", + [23] = "KMC_AES_128", + [24] = "KMC_AES_192", + [25] = "KMC_AES_256", + [26] = "KMC_ENCRYPTED_AES_128", + [27] = "KMC_ENCRYPTED_AES_192", + [28] = "KMC_ENCRYPTED_AES_256", + [29] = "KMC_PRNG", + [30] = "KMA_GCM_AES_128", + [31] = "KMA_GCM_AES_192", + [32] = "KMA_GCM_AES_256", + [33] = "KMA_GCM_ENCRYPTED_AES_128", + [34] = "KMA_GCM_ENCRYPTED_AES_192", + [35] = "KMA_GCM_ENCRYPTED_AES_256", + [36] = "KMF_DEA", + [37] = "KMF_TDEA_128", + [38] = "KMF_TDEA_192", + [39] = "KMF_ENCRYPTED_DEA", + [40] = "KMF_ENCRYPTED_TDEA_128", + [41] = "KMF_ENCRYPTED_TDEA_192", + [42] = "KMF_AES_128", + [43] = "KMF_AES_192", + [44] = "KMF_AES_256", + [45] = "KMF_ENCRYPTED_AES_128", + [46] = "KMF_ENCRYPTED_AES_192", + [47] = "KMF_ENCRYPTED_AES_256", + [48] = "KMCTR_DEA", + [49] = "KMCTR_TDEA_128", + [50] = "KMCTR_TDEA_192", + [51] = "KMCTR_ENCRYPTED_DEA", + [52] = "KMCTR_ENCRYPTED_TDEA_128", + [53] = "KMCTR_ENCRYPTED_TDEA_192", + [54] = "KMCTR_AES_128", + [55] = "KMCTR_AES_192", + [56] = "KMCTR_AES_256", + [57] = "KMCTR_ENCRYPTED_AES_128", + [58] = "KMCTR_ENCRYPTED_AES_192", + [59] = "KMCTR_ENCRYPTED_AES_256", + [60] = "KMO_DEA", + [61] = "KMO_TDEA_128", + [62] = "KMO_TDEA_192", + [63] = "KMO_ENCRYPTED_DEA", + [64] = "KMO_ENCRYPTED_TDEA_128", + [65] = "KMO_ENCRYPTED_TDEA_192", + [66] = "KMO_AES_128", + [67] = "KMO_AES_192", + [68] = "KMO_AES_256", + [69] = "KMO_ENCRYPTED_AES_128", + [70] = "KMO_ENCRYPTED_AES_192", + [71] = "KMO_ENCRYPTED_AES_256", + [72] = "KIMD_SHA_1", + [73] = "KIMD_SHA_256", + [74] = "KIMD_SHA_512", + [75] = "KIMD_SHA3_224", + [76] = "KIMD_SHA3_256", + [77] = "KIMD_SHA3_384", + [78] = "KIMD_SHA3_512", + [79] = "KIMD_SHAKE_128", + [80] = "KIMD_SHAKE_256", + [81] = "KIMD_GHASH", + [82] = "KLMD_SHA_1", + [83] = "KLMD_SHA_256", + [84] = "KLMD_SHA_512", + [85] = "KLMD_SHA3_224", + [86] = "KLMD_SHA3_256", + [87] = "KLMD_SHA3_384", + [88] = "KLMD_SHA3_512", + [89] = "KLMD_SHAKE_128", + [90] = "KLMD_SHAKE_256", + [91] = "KMAC_DEA", + [92] = "KMAC_TDEA_128", + [93] = "KMAC_TDEA_192", + [94] = "KMAC_ENCRYPTED_DEA", + [95] = "KMAC_ENCRYPTED_TDEA_128", + [96] = "KMAC_ENCRYPTED_TDEA_192", + [97] = "KMAC_AES_128", + [98] = "KMAC_AES_192", + [99] = "KMAC_AES_256", + [100] = "KMAC_ENCRYPTED_AES_128", + [101] = "KMAC_ENCRYPTED_AES_192", + [102] = "KMAC_ENCRYPTED_AES_256", + [103] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_DEA", + [104] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_128", + [105] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_192", + [106] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_DEA", + [107] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_128", + [108] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_192", + [109] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_128", + [110] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_192", + [111] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_256", + [112] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_128", + [113] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_192", + [114] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_256A", + [115] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_128", + [116] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_256", + [117] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_128", + [118] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_256", + [119] = "PCC_SCALAR_MULTIPLY_P256", + [120] = "PCC_SCALAR_MULTIPLY_P384", + [121] = "PCC_SCALAR_MULTIPLY_P521", + [122] = "PCC_SCALAR_MULTIPLY_ED25519", + [123] = "PCC_SCALAR_MULTIPLY_ED448", + [124] = "PCC_SCALAR_MULTIPLY_X25519", + [125] = "PCC_SCALAR_MULTIPLY_X448", + [126] = "PRNO_SHA_512_DRNG", + [127] = "PRNO_TRNG_QUERY_RAW_TO_CONDITIONED_RATIO", + [128] = "PRNO_TRNG", + [129] = "KDSA_ECDSA_VERIFY_P256", + [130] = "KDSA_ECDSA_VERIFY_P384", + [131] = "KDSA_ECDSA_VERIFY_P521", + [132] = "KDSA_ECDSA_SIGN_P256", + [133] = "KDSA_ECDSA_SIGN_P384", + [134] = "KDSA_ECDSA_SIGN_P521", + [135] = "KDSA_ENCRYPTED_ECDSA_SIGN_P256", + [136] = "KDSA_ENCRYPTED_ECDSA_SIGN_P384", + [137] = "KDSA_ENCRYPTED_ECDSA_SIGN_P521", + [138] = "KDSA_EDDSA_VERIFY_ED25519", + [139] = "KDSA_EDDSA_VERIFY_ED448", + [140] = "KDSA_EDDSA_SIGN_ED25519", + [141] = "KDSA_EDDSA_SIGN_ED448", + [142] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED25519", + [143] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED448", + [144] = "PCKMO_ENCRYPT_DEA_KEY", + [145] = "PCKMO_ENCRYPT_TDEA_128_KEY", + [146] = "PCKMO_ENCRYPT_TDEA_192_KEY", + [147] = "PCKMO_ENCRYPT_AES_128_KEY", + [148] = "PCKMO_ENCRYPT_AES_192_KEY", + [149] = "PCKMO_ENCRYPT_AES_256_KEY", + [150] = "PCKMO_ENCRYPT_ECC_P256_KEY", + [151] = "PCKMO_ENCRYPT_ECC_P384_KEY", + [152] = "PCKMO_ENCRYPT_ECC_P521_KEY", + [153] = "PCKMO_ENCRYPT_ECC_ED25519_KEY", + [154] = "PCKMO_ENCRYPT_ECC_ED448_KEY", + [155] = "IBM_RESERVED_155", + [156] = "IBM_RESERVED_156", +}; + +static void __init attr_event_free(struct attribute **attrs, int num) +{ + struct perf_pmu_events_attr *pa; + int i; + + for (i = 0; i < num; i++) { + struct device_attribute *dap; + + dap = container_of(attrs[i], struct device_attribute, attr); + pa = container_of(dap, struct perf_pmu_events_attr, attr); + kfree(pa); + } + kfree(attrs); +} + +static int __init attr_event_init_one(struct attribute **attrs, int num) +{ + struct perf_pmu_events_attr *pa; + + pa = kzalloc(sizeof(*pa), GFP_KERNEL); + if (!pa) + return -ENOMEM; + + sysfs_attr_init(&pa->attr.attr); + pa->id = PAI_CRYPTO_BASE + num; + pa->attr.attr.name = paicrypt_ctrnames[num]; + pa->attr.attr.mode = 0444; + pa->attr.show = cpumf_events_sysfs_show; + pa->attr.store = NULL; + attrs[num] = &pa->attr.attr; + return 0; +} + +/* Create PMU sysfs event attributes on the fly. */ +static int __init attr_event_init(void) +{ + struct attribute **attrs; + int ret, i; + + attrs = kmalloc_array(ARRAY_SIZE(paicrypt_ctrnames) + 1, sizeof(*attrs), + GFP_KERNEL); + if (!attrs) + return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(paicrypt_ctrnames); i++) { + ret = attr_event_init_one(attrs, i); + if (ret) { + attr_event_free(attrs, i - 1); + return ret; + } + } + attrs[i] = NULL; + paicrypt_events_group.attrs = attrs; + return 0; +} + +static int __init paicrypt_init(void) +{ + struct qpaci_info_block ib; + int rc; + + if (!test_facility(196)) + return 0; + + qpaci(&ib); + paicrypt_cnt = ib.num_cc; + if (paicrypt_cnt == 0) + return 0; + if (paicrypt_cnt >= PAI_CRYPTO_MAXCTR) + paicrypt_cnt = PAI_CRYPTO_MAXCTR - 1; + + rc = attr_event_init(); /* Export known PAI crypto events */ + if (rc) { + pr_err("Creation of PMU pai_crypto /sysfs failed\n"); + return rc; + } + + /* Setup s390dbf facility */ + cfm_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128); + if (!cfm_dbg) { + pr_err("Registration of s390dbf pai_crypto failed\n"); + return -ENOMEM; + } + debug_register_view(cfm_dbg, &debug_sprintf_view); + + rc = perf_pmu_register(&paicrypt, "pai_crypto", -1); + if (rc) { + pr_err("Registering the pai_crypto PMU failed with rc=%i\n", + rc); + debug_unregister_view(cfm_dbg, &debug_sprintf_view); + debug_unregister(cfm_dbg); + return rc; + } + return 0; +} + +device_initcall(paicrypt_init); diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c new file mode 100644 index 000000000000..74b53c531e0c --- /dev/null +++ b/arch/s390/kernel/perf_pai_ext.c @@ -0,0 +1,672 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Performance event support - Processor Activity Instrumentation Extension + * Facility + * + * Copyright IBM Corp. 2022 + * Author(s): Thomas Richter <tmricht@linux.ibm.com> + */ +#define KMSG_COMPONENT "pai_ext" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/init.h> +#include <linux/export.h> +#include <linux/io.h> + +#include <asm/cpu_mcf.h> +#include <asm/ctl_reg.h> +#include <asm/pai.h> +#include <asm/debug.h> + +#define PAIE1_CB_SZ 0x200 /* Size of PAIE1 control block */ +#define PAIE1_CTRBLOCK_SZ 0x400 /* Size of PAIE1 counter blocks */ + +static debug_info_t *paiext_dbg; +static unsigned int paiext_cnt; /* Extracted with QPACI instruction */ + +enum paiext_mode { + PAI_MODE_NONE, + PAI_MODE_SAMPLING, + PAI_MODE_COUNTER, +}; + +struct pai_userdata { + u16 num; + u64 value; +} __packed; + +/* Create the PAI extension 1 control block area. + * The PAI extension control block 1 is pointed to by lowcore + * address 0x1508 for each CPU. This control block is 512 bytes in size + * and requires a 512 byte boundary alignment. + */ +struct paiext_cb { /* PAI extension 1 control block */ + u64 header; /* Not used */ + u64 reserved1; + u64 acc; /* Addr to analytics counter control block */ + u8 reserved2[488]; +} __packed; + +struct paiext_map { + unsigned long *area; /* Area for CPU to store counters */ + struct pai_userdata *save; /* Area to store non-zero counters */ + enum paiext_mode mode; /* Type of event */ + unsigned int active_events; /* # of PAI Extension users */ + unsigned int refcnt; + struct perf_event *event; /* Perf event for sampling */ + struct paiext_cb *paiext_cb; /* PAI extension control block area */ +}; + +struct paiext_mapptr { + struct paiext_map *mapptr; +}; + +static struct paiext_root { /* Anchor to per CPU data */ + int refcnt; /* Overall active events */ + struct paiext_mapptr __percpu *mapptr; +} paiext_root; + +/* Free per CPU data when the last event is removed. */ +static void paiext_root_free(void) +{ + if (!--paiext_root.refcnt) { + free_percpu(paiext_root.mapptr); + paiext_root.mapptr = NULL; + } +} + +/* On initialization of first event also allocate per CPU data dynamically. + * Start with an array of pointers, the array size is the maximum number of + * CPUs possible, which might be larger than the number of CPUs currently + * online. + */ +static int paiext_root_alloc(void) +{ + if (++paiext_root.refcnt == 1) { + /* The memory is already zeroed. */ + paiext_root.mapptr = alloc_percpu(struct paiext_mapptr); + if (!paiext_root.mapptr) { + /* Returing without refcnt adjustment is ok. The + * error code is handled by paiext_alloc() which + * decrements refcnt when an event can not be + * created. + */ + return -ENOMEM; + } + } + return 0; +} + +/* Protects against concurrent increment of sampler and counter member + * increments at the same time and prohibits concurrent execution of + * counting and sampling events. + * Ensures that analytics counter block is deallocated only when the + * sampling and counting on that cpu is zero. + * For details see paiext_alloc(). + */ +static DEFINE_MUTEX(paiext_reserve_mutex); + +/* Free all memory allocated for event counting/sampling setup */ +static void paiext_free(struct paiext_mapptr *mp) +{ + kfree(mp->mapptr->area); + kfree(mp->mapptr->paiext_cb); + kvfree(mp->mapptr->save); + kfree(mp->mapptr); + mp->mapptr = NULL; +} + +/* Release the PMU if event is the last perf event */ +static void paiext_event_destroy(struct perf_event *event) +{ + struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, event->cpu); + struct paiext_map *cpump = mp->mapptr; + + mutex_lock(&paiext_reserve_mutex); + cpump->event = NULL; + if (!--cpump->refcnt) /* Last reference gone */ + paiext_free(mp); + paiext_root_free(); + mutex_unlock(&paiext_reserve_mutex); + debug_sprintf_event(paiext_dbg, 4, "%s cpu %d mapptr %p\n", __func__, + event->cpu, mp->mapptr); + +} + +/* Used to avoid races in checking concurrent access of counting and + * sampling for pai_extension events. + * + * Only one instance of event pai_ext/NNPA_ALL/ for sampling is + * allowed and when this event is running, no counting event is allowed. + * Several counting events are allowed in parallel, but no sampling event + * is allowed while one (or more) counting events are running. + * + * This function is called in process context and it is safe to block. + * When the event initialization functions fails, no other call back will + * be invoked. + * + * Allocate the memory for the event. + */ +static int paiext_alloc(struct perf_event_attr *a, struct perf_event *event) +{ + struct paiext_mapptr *mp; + struct paiext_map *cpump; + int rc; + + mutex_lock(&paiext_reserve_mutex); + + rc = paiext_root_alloc(); + if (rc) + goto unlock; + + mp = per_cpu_ptr(paiext_root.mapptr, event->cpu); + cpump = mp->mapptr; + if (!cpump) { /* Paiext_map allocated? */ + rc = -ENOMEM; + cpump = kzalloc(sizeof(*cpump), GFP_KERNEL); + if (!cpump) + goto unlock; + + /* Allocate memory for counter area and counter extraction. + * These are + * - a 512 byte block and requires 512 byte boundary alignment. + * - a 1KB byte block and requires 1KB boundary alignment. + * Only the first counting event has to allocate the area. + * + * Note: This works with commit 59bb47985c1d by default. + * Backporting this to kernels without this commit might + * need adjustment. + */ + mp->mapptr = cpump; + cpump->area = kzalloc(PAIE1_CTRBLOCK_SZ, GFP_KERNEL); + cpump->paiext_cb = kzalloc(PAIE1_CB_SZ, GFP_KERNEL); + cpump->save = kvmalloc_array(paiext_cnt + 1, + sizeof(struct pai_userdata), + GFP_KERNEL); + if (!cpump->save || !cpump->area || !cpump->paiext_cb) { + paiext_free(mp); + goto unlock; + } + cpump->mode = a->sample_period ? PAI_MODE_SAMPLING + : PAI_MODE_COUNTER; + } else { + /* Multiple invocation, check whats active. + * Supported are multiple counter events or only one sampling + * event concurrently at any one time. + */ + if (cpump->mode == PAI_MODE_SAMPLING || + (cpump->mode == PAI_MODE_COUNTER && a->sample_period)) { + rc = -EBUSY; + goto unlock; + } + } + + rc = 0; + cpump->event = event; + ++cpump->refcnt; + +unlock: + if (rc) { + /* Error in allocation of event, decrement anchor. Since + * the event in not created, its destroy() function is never + * invoked. Adjust the reference counter for the anchor. + */ + paiext_root_free(); + } + mutex_unlock(&paiext_reserve_mutex); + /* If rc is non-zero, no increment of counter/sampler was done. */ + return rc; +} + +/* The PAI extension 1 control block supports up to 128 entries. Return + * the index within PAIE1_CB given the event number. Also validate event + * number. + */ +static int paiext_event_valid(struct perf_event *event) +{ + u64 cfg = event->attr.config; + + if (cfg >= PAI_NNPA_BASE && cfg <= PAI_NNPA_BASE + paiext_cnt) { + /* Offset NNPA in paiext_cb */ + event->hw.config_base = offsetof(struct paiext_cb, acc); + return 0; + } + return -EINVAL; +} + +/* Might be called on different CPU than the one the event is intended for. */ +static int paiext_event_init(struct perf_event *event) +{ + struct perf_event_attr *a = &event->attr; + int rc; + + /* PMU pai_ext registered as PERF_TYPE_RAW, check event type */ + if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type) + return -ENOENT; + /* PAI extension event must be valid and in supported range */ + rc = paiext_event_valid(event); + if (rc) + return rc; + /* Allow only CPU wide operation, no process context for now. */ + if (event->hw.target || event->cpu == -1) + return -ENOENT; + /* Allow only event NNPA_ALL for sampling. */ + if (a->sample_period && a->config != PAI_NNPA_BASE) + return -EINVAL; + /* Prohibit exclude_user event selection */ + if (a->exclude_user) + return -EINVAL; + + rc = paiext_alloc(a, event); + if (rc) + return rc; + event->hw.last_tag = 0; + event->destroy = paiext_event_destroy; + + if (a->sample_period) { + a->sample_period = 1; + a->freq = 0; + /* Register for paicrypt_sched_task() to be called */ + event->attach_state |= PERF_ATTACH_SCHED_CB; + /* Add raw data which are the memory mapped counters */ + a->sample_type |= PERF_SAMPLE_RAW; + /* Turn off inheritance */ + a->inherit = 0; + } + + return 0; +} + +static u64 paiext_getctr(struct paiext_map *cpump, int nr) +{ + return cpump->area[nr]; +} + +/* Read the counter values. Return value from location in buffer. For event + * NNPA_ALL sum up all events. + */ +static u64 paiext_getdata(struct perf_event *event) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + u64 sum = 0; + int i; + + if (event->attr.config != PAI_NNPA_BASE) + return paiext_getctr(cpump, event->attr.config - PAI_NNPA_BASE); + + for (i = 1; i <= paiext_cnt; i++) + sum += paiext_getctr(cpump, i); + + return sum; +} + +static u64 paiext_getall(struct perf_event *event) +{ + return paiext_getdata(event); +} + +static void paiext_read(struct perf_event *event) +{ + u64 prev, new, delta; + + prev = local64_read(&event->hw.prev_count); + new = paiext_getall(event); + local64_set(&event->hw.prev_count, new); + delta = new - prev; + local64_add(delta, &event->count); +} + +static void paiext_start(struct perf_event *event, int flags) +{ + u64 sum; + + if (event->hw.last_tag) + return; + event->hw.last_tag = 1; + sum = paiext_getall(event); /* Get current value */ + local64_set(&event->hw.prev_count, sum); + local64_set(&event->count, 0); +} + +static int paiext_add(struct perf_event *event, int flags) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + struct paiext_cb *pcb = cpump->paiext_cb; + + if (++cpump->active_events == 1) { + S390_lowcore.aicd = virt_to_phys(cpump->paiext_cb); + pcb->acc = virt_to_phys(cpump->area) | 0x1; + /* Enable CPU instruction lookup for PAIE1 control block */ + __ctl_set_bit(0, 49); + debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n", + __func__, S390_lowcore.aicd, pcb->acc); + } + if (flags & PERF_EF_START && !event->attr.sample_period) { + /* Only counting needs initial counter value */ + paiext_start(event, PERF_EF_RELOAD); + } + event->hw.state = 0; + if (event->attr.sample_period) { + cpump->event = event; + perf_sched_cb_inc(event->pmu); + } + return 0; +} + +static void paiext_stop(struct perf_event *event, int flags) +{ + paiext_read(event); + event->hw.state = PERF_HES_STOPPED; +} + +static void paiext_del(struct perf_event *event, int flags) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + struct paiext_cb *pcb = cpump->paiext_cb; + + if (event->attr.sample_period) + perf_sched_cb_dec(event->pmu); + if (!event->attr.sample_period) { + /* Only counting needs to read counter */ + paiext_stop(event, PERF_EF_UPDATE); + } + if (--cpump->active_events == 0) { + /* Disable CPU instruction lookup for PAIE1 control block */ + __ctl_clear_bit(0, 49); + pcb->acc = 0; + S390_lowcore.aicd = 0; + debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n", + __func__, S390_lowcore.aicd, pcb->acc); + } +} + +/* Create raw data and save it in buffer. Returns number of bytes copied. + * Saves only positive counter entries of the form + * 2 bytes: Number of counter + * 8 bytes: Value of counter + */ +static size_t paiext_copy(struct paiext_map *cpump) +{ + struct pai_userdata *userdata = cpump->save; + int i, outidx = 0; + + for (i = 1; i <= paiext_cnt; i++) { + u64 val = paiext_getctr(cpump, i); + + if (val) { + userdata[outidx].num = i; + userdata[outidx].value = val; + outidx++; + } + } + return outidx * sizeof(*userdata); +} + +/* Write sample when one or more counters values are nonzero. + * + * Note: The function paiext_sched_task() and paiext_push_sample() are not + * invoked after function paiext_del() has been called because of function + * perf_sched_cb_dec(). + * The function paiext_sched_task() and paiext_push_sample() are only + * called when sampling is active. Function perf_sched_cb_inc() + * has been invoked to install function paiext_sched_task() as call back + * to run at context switch time (see paiext_add()). + * + * This causes function perf_event_context_sched_out() and + * perf_event_context_sched_in() to check whether the PMU has installed an + * sched_task() callback. That callback is not active after paiext_del() + * returns and has deleted the event on that CPU. + */ +static int paiext_push_sample(void) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + struct perf_event *event = cpump->event; + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + size_t rawsize; + int overflow; + + rawsize = paiext_copy(cpump); + if (!rawsize) /* No incremented counters */ + return 0; + + /* Setup perf sample */ + memset(®s, 0, sizeof(regs)); + memset(&raw, 0, sizeof(raw)); + memset(&data, 0, sizeof(data)); + perf_sample_data_init(&data, 0, event->hw.last_period); + if (event->attr.sample_type & PERF_SAMPLE_TID) { + data.tid_entry.pid = task_tgid_nr(current); + data.tid_entry.tid = task_pid_nr(current); + } + if (event->attr.sample_type & PERF_SAMPLE_TIME) + data.time = event->clock(); + if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) + data.id = event->id; + if (event->attr.sample_type & PERF_SAMPLE_CPU) + data.cpu_entry.cpu = smp_processor_id(); + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.frag.size = rawsize; + raw.frag.data = cpump->save; + raw.size = raw.frag.size; + data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; + } + + overflow = perf_event_overflow(event, &data, ®s); + perf_event_update_userpage(event); + /* Clear lowcore area after read */ + memset(cpump->area, 0, PAIE1_CTRBLOCK_SZ); + return overflow; +} + +/* Called on schedule-in and schedule-out. No access to event structure, + * but for sampling only event NNPA_ALL is allowed. + */ +static void paiext_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + /* We started with a clean page on event installation. So read out + * results on schedule_out and if page was dirty, clear values. + */ + if (!sched_in) + paiext_push_sample(); +} + +/* Attribute definitions for pai extension1 interface. As with other CPU + * Measurement Facilities, there is one attribute per mapped counter. + * The number of mapped counters may vary per machine generation. Use + * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction + * to determine the number of mapped counters. The instructions returns + * a positive number, which is the highest number of supported counters. + * All counters less than this number are also supported, there are no + * holes. A returned number of zero means no support for mapped counters. + * + * The identification of the counter is a unique number. The chosen range + * is 0x1800 + offset in mapped kernel page. + * All CPU Measurement Facility counters identifiers must be unique and + * the numbers from 0 to 496 are already used for the CPU Measurement + * Counter facility. Number 0x1000 to 0x103e are used for PAI cryptography + * counters. + * Numbers 0xb0000, 0xbc000 and 0xbd000 are already + * used for the CPU Measurement Sampling facility. + */ +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *paiext_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group paiext_events_group = { + .name = "events", + .attrs = NULL, /* Filled in attr_event_init() */ +}; + +static struct attribute_group paiext_format_group = { + .name = "format", + .attrs = paiext_format_attr, +}; + +static const struct attribute_group *paiext_attr_groups[] = { + &paiext_events_group, + &paiext_format_group, + NULL, +}; + +/* Performance monitoring unit for mapped counters */ +static struct pmu paiext = { + .task_ctx_nr = perf_invalid_context, + .event_init = paiext_event_init, + .add = paiext_add, + .del = paiext_del, + .start = paiext_start, + .stop = paiext_stop, + .read = paiext_read, + .sched_task = paiext_sched_task, + .attr_groups = paiext_attr_groups, +}; + +/* List of symbolic PAI extension 1 NNPA counter names. */ +static const char * const paiext_ctrnames[] = { + [0] = "NNPA_ALL", + [1] = "NNPA_ADD", + [2] = "NNPA_SUB", + [3] = "NNPA_MUL", + [4] = "NNPA_DIV", + [5] = "NNPA_MIN", + [6] = "NNPA_MAX", + [7] = "NNPA_LOG", + [8] = "NNPA_EXP", + [9] = "NNPA_IBM_RESERVED_9", + [10] = "NNPA_RELU", + [11] = "NNPA_TANH", + [12] = "NNPA_SIGMOID", + [13] = "NNPA_SOFTMAX", + [14] = "NNPA_BATCHNORM", + [15] = "NNPA_MAXPOOL2D", + [16] = "NNPA_AVGPOOL2D", + [17] = "NNPA_LSTMACT", + [18] = "NNPA_GRUACT", + [19] = "NNPA_CONVOLUTION", + [20] = "NNPA_MATMUL_OP", + [21] = "NNPA_MATMUL_OP_BCAST23", + [22] = "NNPA_SMALLBATCH", + [23] = "NNPA_LARGEDIM", + [24] = "NNPA_SMALLTENSOR", + [25] = "NNPA_1MFRAME", + [26] = "NNPA_2GFRAME", + [27] = "NNPA_ACCESSEXCEPT", +}; + +static void __init attr_event_free(struct attribute **attrs, int num) +{ + struct perf_pmu_events_attr *pa; + struct device_attribute *dap; + int i; + + for (i = 0; i < num; i++) { + dap = container_of(attrs[i], struct device_attribute, attr); + pa = container_of(dap, struct perf_pmu_events_attr, attr); + kfree(pa); + } + kfree(attrs); +} + +static int __init attr_event_init_one(struct attribute **attrs, int num) +{ + struct perf_pmu_events_attr *pa; + + pa = kzalloc(sizeof(*pa), GFP_KERNEL); + if (!pa) + return -ENOMEM; + + sysfs_attr_init(&pa->attr.attr); + pa->id = PAI_NNPA_BASE + num; + pa->attr.attr.name = paiext_ctrnames[num]; + pa->attr.attr.mode = 0444; + pa->attr.show = cpumf_events_sysfs_show; + pa->attr.store = NULL; + attrs[num] = &pa->attr.attr; + return 0; +} + +/* Create PMU sysfs event attributes on the fly. */ +static int __init attr_event_init(void) +{ + struct attribute **attrs; + int ret, i; + + attrs = kmalloc_array(ARRAY_SIZE(paiext_ctrnames) + 1, sizeof(*attrs), + GFP_KERNEL); + if (!attrs) + return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(paiext_ctrnames); i++) { + ret = attr_event_init_one(attrs, i); + if (ret) { + attr_event_free(attrs, i - 1); + return ret; + } + } + attrs[i] = NULL; + paiext_events_group.attrs = attrs; + return 0; +} + +static int __init paiext_init(void) +{ + struct qpaci_info_block ib; + int rc = -ENOMEM; + + if (!test_facility(197)) + return 0; + + qpaci(&ib); + paiext_cnt = ib.num_nnpa; + if (paiext_cnt >= PAI_NNPA_MAXCTR) + paiext_cnt = PAI_NNPA_MAXCTR; + if (!paiext_cnt) + return 0; + + rc = attr_event_init(); + if (rc) { + pr_err("Creation of PMU " KMSG_COMPONENT " /sysfs failed\n"); + return rc; + } + + /* Setup s390dbf facility */ + paiext_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128); + if (!paiext_dbg) { + pr_err("Registration of s390dbf " KMSG_COMPONENT " failed\n"); + rc = -ENOMEM; + goto out_init; + } + debug_register_view(paiext_dbg, &debug_sprintf_view); + + rc = perf_pmu_register(&paiext, KMSG_COMPONENT, -1); + if (rc) { + pr_err("Registration of " KMSG_COMPONENT " PMU failed with " + "rc=%i\n", rc); + goto out_pmu; + } + + return 0; + +out_pmu: + debug_unregister_view(paiext_dbg, &debug_sprintf_view); + debug_unregister(paiext_dbg); +out_init: + attr_event_free(paiext_events_group.attrs, + ARRAY_SIZE(paiext_ctrnames) + 1); + return rc; +} + +device_initcall(paiext_init); diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index e8858b2de24b..42af4b3aa02b 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -91,12 +91,26 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) memcpy(dst, src, arch_task_struct_size); dst->thread.fpu.regs = dst->thread.fpu.fprs; + + /* + * Don't transfer over the runtime instrumentation or the guarded + * storage control block pointers. These fields are cleared here instead + * of in copy_thread() to avoid premature freeing of associated memory + * on fork() failure. Wait to clear the RI flag because ->stack still + * refers to the source thread. + */ + dst->thread.ri_cb = NULL; + dst->thread.gs_cb = NULL; + dst->thread.gs_bc_cb = NULL; + return 0; } -int copy_thread(unsigned long clone_flags, unsigned long new_stackp, - unsigned long arg, struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long new_stackp = args->stack; + unsigned long tls = args->tls; struct fake_frame { struct stack_frame sf; @@ -130,16 +144,15 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, frame->sf.gprs[9] = (unsigned long)frame; /* Store access registers to kernel stack of new process. */ - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* kernel thread */ memset(&frame->childregs, 0, sizeof(struct pt_regs)); frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT | PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; frame->childregs.psw.addr = (unsigned long)__ret_from_fork; - frame->childregs.gprs[9] = new_stackp; /* function */ - frame->childregs.gprs[10] = arg; - frame->childregs.gprs[11] = (unsigned long)do_exit; + frame->childregs.gprs[9] = (unsigned long)args->fn; + frame->childregs.gprs[10] = (unsigned long)args->fn_arg; frame->childregs.orig_gpr2 = -1; frame->childregs.last_break = 1; return 0; @@ -149,13 +162,11 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, frame->childregs.flags = 0; if (new_stackp) frame->childregs.gprs[15] = new_stackp; - - /* Don't copy runtime instrumentation info */ - p->thread.ri_cb = NULL; + /* + * Clear the runtime instrumentation flag after the above childregs + * copy. The CB pointer was already cleared in arch_dup_task_struct(). + */ frame->childregs.psw.mask &= ~PSW_MASK_RI; - /* Don't copy guarded storage control block */ - p->thread.gs_cb = NULL; - p->thread.gs_bc_cb = NULL; /* Set a new TLS ? */ if (clone_flags & CLONE_SETTLS) { @@ -213,13 +224,13 @@ unsigned long __get_wchan(struct task_struct *p) unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() & ~PAGE_MASK; + sp -= prandom_u32_max(PAGE_SIZE); return sp & ~0xf; } static inline unsigned long brk_rnd(void) { - return (get_random_int() & BRK_RND_MASK) << PAGE_SHIFT; + return (get_random_u16() & BRK_RND_MASK) << PAGE_SHIFT; } unsigned long arch_randomize_brk(struct mm_struct *mm) diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index d9d4a806979e..a194611ba88c 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -8,7 +8,6 @@ #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/stop_machine.h> -#include <linux/cpufeature.h> #include <linux/bitops.h> #include <linux/kernel.h> #include <linux/random.h> @@ -96,15 +95,6 @@ void cpu_init(void) enter_lazy_tlb(&init_mm, current); } -/* - * cpu_have_feature - Test CPU features on module initialization - */ -int cpu_have_feature(unsigned int num) -{ - return elf_hwcap & (1UL << num); -} -EXPORT_SYMBOL(cpu_have_feature); - static void show_facilities(struct seq_file *m) { unsigned int bit; @@ -172,8 +162,7 @@ static void show_cpu_summary(struct seq_file *m, void *v) static int __init setup_hwcaps(void) { /* instructions named N3, "backported" to esa-mode */ - if (test_facility(0)) - elf_hwcap |= HWCAP_ESAN3; + elf_hwcap |= HWCAP_ESAN3; /* z/Architecture mode active */ elf_hwcap |= HWCAP_ZARCH; @@ -191,8 +180,7 @@ static int __init setup_hwcaps(void) elf_hwcap |= HWCAP_LDISP; /* extended-immediate */ - if (test_facility(21)) - elf_hwcap |= HWCAP_EIMM; + elf_hwcap |= HWCAP_EIMM; /* extended-translation facility 3 enhancement */ if (test_facility(22) && test_facility(30)) @@ -262,21 +250,7 @@ static int __init setup_elf_platform(void) get_cpu_id(&cpu_id); add_device_randomness(&cpu_id, sizeof(cpu_id)); switch (cpu_id.machine) { - case 0x2064: - case 0x2066: - default: /* Use "z900" as default for 64 bit kernels. */ - strcpy(elf_platform, "z900"); - break; - case 0x2084: - case 0x2086: - strcpy(elf_platform, "z990"); - break; - case 0x2094: - case 0x2096: - strcpy(elf_platform, "z9-109"); - break; - case 0x2097: - case 0x2098: + default: /* Use "z10" as default. */ strcpy(elf_platform, "z10"); break; case 0x2817: @@ -299,6 +273,10 @@ static int __init setup_elf_platform(void) case 0x8562: strcpy(elf_platform, "z15"); break; + case 0x3931: + case 0x3932: + strcpy(elf_platform, "z16"); + break; } return 0; } diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 0ea3d02b378d..53e0209229f8 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -21,7 +21,6 @@ #include <linux/signal.h> #include <linux/elf.h> #include <linux/regset.h> -#include <linux/tracehook.h> #include <linux/seccomp.h> #include <linux/compat.h> #include <trace/syscall.h> @@ -147,38 +146,36 @@ void ptrace_disable(struct task_struct *task) static inline unsigned long __peek_user_per(struct task_struct *child, addr_t addr) { - struct per_struct_kernel *dummy = NULL; - - if (addr == (addr_t) &dummy->cr9) + if (addr == offsetof(struct per_struct_kernel, cr9)) /* Control bits of the active per set. */ return test_thread_flag(TIF_SINGLE_STEP) ? PER_EVENT_IFETCH : child->thread.per_user.control; - else if (addr == (addr_t) &dummy->cr10) + else if (addr == offsetof(struct per_struct_kernel, cr10)) /* Start address of the active per set. */ return test_thread_flag(TIF_SINGLE_STEP) ? 0 : child->thread.per_user.start; - else if (addr == (addr_t) &dummy->cr11) + else if (addr == offsetof(struct per_struct_kernel, cr11)) /* End address of the active per set. */ return test_thread_flag(TIF_SINGLE_STEP) ? -1UL : child->thread.per_user.end; - else if (addr == (addr_t) &dummy->bits) + else if (addr == offsetof(struct per_struct_kernel, bits)) /* Single-step bit. */ return test_thread_flag(TIF_SINGLE_STEP) ? (1UL << (BITS_PER_LONG - 1)) : 0; - else if (addr == (addr_t) &dummy->starting_addr) + else if (addr == offsetof(struct per_struct_kernel, starting_addr)) /* Start address of the user specified per set. */ return child->thread.per_user.start; - else if (addr == (addr_t) &dummy->ending_addr) + else if (addr == offsetof(struct per_struct_kernel, ending_addr)) /* End address of the user specified per set. */ return child->thread.per_user.end; - else if (addr == (addr_t) &dummy->perc_atmid) + else if (addr == offsetof(struct per_struct_kernel, perc_atmid)) /* PER code, ATMID and AI of the last PER trap */ return (unsigned long) child->thread.per_event.cause << (BITS_PER_LONG - 16); - else if (addr == (addr_t) &dummy->address) + else if (addr == offsetof(struct per_struct_kernel, address)) /* Address of the last PER trap */ return child->thread.per_event.address; - else if (addr == (addr_t) &dummy->access_id) + else if (addr == offsetof(struct per_struct_kernel, access_id)) /* Access id of the last PER trap */ return (unsigned long) child->thread.per_event.paid << (BITS_PER_LONG - 8); @@ -196,61 +193,60 @@ static inline unsigned long __peek_user_per(struct task_struct *child, */ static unsigned long __peek_user(struct task_struct *child, addr_t addr) { - struct user *dummy = NULL; addr_t offset, tmp; - if (addr < (addr_t) &dummy->regs.acrs) { + if (addr < offsetof(struct user, regs.acrs)) { /* * psw and gprs are stored on the stack */ tmp = *(addr_t *)((addr_t) &task_pt_regs(child)->psw + addr); - if (addr == (addr_t) &dummy->regs.psw.mask) { + if (addr == offsetof(struct user, regs.psw.mask)) { /* Return a clean psw mask. */ tmp &= PSW_MASK_USER | PSW_MASK_RI; tmp |= PSW_USER_BITS; } - } else if (addr < (addr_t) &dummy->regs.orig_gpr2) { + } else if (addr < offsetof(struct user, regs.orig_gpr2)) { /* * access registers are stored in the thread structure */ - offset = addr - (addr_t) &dummy->regs.acrs; + offset = addr - offsetof(struct user, regs.acrs); /* * Very special case: old & broken 64 bit gdb reading * from acrs[15]. Result is a 64 bit value. Read the * 32 bit acrs[15] value and shift it by 32. Sick... */ - if (addr == (addr_t) &dummy->regs.acrs[15]) + if (addr == offsetof(struct user, regs.acrs[15])) tmp = ((unsigned long) child->thread.acrs[15]) << 32; else tmp = *(addr_t *)((addr_t) &child->thread.acrs + offset); - } else if (addr == (addr_t) &dummy->regs.orig_gpr2) { + } else if (addr == offsetof(struct user, regs.orig_gpr2)) { /* * orig_gpr2 is stored on the kernel stack */ tmp = (addr_t) task_pt_regs(child)->orig_gpr2; - } else if (addr < (addr_t) &dummy->regs.fp_regs) { + } else if (addr < offsetof(struct user, regs.fp_regs)) { /* * prevent reads of padding hole between * orig_gpr2 and fp_regs on s390. */ tmp = 0; - } else if (addr == (addr_t) &dummy->regs.fp_regs.fpc) { + } else if (addr == offsetof(struct user, regs.fp_regs.fpc)) { /* * floating point control reg. is in the thread structure */ tmp = child->thread.fpu.fpc; tmp <<= BITS_PER_LONG - 32; - } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) { + } else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) { /* * floating point regs. are either in child->thread.fpu * or the child->thread.fpu.vxrs array */ - offset = addr - (addr_t) &dummy->regs.fp_regs.fprs; + offset = addr - offsetof(struct user, regs.fp_regs.fprs); if (MACHINE_HAS_VX) tmp = *(addr_t *) ((addr_t) child->thread.fpu.vxrs + 2*offset); @@ -258,11 +254,11 @@ static unsigned long __peek_user(struct task_struct *child, addr_t addr) tmp = *(addr_t *) ((addr_t) child->thread.fpu.fprs + offset); - } else if (addr < (addr_t) (&dummy->regs.per_info + 1)) { + } else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) { /* * Handle access to the per_info structure. */ - addr -= (addr_t) &dummy->regs.per_info; + addr -= offsetof(struct user, regs.per_info); tmp = __peek_user_per(child, addr); } else @@ -281,8 +277,8 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data) * an alignment of 4. Programmers from hell... */ mask = __ADDR_MASK; - if (addr >= (addr_t) &((struct user *) NULL)->regs.acrs && - addr < (addr_t) &((struct user *) NULL)->regs.orig_gpr2) + if (addr >= offsetof(struct user, regs.acrs) && + addr < offsetof(struct user, regs.orig_gpr2)) mask = 3; if ((addr & mask) || addr > sizeof(struct user) - __ADDR_MASK) return -EIO; @@ -294,8 +290,6 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data) static inline void __poke_user_per(struct task_struct *child, addr_t addr, addr_t data) { - struct per_struct_kernel *dummy = NULL; - /* * There are only three fields in the per_info struct that the * debugger user can write to. @@ -308,14 +302,14 @@ static inline void __poke_user_per(struct task_struct *child, * addresses are used only if single stepping is not in effect. * Writes to any other field in per_info are ignored. */ - if (addr == (addr_t) &dummy->cr9) + if (addr == offsetof(struct per_struct_kernel, cr9)) /* PER event mask of the user specified per set. */ child->thread.per_user.control = data & (PER_EVENT_MASK | PER_CONTROL_MASK); - else if (addr == (addr_t) &dummy->starting_addr) + else if (addr == offsetof(struct per_struct_kernel, starting_addr)) /* Starting address of the user specified per set. */ child->thread.per_user.start = data; - else if (addr == (addr_t) &dummy->ending_addr) + else if (addr == offsetof(struct per_struct_kernel, ending_addr)) /* Ending address of the user specified per set. */ child->thread.per_user.end = data; } @@ -328,16 +322,15 @@ static inline void __poke_user_per(struct task_struct *child, */ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data) { - struct user *dummy = NULL; addr_t offset; - if (addr < (addr_t) &dummy->regs.acrs) { + if (addr < offsetof(struct user, regs.acrs)) { struct pt_regs *regs = task_pt_regs(child); /* * psw and gprs are stored on the stack */ - if (addr == (addr_t) &dummy->regs.psw.mask) { + if (addr == offsetof(struct user, regs.psw.mask)) { unsigned long mask = PSW_MASK_USER; mask |= is_ri_task(child) ? PSW_MASK_RI : 0; @@ -359,36 +352,36 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data) regs->int_code = 0x20000 | (data & 0xffff); } *(addr_t *)((addr_t) ®s->psw + addr) = data; - } else if (addr < (addr_t) (&dummy->regs.orig_gpr2)) { + } else if (addr < offsetof(struct user, regs.orig_gpr2)) { /* * access registers are stored in the thread structure */ - offset = addr - (addr_t) &dummy->regs.acrs; + offset = addr - offsetof(struct user, regs.acrs); /* * Very special case: old & broken 64 bit gdb writing * to acrs[15] with a 64 bit value. Ignore the lower * half of the value and write the upper 32 bit to * acrs[15]. Sick... */ - if (addr == (addr_t) &dummy->regs.acrs[15]) + if (addr == offsetof(struct user, regs.acrs[15])) child->thread.acrs[15] = (unsigned int) (data >> 32); else *(addr_t *)((addr_t) &child->thread.acrs + offset) = data; - } else if (addr == (addr_t) &dummy->regs.orig_gpr2) { + } else if (addr == offsetof(struct user, regs.orig_gpr2)) { /* * orig_gpr2 is stored on the kernel stack */ task_pt_regs(child)->orig_gpr2 = data; - } else if (addr < (addr_t) &dummy->regs.fp_regs) { + } else if (addr < offsetof(struct user, regs.fp_regs)) { /* * prevent writes of padding hole between * orig_gpr2 and fp_regs on s390. */ return 0; - } else if (addr == (addr_t) &dummy->regs.fp_regs.fpc) { + } else if (addr == offsetof(struct user, regs.fp_regs.fpc)) { /* * floating point control reg. is in the thread structure */ @@ -397,12 +390,12 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data) return -EINVAL; child->thread.fpu.fpc = data >> (BITS_PER_LONG - 32); - } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) { + } else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) { /* * floating point regs. are either in child->thread.fpu * or the child->thread.fpu.vxrs array */ - offset = addr - (addr_t) &dummy->regs.fp_regs.fprs; + offset = addr - offsetof(struct user, regs.fp_regs.fprs); if (MACHINE_HAS_VX) *(addr_t *)((addr_t) child->thread.fpu.vxrs + 2*offset) = data; @@ -410,11 +403,11 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data) *(addr_t *)((addr_t) child->thread.fpu.fprs + offset) = data; - } else if (addr < (addr_t) (&dummy->regs.per_info + 1)) { + } else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) { /* * Handle access to the per_info structure. */ - addr -= (addr_t) &dummy->regs.per_info; + addr -= offsetof(struct user, regs.per_info); __poke_user_per(child, addr, data); } @@ -431,8 +424,8 @@ static int poke_user(struct task_struct *child, addr_t addr, addr_t data) * an alignment of 4. Programmers from hell indeed... */ mask = __ADDR_MASK; - if (addr >= (addr_t) &((struct user *) NULL)->regs.acrs && - addr < (addr_t) &((struct user *) NULL)->regs.orig_gpr2) + if (addr >= offsetof(struct user, regs.acrs) && + addr < offsetof(struct user, regs.orig_gpr2)) mask = 3; if ((addr & mask) || addr > sizeof(struct user) - __ADDR_MASK) return -EIO; @@ -540,37 +533,35 @@ long arch_ptrace(struct task_struct *child, long request, static inline __u32 __peek_user_per_compat(struct task_struct *child, addr_t addr) { - struct compat_per_struct_kernel *dummy32 = NULL; - - if (addr == (addr_t) &dummy32->cr9) + if (addr == offsetof(struct compat_per_struct_kernel, cr9)) /* Control bits of the active per set. */ return (__u32) test_thread_flag(TIF_SINGLE_STEP) ? PER_EVENT_IFETCH : child->thread.per_user.control; - else if (addr == (addr_t) &dummy32->cr10) + else if (addr == offsetof(struct compat_per_struct_kernel, cr10)) /* Start address of the active per set. */ return (__u32) test_thread_flag(TIF_SINGLE_STEP) ? 0 : child->thread.per_user.start; - else if (addr == (addr_t) &dummy32->cr11) + else if (addr == offsetof(struct compat_per_struct_kernel, cr11)) /* End address of the active per set. */ return test_thread_flag(TIF_SINGLE_STEP) ? PSW32_ADDR_INSN : child->thread.per_user.end; - else if (addr == (addr_t) &dummy32->bits) + else if (addr == offsetof(struct compat_per_struct_kernel, bits)) /* Single-step bit. */ return (__u32) test_thread_flag(TIF_SINGLE_STEP) ? 0x80000000 : 0; - else if (addr == (addr_t) &dummy32->starting_addr) + else if (addr == offsetof(struct compat_per_struct_kernel, starting_addr)) /* Start address of the user specified per set. */ return (__u32) child->thread.per_user.start; - else if (addr == (addr_t) &dummy32->ending_addr) + else if (addr == offsetof(struct compat_per_struct_kernel, ending_addr)) /* End address of the user specified per set. */ return (__u32) child->thread.per_user.end; - else if (addr == (addr_t) &dummy32->perc_atmid) + else if (addr == offsetof(struct compat_per_struct_kernel, perc_atmid)) /* PER code, ATMID and AI of the last PER trap */ return (__u32) child->thread.per_event.cause << 16; - else if (addr == (addr_t) &dummy32->address) + else if (addr == offsetof(struct compat_per_struct_kernel, address)) /* Address of the last PER trap */ return (__u32) child->thread.per_event.address; - else if (addr == (addr_t) &dummy32->access_id) + else if (addr == offsetof(struct compat_per_struct_kernel, access_id)) /* Access id of the last PER trap */ return (__u32) child->thread.per_event.paid << 24; return 0; @@ -581,21 +572,20 @@ static inline __u32 __peek_user_per_compat(struct task_struct *child, */ static u32 __peek_user_compat(struct task_struct *child, addr_t addr) { - struct compat_user *dummy32 = NULL; addr_t offset; __u32 tmp; - if (addr < (addr_t) &dummy32->regs.acrs) { + if (addr < offsetof(struct compat_user, regs.acrs)) { struct pt_regs *regs = task_pt_regs(child); /* * psw and gprs are stored on the stack */ - if (addr == (addr_t) &dummy32->regs.psw.mask) { + if (addr == offsetof(struct compat_user, regs.psw.mask)) { /* Fake a 31 bit psw mask. */ tmp = (__u32)(regs->psw.mask >> 32); tmp &= PSW32_MASK_USER | PSW32_MASK_RI; tmp |= PSW32_USER_BITS; - } else if (addr == (addr_t) &dummy32->regs.psw.addr) { + } else if (addr == offsetof(struct compat_user, regs.psw.addr)) { /* Fake a 31 bit psw address. */ tmp = (__u32) regs->psw.addr | (__u32)(regs->psw.mask & PSW_MASK_BA); @@ -603,38 +593,38 @@ static u32 __peek_user_compat(struct task_struct *child, addr_t addr) /* gpr 0-15 */ tmp = *(__u32 *)((addr_t) ®s->psw + addr*2 + 4); } - } else if (addr < (addr_t) (&dummy32->regs.orig_gpr2)) { + } else if (addr < offsetof(struct compat_user, regs.orig_gpr2)) { /* * access registers are stored in the thread structure */ - offset = addr - (addr_t) &dummy32->regs.acrs; + offset = addr - offsetof(struct compat_user, regs.acrs); tmp = *(__u32*)((addr_t) &child->thread.acrs + offset); - } else if (addr == (addr_t) (&dummy32->regs.orig_gpr2)) { + } else if (addr == offsetof(struct compat_user, regs.orig_gpr2)) { /* * orig_gpr2 is stored on the kernel stack */ tmp = *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4); - } else if (addr < (addr_t) &dummy32->regs.fp_regs) { + } else if (addr < offsetof(struct compat_user, regs.fp_regs)) { /* * prevent reads of padding hole between * orig_gpr2 and fp_regs on s390. */ tmp = 0; - } else if (addr == (addr_t) &dummy32->regs.fp_regs.fpc) { + } else if (addr == offsetof(struct compat_user, regs.fp_regs.fpc)) { /* * floating point control reg. is in the thread structure */ tmp = child->thread.fpu.fpc; - } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) { + } else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) { /* * floating point regs. are either in child->thread.fpu * or the child->thread.fpu.vxrs array */ - offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs; + offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs); if (MACHINE_HAS_VX) tmp = *(__u32 *) ((addr_t) child->thread.fpu.vxrs + 2*offset); @@ -642,11 +632,11 @@ static u32 __peek_user_compat(struct task_struct *child, addr_t addr) tmp = *(__u32 *) ((addr_t) child->thread.fpu.fprs + offset); - } else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) { + } else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) { /* * Handle access to the per_info structure. */ - addr -= (addr_t) &dummy32->regs.per_info; + addr -= offsetof(struct compat_user, regs.per_info); tmp = __peek_user_per_compat(child, addr); } else @@ -673,16 +663,14 @@ static int peek_user_compat(struct task_struct *child, static inline void __poke_user_per_compat(struct task_struct *child, addr_t addr, __u32 data) { - struct compat_per_struct_kernel *dummy32 = NULL; - - if (addr == (addr_t) &dummy32->cr9) + if (addr == offsetof(struct compat_per_struct_kernel, cr9)) /* PER event mask of the user specified per set. */ child->thread.per_user.control = data & (PER_EVENT_MASK | PER_CONTROL_MASK); - else if (addr == (addr_t) &dummy32->starting_addr) + else if (addr == offsetof(struct compat_per_struct_kernel, starting_addr)) /* Starting address of the user specified per set. */ child->thread.per_user.start = data; - else if (addr == (addr_t) &dummy32->ending_addr) + else if (addr == offsetof(struct compat_per_struct_kernel, ending_addr)) /* Ending address of the user specified per set. */ child->thread.per_user.end = data; } @@ -693,16 +681,15 @@ static inline void __poke_user_per_compat(struct task_struct *child, static int __poke_user_compat(struct task_struct *child, addr_t addr, addr_t data) { - struct compat_user *dummy32 = NULL; __u32 tmp = (__u32) data; addr_t offset; - if (addr < (addr_t) &dummy32->regs.acrs) { + if (addr < offsetof(struct compat_user, regs.acrs)) { struct pt_regs *regs = task_pt_regs(child); /* * psw, gprs, acrs and orig_gpr2 are stored on the stack */ - if (addr == (addr_t) &dummy32->regs.psw.mask) { + if (addr == offsetof(struct compat_user, regs.psw.mask)) { __u32 mask = PSW32_MASK_USER; mask |= is_ri_task(child) ? PSW32_MASK_RI : 0; @@ -716,7 +703,7 @@ static int __poke_user_compat(struct task_struct *child, regs->psw.mask = (regs->psw.mask & ~PSW_MASK_USER) | (regs->psw.mask & PSW_MASK_BA) | (__u64)(tmp & mask) << 32; - } else if (addr == (addr_t) &dummy32->regs.psw.addr) { + } else if (addr == offsetof(struct compat_user, regs.psw.addr)) { /* Build a 64 bit psw address from 31 bit address. */ regs->psw.addr = (__u64) tmp & PSW32_ADDR_INSN; /* Transfer 31 bit amode bit to psw mask. */ @@ -732,27 +719,27 @@ static int __poke_user_compat(struct task_struct *child, /* gpr 0-15 */ *(__u32*)((addr_t) ®s->psw + addr*2 + 4) = tmp; } - } else if (addr < (addr_t) (&dummy32->regs.orig_gpr2)) { + } else if (addr < offsetof(struct compat_user, regs.orig_gpr2)) { /* * access registers are stored in the thread structure */ - offset = addr - (addr_t) &dummy32->regs.acrs; + offset = addr - offsetof(struct compat_user, regs.acrs); *(__u32*)((addr_t) &child->thread.acrs + offset) = tmp; - } else if (addr == (addr_t) (&dummy32->regs.orig_gpr2)) { + } else if (addr == offsetof(struct compat_user, regs.orig_gpr2)) { /* * orig_gpr2 is stored on the kernel stack */ *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4) = tmp; - } else if (addr < (addr_t) &dummy32->regs.fp_regs) { + } else if (addr < offsetof(struct compat_user, regs.fp_regs)) { /* * prevent writess of padding hole between * orig_gpr2 and fp_regs on s390. */ return 0; - } else if (addr == (addr_t) &dummy32->regs.fp_regs.fpc) { + } else if (addr == offsetof(struct compat_user, regs.fp_regs.fpc)) { /* * floating point control reg. is in the thread structure */ @@ -760,12 +747,12 @@ static int __poke_user_compat(struct task_struct *child, return -EINVAL; child->thread.fpu.fpc = data; - } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) { + } else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) { /* * floating point regs. are either in child->thread.fpu * or the child->thread.fpu.vxrs array */ - offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs; + offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs); if (MACHINE_HAS_VX) *(__u32 *)((addr_t) child->thread.fpu.vxrs + 2*offset) = tmp; @@ -773,11 +760,11 @@ static int __poke_user_compat(struct task_struct *child, *(__u32 *)((addr_t) child->thread.fpu.fprs + offset) = tmp; - } else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) { + } else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) { /* * Handle access to the per_info structure. */ - addr -= (addr_t) &dummy32->regs.per_info; + addr -= offsetof(struct compat_user, regs.per_info); __poke_user_per_compat(child, addr, data); } diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S index fe396673e8a6..a9a1a6f45375 100644 --- a/arch/s390/kernel/relocate_kernel.S +++ b/arch/s390/kernel/relocate_kernel.S @@ -2,8 +2,7 @@ /* * Copyright IBM Corp. 2005 * - * Author(s): Rolf Adelsberger, - * Heiko Carstens <heiko.carstens@de.ibm.com> + * Author(s): Rolf Adelsberger * */ @@ -15,6 +14,7 @@ * moves the new kernel to its destination... * %r2 = pointer to first kimage_entry_t * %r3 = start address - where to jump to after the job is done... + * %r4 = subcode * * %r5 will be used as temp. storage * %r6 holds the destination address @@ -57,7 +57,7 @@ ENTRY(relocate_kernel) jo 0b j .base .done: - sgr %r0,%r0 # clear register r0 + lgr %r0,%r4 # subcode cghi %r3,0 je .diag la %r4,load_psw-.base(%r13) # load psw-address into the register diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 225ab2d0a4c6..ab19ddb09d65 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -58,7 +58,7 @@ #include <asm/smp.h> #include <asm/mmu_context.h> #include <asm/cpcmd.h> -#include <asm/lowcore.h> +#include <asm/abs_lowcore.h> #include <asm/nmi.h> #include <asm/irq.h> #include <asm/page.h> @@ -74,6 +74,7 @@ #include <asm/alternative.h> #include <asm/nospec-branch.h> #include <asm/mem_detect.h> +#include <asm/maccess.h> #include <asm/uv.h> #include <asm/asm-offsets.h> #include "entry.h" @@ -395,6 +396,7 @@ void __init arch_call_rest_init(void) { unsigned long stack; + smp_reinit_ipl_cpu(); stack = stack_alloc(); if (!stack) panic("Couldn't allocate kernel stack"); @@ -411,8 +413,9 @@ void __init arch_call_rest_init(void) static void __init setup_lowcore_dat_off(void) { unsigned long int_psw_mask = PSW_KERNEL_BITS; + struct lowcore *abs_lc, *lc; unsigned long mcck_stack; - struct lowcore *lc; + unsigned long flags; if (IS_ENABLED(CONFIG_KASAN)) int_psw_mask |= PSW_MASK_DAT; @@ -445,7 +448,7 @@ static void __init setup_lowcore_dat_off(void) lc->lpp = LPP_MAGIC; lc->machine_flags = S390_lowcore.machine_flags; lc->preempt_count = S390_lowcore.preempt_count; - nmi_alloc_boot_cpu(lc); + nmi_alloc_mcesa_early(&lc->mcesad); lc->sys_enter_timer = S390_lowcore.sys_enter_timer; lc->exit_timer = S390_lowcore.exit_timer; lc->user_timer = S390_lowcore.user_timer; @@ -474,46 +477,53 @@ static void __init setup_lowcore_dat_off(void) lc->restart_data = 0; lc->restart_source = -1U; + abs_lc = get_abs_lowcore(&flags); + abs_lc->restart_stack = lc->restart_stack; + abs_lc->restart_fn = lc->restart_fn; + abs_lc->restart_data = lc->restart_data; + abs_lc->restart_source = lc->restart_source; + abs_lc->restart_psw = lc->restart_psw; + abs_lc->mcesad = lc->mcesad; + put_abs_lowcore(abs_lc, flags); + mcck_stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE); if (!mcck_stack) panic("%s: Failed to allocate %lu bytes align=0x%lx\n", __func__, THREAD_SIZE, THREAD_SIZE); lc->mcck_stack = mcck_stack + STACK_INIT_OFFSET; - /* Setup absolute zero lowcore */ - mem_assign_absolute(S390_lowcore.restart_stack, lc->restart_stack); - mem_assign_absolute(S390_lowcore.restart_fn, lc->restart_fn); - mem_assign_absolute(S390_lowcore.restart_data, lc->restart_data); - mem_assign_absolute(S390_lowcore.restart_source, lc->restart_source); - mem_assign_absolute(S390_lowcore.restart_psw, lc->restart_psw); - lc->spinlock_lockval = arch_spin_lockval(0); lc->spinlock_index = 0; arch_spin_lock_setup(0); - lc->br_r1_trampoline = 0x07f1; /* br %r1 */ lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); lc->preempt_count = PREEMPT_DISABLED; - set_prefix((u32)(unsigned long) lc); + set_prefix(__pa(lc)); lowcore_ptr[0] = lc; } static void __init setup_lowcore_dat_on(void) { - struct lowcore *lc = lowcore_ptr[0]; + struct lowcore *abs_lc; + unsigned long flags; __ctl_clear_bit(0, 28); S390_lowcore.external_new_psw.mask |= PSW_MASK_DAT; S390_lowcore.svc_new_psw.mask |= PSW_MASK_DAT; S390_lowcore.program_new_psw.mask |= PSW_MASK_DAT; S390_lowcore.io_new_psw.mask |= PSW_MASK_DAT; - __ctl_store(S390_lowcore.cregs_save_area, 0, 15); __ctl_set_bit(0, 28); - mem_assign_absolute(S390_lowcore.restart_flags, RESTART_FLAG_CTLREGS); - mem_assign_absolute(S390_lowcore.program_new_psw, lc->program_new_psw); - memcpy_absolute(&S390_lowcore.cregs_save_area, lc->cregs_save_area, - sizeof(S390_lowcore.cregs_save_area)); + __ctl_store(S390_lowcore.cregs_save_area, 0, 15); + if (abs_lowcore_map(0, lowcore_ptr[0], true)) + panic("Couldn't setup absolute lowcore"); + abs_lowcore_mapped = true; + abs_lc = get_abs_lowcore(&flags); + abs_lc->restart_flags = RESTART_FLAG_CTLREGS; + abs_lc->program_new_psw = S390_lowcore.program_new_psw; + memcpy(abs_lc->cregs_save_area, S390_lowcore.cregs_save_area, + sizeof(abs_lc->cregs_save_area)); + put_abs_lowcore(abs_lc, flags); } static struct resource code_resource = { @@ -800,6 +810,8 @@ static void __init check_initrd(void) static void __init reserve_kernel(void) { memblock_reserve(0, STARTUP_NORMAL_OFFSET); + memblock_reserve(OLDMEM_BASE, sizeof(unsigned long)); + memblock_reserve(OLDMEM_SIZE, sizeof(unsigned long)); memblock_reserve(__amode31_base, __eamode31 - __samode31); memblock_reserve(__pa(sclp_early_sccb), EXT_SCCB_READ_SCP); memblock_reserve(__pa(_stext), _end - _stext); @@ -873,6 +885,9 @@ static void __init setup_randomness(void) if (stsi(vmms, 3, 2, 2) == 0 && vmms->count) add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count); memblock_free(vmms, PAGE_SIZE); + + if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) + static_branch_enable(&s390_arch_random_available); } /* @@ -1014,10 +1029,10 @@ void __init setup_arch(char **cmdline_p) reserve_crashkernel(); #ifdef CONFIG_CRASH_DUMP /* - * Be aware that smp_save_dump_cpus() triggers a system reset. + * Be aware that smp_save_dump_secondary_cpus() triggers a system reset. * Therefore CPU and device initialization should be done afterwards. */ - smp_save_dump_cpus(); + smp_save_dump_secondary_cpus(); #endif setup_resources(); @@ -1036,12 +1051,15 @@ void __init setup_arch(char **cmdline_p) * Create kernel page tables and switch to virtual addressing. */ paging_init(); - + memcpy_real_init(); /* * After paging_init created the kernel page table, the new PSWs * in lowcore can now run with DAT enabled. */ setup_lowcore_dat_on(); +#ifdef CONFIG_CRASH_DUMP + smp_save_dump_ipl_cpu(); +#endif /* Setup default console */ conmode_default(); diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 307f5d99514d..38258f817048 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -25,7 +25,6 @@ #include <linux/tty.h> #include <linux/personality.h> #include <linux/binfmts.h> -#include <linux/tracehook.h> #include <linux/syscalls.h> #include <linux/compat.h> #include <asm/ucontext.h> @@ -141,7 +140,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) { _sigregs user_sregs; - /* Alwys make any pending restarted system call return -EINTR */ + /* Always make any pending restarted system call return -EINTR */ current->restart_block.fn = do_no_restart_syscall; if (__copy_from_user(&user_sregs, sregs, sizeof(user_sregs))) @@ -453,7 +452,7 @@ static void handle_signal(struct ksignal *ksig, sigset_t *oldset, * stack-frames in one go after that. */ -void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) +void arch_do_signal_or_restart(struct pt_regs *regs) { struct ksignal ksig; sigset_t *oldset = sigmask_to_save(); @@ -466,7 +465,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) current->thread.system_call = test_pt_regs_flag(regs, PIF_SYSCALL) ? regs->int_code : 0; - if (has_signal && get_signal(&ksig)) { + if (get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ if (current->thread.system_call) { regs->int_code = current->thread.system_call; diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 78a8ea6fd582..0031325ce4bc 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -5,7 +5,6 @@ * Copyright IBM Corp. 1999, 2012 * Author(s): Denis Joseph Barrow, * Martin Schwidefsky <schwidefsky@de.ibm.com>, - * Heiko Carstens <heiko.carstens@de.ibm.com>, * * based on other smp stuff by * (c) 1995 Alan Cox, CymruNET Ltd <alan@cymru.net> @@ -46,7 +45,7 @@ #include <asm/irq.h> #include <asm/tlbflush.h> #include <asm/vtimer.h> -#include <asm/lowcore.h> +#include <asm/abs_lowcore.h> #include <asm/sclp.h> #include <asm/debug.h> #include <asm/os_info.h> @@ -56,6 +55,7 @@ #include <asm/stacktrace.h> #include <asm/topology.h> #include <asm/vdso.h> +#include <asm/maccess.h> #include "entry.h" enum { @@ -208,16 +208,19 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) lc->cpu_nr = cpu; lc->spinlock_lockval = arch_spin_lockval(cpu); lc->spinlock_index = 0; - lc->br_r1_trampoline = 0x07f1; /* br %r1 */ lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); lc->preempt_count = PREEMPT_DISABLED; - if (nmi_alloc_per_cpu(lc)) + if (nmi_alloc_mcesa(&lc->mcesad)) goto out; + if (abs_lowcore_map(cpu, lc, true)) + goto out_mcesa; lowcore_ptr[cpu] = lc; - pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, (u32)(unsigned long) lc); + pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, __pa(lc)); return 0; +out_mcesa: + nmi_free_mcesa(&lc->mcesad); out: stack_free(mcck_stack); stack_free(async_stack); @@ -239,7 +242,8 @@ static void pcpu_free_lowcore(struct pcpu *pcpu) mcck_stack = lc->mcck_stack - STACK_INIT_OFFSET; pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, 0); lowcore_ptr[cpu] = NULL; - nmi_free_per_cpu(lc); + abs_lowcore_unmap(cpu); + nmi_free_mcesa(&lc->mcesad); stack_free(async_stack); stack_free(mcck_stack); free_pages(nodat_stack, THREAD_SIZE_ORDER); @@ -317,9 +321,12 @@ static void pcpu_delegate(struct pcpu *pcpu, pcpu_delegate_fn *func, void *data, unsigned long stack) { - struct lowcore *lc = lowcore_ptr[pcpu - pcpu_devices]; - unsigned int source_cpu = stap(); + struct lowcore *lc, *abs_lc; + unsigned int source_cpu; + unsigned long flags; + lc = lowcore_ptr[pcpu - pcpu_devices]; + source_cpu = stap(); __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); if (pcpu->address == source_cpu) { call_on_stack(2, stack, void, __pcpu_delegate, @@ -328,10 +335,19 @@ static void pcpu_delegate(struct pcpu *pcpu, /* Stop target cpu (if func returns this stops the current cpu). */ pcpu_sigp_retry(pcpu, SIGP_STOP, 0); /* Restart func on the target cpu and stop the current cpu. */ - mem_assign_absolute(lc->restart_stack, stack); - mem_assign_absolute(lc->restart_fn, (unsigned long) func); - mem_assign_absolute(lc->restart_data, (unsigned long) data); - mem_assign_absolute(lc->restart_source, source_cpu); + if (lc) { + lc->restart_stack = stack; + lc->restart_fn = (unsigned long)func; + lc->restart_data = (unsigned long)data; + lc->restart_source = source_cpu; + } else { + abs_lc = get_abs_lowcore(&flags); + abs_lc->restart_stack = stack; + abs_lc->restart_fn = (unsigned long)func; + abs_lc->restart_data = (unsigned long)data; + abs_lc->restart_source = source_cpu; + put_abs_lowcore(abs_lc, flags); + } __bpon(); asm volatile( "0: sigp 0,%0,%2 # sigp restart to target cpu\n" @@ -572,39 +588,31 @@ static void smp_ctl_bit_callback(void *info) } static DEFINE_SPINLOCK(ctl_lock); -static unsigned long ctlreg; -/* - * Set a bit in a control register of all cpus - */ -void smp_ctl_set_bit(int cr, int bit) -{ - struct ec_creg_mask_parms parms = { 1UL << bit, -1UL, cr }; - - spin_lock(&ctl_lock); - memcpy_absolute(&ctlreg, &S390_lowcore.cregs_save_area[cr], sizeof(ctlreg)); - __set_bit(bit, &ctlreg); - memcpy_absolute(&S390_lowcore.cregs_save_area[cr], &ctlreg, sizeof(ctlreg)); - spin_unlock(&ctl_lock); - on_each_cpu(smp_ctl_bit_callback, &parms, 1); -} -EXPORT_SYMBOL(smp_ctl_set_bit); - -/* - * Clear a bit in a control register of all cpus - */ -void smp_ctl_clear_bit(int cr, int bit) +void smp_ctl_set_clear_bit(int cr, int bit, bool set) { - struct ec_creg_mask_parms parms = { 0, ~(1UL << bit), cr }; - + struct ec_creg_mask_parms parms = { .cr = cr, }; + struct lowcore *abs_lc; + unsigned long flags; + u64 ctlreg; + + if (set) { + parms.orval = 1UL << bit; + parms.andval = -1UL; + } else { + parms.orval = 0; + parms.andval = ~(1UL << bit); + } spin_lock(&ctl_lock); - memcpy_absolute(&ctlreg, &S390_lowcore.cregs_save_area[cr], sizeof(ctlreg)); - __clear_bit(bit, &ctlreg); - memcpy_absolute(&S390_lowcore.cregs_save_area[cr], &ctlreg, sizeof(ctlreg)); + abs_lc = get_abs_lowcore(&flags); + ctlreg = abs_lc->cregs_save_area[cr]; + ctlreg = (ctlreg & parms.andval) | parms.orval; + abs_lc->cregs_save_area[cr] = ctlreg; + put_abs_lowcore(abs_lc, flags); spin_unlock(&ctl_lock); on_each_cpu(smp_ctl_bit_callback, &parms, 1); } -EXPORT_SYMBOL(smp_ctl_clear_bit); +EXPORT_SYMBOL(smp_ctl_set_clear_bit); #ifdef CONFIG_CRASH_DUMP @@ -622,7 +630,7 @@ int smp_store_status(int cpu) return -EIO; if (!MACHINE_HAS_VX && !MACHINE_HAS_GS) return 0; - pa = __pa(lc->mcesad & MCESA_ORIGIN_MASK); + pa = lc->mcesad & MCESA_ORIGIN_MASK; if (MACHINE_HAS_GS) pa |= lc->mcesad & MCESA_LC_MASK; if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_ADDITIONAL_STATUS, @@ -657,42 +665,39 @@ int smp_store_status(int cpu) * This case does not exist for s390 anymore, setup_arch explicitly * deactivates the elfcorehdr= kernel parameter */ -static __init void smp_save_cpu_vxrs(struct save_area *sa, u16 addr, - bool is_boot_cpu, unsigned long page) +static bool dump_available(void) { - __vector128 *vxrs = (__vector128 *) page; - - if (is_boot_cpu) - vxrs = boot_cpu_vector_save_area; - else - __pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, page); - save_area_add_vxrs(sa, vxrs); + return oldmem_data.start || is_ipl_type_dump(); } -static __init void smp_save_cpu_regs(struct save_area *sa, u16 addr, - bool is_boot_cpu, unsigned long page) +void __init smp_save_dump_ipl_cpu(void) { - void *regs = (void *) page; + struct save_area *sa; + void *regs; - if (is_boot_cpu) - copy_oldmem_kernel(regs, (void *) __LC_FPREGS_SAVE_AREA, 512); - else - __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, page); + if (!dump_available()) + return; + sa = save_area_alloc(true); + regs = memblock_alloc(512, 8); + if (!sa || !regs) + panic("could not allocate memory for boot CPU save area\n"); + copy_oldmem_kernel(regs, __LC_FPREGS_SAVE_AREA, 512); save_area_add_regs(sa, regs); + memblock_free(regs, 512); + if (MACHINE_HAS_VX) + save_area_add_vxrs(sa, boot_cpu_vector_save_area); } -void __init smp_save_dump_cpus(void) +void __init smp_save_dump_secondary_cpus(void) { int addr, boot_cpu_addr, max_cpu_addr; struct save_area *sa; - unsigned long page; - bool is_boot_cpu; + void *page; - if (!(oldmem_data.start || is_ipl_type_dump())) - /* No previous system present, normal boot. */ + if (!dump_available()) return; /* Allocate a page as dumping area for the store status sigps */ - page = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0, 1UL << 31); + page = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); if (!page) panic("ERROR: Failed to allocate %lx bytes below %lx\n", PAGE_SIZE, 1UL << 31); @@ -702,28 +707,22 @@ void __init smp_save_dump_cpus(void) boot_cpu_addr = stap(); max_cpu_addr = SCLP_MAX_CORES << sclp.mtid_prev; for (addr = 0; addr <= max_cpu_addr; addr++) { + if (addr == boot_cpu_addr) + continue; if (__pcpu_sigp_relax(addr, SIGP_SENSE, 0) == SIGP_CC_NOT_OPERATIONAL) continue; - is_boot_cpu = (addr == boot_cpu_addr); - /* Allocate save area */ - sa = save_area_alloc(is_boot_cpu); + sa = save_area_alloc(false); if (!sa) panic("could not allocate memory for save area\n"); - if (MACHINE_HAS_VX) - /* Get the vector registers */ - smp_save_cpu_vxrs(sa, addr, is_boot_cpu, page); - /* - * For a zfcp/nvme dump OLDMEM_BASE == NULL and the registers - * of the boot CPU are stored in the HSA. To retrieve - * these registers an SCLP request is required which is - * done by drivers/s390/char/zcore.c:init_cpu_info() - */ - if (!is_boot_cpu || oldmem_data.start) - /* Get the CPU registers */ - smp_save_cpu_regs(sa, addr, is_boot_cpu, page); + __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, __pa(page)); + save_area_add_regs(sa, page); + if (MACHINE_HAS_VX) { + __pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, __pa(page)); + save_area_add_vxrs(sa, page); + } } - memblock_phys_free(page, PAGE_SIZE); + memblock_free(page, PAGE_SIZE); diag_amode31_ops.diag308_reset(); pcpu_set_smt(0); } @@ -880,7 +879,7 @@ void __init smp_detect_cpus(void) /* Add CPUs present at boot */ __smp_rescan_cpus(info, true); - memblock_phys_free((unsigned long)info, sizeof(*info)); + memblock_free(info, sizeof(*info)); } /* @@ -1257,7 +1256,7 @@ static __always_inline void set_new_lowcore(struct lowcore *lc) src.odd = sizeof(S390_lowcore); dst.even = (unsigned long) lc; dst.odd = sizeof(*lc); - pfx = (unsigned long) lc; + pfx = __pa(lc); asm volatile( " mvcl %[dst],%[src]\n" @@ -1267,18 +1266,19 @@ static __always_inline void set_new_lowcore(struct lowcore *lc) : "memory", "cc"); } -static int __init smp_reinit_ipl_cpu(void) +int __init smp_reinit_ipl_cpu(void) { unsigned long async_stack, nodat_stack, mcck_stack; struct lowcore *lc, *lc_ipl; - unsigned long flags; + unsigned long flags, cr0; + u64 mcesad; lc_ipl = lowcore_ptr[0]; lc = (struct lowcore *) __get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER); nodat_stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); async_stack = stack_alloc(); mcck_stack = stack_alloc(); - if (!lc || !nodat_stack || !async_stack || !mcck_stack) + if (!lc || !nodat_stack || !async_stack || !mcck_stack || nmi_alloc_mcesa(&mcesad)) panic("Couldn't allocate memory"); local_irq_save(flags); @@ -1287,14 +1287,19 @@ static int __init smp_reinit_ipl_cpu(void) S390_lowcore.nodat_stack = nodat_stack + STACK_INIT_OFFSET; S390_lowcore.async_stack = async_stack + STACK_INIT_OFFSET; S390_lowcore.mcck_stack = mcck_stack + STACK_INIT_OFFSET; + __ctl_store(cr0, 0, 0); + __ctl_clear_bit(0, 28); /* disable lowcore protection */ + S390_lowcore.mcesad = mcesad; + __ctl_load(cr0, 0, 0); + if (abs_lowcore_map(0, lc, false)) + panic("Couldn't remap absolute lowcore"); lowcore_ptr[0] = lc; local_mcck_enable(); local_irq_restore(flags); free_pages(lc_ipl->async_stack - STACK_INIT_OFFSET, THREAD_SIZE_ORDER); - memblock_free_late(lc_ipl->mcck_stack - STACK_INIT_OFFSET, THREAD_SIZE); - memblock_free_late((unsigned long) lc_ipl, sizeof(*lc_ipl)); + memblock_free_late(__pa(lc_ipl->mcck_stack - STACK_INIT_OFFSET), THREAD_SIZE); + memblock_free_late(__pa(lc_ipl), sizeof(*lc_ipl)); return 0; } -early_initcall(smp_reinit_ipl_cpu); diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index b7bb1981e9ee..7ee455e8e3d5 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -3,7 +3,6 @@ * Stack trace management functions * * Copyright IBM Corp. 2006 - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> */ #include <linux/stacktrace.h> diff --git a/arch/s390/kernel/syscalls/Makefile b/arch/s390/kernel/syscalls/Makefile index b98f25029b8e..fb85e797946d 100644 --- a/arch/s390/kernel/syscalls/Makefile +++ b/arch/s390/kernel/syscalls/Makefile @@ -21,8 +21,7 @@ uapi: $(uapi-hdrs-y) # Create output directory if not already present -_dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ - $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') +$(shell mkdir -p $(uapi) $(kapi)) filechk_syshdr = $(CONFIG_SHELL) '$(systbl)' -H -a $(syshdr_abi_$(basetarget)) -f "$2" < $< diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index ed9c5c2eafad..799147658dee 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -452,3 +452,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c index ef3f2659876c..b5e364358ce4 100644 --- a/arch/s390/kernel/sysinfo.c +++ b/arch/s390/kernel/sysinfo.c @@ -14,6 +14,7 @@ #include <linux/delay.h> #include <linux/export.h> #include <linux/slab.h> +#include <asm/asm-extable.h> #include <asm/ebcdic.h> #include <asm/debug.h> #include <asm/sysinfo.h> diff --git a/arch/s390/kernel/text_amode31.S b/arch/s390/kernel/text_amode31.S index 868e4a604110..2c8b14cc5556 100644 --- a/arch/s390/kernel/text_amode31.S +++ b/arch/s390/kernel/text_amode31.S @@ -6,6 +6,7 @@ */ #include <linux/linkage.h> +#include <asm/asm-extable.h> #include <asm/errno.h> #include <asm/sigp.h> diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 326cb8f75f58..6b7b6d5e3632 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -364,7 +364,7 @@ static inline int check_sync_clock(void) * Apply clock delta to the global data structures. * This is called once on the CPU that performed the clock sync. */ -static void clock_sync_global(unsigned long delta) +static void clock_sync_global(long delta) { unsigned long now, adj; struct ptff_qto qto; @@ -400,7 +400,7 @@ static void clock_sync_global(unsigned long delta) * Apply clock delta to the per-CPU data structures of this CPU. * This is called for each online CPU after the call to clock_sync_global. */ -static void clock_sync_local(unsigned long delta) +static void clock_sync_local(long delta) { /* Add the delta to the clock comparator. */ if (S390_lowcore.clock_comparator != clock_comparator_max) { @@ -424,7 +424,7 @@ static void __init time_init_wq(void) struct clock_sync_data { atomic_t cpus; int in_sync; - unsigned long clock_delta; + long clock_delta; }; /* @@ -544,7 +544,7 @@ static int stpinfo_valid(void) static int stp_sync_clock(void *data) { struct clock_sync_data *sync = data; - u64 clock_delta, flags; + long clock_delta, flags; static int first; int rc; @@ -554,9 +554,7 @@ static int stp_sync_clock(void *data) while (atomic_read(&sync->cpus) != 0) cpu_relax(); rc = 0; - if (stp_info.todoff[0] || stp_info.todoff[1] || - stp_info.todoff[2] || stp_info.todoff[3] || - stp_info.tmd != 2) { + if (stp_info.todoff || stp_info.tmd != 2) { flags = vdso_update_begin(); rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0, &clock_delta); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 58f8291950cb..c6eecd4a5302 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corp. 2007, 2011 - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> */ #define KMSG_COMPONENT "cpu" diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 2b780786fc68..1d2aa448d103 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -27,6 +27,7 @@ #include <linux/uaccess.h> #include <linux/cpu.h> #include <linux/entry-common.h> +#include <asm/asm-extable.h> #include <asm/fpu/api.h> #include <asm/vtime.h> #include "entry.h" @@ -53,9 +54,7 @@ void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) force_sig_fault(si_signo, si_code, get_trap_ip(regs)); report_user_fault(regs, si_signo, 0); } else { - const struct exception_table_entry *fixup; - fixup = s390_search_extables(regs->psw.addr); - if (!fixup || !ex_handle(fixup, regs)) + if (!fixup_exception(regs)) die(regs, str); } } @@ -142,10 +141,10 @@ static inline void do_fp_trap(struct pt_regs *regs, __u32 fpc) do_trap(regs, SIGFPE, si_code, "floating point exception"); } -static void translation_exception(struct pt_regs *regs) +static void translation_specification_exception(struct pt_regs *regs) { /* May never happen. */ - panic("Translation exception"); + panic("Translation-Specification Exception"); } static void illegal_op(struct pt_regs *regs) @@ -244,16 +243,12 @@ static void space_switch_exception(struct pt_regs *regs) static void monitor_event_exception(struct pt_regs *regs) { - const struct exception_table_entry *fixup; - if (user_mode(regs)) return; switch (report_bug(regs->psw.addr - (regs->int_code >> 16), regs)) { case BUG_TRAP_TYPE_NONE: - fixup = s390_search_extables(regs->psw.addr); - if (fixup) - ex_handle(fixup, regs); + fixup_exception(regs); break; case BUG_TRAP_TYPE_WARN: break; @@ -291,7 +286,6 @@ static void __init test_monitor_call(void) void __init trap_init(void) { - sort_extable(__start_amode31_ex_table, __stop_amode31_ex_table); local_mcck_enable(); test_monitor_call(); } @@ -303,7 +297,7 @@ void noinstr __do_pgm_check(struct pt_regs *regs) unsigned int trapnr; irqentry_state_t state; - regs->int_code = *(u32 *)&S390_lowcore.pgm_ilc; + regs->int_code = S390_lowcore.pgm_int_code; regs->int_parm_long = S390_lowcore.trans_exc_code; state = irqentry_enter(regs); @@ -328,7 +322,7 @@ void noinstr __do_pgm_check(struct pt_regs *regs) set_thread_flag(TIF_PER_TRAP); ev->address = S390_lowcore.per_address; - ev->cause = *(u16 *)&S390_lowcore.per_code; + ev->cause = S390_lowcore.per_code_combined; ev->paid = S390_lowcore.per_access_id; } else { /* PER event in kernel is kprobes */ @@ -374,7 +368,7 @@ static void (*pgm_check_table[128])(struct pt_regs *regs) = { [0x0f] = hfp_divide_exception, [0x10] = do_dat_exception, [0x11] = do_dat_exception, - [0x12] = translation_exception, + [0x12] = translation_specification_exception, [0x13] = special_op_exception, [0x14] = default_trap_handler, [0x15] = operand_exception, diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c index 707fd99f6734..0ece156fdd7c 100644 --- a/arch/s390/kernel/unwind_bc.c +++ b/arch/s390/kernel/unwind_bc.c @@ -64,8 +64,8 @@ bool unwind_next_frame(struct unwind_state *state) ip = READ_ONCE_NOCHECK(sf->gprs[8]); reliable = false; regs = NULL; - if (!__kernel_text_address(ip)) { - /* skip bogus %r14 */ + /* skip bogus %r14 or if is the same as regs->psw.addr */ + if (!__kernel_text_address(ip) || state->ip == unwind_recover_ret_addr(state, ip)) { state->regs = NULL; return unwind_next_frame(state); } @@ -103,13 +103,11 @@ bool unwind_next_frame(struct unwind_state *state) if (sp & 0x7) goto out_err; - ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, (void *) sp); - /* Update unwind state */ state->sp = sp; - state->ip = ip; state->regs = regs; state->reliable = reliable; + state->ip = unwind_recover_ret_addr(state, ip); return true; out_err: @@ -161,12 +159,10 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, ip = READ_ONCE_NOCHECK(sf->gprs[8]); } - ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, NULL); - /* Update unwind state */ state->sp = sp; - state->ip = ip; state->reliable = true; + state->ip = unwind_recover_ret_addr(state, ip); if (!first_frame) return; diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c index bd3ef121c379..b88345ef8bd9 100644 --- a/arch/s390/kernel/uprobes.c +++ b/arch/s390/kernel/uprobes.c @@ -177,9 +177,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len) __typeof__(*(ptr)) input; \ int __rc = 0; \ \ - if (!test_facility(34)) \ - __rc = EMU_ILLEGAL_OP; \ - else if ((u64 __force)ptr & mask) \ + if ((u64 __force)ptr & mask) \ __rc = EMU_SPECIFICATION; \ else if (get_user(input, ptr)) \ __rc = EMU_ADDRESSING; \ @@ -194,9 +192,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len) __typeof__(ptr) __ptr = (ptr); \ int __rc = 0; \ \ - if (!test_facility(34)) \ - __rc = EMU_ILLEGAL_OP; \ - else if ((u64 __force)__ptr & mask) \ + if ((u64 __force)__ptr & mask) \ __rc = EMU_SPECIFICATION; \ else if (put_user(*(input), __ptr)) \ __rc = EMU_ADDRESSING; \ @@ -213,9 +209,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len) __typeof__(*(ptr)) input; \ int __rc = 0; \ \ - if (!test_facility(34)) \ - __rc = EMU_ILLEGAL_OP; \ - else if ((u64 __force)ptr & mask) \ + if ((u64 __force)ptr & mask) \ __rc = EMU_SPECIFICATION; \ else if (get_user(input, ptr)) \ __rc = EMU_ADDRESSING; \ @@ -327,10 +321,6 @@ static void handle_insn_ril(struct arch_uprobe *auprobe, struct pt_regs *regs) break; case 0xc6: switch (insn->opc1) { - case 0x02: /* pfdrl */ - if (!test_facility(34)) - rc = EMU_ILLEGAL_OP; - break; case 0x04: /* cghrl */ rc = emu_cmp_ril(regs, (s16 __user *)uptr, &rx->s64); break; diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 386d4e42b8d3..f9810d2a267c 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -30,7 +30,7 @@ int __bootdata_preserved(prot_virt_host); EXPORT_SYMBOL(prot_virt_host); EXPORT_SYMBOL(uv_info); -static int __init uv_init(unsigned long stor_base, unsigned long stor_len) +static int __init uv_init(phys_addr_t stor_base, unsigned long stor_len) { struct uv_cb_init uvcb = { .header.cmd = UVC_CMD_INIT_UV, @@ -49,12 +49,12 @@ static int __init uv_init(unsigned long stor_base, unsigned long stor_len) void __init setup_uv(void) { - unsigned long uv_stor_base; + void *uv_stor_base; if (!is_prot_virt_host()) return; - uv_stor_base = (unsigned long)memblock_alloc_try_nid( + uv_stor_base = memblock_alloc_try_nid( uv_info.uv_base_stor_len, SZ_1M, SZ_2G, MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); if (!uv_stor_base) { @@ -63,8 +63,8 @@ void __init setup_uv(void) goto fail; } - if (uv_init(uv_stor_base, uv_info.uv_base_stor_len)) { - memblock_phys_free(uv_stor_base, uv_info.uv_base_stor_len); + if (uv_init(__pa(uv_stor_base), uv_info.uv_base_stor_len)) { + memblock_free(uv_stor_base, uv_info.uv_base_stor_len); goto fail; } @@ -234,6 +234,32 @@ static int make_secure_pte(pte_t *ptep, unsigned long addr, return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; } +/** + * should_export_before_import - Determine whether an export is needed + * before an import-like operation + * @uvcb: the Ultravisor control block of the UVC to be performed + * @mm: the mm of the process + * + * Returns whether an export is needed before every import-like operation. + * This is needed for shared pages, which don't trigger a secure storage + * exception when accessed from a different guest. + * + * Although considered as one, the Unpin Page UVC is not an actual import, + * so it is not affected. + * + * No export is needed also when there is only one protected VM, because the + * page cannot belong to the wrong VM in that case (there is no "other VM" + * it can belong to). + * + * Return: true if an export is needed before every import, otherwise false. + */ +static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) +{ + if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) + return false; + return atomic_read(&mm->context.protected_count) > 1; +} + /* * Requests the Ultravisor to make a page accessible to a guest. * If it's brought in the first time, it will be cleared. If @@ -277,6 +303,8 @@ again: lock_page(page); ptep = get_locked_pte(gmap->mm, uaddr, &ptelock); + if (should_export_before_import(uvcb, gmap->mm)) + uv_convert_from_secure(page_to_phys(page)); rc = make_secure_pte(ptep, uaddr, page, uvcb); pte_unmap_unlock(ptep, ptelock); unlock_page(page); @@ -334,6 +362,61 @@ int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) } EXPORT_SYMBOL_GPL(gmap_convert_to_secure); +/** + * gmap_destroy_page - Destroy a guest page. + * @gmap: the gmap of the guest + * @gaddr: the guest address to destroy + * + * An attempt will be made to destroy the given guest page. If the attempt + * fails, an attempt is made to export the page. If both attempts fail, an + * appropriate error is returned. + */ +int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) +{ + struct vm_area_struct *vma; + unsigned long uaddr; + struct page *page; + int rc; + + rc = -EFAULT; + mmap_read_lock(gmap->mm); + + uaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(uaddr)) + goto out; + vma = vma_lookup(gmap->mm, uaddr); + if (!vma) + goto out; + /* + * Huge pages should not be able to become secure + */ + if (is_vm_hugetlb_page(vma)) + goto out; + + rc = 0; + /* we take an extra reference here */ + page = follow_page(vma, uaddr, FOLL_WRITE | FOLL_GET); + if (IS_ERR_OR_NULL(page)) + goto out; + rc = uv_destroy_owned_page(page_to_phys(page)); + /* + * Fault handlers can race; it is possible that two CPUs will fault + * on the same secure page. One CPU can destroy the page, reboot, + * re-enter secure mode and import it, while the second CPU was + * stuck at the beginning of the handler. At some point the second + * CPU will be able to progress, and it will not be able to destroy + * the page. In that case we do not want to terminate the process, + * we instead try to export the page. + */ + if (rc) + rc = uv_convert_owned_from_secure(page_to_phys(page)); + put_page(page); +out: + mmap_read_unlock(gmap->mm); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_destroy_page); + /* * To be called with the page locked or with an extra reference! This will * prevent gmap_make_secure from touching the page concurrently. Having 2 @@ -392,6 +475,54 @@ static ssize_t uv_query_facilities(struct kobject *kobj, static struct kobj_attribute uv_query_facilities_attr = __ATTR(facilities, 0444, uv_query_facilities, NULL); +static ssize_t uv_query_supp_se_hdr_ver(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_ver); +} + +static struct kobj_attribute uv_query_supp_se_hdr_ver_attr = + __ATTR(supp_se_hdr_ver, 0444, uv_query_supp_se_hdr_ver, NULL); + +static ssize_t uv_query_supp_se_hdr_pcf(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_pcf); +} + +static struct kobj_attribute uv_query_supp_se_hdr_pcf_attr = + __ATTR(supp_se_hdr_pcf, 0444, uv_query_supp_se_hdr_pcf, NULL); + +static ssize_t uv_query_dump_cpu_len(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "%lx\n", + uv_info.guest_cpu_stor_len); +} + +static struct kobj_attribute uv_query_dump_cpu_len_attr = + __ATTR(uv_query_dump_cpu_len, 0444, uv_query_dump_cpu_len, NULL); + +static ssize_t uv_query_dump_storage_state_len(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "%lx\n", + uv_info.conf_dump_storage_state_len); +} + +static struct kobj_attribute uv_query_dump_storage_state_len_attr = + __ATTR(dump_storage_state_len, 0444, uv_query_dump_storage_state_len, NULL); + +static ssize_t uv_query_dump_finalize_len(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "%lx\n", + uv_info.conf_dump_finalize_len); +} + +static struct kobj_attribute uv_query_dump_finalize_len_attr = + __ATTR(dump_finalize_len, 0444, uv_query_dump_finalize_len, NULL); + static ssize_t uv_query_feature_indications(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -431,12 +562,37 @@ static ssize_t uv_query_max_guest_addr(struct kobject *kobj, static struct kobj_attribute uv_query_max_guest_addr_attr = __ATTR(max_address, 0444, uv_query_max_guest_addr, NULL); +static ssize_t uv_query_supp_att_req_hdr_ver(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "%lx\n", uv_info.supp_att_req_hdr_ver); +} + +static struct kobj_attribute uv_query_supp_att_req_hdr_ver_attr = + __ATTR(supp_att_req_hdr_ver, 0444, uv_query_supp_att_req_hdr_ver, NULL); + +static ssize_t uv_query_supp_att_pflags(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "%lx\n", uv_info.supp_att_pflags); +} + +static struct kobj_attribute uv_query_supp_att_pflags_attr = + __ATTR(supp_att_pflags, 0444, uv_query_supp_att_pflags, NULL); + static struct attribute *uv_query_attrs[] = { &uv_query_facilities_attr.attr, &uv_query_feature_indications_attr.attr, &uv_query_max_guest_cpus_attr.attr, &uv_query_max_guest_vms_attr.attr, &uv_query_max_guest_addr_attr.attr, + &uv_query_supp_se_hdr_ver_attr.attr, + &uv_query_supp_se_hdr_pcf_attr.attr, + &uv_query_dump_storage_state_len_attr.attr, + &uv_query_dump_finalize_len_attr.attr, + &uv_query_dump_cpu_len_attr.attr, + &uv_query_supp_att_req_hdr_ver_attr.attr, + &uv_query_supp_att_pflags_attr.attr, NULL, }; diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 99694260cac9..3105ca5bd470 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -16,6 +16,7 @@ #include <linux/slab.h> #include <linux/smp.h> #include <linux/time_namespace.h> +#include <linux/random.h> #include <vdso/datapage.h> #include <asm/vdso.h> @@ -68,10 +69,11 @@ static struct page *find_timens_vvar_page(struct vm_area_struct *vma) int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { struct mm_struct *mm = task->mm; + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long size = vma->vm_end - vma->vm_start; if (!vma_is_special_mapping(vma, &vvar_mapping)) @@ -160,10 +162,9 @@ int vdso_getcpu_init(void) } early_initcall(vdso_getcpu_init); /* Must be called before SMP init */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len) { - unsigned long vdso_text_len, vdso_mapping_len; - unsigned long vvar_start, vdso_text_start; + unsigned long vvar_start, vdso_text_start, vdso_text_len; struct vm_special_mapping *vdso_mapping; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -180,8 +181,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) vdso_text_len = vdso64_end - vdso64_start; vdso_mapping = &vdso64_mapping; } - vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE; - vvar_start = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); + vvar_start = get_unmapped_area(NULL, addr, vdso_mapping_len, 0, 0); rc = vvar_start; if (IS_ERR_VALUE(vvar_start)) goto out; @@ -210,6 +210,52 @@ out: return rc; } +static unsigned long vdso_addr(unsigned long start, unsigned long len) +{ + unsigned long addr, end, offset; + + /* + * Round up the start address. It can start out unaligned as a result + * of stack start randomization. + */ + start = PAGE_ALIGN(start); + + /* Round the lowest possible end address up to a PMD boundary. */ + end = (start + len + PMD_SIZE - 1) & PMD_MASK; + if (end >= VDSO_BASE) + end = VDSO_BASE; + end -= len; + + if (end > start) { + offset = prandom_u32_max(((end - start) >> PAGE_SHIFT) + 1); + addr = start + (offset << PAGE_SHIFT); + } else { + addr = start; + } + return addr; +} + +unsigned long vdso_size(void) +{ + unsigned long size = VVAR_NR_PAGES * PAGE_SIZE; + + if (is_compat_task()) + size += vdso32_end - vdso32_start; + else + size += vdso64_end - vdso64_start; + return PAGE_ALIGN(size); +} + +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + unsigned long addr = VDSO_BASE; + unsigned long size = vdso_size(); + + if (current->flags & PF_RANDOMIZE) + addr = vdso_addr(current->mm->start_stack + PAGE_SIZE, size); + return map_vdso(addr, size); +} + static struct page ** __init vdso_setup_pages(void *start, void *end) { int pages = (end - start) >> PAGE_SHIFT; diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 42c43521878f..5ea3830af0cc 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -49,7 +49,6 @@ SECTIONS SOFTIRQENTRY_TEXT FTRACE_HOTPATCH_TRAMPOLINES_TEXT *(.text.*_indirect_*) - *(.fixup) *(.gnu.warning) . = ALIGN(PAGE_SIZE); _etext = .; /* End of text section */ @@ -132,6 +131,7 @@ SECTIONS /* * Table with the patch locations to undo expolines */ + . = ALIGN(4); .nospec_call_table : { __nospec_call_start = . ; *(.s390_indirect*) diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index f216a1b2f825..9436f3053b88 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -128,13 +128,12 @@ static int do_account_vtime(struct task_struct *tsk) timer = S390_lowcore.last_update_timer; clock = S390_lowcore.last_update_clock; - /* Use STORE CLOCK by default, STORE CLOCK FAST if available. */ - alternative_io("stpt %0\n .insn s,0xb2050000,%1\n", - "stpt %0\n .insn s,0xb27c0000,%1\n", - 25, - ASM_OUTPUT2("=Q" (S390_lowcore.last_update_timer), - "=Q" (S390_lowcore.last_update_clock)), - ASM_NO_INPUT_CLOBBER("cc")); + asm volatile( + " stpt %0\n" /* Store current cpu timer value */ + " stckf %1" /* Store current tod clock value */ + : "=Q" (S390_lowcore.last_update_timer), + "=Q" (S390_lowcore.last_update_clock) + : : "cc"); clock = S390_lowcore.last_update_clock - clock; timer -= S390_lowcore.last_update_timer; |